From c4101a4d13906481258e81c3dc5194873304fe90 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Thu, 13 Apr 2023 12:46:35 +0800 Subject: [PATCH] Visualize populated dataset in admin (#96) --- tabby/admin/pages/Projects.py | 8 ++++++-- tabby/tools/build_dataset/__main__.py | 15 +++++++++++++++ 2 files changed, 21 insertions(+), 2 deletions(-) diff --git a/tabby/admin/pages/Projects.py b/tabby/admin/pages/Projects.py index 9ff5e1b..e27b0fd 100644 --- a/tabby/admin/pages/Projects.py +++ b/tabby/admin/pages/Projects.py @@ -42,6 +42,10 @@ def dataset_info(): with col2: st.bar_chart(count_by_language(dataset)) + df = pd.DataFrame(dataset) + del df["id"] + st.dataframe(df, use_container_width=True, height=600) + def project_list(): if len(projects) <= 0: @@ -63,6 +67,6 @@ def project_list(): st.write(f"Status: `{sha}`") -dataset_info() -st.write("---") project_list() +st.write("---") +dataset_info() diff --git a/tabby/tools/build_dataset/__main__.py b/tabby/tools/build_dataset/__main__.py index cd07fc3..e76cf5f 100644 --- a/tabby/tools/build_dataset/__main__.py +++ b/tabby/tools/build_dataset/__main__.py @@ -84,6 +84,20 @@ def to_id(*args): return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=") +def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25): + def fn(example): + """Filter files based on line length and % alphanumeric characters""" + if example["max_line_length"] > line_max: + return False + elif example["avg_line_length"] > line_mean: + return False + elif example["alphanum_fraction"] < alpha_frac: + return False + return True + + return fn + + if __name__ == "__main__": valid_extensions = read_valid_extensions() @@ -103,6 +117,7 @@ if __name__ == "__main__": ) ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files)) + ds = ds.filter(basic_filters()) ds.save_to_disk(args.output_dir) ds.to_json(os.path.join(args.output_dir, "dumps.json"))