Visualize populated dataset in admin (#96)

add-tracing
Meng Zhang 2023-04-13 12:46:35 +08:00 committed by GitHub
parent aa5608fcb3
commit c4101a4d13
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 2 deletions

View File

@ -42,6 +42,10 @@ def dataset_info():
with col2:
st.bar_chart(count_by_language(dataset))
df = pd.DataFrame(dataset)
del df["id"]
st.dataframe(df, use_container_width=True, height=600)
def project_list():
if len(projects) <= 0:
@ -63,6 +67,6 @@ def project_list():
st.write(f"Status: `{sha}`")
dataset_info()
st.write("---")
project_list()
st.write("---")
dataset_info()

View File

@ -84,6 +84,20 @@ def to_id(*args):
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25):
def fn(example):
"""Filter files based on line length and % alphanumeric characters"""
if example["max_line_length"] > line_max:
return False
elif example["avg_line_length"] > line_mean:
return False
elif example["alphanum_fraction"] < alpha_frac:
return False
return True
return fn
if __name__ == "__main__":
valid_extensions = read_valid_extensions()
@ -103,6 +117,7 @@ if __name__ == "__main__":
)
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
ds = ds.filter(basic_filters())
ds.save_to_disk(args.output_dir)
ds.to_json(os.path.join(args.output_dir, "dumps.json"))