Visualize populated dataset in admin (#96)
parent
aa5608fcb3
commit
c4101a4d13
|
|
@ -42,6 +42,10 @@ def dataset_info():
|
|||
with col2:
|
||||
st.bar_chart(count_by_language(dataset))
|
||||
|
||||
df = pd.DataFrame(dataset)
|
||||
del df["id"]
|
||||
st.dataframe(df, use_container_width=True, height=600)
|
||||
|
||||
|
||||
def project_list():
|
||||
if len(projects) <= 0:
|
||||
|
|
@ -63,6 +67,6 @@ def project_list():
|
|||
st.write(f"Status: `{sha}`")
|
||||
|
||||
|
||||
dataset_info()
|
||||
st.write("---")
|
||||
project_list()
|
||||
st.write("---")
|
||||
dataset_info()
|
||||
|
|
|
|||
|
|
@ -84,6 +84,20 @@ def to_id(*args):
|
|||
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
|
||||
|
||||
|
||||
def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25):
|
||||
def fn(example):
|
||||
"""Filter files based on line length and % alphanumeric characters"""
|
||||
if example["max_line_length"] > line_max:
|
||||
return False
|
||||
elif example["avg_line_length"] > line_mean:
|
||||
return False
|
||||
elif example["alphanum_fraction"] < alpha_frac:
|
||||
return False
|
||||
return True
|
||||
|
||||
return fn
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
valid_extensions = read_valid_extensions()
|
||||
|
||||
|
|
@ -103,6 +117,7 @@ if __name__ == "__main__":
|
|||
)
|
||||
|
||||
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
|
||||
ds = ds.filter(basic_filters())
|
||||
ds.save_to_disk(args.output_dir)
|
||||
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue