Visualize populated dataset in admin (#96)
parent
aa5608fcb3
commit
c4101a4d13
|
|
@ -42,6 +42,10 @@ def dataset_info():
|
||||||
with col2:
|
with col2:
|
||||||
st.bar_chart(count_by_language(dataset))
|
st.bar_chart(count_by_language(dataset))
|
||||||
|
|
||||||
|
df = pd.DataFrame(dataset)
|
||||||
|
del df["id"]
|
||||||
|
st.dataframe(df, use_container_width=True, height=600)
|
||||||
|
|
||||||
|
|
||||||
def project_list():
|
def project_list():
|
||||||
if len(projects) <= 0:
|
if len(projects) <= 0:
|
||||||
|
|
@ -63,6 +67,6 @@ def project_list():
|
||||||
st.write(f"Status: `{sha}`")
|
st.write(f"Status: `{sha}`")
|
||||||
|
|
||||||
|
|
||||||
dataset_info()
|
|
||||||
st.write("---")
|
|
||||||
project_list()
|
project_list()
|
||||||
|
st.write("---")
|
||||||
|
dataset_info()
|
||||||
|
|
|
||||||
|
|
@ -84,6 +84,20 @@ def to_id(*args):
|
||||||
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
|
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
|
||||||
|
|
||||||
|
|
||||||
|
def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25):
|
||||||
|
def fn(example):
|
||||||
|
"""Filter files based on line length and % alphanumeric characters"""
|
||||||
|
if example["max_line_length"] > line_max:
|
||||||
|
return False
|
||||||
|
elif example["avg_line_length"] > line_mean:
|
||||||
|
return False
|
||||||
|
elif example["alphanum_fraction"] < alpha_frac:
|
||||||
|
return False
|
||||||
|
return True
|
||||||
|
|
||||||
|
return fn
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
valid_extensions = read_valid_extensions()
|
valid_extensions = read_valid_extensions()
|
||||||
|
|
||||||
|
|
@ -103,6 +117,7 @@ if __name__ == "__main__":
|
||||||
)
|
)
|
||||||
|
|
||||||
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
|
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
|
||||||
|
ds = ds.filter(basic_filters())
|
||||||
ds.save_to_disk(args.output_dir)
|
ds.save_to_disk(args.output_dir)
|
||||||
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
|
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue