update
parent
1b52a83dcc
commit
4d1d8965e0
|
|
@ -4,9 +4,10 @@ setup(
|
||||||
name="tabby",
|
name="tabby",
|
||||||
packages=find_packages(exclude=["tabby_tests"]),
|
packages=find_packages(exclude=["tabby_tests"]),
|
||||||
install_requires=[
|
install_requires=[
|
||||||
|
"datasets",
|
||||||
"dagster",
|
"dagster",
|
||||||
"dagster-cloud",
|
"dagster-cloud",
|
||||||
"dagster-pandas"
|
"dagster-pandas",
|
||||||
],
|
],
|
||||||
extras_require={"dev": ["dagster-webserver", "pytest"]},
|
extras_require={"dev": ["dagster-webserver", "pytest"]},
|
||||||
)
|
)
|
||||||
|
|
|
||||||
|
|
@ -24,7 +24,7 @@ DatasetDataFrame = create_dagster_pandas_dataframe_type(
|
||||||
|
|
||||||
@asset(dagster_type=DatasetDataFrame)
|
@asset(dagster_type=DatasetDataFrame)
|
||||||
def dataset():
|
def dataset():
|
||||||
"""Get source code information from TABBY_ROOT"""
|
"""Read source code dataset from TABBY_ROOT"""
|
||||||
|
|
||||||
ds = []
|
ds = []
|
||||||
for path in glob.glob(constants.TABBY_DATASET_FILEPATTERN):
|
for path in glob.glob(constants.TABBY_DATASET_FILEPATTERN):
|
||||||
|
|
@ -50,34 +50,16 @@ def dataset():
|
||||||
}
|
}
|
||||||
return Output(df, metadata=metadata)
|
return Output(df, metadata=metadata)
|
||||||
|
|
||||||
EventDataFrame = create_dagster_pandas_dataframe_type(
|
@asset
|
||||||
name="EventDataFrame",
|
def train_dataset(dataset):
|
||||||
columns=[
|
"""Filter source code dataset for training / evaluation"""
|
||||||
PandasColumn.integer_column("ts"),
|
from datasets import Dataset
|
||||||
PandasColumn.exists("event"),
|
|
||||||
],
|
|
||||||
)
|
|
||||||
|
|
||||||
@asset(dagster_type=EventDataFrame)
|
df = dataset
|
||||||
def events():
|
df = df[df["max_line_length"] < 300]
|
||||||
"""Get events information from TABBY_ROOT"""
|
df = df[df["avg_line_length"] < 150]
|
||||||
|
|
||||||
ds = []
|
|
||||||
for path in glob.glob(constants.TABBY_EVENTS_FILEPATTERN):
|
|
||||||
with open(path, "r") as f:
|
|
||||||
for line in f.readlines():
|
|
||||||
ds.append(json.loads(line))
|
|
||||||
|
|
||||||
df = pd.DataFrame(ds)
|
|
||||||
metadata = {
|
metadata = {
|
||||||
"num_records": len(df),
|
"num_records": len(df),
|
||||||
"preview": MetadataValue.md(
|
"num_filtered_records": len(dataset) - len(df)
|
||||||
df.head()[
|
|
||||||
[
|
|
||||||
"ts",
|
|
||||||
"event"
|
|
||||||
]
|
|
||||||
].to_markdown()
|
|
||||||
),
|
|
||||||
}
|
}
|
||||||
return Output(df, metadata=metadata)
|
return Output(Dataset.from_pandas(df), metadata=metadata)
|
||||||
Loading…
Reference in New Issue