add events pipeline
parent
d80e675211
commit
1b52a83dcc
|
|
@ -14,18 +14,16 @@ DatasetDataFrame = create_dagster_pandas_dataframe_type(
|
||||||
PandasColumn.string_column("filepath"),
|
PandasColumn.string_column("filepath"),
|
||||||
PandasColumn.string_column("content"),
|
PandasColumn.string_column("content"),
|
||||||
PandasColumn.string_column("language"),
|
PandasColumn.string_column("language"),
|
||||||
|
|
||||||
PandasColumn.integer_column("max_line_length"),
|
PandasColumn.integer_column("max_line_length"),
|
||||||
PandasColumn.float_column("avg_line_length"),
|
PandasColumn.float_column("avg_line_length"),
|
||||||
PandasColumn.float_column("alphanum_fraction"),
|
PandasColumn.float_column("alphanum_fraction"),
|
||||||
|
|
||||||
PandasColumn.exists("tags"),
|
PandasColumn.exists("tags"),
|
||||||
]
|
],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@asset(dagster_type=DatasetDataFrame)
|
@asset(dagster_type=DatasetDataFrame)
|
||||||
def dataset_files():
|
def dataset():
|
||||||
"""Get source code information from TABBY_ROOT"""
|
"""Get source code information from TABBY_ROOT"""
|
||||||
|
|
||||||
ds = []
|
ds = []
|
||||||
|
|
@ -35,4 +33,51 @@ def dataset_files():
|
||||||
ds.append(json.loads(line))
|
ds.append(json.loads(line))
|
||||||
|
|
||||||
df = pd.DataFrame(ds)
|
df = pd.DataFrame(ds)
|
||||||
return Output(df, metadata={"num_files": len(df) })
|
metadata = {
|
||||||
|
"num_records": len(df),
|
||||||
|
"preview": MetadataValue.md(
|
||||||
|
df.head()[
|
||||||
|
[
|
||||||
|
"git_url",
|
||||||
|
"filepath",
|
||||||
|
"language",
|
||||||
|
"max_line_length",
|
||||||
|
"avg_line_length",
|
||||||
|
"alphanum_fraction",
|
||||||
|
]
|
||||||
|
].to_markdown()
|
||||||
|
),
|
||||||
|
}
|
||||||
|
return Output(df, metadata=metadata)
|
||||||
|
|
||||||
|
EventDataFrame = create_dagster_pandas_dataframe_type(
|
||||||
|
name="EventDataFrame",
|
||||||
|
columns=[
|
||||||
|
PandasColumn.integer_column("ts"),
|
||||||
|
PandasColumn.exists("event"),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
@asset(dagster_type=EventDataFrame)
|
||||||
|
def events():
|
||||||
|
"""Get events information from TABBY_ROOT"""
|
||||||
|
|
||||||
|
ds = []
|
||||||
|
for path in glob.glob(constants.TABBY_EVENTS_FILEPATTERN):
|
||||||
|
with open(path, "r") as f:
|
||||||
|
for line in f.readlines():
|
||||||
|
ds.append(json.loads(line))
|
||||||
|
|
||||||
|
df = pd.DataFrame(ds)
|
||||||
|
metadata = {
|
||||||
|
"num_records": len(df),
|
||||||
|
"preview": MetadataValue.md(
|
||||||
|
df.head()[
|
||||||
|
[
|
||||||
|
"ts",
|
||||||
|
"event"
|
||||||
|
]
|
||||||
|
].to_markdown()
|
||||||
|
),
|
||||||
|
}
|
||||||
|
return Output(df, metadata=metadata)
|
||||||
|
|
|
||||||
|
|
@ -3,3 +3,5 @@ import os
|
||||||
TABBY_ROOT = os.environ.get("TABBY_ROOT", os.path.expanduser("~/.tabby"))
|
TABBY_ROOT = os.environ.get("TABBY_ROOT", os.path.expanduser("~/.tabby"))
|
||||||
|
|
||||||
TABBY_DATASET_FILEPATTERN = os.path.join(TABBY_ROOT, "dataset/*.jsonl")
|
TABBY_DATASET_FILEPATTERN = os.path.join(TABBY_ROOT, "dataset/*.jsonl")
|
||||||
|
|
||||||
|
TABBY_EVENTS_FILEPATTERN = os.path.join(TABBY_ROOT, "events/*.json")
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue