From 1b52a83dcc0bdb868829c56651989b727e2c2469 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 13 Oct 2023 21:43:00 -0700 Subject: [PATCH] add events pipeline --- python/tabby/assets.py | 57 ++++++++++++++++++++++++++++++++++----- python/tabby/constants.py | 2 ++ 2 files changed, 53 insertions(+), 6 deletions(-) diff --git a/python/tabby/assets.py b/python/tabby/assets.py index a958f88..05a2a6c 100644 --- a/python/tabby/assets.py +++ b/python/tabby/assets.py @@ -9,23 +9,21 @@ from . import constants DatasetDataFrame = create_dagster_pandas_dataframe_type( name="DatasetDataFrame", - columns = [ + columns=[ PandasColumn.string_column("git_url"), PandasColumn.string_column("filepath"), PandasColumn.string_column("content"), PandasColumn.string_column("language"), - PandasColumn.integer_column("max_line_length"), PandasColumn.float_column("avg_line_length"), PandasColumn.float_column("alphanum_fraction"), - PandasColumn.exists("tags"), - ] + ], ) @asset(dagster_type=DatasetDataFrame) -def dataset_files(): +def dataset(): """Get source code information from TABBY_ROOT""" ds = [] @@ -35,4 +33,51 @@ def dataset_files(): ds.append(json.loads(line)) df = pd.DataFrame(ds) - return Output(df, metadata={"num_files": len(df) }) + metadata = { + "num_records": len(df), + "preview": MetadataValue.md( + df.head()[ + [ + "git_url", + "filepath", + "language", + "max_line_length", + "avg_line_length", + "alphanum_fraction", + ] + ].to_markdown() + ), + } + return Output(df, metadata=metadata) + +EventDataFrame = create_dagster_pandas_dataframe_type( + name="EventDataFrame", + columns=[ + PandasColumn.integer_column("ts"), + PandasColumn.exists("event"), + ], +) + +@asset(dagster_type=EventDataFrame) +def events(): + """Get events information from TABBY_ROOT""" + + ds = [] + for path in glob.glob(constants.TABBY_EVENTS_FILEPATTERN): + with open(path, "r") as f: + for line in f.readlines(): + ds.append(json.loads(line)) + + df = pd.DataFrame(ds) + metadata = { + "num_records": len(df), + "preview": MetadataValue.md( + df.head()[ + [ + "ts", + "event" + ] + ].to_markdown() + ), + } + return Output(df, metadata=metadata) diff --git a/python/tabby/constants.py b/python/tabby/constants.py index 2451097..83fa5df 100644 --- a/python/tabby/constants.py +++ b/python/tabby/constants.py @@ -3,3 +3,5 @@ import os TABBY_ROOT = os.environ.get("TABBY_ROOT", os.path.expanduser("~/.tabby")) TABBY_DATASET_FILEPATTERN = os.path.join(TABBY_ROOT, "dataset/*.jsonl") + +TABBY_EVENTS_FILEPATTERN = os.path.join(TABBY_ROOT, "events/*.json")