diff --git a/python/.gitignore b/python/.gitignore new file mode 100644 index 0000000..d7d5db7 --- /dev/null +++ b/python/.gitignore @@ -0,0 +1,3 @@ +tabby.egg-info/ +__pycache__ +tmp*/ diff --git a/python/README.md b/python/README.md new file mode 100644 index 0000000..c03f3e4 --- /dev/null +++ b/python/README.md @@ -0,0 +1,13 @@ +# tabby + +## Setup Development Environment + +```bash +pip install -e ".[dev]" +``` + +Then, start the Dagster UI web server: + +```bash +dagster dev +``` diff --git a/python/pyproject.toml b/python/pyproject.toml new file mode 100644 index 0000000..490b2e6 --- /dev/null +++ b/python/pyproject.toml @@ -0,0 +1,6 @@ +[build-system] +requires = ["setuptools"] +build-backend = "setuptools.build_meta" + +[tool.dagster] +module_name = "tabby" diff --git a/python/setup.cfg b/python/setup.cfg new file mode 100644 index 0000000..afe0d2c --- /dev/null +++ b/python/setup.cfg @@ -0,0 +1,2 @@ +[metadata] +name = tabby diff --git a/python/setup.py b/python/setup.py new file mode 100644 index 0000000..23b4a35 --- /dev/null +++ b/python/setup.py @@ -0,0 +1,12 @@ +from setuptools import find_packages, setup + +setup( + name="tabby", + packages=find_packages(exclude=["tabby_tests"]), + install_requires=[ + "dagster", + "dagster-cloud", + "dagster-pandas" + ], + extras_require={"dev": ["dagster-webserver", "pytest"]}, +) diff --git a/python/tabby/__init__.py b/python/tabby/__init__.py new file mode 100644 index 0000000..a36556a --- /dev/null +++ b/python/tabby/__init__.py @@ -0,0 +1,9 @@ +from dagster import Definitions, load_assets_from_modules + +from . import assets + +all_assets = load_assets_from_modules([assets]) + +defs = Definitions( + assets=all_assets, +) diff --git a/python/tabby/assets.py b/python/tabby/assets.py new file mode 100644 index 0000000..a958f88 --- /dev/null +++ b/python/tabby/assets.py @@ -0,0 +1,38 @@ +from dagster import Out, Output, MetadataValue, asset +from dagster_pandas import DataFrame, PandasColumn, create_dagster_pandas_dataframe_type + +import pandas as pd +import json +import glob + +from . import constants + +DatasetDataFrame = create_dagster_pandas_dataframe_type( + name="DatasetDataFrame", + columns = [ + PandasColumn.string_column("git_url"), + PandasColumn.string_column("filepath"), + PandasColumn.string_column("content"), + PandasColumn.string_column("language"), + + PandasColumn.integer_column("max_line_length"), + PandasColumn.float_column("avg_line_length"), + PandasColumn.float_column("alphanum_fraction"), + + PandasColumn.exists("tags"), + ] +) + + +@asset(dagster_type=DatasetDataFrame) +def dataset_files(): + """Get source code information from TABBY_ROOT""" + + ds = [] + for path in glob.glob(constants.TABBY_DATASET_FILEPATTERN): + with open(path, "r") as f: + for line in f.readlines(): + ds.append(json.loads(line)) + + df = pd.DataFrame(ds) + return Output(df, metadata={"num_files": len(df) }) diff --git a/python/tabby/constants.py b/python/tabby/constants.py new file mode 100644 index 0000000..2451097 --- /dev/null +++ b/python/tabby/constants.py @@ -0,0 +1,5 @@ +import os + +TABBY_ROOT = os.environ.get("TABBY_ROOT", os.path.expanduser("~/.tabby")) + +TABBY_DATASET_FILEPATTERN = os.path.join(TABBY_ROOT, "dataset/*.jsonl") diff --git a/python/tabby_tests/__init__.py b/python/tabby_tests/__init__.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/python/tabby_tests/__init__.py @@ -0,0 +1 @@ + diff --git a/python/tabby_tests/test_assets.py b/python/tabby_tests/test_assets.py new file mode 100644 index 0000000..8b13789 --- /dev/null +++ b/python/tabby_tests/test_assets.py @@ -0,0 +1 @@ +