temp init

add-dagster-data-pipeline
Meng Zhang 2023-10-13 18:59:43 -07:00
parent 99d1bf34bb
commit d80e675211
10 changed files with 90 additions and 0 deletions

3
python/.gitignore vendored Normal file
View File

@ -0,0 +1,3 @@
tabby.egg-info/
__pycache__
tmp*/

13
python/README.md Normal file
View File

@ -0,0 +1,13 @@
# tabby
## Setup Development Environment
```bash
pip install -e ".[dev]"
```
Then, start the Dagster UI web server:
```bash
dagster dev
```

6
python/pyproject.toml Normal file
View File

@ -0,0 +1,6 @@
[build-system]
requires = ["setuptools"]
build-backend = "setuptools.build_meta"
[tool.dagster]
module_name = "tabby"

2
python/setup.cfg Normal file
View File

@ -0,0 +1,2 @@
[metadata]
name = tabby

12
python/setup.py Normal file
View File

@ -0,0 +1,12 @@
from setuptools import find_packages, setup
setup(
name="tabby",
packages=find_packages(exclude=["tabby_tests"]),
install_requires=[
"dagster",
"dagster-cloud",
"dagster-pandas"
],
extras_require={"dev": ["dagster-webserver", "pytest"]},
)

9
python/tabby/__init__.py Normal file
View File

@ -0,0 +1,9 @@
from dagster import Definitions, load_assets_from_modules
from . import assets
all_assets = load_assets_from_modules([assets])
defs = Definitions(
assets=all_assets,
)

38
python/tabby/assets.py Normal file
View File

@ -0,0 +1,38 @@
from dagster import Out, Output, MetadataValue, asset
from dagster_pandas import DataFrame, PandasColumn, create_dagster_pandas_dataframe_type
import pandas as pd
import json
import glob
from . import constants
DatasetDataFrame = create_dagster_pandas_dataframe_type(
name="DatasetDataFrame",
columns = [
PandasColumn.string_column("git_url"),
PandasColumn.string_column("filepath"),
PandasColumn.string_column("content"),
PandasColumn.string_column("language"),
PandasColumn.integer_column("max_line_length"),
PandasColumn.float_column("avg_line_length"),
PandasColumn.float_column("alphanum_fraction"),
PandasColumn.exists("tags"),
]
)
@asset(dagster_type=DatasetDataFrame)
def dataset_files():
"""Get source code information from TABBY_ROOT"""
ds = []
for path in glob.glob(constants.TABBY_DATASET_FILEPATTERN):
with open(path, "r") as f:
for line in f.readlines():
ds.append(json.loads(line))
df = pd.DataFrame(ds)
return Output(df, metadata={"num_files": len(df) })

View File

@ -0,0 +1,5 @@
import os
TABBY_ROOT = os.environ.get("TABBY_ROOT", os.path.expanduser("~/.tabby"))
TABBY_DATASET_FILEPATTERN = os.path.join(TABBY_ROOT, "dataset/*.jsonl")

View File

@ -0,0 +1 @@

View File

@ -0,0 +1 @@