diff --git a/Dockerfile b/Dockerfile index c0412b0..460b4f8 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,9 +2,15 @@ FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime -# Install dagu (https://github.com/yohamta/dagu) +# Install utilities RUN < dagu.tar.gz tar zxvf dagu.tar.gz mv dagu /usr/local/bin rm dagu.tar.gz LICENSE.md README.md diff --git a/deployment/config/dags/trainer.yaml b/deployment/config/dags/trainer.yaml deleted file mode 100644 index a699f78..0000000 --- a/deployment/config/dags/trainer.yaml +++ /dev/null @@ -1,20 +0,0 @@ -schedule: "5 4 * * *" # Run at 04:05. - -env: - - APP_DIR: /app -steps: - - name: Create job dir - command: mktemp -d /tmp/job.XXXXX - output: JOB_DIR - - - name: Fetch source code - dir: $APP_DIR - command: python -m tabby.tools.download_project --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository - depends: - - Create job dir - - - name: Generate dataset - dir: $APP_DIR - command: python -m tabby.tools.preprocess.preprocess_project --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset - depends: - - Fetch source code diff --git a/deployment/config/repository.toml b/deployment/config/repository.toml new file mode 100644 index 0000000..54f124a --- /dev/null +++ b/deployment/config/repository.toml @@ -0,0 +1,5 @@ +[repositories.quickjs] +url = "https://gitee.com/vsf-linux/quickjs.git" + +[repositories.transformers] +url = "https://gitee.com/angzhao/transformers.git" diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml index 8ebf0f7..9d849c9 100644 --- a/deployment/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -7,11 +7,11 @@ services: command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000 environment: - PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped - - EVENTS_LOG_DIR=/logs/tabby-server + - EVENTS_LOG_DIR=/logs ports: - "5000:5000" volumes: - - ./data/logs:/logs + - ./data/logs/tabby-server:/logs - ./data/hf_cache:/root/.cache/huggingface admin: @@ -35,7 +35,9 @@ services: image: tabbyml/tabby container_name: tabby-dagu command: dagu server --host 0.0.0.0 --port 8080 - ports: - - 8080:8080 volumes: - - ./config/dags:/root/.dagu/dags + - ./config:/config:ro + - ./data/repositories:/repositories + - ./data/dataset:/dataset + environment: + DAGU_DAGS: tabby/tasks diff --git a/poetry.lock b/poetry.lock index 8ec614f..a9ee434 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1013,6 +1013,22 @@ files = [ [package.dependencies] gitdb = ">=4.0.1,<5" +[[package]] +name = "gitup" +version = "0.5.1" +description = "Easily update multiple git repositories at once" +category = "main" +optional = false +python-versions = "*" +files = [ + {file = "gitup-0.5.1-py3-none-any.whl", hash = "sha256:ee5b0314062edbccee900b851c030233378613256c141422a22a831c6201d8f8"}, + {file = "gitup-0.5.1.tar.gz", hash = "sha256:4f787079cd65d8f60c5842181204635e1b72d3533ae91f0c619624c6b20846dd"}, +] + +[package.dependencies] +colorama = ">=0.3.9" +GitPython = ">=2.1.8" + [[package]] name = "greenlet" version = "2.0.2" @@ -3396,4 +3412,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"] [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "0f2c2fbfd49c0a69dccf125d3ab96a00115b844753c2063b68d936bc2e8e39d1" +content-hash = "c4e4670acf2dd356d6e625d590ec109eee7081c065fc54ce97e9f69bdcb21844" diff --git a/pyproject.toml b/pyproject.toml index 5e98fc8..ff2ffbb 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -15,6 +15,9 @@ uvicorn = "^0.21.1" tritonclient = {extras = ["all"], version = "^2.31.0"} streamlit = "^1.20.0" loguru = "^0.6.0" +gitup = "^0.5.1" +toml = "^0.10.2" +gitpython = "^3.1.31" [tool.poetry.group.dev.dependencies] diff --git a/tabby/tasks/trainer.yaml b/tabby/tasks/trainer.yaml new file mode 100644 index 0000000..b28f4a9 --- /dev/null +++ b/tabby/tasks/trainer.yaml @@ -0,0 +1,15 @@ +schedule: "5 4 * * *" # Run at 04:05. + +env: + - PATH: "/opt/conda/bin:$PATH" + - APP_DIR: /app +steps: + - name: Update repositories + dir: $APP_DIR + command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml + + - name: Generate dataset + dir: $APP_DIR + command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset + depends: + - Update repositories diff --git a/tabby/tools/download_project.py b/tabby/tools/download_project.py deleted file mode 100644 index 02d5c5b..0000000 --- a/tabby/tools/download_project.py +++ /dev/null @@ -1,35 +0,0 @@ -import tarfile -from dataclasses import dataclass, field -from typing import Optional - -import requests -from transformers import HfArgumentParser - - -@dataclass -class DownloaderArgs: - url: str = field(metadata={"help": "URL to source code tar.gz file"}) - output_dir: str = field(metadata={"help": "Output save path directory"}) - - -def parse_args(): - parser = HfArgumentParser(DownloaderArgs) - return parser.parse_args() - - -def download_and_untar(url, output_dir): - response = requests.get(url, stream=True) - mode = "r" - if url.endswith(".gz"): - mode += "|gz" - elif url.endswith(".xz"): - mode += "|xz" - elif url.endswith(".bz2"): - mode += "|bz2" - file = tarfile.open(fileobj=response.raw, mode=mode) - file.extractall(output_dir) - - -if __name__ == "__main__": - args = parse_args() - download_and_untar(args.url, args.output_dir) diff --git a/tabby/tools/preprocess/preprocess_project.py b/tabby/tools/preprocess/build_dataset.py similarity index 98% rename from tabby/tools/preprocess/preprocess_project.py rename to tabby/tools/preprocess/build_dataset.py index b489faa..e9a52a2 100644 --- a/tabby/tools/preprocess/preprocess_project.py +++ b/tabby/tools/preprocess/build_dataset.py @@ -42,6 +42,7 @@ def dataset_iter(files): def gen(): mappings = read_extension_to_language_mappings() for x in files: + print("Processing", x) _, extname = os.path.splitext(x) with open(x) as f: diff --git a/tabby/tools/repository/Makefile b/tabby/tools/repository/Makefile new file mode 100644 index 0000000..94d910e --- /dev/null +++ b/tabby/tools/repository/Makefile @@ -0,0 +1,11 @@ +all: + +CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml + +test: + # Cleanup + rm -rf testdata/repositories/quickjs + # Initialize + $(CMD) + # Updating + $(CMD) diff --git a/tabby/tools/repository/testdata/config.toml b/tabby/tools/repository/testdata/config.toml new file mode 100644 index 0000000..dc681fe --- /dev/null +++ b/tabby/tools/repository/testdata/config.toml @@ -0,0 +1,4 @@ +data_dir = "/repositories" + +[repositories.quickjs] +url = "https://gitee.com/vsf-linux/quickjs.git" diff --git a/tabby/tools/repository/testdata/repositories/.gitignore b/tabby/tools/repository/testdata/repositories/.gitignore new file mode 100644 index 0000000..0d17186 --- /dev/null +++ b/tabby/tools/repository/testdata/repositories/.gitignore @@ -0,0 +1 @@ +quickjs diff --git a/tabby/tools/repository/testdata/repositories/.gitkeep b/tabby/tools/repository/testdata/repositories/.gitkeep new file mode 100644 index 0000000..e69de29 diff --git a/tabby/tools/repository/updater.py b/tabby/tools/repository/updater.py new file mode 100644 index 0000000..dcb1fb2 --- /dev/null +++ b/tabby/tools/repository/updater.py @@ -0,0 +1,34 @@ +import os +import pathlib +from dataclasses import dataclass, field + +import toml +from git import Repo +from transformers import HfArgumentParser + + +@dataclass +class Arguments: + data_dir: str = field(metadata={"help": "Base dir for repositories"}) + config_file: str = field(metadata={"help": "Configuration file for tabby updater"}) + + +def parse_args(): + parser = HfArgumentParser(Arguments) + return parser.parse_args() + + +if __name__ == "__main__": + args = parse_args() + + config = toml.load(args.config_file) + repositories = config["repositories"] + + for name, config in repositories.items(): + path = pathlib.Path(args.data_dir, name) + if path.is_dir(): + repo = Repo(path) + else: + Repo.clone_from(config["url"], path.absolute(), depth=1) + + os.system(f"gitup {args.data_dir}")