Add dagu for data processing job orchestration (#7)

* Install dagu * Move dagu install to first stage * Fix metrics * Add DAGs for create dataset from code repository
2023-03-25 00:05:47 +08:00 · 2023-03-25 00:05:47 +08:00 · 1038bb39a1
parent c476f5f48f
commit 1038bb39a1
15 changed files with 94 additions and 11 deletions
--- a/.dockerignore
+++ b/.dockerignore
@ -1,4 +1,5 @@
 testdata
 deployment
 **/.git
 **/__pycache__
--- a/2
+++ b/2
@ -15,7 +15,7 @@ $(PRE_COMMIT_HOOK):
 	poetry run pre-commit install --install-hooks
 $(LOCAL_MODEL):
-	poetry run python tasks/converter/huggingface_gptneox_convert.py \
+	poetry run python scripts/huggingface_gptneox_convert.py \
 		-in_file EleutherAI/pythia-70m-deduped \
 		-o $@ \
 		-i_g 1 -m_n tiny-70M -p 1 -w fp16
--- a/README.md
+++ b/README.md
@ -12,7 +12,6 @@ An opensource / on-prem alternative to GitHub Copilot
 * [`admin`](./admin): Admin panel for monitoring / settings purpose.
 * [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality).
 * [`deployment`](./deployment): Container related deployment configs.
 * [`tasks`](./tasks): Various data processing scripts.
 ## Development
--- a/deployment/Dockerfile
+++ b/deployment/Dockerfile
@ -2,6 +2,14 @@
 FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
 # Install dagu (https://github.com/yohamta/dagu)
 RUN <<EOF
  python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
  tar zxvf dagu.tar.gz
  mv dagu /usr/local/bin
  rm dagu.tar.gz LICENSE.md README.md
 EOF
 ARG PYPI_INDEX_URL=https://pypi.org/simple
 ARG POETRY_VERSION=1.4.0
--- a/deployment/config/dags/train.yaml
+++ b/deployment/config/dags/train.yaml
@ -0,0 +1,20 @@
 schedule: "5 4 * * *" # Run at 04:05.
 env:
  - APP_DIR: /app
 steps:
  - name: Create job dir
    command: mktemp -d /tmp/job.XXXXX
    output: JOB_DIR
  - name: Fetch source code
    dir: $APP_DIR
    command: python scripts/download_project.py --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
    depends:
      - Create job dir
  - name: Generate dataset
    dir: ${APP_DIR}
    command: python scripts/preprocess/preprocess_project.py --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
    depends:
      - Fetch source code
--- a/deployment/docker-compose.dev.yml
+++ b/deployment/docker-compose.dev.yml
@ -19,3 +19,9 @@ services:
  vector:
    environment:
      - VECTOR_WATCH_CONFIG=true
  dagu:
    ports:
      - 8080:8080
    volumes:
      - ../:/app
--- a/deployment/docker-compose.yml
+++ b/deployment/docker-compose.yml
@ -34,3 +34,12 @@ services:
    volumes:
      - ./config/vector.toml:/etc/vector/vector.toml:ro
      - ./data/logs:/logs
  dagu:
    image: tabbyml/tabby
    container_name: tabby-dagu
    command: dagu server --host 0.0.0.0 --port 8080
    ports:
      - 8080:8080
    volumes:
      - ./config/dags:/root/.dagu/dags
--- a/scripts/download_project.py
+++ b/scripts/download_project.py
@ -0,0 +1,35 @@
 import tarfile
 from dataclasses import dataclass, field
 from typing import Optional
 import requests
 from transformers import HfArgumentParser
@dataclass(kw_only=True)
 class DownloaderArgs:
    url: str = field(metadata={"help": "URL to source code tar.gz file"})
    output_dir: str = field(metadata={"help": "Output save path directory"})
 def parse_args():
    parser = HfArgumentParser(DownloaderArgs)
    return parser.parse_args()
 def download_and_untar(url, output_dir):
    response = requests.get(url, stream=True)
    mode = "r"
    if url.endswith(".gz"):
        mode += "|gz"
    elif url.endswith(".xz"):
        mode += "|xz"
    elif url.endswith(".bz2"):
        mode += "|bz2"
    file = tarfile.open(fileobj=response.raw, mode=mode)
    file.extractall(output_dir)
 if __name__ == "__main__":
    args = parse_args()
    download_and_untar(args.url, args.output_dir)
--- a/tasks/converter/huggingface_gptneox_convert.py
+++ b/tasks/converter/huggingface_gptneox_convert.py
--- a/scripts/preprocess/args.py
+++ b/scripts/preprocess/args.py
--- a/scripts/preprocess/filters.py
+++ b/scripts/preprocess/filters.py
--- a/scripts/preprocess/metrics.py
+++ b/scripts/preprocess/metrics.py
@ -1,16 +1,22 @@
 def max_line_length(content):
-    return max([len(x) for x in content.splitlines()])
+    return max([0] + [len(x) for x in content.splitlines()])
 def avg_line_length(content):
    lines = [len(x) for x in content.splitlines()]
    total = sum(lines)
-    return total / len(lines)
+    if len(lines) != 0:
        return total / len(lines)
    else:
        return 0
 def alphanum_fraction(content):
    alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
-    return len(alphanum) / len(content)
+    if len(content) != 0:
        return len(alphanum) / len(content)
    else:
        return 0
 def compute(content):
--- a/scripts/preprocess/preprocess_project.py
+++ b/scripts/preprocess/preprocess_project.py
@ -45,7 +45,11 @@ def dataset_iter(files):
            _, extname = os.path.splitext(x)
            with open(x) as f:
-                content = f.read()
+                try:
                    content = f.read()
                except UnicodeDecodeError:
                    print("Cannot decode unicode", x)
                    continue
            yield dict(
                language=mappings[extname],
--- a/scripts/preprocess/programming-languages-to-file-extensions.json
+++ b/scripts/preprocess/programming-languages-to-file-extensions.json
--- a/tasks/README.md
+++ b/tasks/README.md
@ -1,5 +0,0 @@
 ## Tasks
 * [`converter`](./converter): Converts a [transformers](https://huggingface.co/docs/transformers) causal LM model into TensorRT / FasterTransformer serving formats.
 * [`preprocess`](./preprocess): Preprocess files into [datasets](https://huggingface.co/docs/datasets)
 * [`tabformer`](./tabformer): *NOT RELEASED* Trainer(PEFT w/RLHF) for tabby models.