From 1038bb39a1a23d592cc066b7f1b2607c20528f94 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Sat, 25 Mar 2023 00:05:47 +0800 Subject: [PATCH] Add dagu for data processing job orchestration (#7) * Install dagu * Move dagu install to first stage * Fix metrics * Add DAGs for create dataset from code repository --- .dockerignore | 1 + Makefile | 2 +- README.md | 1 - deployment/Dockerfile | 8 +++++ deployment/config/dags/train.yaml | 20 +++++++++++ deployment/docker-compose.dev.yml | 6 ++++ deployment/docker-compose.yml | 9 +++++ scripts/download_project.py | 35 +++++++++++++++++++ .../huggingface_gptneox_convert.py | 0 {tasks => scripts}/preprocess/args.py | 0 {tasks => scripts}/preprocess/filters.py | 0 {tasks => scripts}/preprocess/metrics.py | 12 +++++-- .../preprocess/preprocess_project.py | 6 +++- ...gramming-languages-to-file-extensions.json | 0 tasks/README.md | 5 --- 15 files changed, 94 insertions(+), 11 deletions(-) create mode 100644 deployment/config/dags/train.yaml create mode 100644 scripts/download_project.py rename {tasks/converter => scripts}/huggingface_gptneox_convert.py (100%) rename {tasks => scripts}/preprocess/args.py (100%) rename {tasks => scripts}/preprocess/filters.py (100%) rename {tasks => scripts}/preprocess/metrics.py (64%) rename {tasks => scripts}/preprocess/preprocess_project.py (90%) rename {tasks => scripts}/preprocess/programming-languages-to-file-extensions.json (100%) delete mode 100644 tasks/README.md diff --git a/.dockerignore b/.dockerignore index 8a66279..d0b1763 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,5 @@ testdata +deployment **/.git **/__pycache__ diff --git a/Makefile b/Makefile index 1b1968b..d849eb2 100644 --- a/Makefile +++ b/Makefile @@ -15,7 +15,7 @@ $(PRE_COMMIT_HOOK): poetry run pre-commit install --install-hooks $(LOCAL_MODEL): - poetry run python tasks/converter/huggingface_gptneox_convert.py \ + poetry run python scripts/huggingface_gptneox_convert.py \ -in_file EleutherAI/pythia-70m-deduped \ -o $@ \ -i_g 1 -m_n tiny-70M -p 1 -w fp16 diff --git a/README.md b/README.md index 9869647..d5a083b 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,6 @@ An opensource / on-prem alternative to GitHub Copilot * [`admin`](./admin): Admin panel for monitoring / settings purpose. * [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality). * [`deployment`](./deployment): Container related deployment configs. -* [`tasks`](./tasks): Various data processing scripts. ## Development diff --git a/deployment/Dockerfile b/deployment/Dockerfile index 08b06d2..c0412b0 100644 --- a/deployment/Dockerfile +++ b/deployment/Dockerfile @@ -2,6 +2,14 @@ FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime +# Install dagu (https://github.com/yohamta/dagu) +RUN <