Add dagu for data processing job orchestration (#7)

* Install dagu

* Move dagu install to first stage

* Fix metrics

* Add DAGs for create dataset from code repository
add-more-languages
Meng Zhang 2023-03-25 00:05:47 +08:00 committed by GitHub
parent c476f5f48f
commit 1038bb39a1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
15 changed files with 94 additions and 11 deletions

View File

@ -1,4 +1,5 @@
testdata testdata
deployment
**/.git **/.git
**/__pycache__ **/__pycache__

View File

@ -15,7 +15,7 @@ $(PRE_COMMIT_HOOK):
poetry run pre-commit install --install-hooks poetry run pre-commit install --install-hooks
$(LOCAL_MODEL): $(LOCAL_MODEL):
poetry run python tasks/converter/huggingface_gptneox_convert.py \ poetry run python scripts/huggingface_gptneox_convert.py \
-in_file EleutherAI/pythia-70m-deduped \ -in_file EleutherAI/pythia-70m-deduped \
-o $@ \ -o $@ \
-i_g 1 -m_n tiny-70M -p 1 -w fp16 -i_g 1 -m_n tiny-70M -p 1 -w fp16

View File

@ -12,7 +12,6 @@ An opensource / on-prem alternative to GitHub Copilot
* [`admin`](./admin): Admin panel for monitoring / settings purpose. * [`admin`](./admin): Admin panel for monitoring / settings purpose.
* [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality). * [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality).
* [`deployment`](./deployment): Container related deployment configs. * [`deployment`](./deployment): Container related deployment configs.
* [`tasks`](./tasks): Various data processing scripts.
## Development ## Development

View File

@ -2,6 +2,14 @@
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
# Install dagu (https://github.com/yohamta/dagu)
RUN <<EOF
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
tar zxvf dagu.tar.gz
mv dagu /usr/local/bin
rm dagu.tar.gz LICENSE.md README.md
EOF
ARG PYPI_INDEX_URL=https://pypi.org/simple ARG PYPI_INDEX_URL=https://pypi.org/simple
ARG POETRY_VERSION=1.4.0 ARG POETRY_VERSION=1.4.0

View File

@ -0,0 +1,20 @@
schedule: "5 4 * * *" # Run at 04:05.
env:
- APP_DIR: /app
steps:
- name: Create job dir
command: mktemp -d /tmp/job.XXXXX
output: JOB_DIR
- name: Fetch source code
dir: $APP_DIR
command: python scripts/download_project.py --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
depends:
- Create job dir
- name: Generate dataset
dir: ${APP_DIR}
command: python scripts/preprocess/preprocess_project.py --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
depends:
- Fetch source code

View File

@ -19,3 +19,9 @@ services:
vector: vector:
environment: environment:
- VECTOR_WATCH_CONFIG=true - VECTOR_WATCH_CONFIG=true
dagu:
ports:
- 8080:8080
volumes:
- ../:/app

View File

@ -34,3 +34,12 @@ services:
volumes: volumes:
- ./config/vector.toml:/etc/vector/vector.toml:ro - ./config/vector.toml:/etc/vector/vector.toml:ro
- ./data/logs:/logs - ./data/logs:/logs
dagu:
image: tabbyml/tabby
container_name: tabby-dagu
command: dagu server --host 0.0.0.0 --port 8080
ports:
- 8080:8080
volumes:
- ./config/dags:/root/.dagu/dags

View File

@ -0,0 +1,35 @@
import tarfile
from dataclasses import dataclass, field
from typing import Optional
import requests
from transformers import HfArgumentParser
@dataclass(kw_only=True)
class DownloaderArgs:
url: str = field(metadata={"help": "URL to source code tar.gz file"})
output_dir: str = field(metadata={"help": "Output save path directory"})
def parse_args():
parser = HfArgumentParser(DownloaderArgs)
return parser.parse_args()
def download_and_untar(url, output_dir):
response = requests.get(url, stream=True)
mode = "r"
if url.endswith(".gz"):
mode += "|gz"
elif url.endswith(".xz"):
mode += "|xz"
elif url.endswith(".bz2"):
mode += "|bz2"
file = tarfile.open(fileobj=response.raw, mode=mode)
file.extractall(output_dir)
if __name__ == "__main__":
args = parse_args()
download_and_untar(args.url, args.output_dir)

View File

@ -1,16 +1,22 @@
def max_line_length(content): def max_line_length(content):
return max([len(x) for x in content.splitlines()]) return max([0] + [len(x) for x in content.splitlines()])
def avg_line_length(content): def avg_line_length(content):
lines = [len(x) for x in content.splitlines()] lines = [len(x) for x in content.splitlines()]
total = sum(lines) total = sum(lines)
return total / len(lines) if len(lines) != 0:
return total / len(lines)
else:
return 0
def alphanum_fraction(content): def alphanum_fraction(content):
alphanum = [x for x in content if x.isalpha() or x.isnumeric()] alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
return len(alphanum) / len(content) if len(content) != 0:
return len(alphanum) / len(content)
else:
return 0
def compute(content): def compute(content):

View File

@ -45,7 +45,11 @@ def dataset_iter(files):
_, extname = os.path.splitext(x) _, extname = os.path.splitext(x)
with open(x) as f: with open(x) as f:
content = f.read() try:
content = f.read()
except UnicodeDecodeError:
print("Cannot decode unicode", x)
continue
yield dict( yield dict(
language=mappings[extname], language=mappings[extname],

View File

@ -1,5 +0,0 @@
## Tasks
* [`converter`](./converter): Converts a [transformers](https://huggingface.co/docs/transformers) causal LM model into TensorRT / FasterTransformer serving formats.
* [`preprocess`](./preprocess): Preprocess files into [datasets](https://huggingface.co/docs/datasets)
* [`tabformer`](./tabformer): *NOT RELEASED* Trainer(PEFT w/RLHF) for tabby models.