Add dagu for data processing job orchestration (#7)
* Install dagu * Move dagu install to first stage * Fix metrics * Add DAGs for create dataset from code repositoryadd-more-languages
parent
c476f5f48f
commit
1038bb39a1
|
|
@ -1,4 +1,5 @@
|
||||||
testdata
|
testdata
|
||||||
|
deployment
|
||||||
|
|
||||||
**/.git
|
**/.git
|
||||||
**/__pycache__
|
**/__pycache__
|
||||||
|
|
|
||||||
2
Makefile
2
Makefile
|
|
@ -15,7 +15,7 @@ $(PRE_COMMIT_HOOK):
|
||||||
poetry run pre-commit install --install-hooks
|
poetry run pre-commit install --install-hooks
|
||||||
|
|
||||||
$(LOCAL_MODEL):
|
$(LOCAL_MODEL):
|
||||||
poetry run python tasks/converter/huggingface_gptneox_convert.py \
|
poetry run python scripts/huggingface_gptneox_convert.py \
|
||||||
-in_file EleutherAI/pythia-70m-deduped \
|
-in_file EleutherAI/pythia-70m-deduped \
|
||||||
-o $@ \
|
-o $@ \
|
||||||
-i_g 1 -m_n tiny-70M -p 1 -w fp16
|
-i_g 1 -m_n tiny-70M -p 1 -w fp16
|
||||||
|
|
|
||||||
|
|
@ -12,7 +12,6 @@ An opensource / on-prem alternative to GitHub Copilot
|
||||||
* [`admin`](./admin): Admin panel for monitoring / settings purpose.
|
* [`admin`](./admin): Admin panel for monitoring / settings purpose.
|
||||||
* [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality).
|
* [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality).
|
||||||
* [`deployment`](./deployment): Container related deployment configs.
|
* [`deployment`](./deployment): Container related deployment configs.
|
||||||
* [`tasks`](./tasks): Various data processing scripts.
|
|
||||||
|
|
||||||
## Development
|
## Development
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -2,6 +2,14 @@
|
||||||
|
|
||||||
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
||||||
|
|
||||||
|
# Install dagu (https://github.com/yohamta/dagu)
|
||||||
|
RUN <<EOF
|
||||||
|
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
|
||||||
|
tar zxvf dagu.tar.gz
|
||||||
|
mv dagu /usr/local/bin
|
||||||
|
rm dagu.tar.gz LICENSE.md README.md
|
||||||
|
EOF
|
||||||
|
|
||||||
ARG PYPI_INDEX_URL=https://pypi.org/simple
|
ARG PYPI_INDEX_URL=https://pypi.org/simple
|
||||||
ARG POETRY_VERSION=1.4.0
|
ARG POETRY_VERSION=1.4.0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,20 @@
|
||||||
|
schedule: "5 4 * * *" # Run at 04:05.
|
||||||
|
|
||||||
|
env:
|
||||||
|
- APP_DIR: /app
|
||||||
|
steps:
|
||||||
|
- name: Create job dir
|
||||||
|
command: mktemp -d /tmp/job.XXXXX
|
||||||
|
output: JOB_DIR
|
||||||
|
|
||||||
|
- name: Fetch source code
|
||||||
|
dir: $APP_DIR
|
||||||
|
command: python scripts/download_project.py --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
|
||||||
|
depends:
|
||||||
|
- Create job dir
|
||||||
|
|
||||||
|
- name: Generate dataset
|
||||||
|
dir: ${APP_DIR}
|
||||||
|
command: python scripts/preprocess/preprocess_project.py --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
|
||||||
|
depends:
|
||||||
|
- Fetch source code
|
||||||
|
|
@ -19,3 +19,9 @@ services:
|
||||||
vector:
|
vector:
|
||||||
environment:
|
environment:
|
||||||
- VECTOR_WATCH_CONFIG=true
|
- VECTOR_WATCH_CONFIG=true
|
||||||
|
|
||||||
|
dagu:
|
||||||
|
ports:
|
||||||
|
- 8080:8080
|
||||||
|
volumes:
|
||||||
|
- ../:/app
|
||||||
|
|
|
||||||
|
|
@ -34,3 +34,12 @@ services:
|
||||||
volumes:
|
volumes:
|
||||||
- ./config/vector.toml:/etc/vector/vector.toml:ro
|
- ./config/vector.toml:/etc/vector/vector.toml:ro
|
||||||
- ./data/logs:/logs
|
- ./data/logs:/logs
|
||||||
|
|
||||||
|
dagu:
|
||||||
|
image: tabbyml/tabby
|
||||||
|
container_name: tabby-dagu
|
||||||
|
command: dagu server --host 0.0.0.0 --port 8080
|
||||||
|
ports:
|
||||||
|
- 8080:8080
|
||||||
|
volumes:
|
||||||
|
- ./config/dags:/root/.dagu/dags
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
import tarfile
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
import requests
|
||||||
|
from transformers import HfArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass(kw_only=True)
|
||||||
|
class DownloaderArgs:
|
||||||
|
url: str = field(metadata={"help": "URL to source code tar.gz file"})
|
||||||
|
output_dir: str = field(metadata={"help": "Output save path directory"})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = HfArgumentParser(DownloaderArgs)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def download_and_untar(url, output_dir):
|
||||||
|
response = requests.get(url, stream=True)
|
||||||
|
mode = "r"
|
||||||
|
if url.endswith(".gz"):
|
||||||
|
mode += "|gz"
|
||||||
|
elif url.endswith(".xz"):
|
||||||
|
mode += "|xz"
|
||||||
|
elif url.endswith(".bz2"):
|
||||||
|
mode += "|bz2"
|
||||||
|
file = tarfile.open(fileobj=response.raw, mode=mode)
|
||||||
|
file.extractall(output_dir)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
download_and_untar(args.url, args.output_dir)
|
||||||
|
|
@ -1,16 +1,22 @@
|
||||||
def max_line_length(content):
|
def max_line_length(content):
|
||||||
return max([len(x) for x in content.splitlines()])
|
return max([0] + [len(x) for x in content.splitlines()])
|
||||||
|
|
||||||
|
|
||||||
def avg_line_length(content):
|
def avg_line_length(content):
|
||||||
lines = [len(x) for x in content.splitlines()]
|
lines = [len(x) for x in content.splitlines()]
|
||||||
total = sum(lines)
|
total = sum(lines)
|
||||||
return total / len(lines)
|
if len(lines) != 0:
|
||||||
|
return total / len(lines)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def alphanum_fraction(content):
|
def alphanum_fraction(content):
|
||||||
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
|
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
|
||||||
return len(alphanum) / len(content)
|
if len(content) != 0:
|
||||||
|
return len(alphanum) / len(content)
|
||||||
|
else:
|
||||||
|
return 0
|
||||||
|
|
||||||
|
|
||||||
def compute(content):
|
def compute(content):
|
||||||
|
|
@ -45,7 +45,11 @@ def dataset_iter(files):
|
||||||
_, extname = os.path.splitext(x)
|
_, extname = os.path.splitext(x)
|
||||||
|
|
||||||
with open(x) as f:
|
with open(x) as f:
|
||||||
content = f.read()
|
try:
|
||||||
|
content = f.read()
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
print("Cannot decode unicode", x)
|
||||||
|
continue
|
||||||
|
|
||||||
yield dict(
|
yield dict(
|
||||||
language=mappings[extname],
|
language=mappings[extname],
|
||||||
|
|
@ -1,5 +0,0 @@
|
||||||
## Tasks
|
|
||||||
|
|
||||||
* [`converter`](./converter): Converts a [transformers](https://huggingface.co/docs/transformers) causal LM model into TensorRT / FasterTransformer serving formats.
|
|
||||||
* [`preprocess`](./preprocess): Preprocess files into [datasets](https://huggingface.co/docs/datasets)
|
|
||||||
* [`tabformer`](./tabformer): *NOT RELEASED* Trainer(PEFT w/RLHF) for tabby models.
|
|
||||||
Loading…
Reference in New Issue