Add dagu for data processing job orchestration (#7)
* Install dagu * Move dagu install to first stage * Fix metrics * Add DAGs for create dataset from code repositoryadd-more-languages
parent
c476f5f48f
commit
1038bb39a1
|
|
@ -1,4 +1,5 @@
|
|||
testdata
|
||||
deployment
|
||||
|
||||
**/.git
|
||||
**/__pycache__
|
||||
|
|
|
|||
2
Makefile
2
Makefile
|
|
@ -15,7 +15,7 @@ $(PRE_COMMIT_HOOK):
|
|||
poetry run pre-commit install --install-hooks
|
||||
|
||||
$(LOCAL_MODEL):
|
||||
poetry run python tasks/converter/huggingface_gptneox_convert.py \
|
||||
poetry run python scripts/huggingface_gptneox_convert.py \
|
||||
-in_file EleutherAI/pythia-70m-deduped \
|
||||
-o $@ \
|
||||
-i_g 1 -m_n tiny-70M -p 1 -w fp16
|
||||
|
|
|
|||
|
|
@ -12,7 +12,6 @@ An opensource / on-prem alternative to GitHub Copilot
|
|||
* [`admin`](./admin): Admin panel for monitoring / settings purpose.
|
||||
* [`server`](./server): API server for completion requests. It also logs users' selections (as feedback to model's quality).
|
||||
* [`deployment`](./deployment): Container related deployment configs.
|
||||
* [`tasks`](./tasks): Various data processing scripts.
|
||||
|
||||
## Development
|
||||
|
||||
|
|
|
|||
|
|
@ -2,6 +2,14 @@
|
|||
|
||||
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
||||
|
||||
# Install dagu (https://github.com/yohamta/dagu)
|
||||
RUN <<EOF
|
||||
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
|
||||
tar zxvf dagu.tar.gz
|
||||
mv dagu /usr/local/bin
|
||||
rm dagu.tar.gz LICENSE.md README.md
|
||||
EOF
|
||||
|
||||
ARG PYPI_INDEX_URL=https://pypi.org/simple
|
||||
ARG POETRY_VERSION=1.4.0
|
||||
|
||||
|
|
|
|||
|
|
@ -0,0 +1,20 @@
|
|||
schedule: "5 4 * * *" # Run at 04:05.
|
||||
|
||||
env:
|
||||
- APP_DIR: /app
|
||||
steps:
|
||||
- name: Create job dir
|
||||
command: mktemp -d /tmp/job.XXXXX
|
||||
output: JOB_DIR
|
||||
|
||||
- name: Fetch source code
|
||||
dir: $APP_DIR
|
||||
command: python scripts/download_project.py --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
|
||||
depends:
|
||||
- Create job dir
|
||||
|
||||
- name: Generate dataset
|
||||
dir: ${APP_DIR}
|
||||
command: python scripts/preprocess/preprocess_project.py --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
|
||||
depends:
|
||||
- Fetch source code
|
||||
|
|
@ -19,3 +19,9 @@ services:
|
|||
vector:
|
||||
environment:
|
||||
- VECTOR_WATCH_CONFIG=true
|
||||
|
||||
dagu:
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ../:/app
|
||||
|
|
|
|||
|
|
@ -34,3 +34,12 @@ services:
|
|||
volumes:
|
||||
- ./config/vector.toml:/etc/vector/vector.toml:ro
|
||||
- ./data/logs:/logs
|
||||
|
||||
dagu:
|
||||
image: tabbyml/tabby
|
||||
container_name: tabby-dagu
|
||||
command: dagu server --host 0.0.0.0 --port 8080
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ./config/dags:/root/.dagu/dags
|
||||
|
|
|
|||
|
|
@ -0,0 +1,35 @@
|
|||
import tarfile
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from transformers import HfArgumentParser
|
||||
|
||||
|
||||
@dataclass(kw_only=True)
|
||||
class DownloaderArgs:
|
||||
url: str = field(metadata={"help": "URL to source code tar.gz file"})
|
||||
output_dir: str = field(metadata={"help": "Output save path directory"})
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = HfArgumentParser(DownloaderArgs)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def download_and_untar(url, output_dir):
|
||||
response = requests.get(url, stream=True)
|
||||
mode = "r"
|
||||
if url.endswith(".gz"):
|
||||
mode += "|gz"
|
||||
elif url.endswith(".xz"):
|
||||
mode += "|xz"
|
||||
elif url.endswith(".bz2"):
|
||||
mode += "|bz2"
|
||||
file = tarfile.open(fileobj=response.raw, mode=mode)
|
||||
file.extractall(output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
download_and_untar(args.url, args.output_dir)
|
||||
|
|
@ -1,16 +1,22 @@
|
|||
def max_line_length(content):
|
||||
return max([len(x) for x in content.splitlines()])
|
||||
return max([0] + [len(x) for x in content.splitlines()])
|
||||
|
||||
|
||||
def avg_line_length(content):
|
||||
lines = [len(x) for x in content.splitlines()]
|
||||
total = sum(lines)
|
||||
return total / len(lines)
|
||||
if len(lines) != 0:
|
||||
return total / len(lines)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def alphanum_fraction(content):
|
||||
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
|
||||
return len(alphanum) / len(content)
|
||||
if len(content) != 0:
|
||||
return len(alphanum) / len(content)
|
||||
else:
|
||||
return 0
|
||||
|
||||
|
||||
def compute(content):
|
||||
|
|
@ -45,7 +45,11 @@ def dataset_iter(files):
|
|||
_, extname = os.path.splitext(x)
|
||||
|
||||
with open(x) as f:
|
||||
content = f.read()
|
||||
try:
|
||||
content = f.read()
|
||||
except UnicodeDecodeError:
|
||||
print("Cannot decode unicode", x)
|
||||
continue
|
||||
|
||||
yield dict(
|
||||
language=mappings[extname],
|
||||
|
|
@ -1,5 +0,0 @@
|
|||
## Tasks
|
||||
|
||||
* [`converter`](./converter): Converts a [transformers](https://huggingface.co/docs/transformers) causal LM model into TensorRT / FasterTransformer serving formats.
|
||||
* [`preprocess`](./preprocess): Preprocess files into [datasets](https://huggingface.co/docs/datasets)
|
||||
* [`tabformer`](./tabformer): *NOT RELEASED* Trainer(PEFT w/RLHF) for tabby models.
|
||||
Loading…
Reference in New Issue