Add tabby.tools.repository.updater for easier git repository synchronization. (#9)

* Move dags to tabby.tasks

* Add repository syncer

* Follow redirect for curl
add-more-languages
Meng Zhang 2023-03-25 14:44:46 +08:00 committed by GitHub
parent 8cf533016a
commit 8144e4f83a
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
14 changed files with 106 additions and 63 deletions

View File

@ -2,9 +2,15 @@
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
# Install dagu (https://github.com/yohamta/dagu)
# Install utilities
RUN <<EOF
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
apt-get -y update
apt-get -y install git curl
EOF
# Install dagu
RUN <<EOF
curl -L https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz > dagu.tar.gz
tar zxvf dagu.tar.gz
mv dagu /usr/local/bin
rm dagu.tar.gz LICENSE.md README.md

View File

@ -1,20 +0,0 @@
schedule: "5 4 * * *" # Run at 04:05.
env:
- APP_DIR: /app
steps:
- name: Create job dir
command: mktemp -d /tmp/job.XXXXX
output: JOB_DIR
- name: Fetch source code
dir: $APP_DIR
command: python -m tabby.tools.download_project --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
depends:
- Create job dir
- name: Generate dataset
dir: $APP_DIR
command: python -m tabby.tools.preprocess.preprocess_project --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
depends:
- Fetch source code

View File

@ -0,0 +1,5 @@
[repositories.quickjs]
url = "https://gitee.com/vsf-linux/quickjs.git"
[repositories.transformers]
url = "https://gitee.com/angzhao/transformers.git"

View File

@ -7,11 +7,11 @@ services:
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000
environment:
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
- EVENTS_LOG_DIR=/logs/tabby-server
- EVENTS_LOG_DIR=/logs
ports:
- "5000:5000"
volumes:
- ./data/logs:/logs
- ./data/logs/tabby-server:/logs
- ./data/hf_cache:/root/.cache/huggingface
admin:
@ -35,7 +35,9 @@ services:
image: tabbyml/tabby
container_name: tabby-dagu
command: dagu server --host 0.0.0.0 --port 8080
ports:
- 8080:8080
volumes:
- ./config/dags:/root/.dagu/dags
- ./config:/config:ro
- ./data/repositories:/repositories
- ./data/dataset:/dataset
environment:
DAGU_DAGS: tabby/tasks

18
poetry.lock generated
View File

@ -1013,6 +1013,22 @@ files = [
[package.dependencies]
gitdb = ">=4.0.1,<5"
[[package]]
name = "gitup"
version = "0.5.1"
description = "Easily update multiple git repositories at once"
category = "main"
optional = false
python-versions = "*"
files = [
{file = "gitup-0.5.1-py3-none-any.whl", hash = "sha256:ee5b0314062edbccee900b851c030233378613256c141422a22a831c6201d8f8"},
{file = "gitup-0.5.1.tar.gz", hash = "sha256:4f787079cd65d8f60c5842181204635e1b72d3533ae91f0c619624c6b20846dd"},
]
[package.dependencies]
colorama = ">=0.3.9"
GitPython = ">=2.1.8"
[[package]]
name = "greenlet"
version = "2.0.2"
@ -3396,4 +3412,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
[metadata]
lock-version = "2.0"
python-versions = "^3.10"
content-hash = "0f2c2fbfd49c0a69dccf125d3ab96a00115b844753c2063b68d936bc2e8e39d1"
content-hash = "c4e4670acf2dd356d6e625d590ec109eee7081c065fc54ce97e9f69bdcb21844"

View File

@ -15,6 +15,9 @@ uvicorn = "^0.21.1"
tritonclient = {extras = ["all"], version = "^2.31.0"}
streamlit = "^1.20.0"
loguru = "^0.6.0"
gitup = "^0.5.1"
toml = "^0.10.2"
gitpython = "^3.1.31"
[tool.poetry.group.dev.dependencies]

15
tabby/tasks/trainer.yaml Normal file
View File

@ -0,0 +1,15 @@
schedule: "5 4 * * *" # Run at 04:05.
env:
- PATH: "/opt/conda/bin:$PATH"
- APP_DIR: /app
steps:
- name: Update repositories
dir: $APP_DIR
command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml
- name: Generate dataset
dir: $APP_DIR
command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset
depends:
- Update repositories

View File

@ -1,35 +0,0 @@
import tarfile
from dataclasses import dataclass, field
from typing import Optional
import requests
from transformers import HfArgumentParser
@dataclass
class DownloaderArgs:
url: str = field(metadata={"help": "URL to source code tar.gz file"})
output_dir: str = field(metadata={"help": "Output save path directory"})
def parse_args():
parser = HfArgumentParser(DownloaderArgs)
return parser.parse_args()
def download_and_untar(url, output_dir):
response = requests.get(url, stream=True)
mode = "r"
if url.endswith(".gz"):
mode += "|gz"
elif url.endswith(".xz"):
mode += "|xz"
elif url.endswith(".bz2"):
mode += "|bz2"
file = tarfile.open(fileobj=response.raw, mode=mode)
file.extractall(output_dir)
if __name__ == "__main__":
args = parse_args()
download_and_untar(args.url, args.output_dir)

View File

@ -42,6 +42,7 @@ def dataset_iter(files):
def gen():
mappings = read_extension_to_language_mappings()
for x in files:
print("Processing", x)
_, extname = os.path.splitext(x)
with open(x) as f:

View File

@ -0,0 +1,11 @@
all:
CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml
test:
# Cleanup
rm -rf testdata/repositories/quickjs
# Initialize
$(CMD)
# Updating
$(CMD)

View File

@ -0,0 +1,4 @@
data_dir = "/repositories"
[repositories.quickjs]
url = "https://gitee.com/vsf-linux/quickjs.git"

View File

@ -0,0 +1 @@
quickjs

View File

View File

@ -0,0 +1,34 @@
import os
import pathlib
from dataclasses import dataclass, field
import toml
from git import Repo
from transformers import HfArgumentParser
@dataclass
class Arguments:
data_dir: str = field(metadata={"help": "Base dir for repositories"})
config_file: str = field(metadata={"help": "Configuration file for tabby updater"})
def parse_args():
parser = HfArgumentParser(Arguments)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
config = toml.load(args.config_file)
repositories = config["repositories"]
for name, config in repositories.items():
path = pathlib.Path(args.data_dir, name)
if path.is_dir():
repo = Repo(path)
else:
Repo.clone_from(config["url"], path.absolute(), depth=1)
os.system(f"gitup {args.data_dir}")