Add tabby.tools.repository.updater for easier git repository synchronization. (#9)
* Move dags to tabby.tasks * Add repository syncer * Follow redirect for curladd-more-languages
parent
8cf533016a
commit
8144e4f83a
10
Dockerfile
10
Dockerfile
|
|
@ -2,9 +2,15 @@
|
||||||
|
|
||||||
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
||||||
|
|
||||||
# Install dagu (https://github.com/yohamta/dagu)
|
# Install utilities
|
||||||
RUN <<EOF
|
RUN <<EOF
|
||||||
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
|
apt-get -y update
|
||||||
|
apt-get -y install git curl
|
||||||
|
EOF
|
||||||
|
|
||||||
|
# Install dagu
|
||||||
|
RUN <<EOF
|
||||||
|
curl -L https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz > dagu.tar.gz
|
||||||
tar zxvf dagu.tar.gz
|
tar zxvf dagu.tar.gz
|
||||||
mv dagu /usr/local/bin
|
mv dagu /usr/local/bin
|
||||||
rm dagu.tar.gz LICENSE.md README.md
|
rm dagu.tar.gz LICENSE.md README.md
|
||||||
|
|
|
||||||
|
|
@ -1,20 +0,0 @@
|
||||||
schedule: "5 4 * * *" # Run at 04:05.
|
|
||||||
|
|
||||||
env:
|
|
||||||
- APP_DIR: /app
|
|
||||||
steps:
|
|
||||||
- name: Create job dir
|
|
||||||
command: mktemp -d /tmp/job.XXXXX
|
|
||||||
output: JOB_DIR
|
|
||||||
|
|
||||||
- name: Fetch source code
|
|
||||||
dir: $APP_DIR
|
|
||||||
command: python -m tabby.tools.download_project --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
|
|
||||||
depends:
|
|
||||||
- Create job dir
|
|
||||||
|
|
||||||
- name: Generate dataset
|
|
||||||
dir: $APP_DIR
|
|
||||||
command: python -m tabby.tools.preprocess.preprocess_project --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
|
|
||||||
depends:
|
|
||||||
- Fetch source code
|
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
[repositories.quickjs]
|
||||||
|
url = "https://gitee.com/vsf-linux/quickjs.git"
|
||||||
|
|
||||||
|
[repositories.transformers]
|
||||||
|
url = "https://gitee.com/angzhao/transformers.git"
|
||||||
|
|
@ -7,11 +7,11 @@ services:
|
||||||
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000
|
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000
|
||||||
environment:
|
environment:
|
||||||
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
|
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
|
||||||
- EVENTS_LOG_DIR=/logs/tabby-server
|
- EVENTS_LOG_DIR=/logs
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "5000:5000"
|
||||||
volumes:
|
volumes:
|
||||||
- ./data/logs:/logs
|
- ./data/logs/tabby-server:/logs
|
||||||
- ./data/hf_cache:/root/.cache/huggingface
|
- ./data/hf_cache:/root/.cache/huggingface
|
||||||
|
|
||||||
admin:
|
admin:
|
||||||
|
|
@ -35,7 +35,9 @@ services:
|
||||||
image: tabbyml/tabby
|
image: tabbyml/tabby
|
||||||
container_name: tabby-dagu
|
container_name: tabby-dagu
|
||||||
command: dagu server --host 0.0.0.0 --port 8080
|
command: dagu server --host 0.0.0.0 --port 8080
|
||||||
ports:
|
|
||||||
- 8080:8080
|
|
||||||
volumes:
|
volumes:
|
||||||
- ./config/dags:/root/.dagu/dags
|
- ./config:/config:ro
|
||||||
|
- ./data/repositories:/repositories
|
||||||
|
- ./data/dataset:/dataset
|
||||||
|
environment:
|
||||||
|
DAGU_DAGS: tabby/tasks
|
||||||
|
|
|
||||||
|
|
@ -1013,6 +1013,22 @@ files = [
|
||||||
[package.dependencies]
|
[package.dependencies]
|
||||||
gitdb = ">=4.0.1,<5"
|
gitdb = ">=4.0.1,<5"
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "gitup"
|
||||||
|
version = "0.5.1"
|
||||||
|
description = "Easily update multiple git repositories at once"
|
||||||
|
category = "main"
|
||||||
|
optional = false
|
||||||
|
python-versions = "*"
|
||||||
|
files = [
|
||||||
|
{file = "gitup-0.5.1-py3-none-any.whl", hash = "sha256:ee5b0314062edbccee900b851c030233378613256c141422a22a831c6201d8f8"},
|
||||||
|
{file = "gitup-0.5.1.tar.gz", hash = "sha256:4f787079cd65d8f60c5842181204635e1b72d3533ae91f0c619624c6b20846dd"},
|
||||||
|
]
|
||||||
|
|
||||||
|
[package.dependencies]
|
||||||
|
colorama = ">=0.3.9"
|
||||||
|
GitPython = ">=2.1.8"
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "greenlet"
|
name = "greenlet"
|
||||||
version = "2.0.2"
|
version = "2.0.2"
|
||||||
|
|
@ -3396,4 +3412,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.10"
|
python-versions = "^3.10"
|
||||||
content-hash = "0f2c2fbfd49c0a69dccf125d3ab96a00115b844753c2063b68d936bc2e8e39d1"
|
content-hash = "c4e4670acf2dd356d6e625d590ec109eee7081c065fc54ce97e9f69bdcb21844"
|
||||||
|
|
|
||||||
|
|
@ -15,6 +15,9 @@ uvicorn = "^0.21.1"
|
||||||
tritonclient = {extras = ["all"], version = "^2.31.0"}
|
tritonclient = {extras = ["all"], version = "^2.31.0"}
|
||||||
streamlit = "^1.20.0"
|
streamlit = "^1.20.0"
|
||||||
loguru = "^0.6.0"
|
loguru = "^0.6.0"
|
||||||
|
gitup = "^0.5.1"
|
||||||
|
toml = "^0.10.2"
|
||||||
|
gitpython = "^3.1.31"
|
||||||
|
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,15 @@
|
||||||
|
schedule: "5 4 * * *" # Run at 04:05.
|
||||||
|
|
||||||
|
env:
|
||||||
|
- PATH: "/opt/conda/bin:$PATH"
|
||||||
|
- APP_DIR: /app
|
||||||
|
steps:
|
||||||
|
- name: Update repositories
|
||||||
|
dir: $APP_DIR
|
||||||
|
command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml
|
||||||
|
|
||||||
|
- name: Generate dataset
|
||||||
|
dir: $APP_DIR
|
||||||
|
command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset
|
||||||
|
depends:
|
||||||
|
- Update repositories
|
||||||
|
|
@ -1,35 +0,0 @@
|
||||||
import tarfile
|
|
||||||
from dataclasses import dataclass, field
|
|
||||||
from typing import Optional
|
|
||||||
|
|
||||||
import requests
|
|
||||||
from transformers import HfArgumentParser
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class DownloaderArgs:
|
|
||||||
url: str = field(metadata={"help": "URL to source code tar.gz file"})
|
|
||||||
output_dir: str = field(metadata={"help": "Output save path directory"})
|
|
||||||
|
|
||||||
|
|
||||||
def parse_args():
|
|
||||||
parser = HfArgumentParser(DownloaderArgs)
|
|
||||||
return parser.parse_args()
|
|
||||||
|
|
||||||
|
|
||||||
def download_and_untar(url, output_dir):
|
|
||||||
response = requests.get(url, stream=True)
|
|
||||||
mode = "r"
|
|
||||||
if url.endswith(".gz"):
|
|
||||||
mode += "|gz"
|
|
||||||
elif url.endswith(".xz"):
|
|
||||||
mode += "|xz"
|
|
||||||
elif url.endswith(".bz2"):
|
|
||||||
mode += "|bz2"
|
|
||||||
file = tarfile.open(fileobj=response.raw, mode=mode)
|
|
||||||
file.extractall(output_dir)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
args = parse_args()
|
|
||||||
download_and_untar(args.url, args.output_dir)
|
|
||||||
|
|
@ -42,6 +42,7 @@ def dataset_iter(files):
|
||||||
def gen():
|
def gen():
|
||||||
mappings = read_extension_to_language_mappings()
|
mappings = read_extension_to_language_mappings()
|
||||||
for x in files:
|
for x in files:
|
||||||
|
print("Processing", x)
|
||||||
_, extname = os.path.splitext(x)
|
_, extname = os.path.splitext(x)
|
||||||
|
|
||||||
with open(x) as f:
|
with open(x) as f:
|
||||||
|
|
@ -0,0 +1,11 @@
|
||||||
|
all:
|
||||||
|
|
||||||
|
CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml
|
||||||
|
|
||||||
|
test:
|
||||||
|
# Cleanup
|
||||||
|
rm -rf testdata/repositories/quickjs
|
||||||
|
# Initialize
|
||||||
|
$(CMD)
|
||||||
|
# Updating
|
||||||
|
$(CMD)
|
||||||
|
|
@ -0,0 +1,4 @@
|
||||||
|
data_dir = "/repositories"
|
||||||
|
|
||||||
|
[repositories.quickjs]
|
||||||
|
url = "https://gitee.com/vsf-linux/quickjs.git"
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
quickjs
|
||||||
|
|
@ -0,0 +1,34 @@
|
||||||
|
import os
|
||||||
|
import pathlib
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
|
||||||
|
import toml
|
||||||
|
from git import Repo
|
||||||
|
from transformers import HfArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class Arguments:
|
||||||
|
data_dir: str = field(metadata={"help": "Base dir for repositories"})
|
||||||
|
config_file: str = field(metadata={"help": "Configuration file for tabby updater"})
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = HfArgumentParser(Arguments)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
config = toml.load(args.config_file)
|
||||||
|
repositories = config["repositories"]
|
||||||
|
|
||||||
|
for name, config in repositories.items():
|
||||||
|
path = pathlib.Path(args.data_dir, name)
|
||||||
|
if path.is_dir():
|
||||||
|
repo = Repo(path)
|
||||||
|
else:
|
||||||
|
Repo.clone_from(config["url"], path.absolute(), depth=1)
|
||||||
|
|
||||||
|
os.system(f"gitup {args.data_dir}")
|
||||||
Loading…
Reference in New Issue