Add tabby.tools.repository.updater for easier git repository synchronization. (#9)
* Move dags to tabby.tasks * Add repository syncer * Follow redirect for curladd-more-languages
parent
8cf533016a
commit
8144e4f83a
10
Dockerfile
10
Dockerfile
|
|
@ -2,9 +2,15 @@
|
|||
|
||||
FROM pytorch/pytorch:2.0.0-cuda11.7-cudnn8-runtime
|
||||
|
||||
# Install dagu (https://github.com/yohamta/dagu)
|
||||
# Install utilities
|
||||
RUN <<EOF
|
||||
python -c "import urllib.request; urllib.request.urlretrieve('https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz', 'dagu.tar.gz')"
|
||||
apt-get -y update
|
||||
apt-get -y install git curl
|
||||
EOF
|
||||
|
||||
# Install dagu
|
||||
RUN <<EOF
|
||||
curl -L https://github.com/yohamta/dagu/releases/download/v1.10.2/dagu_1.10.2_Linux_x86_64.tar.gz > dagu.tar.gz
|
||||
tar zxvf dagu.tar.gz
|
||||
mv dagu /usr/local/bin
|
||||
rm dagu.tar.gz LICENSE.md README.md
|
||||
|
|
|
|||
|
|
@ -1,20 +0,0 @@
|
|||
schedule: "5 4 * * *" # Run at 04:05.
|
||||
|
||||
env:
|
||||
- APP_DIR: /app
|
||||
steps:
|
||||
- name: Create job dir
|
||||
command: mktemp -d /tmp/job.XXXXX
|
||||
output: JOB_DIR
|
||||
|
||||
- name: Fetch source code
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.download_project --url http://download.savannah.gnu.org/releases/tinycc/tcc-0.9.24.tar.bz2 --output_dir $JOB_DIR/repository
|
||||
depends:
|
||||
- Create job dir
|
||||
|
||||
- name: Generate dataset
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.preprocess.preprocess_project --project_dir $JOB_DIR/repository --output_dir $JOB_DIR/dataset
|
||||
depends:
|
||||
- Fetch source code
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
[repositories.quickjs]
|
||||
url = "https://gitee.com/vsf-linux/quickjs.git"
|
||||
|
||||
[repositories.transformers]
|
||||
url = "https://gitee.com/angzhao/transformers.git"
|
||||
|
|
@ -7,11 +7,11 @@ services:
|
|||
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000
|
||||
environment:
|
||||
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
|
||||
- EVENTS_LOG_DIR=/logs/tabby-server
|
||||
- EVENTS_LOG_DIR=/logs
|
||||
ports:
|
||||
- "5000:5000"
|
||||
volumes:
|
||||
- ./data/logs:/logs
|
||||
- ./data/logs/tabby-server:/logs
|
||||
- ./data/hf_cache:/root/.cache/huggingface
|
||||
|
||||
admin:
|
||||
|
|
@ -35,7 +35,9 @@ services:
|
|||
image: tabbyml/tabby
|
||||
container_name: tabby-dagu
|
||||
command: dagu server --host 0.0.0.0 --port 8080
|
||||
ports:
|
||||
- 8080:8080
|
||||
volumes:
|
||||
- ./config/dags:/root/.dagu/dags
|
||||
- ./config:/config:ro
|
||||
- ./data/repositories:/repositories
|
||||
- ./data/dataset:/dataset
|
||||
environment:
|
||||
DAGU_DAGS: tabby/tasks
|
||||
|
|
|
|||
|
|
@ -1013,6 +1013,22 @@ files = [
|
|||
[package.dependencies]
|
||||
gitdb = ">=4.0.1,<5"
|
||||
|
||||
[[package]]
|
||||
name = "gitup"
|
||||
version = "0.5.1"
|
||||
description = "Easily update multiple git repositories at once"
|
||||
category = "main"
|
||||
optional = false
|
||||
python-versions = "*"
|
||||
files = [
|
||||
{file = "gitup-0.5.1-py3-none-any.whl", hash = "sha256:ee5b0314062edbccee900b851c030233378613256c141422a22a831c6201d8f8"},
|
||||
{file = "gitup-0.5.1.tar.gz", hash = "sha256:4f787079cd65d8f60c5842181204635e1b72d3533ae91f0c619624c6b20846dd"},
|
||||
]
|
||||
|
||||
[package.dependencies]
|
||||
colorama = ">=0.3.9"
|
||||
GitPython = ">=2.1.8"
|
||||
|
||||
[[package]]
|
||||
name = "greenlet"
|
||||
version = "2.0.2"
|
||||
|
|
@ -3396,4 +3412,4 @@ testing = ["coverage (>=5.0.3)", "zope.event", "zope.testing"]
|
|||
[metadata]
|
||||
lock-version = "2.0"
|
||||
python-versions = "^3.10"
|
||||
content-hash = "0f2c2fbfd49c0a69dccf125d3ab96a00115b844753c2063b68d936bc2e8e39d1"
|
||||
content-hash = "c4e4670acf2dd356d6e625d590ec109eee7081c065fc54ce97e9f69bdcb21844"
|
||||
|
|
|
|||
|
|
@ -15,6 +15,9 @@ uvicorn = "^0.21.1"
|
|||
tritonclient = {extras = ["all"], version = "^2.31.0"}
|
||||
streamlit = "^1.20.0"
|
||||
loguru = "^0.6.0"
|
||||
gitup = "^0.5.1"
|
||||
toml = "^0.10.2"
|
||||
gitpython = "^3.1.31"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
|
|
|
|||
|
|
@ -0,0 +1,15 @@
|
|||
schedule: "5 4 * * *" # Run at 04:05.
|
||||
|
||||
env:
|
||||
- PATH: "/opt/conda/bin:$PATH"
|
||||
- APP_DIR: /app
|
||||
steps:
|
||||
- name: Update repositories
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml
|
||||
|
||||
- name: Generate dataset
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset
|
||||
depends:
|
||||
- Update repositories
|
||||
|
|
@ -1,35 +0,0 @@
|
|||
import tarfile
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
from transformers import HfArgumentParser
|
||||
|
||||
|
||||
@dataclass
|
||||
class DownloaderArgs:
|
||||
url: str = field(metadata={"help": "URL to source code tar.gz file"})
|
||||
output_dir: str = field(metadata={"help": "Output save path directory"})
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = HfArgumentParser(DownloaderArgs)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def download_and_untar(url, output_dir):
|
||||
response = requests.get(url, stream=True)
|
||||
mode = "r"
|
||||
if url.endswith(".gz"):
|
||||
mode += "|gz"
|
||||
elif url.endswith(".xz"):
|
||||
mode += "|xz"
|
||||
elif url.endswith(".bz2"):
|
||||
mode += "|bz2"
|
||||
file = tarfile.open(fileobj=response.raw, mode=mode)
|
||||
file.extractall(output_dir)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
download_and_untar(args.url, args.output_dir)
|
||||
|
|
@ -42,6 +42,7 @@ def dataset_iter(files):
|
|||
def gen():
|
||||
mappings = read_extension_to_language_mappings()
|
||||
for x in files:
|
||||
print("Processing", x)
|
||||
_, extname = os.path.splitext(x)
|
||||
|
||||
with open(x) as f:
|
||||
|
|
@ -0,0 +1,11 @@
|
|||
all:
|
||||
|
||||
CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml
|
||||
|
||||
test:
|
||||
# Cleanup
|
||||
rm -rf testdata/repositories/quickjs
|
||||
# Initialize
|
||||
$(CMD)
|
||||
# Updating
|
||||
$(CMD)
|
||||
|
|
@ -0,0 +1,4 @@
|
|||
data_dir = "/repositories"
|
||||
|
||||
[repositories.quickjs]
|
||||
url = "https://gitee.com/vsf-linux/quickjs.git"
|
||||
|
|
@ -0,0 +1 @@
|
|||
quickjs
|
||||
|
|
@ -0,0 +1,34 @@
|
|||
import os
|
||||
import pathlib
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import toml
|
||||
from git import Repo
|
||||
from transformers import HfArgumentParser
|
||||
|
||||
|
||||
@dataclass
|
||||
class Arguments:
|
||||
data_dir: str = field(metadata={"help": "Base dir for repositories"})
|
||||
config_file: str = field(metadata={"help": "Configuration file for tabby updater"})
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = HfArgumentParser(Arguments)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
config = toml.load(args.config_file)
|
||||
repositories = config["repositories"]
|
||||
|
||||
for name, config in repositories.items():
|
||||
path = pathlib.Path(args.data_dir, name)
|
||||
if path.is_dir():
|
||||
repo = Repo(path)
|
||||
else:
|
||||
Repo.clone_from(config["url"], path.absolute(), depth=1)
|
||||
|
||||
os.system(f"gitup {args.data_dir}")
|
||||
Loading…
Reference in New Issue