From 1c61ef39442b080de2129197e3190438f9825a81 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 3 Apr 2023 13:04:04 +0800 Subject: [PATCH] feat: integrate projects / dataset information in admin. (#38) * feat: add projects page in admin * feat: integrate update_dataset job * feat: display dataset info in projects --- deployment/docker-compose.yml | 2 - deployment/scripts/tabby.sh | 13 +++++ development/config/repository.toml | 5 -- tabby/admin/pages/Projects.py | 51 +++++++++++++++++++ tabby/server/backend/utils.py | 6 +-- tabby/server/events.py | 2 +- .../{trainer.yaml => update_dataset.yaml} | 11 ++-- tabby/tools/repository/updater.py | 13 ++++- 8 files changed, 86 insertions(+), 17 deletions(-) delete mode 100644 development/config/repository.toml create mode 100644 tabby/admin/pages/Projects.py rename tabby/tasks/{trainer.yaml => update_dataset.yaml} (50%) diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml index 2891207..6979327 100644 --- a/deployment/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -7,8 +7,6 @@ services: environment: MODEL_NAME: ${MODEL_NAME} MODEL_BACKEND: triton - LOGS_DIR: /data/logs - DAGU_DAGS: /app/tabby/tasks ports: - "5000:5000" - "8080:8080" diff --git a/deployment/scripts/tabby.sh b/deployment/scripts/tabby.sh index e16acdb..908a910 100755 --- a/deployment/scripts/tabby.sh +++ b/deployment/scripts/tabby.sh @@ -4,15 +4,28 @@ set -e # Shared environment variables export LOGS_DIR="${LOGS_DIR:-/data/logs}" export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}" +export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml} # server export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}" export MODEL_BACKEND="${MODEL_BACKEND:-python}" +# projects +export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}" +export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}" + # dagu export DAGU_DAGS="tabby/tasks" init() { +if [ ! -f $CONFIG_FILE ]; then + mkdir -p $(dirname $CONFIG_FILE) + touch $CONFIG_FILE +fi + +# Disable safe directory check +git config --global --add safe.directory '*' + python -m tabby.tools.download_models --repo_id=$MODEL_NAME } diff --git a/development/config/repository.toml b/development/config/repository.toml deleted file mode 100644 index 54f124a..0000000 --- a/development/config/repository.toml +++ /dev/null @@ -1,5 +0,0 @@ -[repositories.quickjs] -url = "https://gitee.com/vsf-linux/quickjs.git" - -[repositories.transformers] -url = "https://gitee.com/angzhao/transformers.git" diff --git a/tabby/admin/pages/Projects.py b/tabby/admin/pages/Projects.py new file mode 100644 index 0000000..27ca3d0 --- /dev/null +++ b/tabby/admin/pages/Projects.py @@ -0,0 +1,51 @@ +import os +from pathlib import Path + +import streamlit as st +import toml +from datasets import load_from_disk +from git import Repo + +st.set_page_config(page_title="Tabby Admin - Projects") + +dataset_dir = os.environ.get("DATASET_DIR", None) +git_repositories_dir = os.environ.get("GIT_REPOSITORIES_DIR", None) +config_file = os.environ.get("CONFIG_FILE", None) +config = toml.load(config_file) + + +def dataset_info(): + st.subheader("Dataset") + if not Path(dataset_dir).is_dir(): + st.write("*Not populated*") + return + + info = load_from_disk(dataset_dir) + st.write("Source files: ", len(info)) + + +def project_list(): + data = config.get("projects", {}) + + if len(data) <= 0: + st.write("Your project list is empty") + st.write(f"Edit `{config_file}` to add projects") + return + + for k, v in data.items(): + st.subheader(k) + st.write(f'Git: {v["git_url"]}') + + git_repository = Path(git_repositories_dir, k) + if not git_repository.is_dir(): + st.write(f"Status: *Before Initialization*") + continue + + repo = Repo(git_repository) + sha = repo.active_branch.commit.hexsha + st.write(f"Status: `{sha}`") + + +dataset_info() +st.write("---") +project_list() diff --git a/tabby/server/backend/utils.py b/tabby/server/backend/utils.py index 4d3b711..3a136dc 100644 --- a/tabby/server/backend/utils.py +++ b/tabby/server/backend/utils.py @@ -10,7 +10,7 @@ def random_completion_id(): def trim_with_stop_words(output: str, stopwords: list) -> str: for w in sorted(stopwords, key=len, reverse=True): - if output.endswith(w): - output = output[: -len(w)] - break + index = output.find(w) + if index != -1: + output = output[:index] return output diff --git a/tabby/server/events.py b/tabby/server/events.py index d2d930d..1f4928d 100644 --- a/tabby/server/events.py +++ b/tabby/server/events.py @@ -9,7 +9,7 @@ from . import models def setup_logging(logdir): try: - shutil.rmtree(logdir + "/*") + shutil.rmtree(logdir) except FileNotFoundError: pass diff --git a/tabby/tasks/trainer.yaml b/tabby/tasks/update_dataset.yaml similarity index 50% rename from tabby/tasks/trainer.yaml rename to tabby/tasks/update_dataset.yaml index 6e2b5df..90ef220 100644 --- a/tabby/tasks/trainer.yaml +++ b/tabby/tasks/update_dataset.yaml @@ -1,15 +1,18 @@ -# schedule: "5 4 * * *" # Run at 04:05. +schedule: "5 4 * * *" # Run daily at 04:05. env: - - PATH: "/opt/conda/bin:$PATH" + - PATH: "$PATH" - APP_DIR: /app + - CONFIG_FILE: "$CONFIG_FILE" + - GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR" + - DATASET_DIR: "$DATASET_DIR" steps: - name: Update repositories dir: $APP_DIR - command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml + command: python -m tabby.tools.repository.updater --data_dir=$GIT_REPOSITORIES_DIR --config_file=$CONFIG_FILE - name: Generate dataset dir: $APP_DIR - command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset + command: python -m tabby.tools.preprocess.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR depends: - Update repositories diff --git a/tabby/tools/repository/updater.py b/tabby/tools/repository/updater.py index dcb1fb2..6060500 100644 --- a/tabby/tools/repository/updater.py +++ b/tabby/tools/repository/updater.py @@ -1,5 +1,6 @@ import os import pathlib +import shutil from dataclasses import dataclass, field import toml @@ -22,13 +23,21 @@ if __name__ == "__main__": args = parse_args() config = toml.load(args.config_file) - repositories = config["repositories"] + repositories = config["projects"] + + for x in pathlib.Path(args.data_dir).glob("*"): + if x.is_dir() and x.name not in repositories: + print("Remove unused dir:", x) + shutil.rmtree(str(x)) + elif x.is_file(): + print("Remove unused file:", x) + x.unlink() for name, config in repositories.items(): path = pathlib.Path(args.data_dir, name) if path.is_dir(): repo = Repo(path) else: - Repo.clone_from(config["url"], path.absolute(), depth=1) + Repo.clone_from(config["git_url"], path.absolute(), depth=1) os.system(f"gitup {args.data_dir}")