feat: integrate projects / dataset information in admin. (#38)

* feat: add projects page in admin

* feat: integrate update_dataset job

* feat: display dataset info in projects
add-more-languages
Meng Zhang 2023-04-03 13:04:04 +08:00 committed by GitHub
parent a241c08fc3
commit 1c61ef3944
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 86 additions and 17 deletions

View File

@ -7,8 +7,6 @@ services:
environment:
MODEL_NAME: ${MODEL_NAME}
MODEL_BACKEND: triton
LOGS_DIR: /data/logs
DAGU_DAGS: /app/tabby/tasks
ports:
- "5000:5000"
- "8080:8080"

View File

@ -4,15 +4,28 @@ set -e
# Shared environment variables
export LOGS_DIR="${LOGS_DIR:-/data/logs}"
export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}"
export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml}
# server
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
# projects
export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}"
export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}"
# dagu
export DAGU_DAGS="tabby/tasks"
init() {
if [ ! -f $CONFIG_FILE ]; then
mkdir -p $(dirname $CONFIG_FILE)
touch $CONFIG_FILE
fi
# Disable safe directory check
git config --global --add safe.directory '*'
python -m tabby.tools.download_models --repo_id=$MODEL_NAME
}

View File

@ -1,5 +0,0 @@
[repositories.quickjs]
url = "https://gitee.com/vsf-linux/quickjs.git"
[repositories.transformers]
url = "https://gitee.com/angzhao/transformers.git"

View File

@ -0,0 +1,51 @@
import os
from pathlib import Path
import streamlit as st
import toml
from datasets import load_from_disk
from git import Repo
st.set_page_config(page_title="Tabby Admin - Projects")
dataset_dir = os.environ.get("DATASET_DIR", None)
git_repositories_dir = os.environ.get("GIT_REPOSITORIES_DIR", None)
config_file = os.environ.get("CONFIG_FILE", None)
config = toml.load(config_file)
def dataset_info():
st.subheader("Dataset")
if not Path(dataset_dir).is_dir():
st.write("*Not populated*")
return
info = load_from_disk(dataset_dir)
st.write("Source files: ", len(info))
def project_list():
data = config.get("projects", {})
if len(data) <= 0:
st.write("Your project list is empty")
st.write(f"Edit `{config_file}` to add projects")
return
for k, v in data.items():
st.subheader(k)
st.write(f'Git: {v["git_url"]}')
git_repository = Path(git_repositories_dir, k)
if not git_repository.is_dir():
st.write(f"Status: *Before Initialization*")
continue
repo = Repo(git_repository)
sha = repo.active_branch.commit.hexsha
st.write(f"Status: `{sha}`")
dataset_info()
st.write("---")
project_list()

View File

@ -10,7 +10,7 @@ def random_completion_id():
def trim_with_stop_words(output: str, stopwords: list) -> str:
for w in sorted(stopwords, key=len, reverse=True):
if output.endswith(w):
output = output[: -len(w)]
break
index = output.find(w)
if index != -1:
output = output[:index]
return output

View File

@ -9,7 +9,7 @@ from . import models
def setup_logging(logdir):
try:
shutil.rmtree(logdir + "/*")
shutil.rmtree(logdir)
except FileNotFoundError:
pass

View File

@ -1,15 +1,18 @@
# schedule: "5 4 * * *" # Run at 04:05.
schedule: "5 4 * * *" # Run daily at 04:05.
env:
- PATH: "/opt/conda/bin:$PATH"
- PATH: "$PATH"
- APP_DIR: /app
- CONFIG_FILE: "$CONFIG_FILE"
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
- DATASET_DIR: "$DATASET_DIR"
steps:
- name: Update repositories
dir: $APP_DIR
command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml
command: python -m tabby.tools.repository.updater --data_dir=$GIT_REPOSITORIES_DIR --config_file=$CONFIG_FILE
- name: Generate dataset
dir: $APP_DIR
command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset
command: python -m tabby.tools.preprocess.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
depends:
- Update repositories

View File

@ -1,5 +1,6 @@
import os
import pathlib
import shutil
from dataclasses import dataclass, field
import toml
@ -22,13 +23,21 @@ if __name__ == "__main__":
args = parse_args()
config = toml.load(args.config_file)
repositories = config["repositories"]
repositories = config["projects"]
for x in pathlib.Path(args.data_dir).glob("*"):
if x.is_dir() and x.name not in repositories:
print("Remove unused dir:", x)
shutil.rmtree(str(x))
elif x.is_file():
print("Remove unused file:", x)
x.unlink()
for name, config in repositories.items():
path = pathlib.Path(args.data_dir, name)
if path.is_dir():
repo = Repo(path)
else:
Repo.clone_from(config["url"], path.absolute(), depth=1)
Repo.clone_from(config["git_url"], path.absolute(), depth=1)
os.system(f"gitup {args.data_dir}")