feat: integrate projects / dataset information in admin. (#38)
* feat: add projects page in admin * feat: integrate update_dataset job * feat: display dataset info in projectsadd-more-languages
parent
a241c08fc3
commit
1c61ef3944
|
|
@ -7,8 +7,6 @@ services:
|
|||
environment:
|
||||
MODEL_NAME: ${MODEL_NAME}
|
||||
MODEL_BACKEND: triton
|
||||
LOGS_DIR: /data/logs
|
||||
DAGU_DAGS: /app/tabby/tasks
|
||||
ports:
|
||||
- "5000:5000"
|
||||
- "8080:8080"
|
||||
|
|
|
|||
|
|
@ -4,15 +4,28 @@ set -e
|
|||
# Shared environment variables
|
||||
export LOGS_DIR="${LOGS_DIR:-/data/logs}"
|
||||
export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}"
|
||||
export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml}
|
||||
|
||||
# server
|
||||
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
|
||||
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
|
||||
|
||||
# projects
|
||||
export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}"
|
||||
export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}"
|
||||
|
||||
# dagu
|
||||
export DAGU_DAGS="tabby/tasks"
|
||||
|
||||
init() {
|
||||
if [ ! -f $CONFIG_FILE ]; then
|
||||
mkdir -p $(dirname $CONFIG_FILE)
|
||||
touch $CONFIG_FILE
|
||||
fi
|
||||
|
||||
# Disable safe directory check
|
||||
git config --global --add safe.directory '*'
|
||||
|
||||
python -m tabby.tools.download_models --repo_id=$MODEL_NAME
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +0,0 @@
|
|||
[repositories.quickjs]
|
||||
url = "https://gitee.com/vsf-linux/quickjs.git"
|
||||
|
||||
[repositories.transformers]
|
||||
url = "https://gitee.com/angzhao/transformers.git"
|
||||
|
|
@ -0,0 +1,51 @@
|
|||
import os
|
||||
from pathlib import Path
|
||||
|
||||
import streamlit as st
|
||||
import toml
|
||||
from datasets import load_from_disk
|
||||
from git import Repo
|
||||
|
||||
st.set_page_config(page_title="Tabby Admin - Projects")
|
||||
|
||||
dataset_dir = os.environ.get("DATASET_DIR", None)
|
||||
git_repositories_dir = os.environ.get("GIT_REPOSITORIES_DIR", None)
|
||||
config_file = os.environ.get("CONFIG_FILE", None)
|
||||
config = toml.load(config_file)
|
||||
|
||||
|
||||
def dataset_info():
|
||||
st.subheader("Dataset")
|
||||
if not Path(dataset_dir).is_dir():
|
||||
st.write("*Not populated*")
|
||||
return
|
||||
|
||||
info = load_from_disk(dataset_dir)
|
||||
st.write("Source files: ", len(info))
|
||||
|
||||
|
||||
def project_list():
|
||||
data = config.get("projects", {})
|
||||
|
||||
if len(data) <= 0:
|
||||
st.write("Your project list is empty")
|
||||
st.write(f"Edit `{config_file}` to add projects")
|
||||
return
|
||||
|
||||
for k, v in data.items():
|
||||
st.subheader(k)
|
||||
st.write(f'Git: {v["git_url"]}')
|
||||
|
||||
git_repository = Path(git_repositories_dir, k)
|
||||
if not git_repository.is_dir():
|
||||
st.write(f"Status: *Before Initialization*")
|
||||
continue
|
||||
|
||||
repo = Repo(git_repository)
|
||||
sha = repo.active_branch.commit.hexsha
|
||||
st.write(f"Status: `{sha}`")
|
||||
|
||||
|
||||
dataset_info()
|
||||
st.write("---")
|
||||
project_list()
|
||||
|
|
@ -10,7 +10,7 @@ def random_completion_id():
|
|||
|
||||
def trim_with_stop_words(output: str, stopwords: list) -> str:
|
||||
for w in sorted(stopwords, key=len, reverse=True):
|
||||
if output.endswith(w):
|
||||
output = output[: -len(w)]
|
||||
break
|
||||
index = output.find(w)
|
||||
if index != -1:
|
||||
output = output[:index]
|
||||
return output
|
||||
|
|
|
|||
|
|
@ -9,7 +9,7 @@ from . import models
|
|||
|
||||
def setup_logging(logdir):
|
||||
try:
|
||||
shutil.rmtree(logdir + "/*")
|
||||
shutil.rmtree(logdir)
|
||||
except FileNotFoundError:
|
||||
pass
|
||||
|
||||
|
|
|
|||
|
|
@ -1,15 +1,18 @@
|
|||
# schedule: "5 4 * * *" # Run at 04:05.
|
||||
schedule: "5 4 * * *" # Run daily at 04:05.
|
||||
|
||||
env:
|
||||
- PATH: "/opt/conda/bin:$PATH"
|
||||
- PATH: "$PATH"
|
||||
- APP_DIR: /app
|
||||
- CONFIG_FILE: "$CONFIG_FILE"
|
||||
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
|
||||
- DATASET_DIR: "$DATASET_DIR"
|
||||
steps:
|
||||
- name: Update repositories
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.repository.updater --data_dir=/repositories --config_file=/config/repository.toml
|
||||
command: python -m tabby.tools.repository.updater --data_dir=$GIT_REPOSITORIES_DIR --config_file=$CONFIG_FILE
|
||||
|
||||
- name: Generate dataset
|
||||
dir: $APP_DIR
|
||||
command: python -m tabby.tools.preprocess.build_dataset --project_dir /repositories --output_dir /dataset
|
||||
command: python -m tabby.tools.preprocess.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
|
||||
depends:
|
||||
- Update repositories
|
||||
|
|
@ -1,5 +1,6 @@
|
|||
import os
|
||||
import pathlib
|
||||
import shutil
|
||||
from dataclasses import dataclass, field
|
||||
|
||||
import toml
|
||||
|
|
@ -22,13 +23,21 @@ if __name__ == "__main__":
|
|||
args = parse_args()
|
||||
|
||||
config = toml.load(args.config_file)
|
||||
repositories = config["repositories"]
|
||||
repositories = config["projects"]
|
||||
|
||||
for x in pathlib.Path(args.data_dir).glob("*"):
|
||||
if x.is_dir() and x.name not in repositories:
|
||||
print("Remove unused dir:", x)
|
||||
shutil.rmtree(str(x))
|
||||
elif x.is_file():
|
||||
print("Remove unused file:", x)
|
||||
x.unlink()
|
||||
|
||||
for name, config in repositories.items():
|
||||
path = pathlib.Path(args.data_dir, name)
|
||||
if path.is_dir():
|
||||
repo = Repo(path)
|
||||
else:
|
||||
Repo.clone_from(config["url"], path.absolute(), depth=1)
|
||||
Repo.clone_from(config["git_url"], path.absolute(), depth=1)
|
||||
|
||||
os.system(f"gitup {args.data_dir}")
|
||||
|
|
|
|||
Loading…
Reference in New Issue