feat: integrate with meilisearch for snippet retrieval (#85)

* feat: extract json dump for dataset building

* feat: integrate with meilisearch for code index

* condition changes for meili with FLAG
add-tracing
Meng Zhang 2023-04-12 13:18:50 +08:00 committed by GitHub
parent 9c4b13174e
commit aa5608fcb3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 96 additions and 41 deletions

View File

@ -61,6 +61,12 @@ RUN <<EOF
rm caddy.tar.gz README.md LICENSE
EOF
# Install meilisearch
RUN <<EOF
curl -L https://install.meilisearch.com | bash
mv meilisearch ~/.bin/
EOF
# Setup file permissions
USER root
RUN mkdir -p /var/lib/vector

View File

@ -17,3 +17,6 @@ services:
STREAMLIT_HIDE_MENU: false
volumes:
- ../tabby:/home/app/tabby
ports:
- 8083:8083
- 8084:8084

View File

@ -1,14 +1,21 @@
import os
import streamlit as st
from utils.service_info import ServiceInfo
from utils.streamlit import set_page_config
SERVICES = [
ServiceInfo(label="server", health_url="http://localhost:8081"),
ServiceInfo(label="triton", health_url="http://localhost:8002/metrics"),
ServiceInfo(label="vector", health_url="http://localhost:8686/health"),
ServiceInfo(label="dagu", health_url="http://localhost:8083"),
ServiceInfo(label="server", health_url="http://localhost:8081"),
]
if os.environ.get("FLAGS_enable_meilisearch", False):
SERVICES.append(
ServiceInfo(label="meilisearch", health_url="http://localhost:8084")
)
def make_badge_markdown(x: ServiceInfo):
return f"![{x.label}]({x.badge_url})"

1
tabby/scripts/flags.sh Normal file
View File

@ -0,0 +1 @@
export FLAGS_enable_meilisearch=""

View File

@ -1,18 +1,23 @@
#!/bin/bash
set -e
# Shared environment variables
export LOGS_DIR="${LOGS_DIR:-/data/logs}"
export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}"
export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml}
# import flags
SCRIPT_DIR=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
source "$SCRIPT_DIR/flags.sh"
# server
# INPUT ENVIRONMENT ARGS
export DATA_DIR="${DATA_DIR:-/data}"
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
# Shared environment variables
export LOGS_DIR="$DATA_DIR/logs"
export DB_FILE="$LOGS_DIR/duckdb/duck.db"
export CONFIG_FILE="$DATA_DIR/config/tabby.toml"
# projects
export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}"
export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}"
export GIT_REPOSITORIES_DIR="$DATA_DIR/repositories"
export DATASET_DIR="$DATA_DIR/dataset"
# dagu
export DAGU_DAGS="tabby/tasks"
@ -54,6 +59,17 @@ command=caddy run --config tabby/config/Caddyfile $CADDY_ARGS
EOF
}
program:meilisearch() {
local MEILI_DIR="$DATA_DIR/meili"
if [[ ! -z ${FLAGS_enable_meilisearch} ]]; then
cat <<EOF
[program:meilisearch]
command=meilisearch --http-addr 0.0.0.0:8084 --db-path ${MEILI_DIR}/data.ms --dump-dir ${MEILI_DIR}/dumps/
EOF
fi
}
supervisor() {
# Create logs dir if not exists.
mkdir -p ${LOGS_DIR}
@ -81,6 +97,8 @@ command=dagu server --host 0.0.0.0 --port 8083
$(program:triton)
$(program:caddy)
$(program:meilisearch)
EOF
)
}

View File

@ -7,6 +7,7 @@ env:
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
- DATASET_DIR: "$DATASET_DIR"
- HOME: "$HOME"
- FLAGS_enable_meilisearch: "$FLAGS_enable_meilisearch"
steps:
- name: update repositories
dir: $APP_DIR
@ -17,3 +18,14 @@ steps:
command: python -m tabby.tools.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
depends:
- update repositories
- name: refresh index
dir: $APP_DIR
preconditions:
- condition: "$FLAGS_enable_meilisearch"
expected: "1"
depends:
- generate dataset
command: |
curl -X DELETE 'http://localhost:8084/indexes/dataset/documents'
curl -X POST 'http://localhost:8084/indexes/dataset/documents?primaryKey=id' -H 'Content-Type: application/x-ndjson' --data-binary @$DATASET_DIR/dumps.json

View File

@ -1,3 +1,4 @@
import base64
import glob
import json
import os
@ -39,11 +40,10 @@ def read_extension_to_language_mappings():
return mappings
def dataset_iter(files):
def dataset_iter(project_dir, files):
def gen():
mappings = read_extension_to_language_mappings()
for x in files:
print("Processing", x)
_, extname = os.path.splitext(x)
with open(x) as f:
@ -53,7 +53,13 @@ def dataset_iter(files):
print("Cannot decode unicode", x)
continue
segments = x.removeprefix(project_dir).split(os.sep)
project = segments[1]
file = os.path.join(*segments[2:])
yield dict(
id=to_id(project, file),
project=project,
file=file,
language=mappings[extname],
content=content,
**metrics.compute(content),
@ -73,6 +79,11 @@ def count_by_language(dataset):
return df
def to_id(*args):
token = ":".join(args)
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
if __name__ == "__main__":
valid_extensions = read_valid_extensions()
@ -91,8 +102,9 @@ if __name__ == "__main__":
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
)
ds = Dataset.from_generator(dataset_iter(files))
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
ds.save_to_disk(args.output_dir)
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
print("\n## Summary")
print("Number of source files", len(ds))

View File

@ -1,32 +1,28 @@
{
"Assembly": [".asm"],
"Batchfile": [".bat", ".cmd"],
"C": [".c", ".h"],
"C#": [".cs"],
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
"CMake": [".cmake"],
"CSS": [".css"],
"Dockerfile": [".dockerfile", "Dockerfile"],
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
"Go": [".go"],
"Haskell": [".hs"],
"HTML": [".html"],
"Java": [".java"],
"JavaScript": [".js"],
"Julia": [".jl"],
"Lua": [".lua"],
"Makefile": ["Makefile"],
"Markdown": [".md", ".markdown"],
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
"Perl": [".pl", ".pm", ".pod", ".perl"],
"PowerShell": [".ps1", ".psd1", ".psm1"],
"Python": [".py"],
"Ruby": [".rb"],
"Rust": [".rs"],
"SQL": [".sql"],
"Scala": [".scala"],
"Shell": [".sh", ".bash", ".command", ".zsh"],
"TypeScript": [".ts", ".tsx"],
"TeX": [".tex"],
"Visual Basic": [".vb"]
"c": [".c", ".h"],
"csharp": [".cs"],
"cpp": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
"css": [".css"],
"dockerfile": ["Dockerfile"],
"go": [".go"],
"haskell": [".hs"],
"html": [".html"],
"java": [".java"],
"javascript": [".js"],
"julia": [".jl"],
"lua": [".lua"],
"makefile": ["Makefile"],
"markdown": [".md", ".markdown"],
"php": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
"perl": [".pl", ".pm", ".pod", ".perl"],
"powershell": [".ps1", ".psd1", ".psm1"],
"python": [".py"],
"ruby": [".rb"],
"rust": [".rs"],
"sql": [".sql"],
"scala": [".scala"],
"shellscript": [".sh", ".bash", ".command", ".zsh"],
"typescript": [".ts", ".tsx"],
"tex": [".tex"],
"vb": [".vb"]
}