feat: integrate with meilisearch for snippet retrieval (#85)
* feat: extract json dump for dataset building * feat: integrate with meilisearch for code index * condition changes for meili with FLAGadd-tracing
parent
9c4b13174e
commit
aa5608fcb3
|
|
@ -61,6 +61,12 @@ RUN <<EOF
|
|||
rm caddy.tar.gz README.md LICENSE
|
||||
EOF
|
||||
|
||||
# Install meilisearch
|
||||
RUN <<EOF
|
||||
curl -L https://install.meilisearch.com | bash
|
||||
mv meilisearch ~/.bin/
|
||||
EOF
|
||||
|
||||
# Setup file permissions
|
||||
USER root
|
||||
RUN mkdir -p /var/lib/vector
|
||||
|
|
|
|||
|
|
@ -17,3 +17,6 @@ services:
|
|||
STREAMLIT_HIDE_MENU: false
|
||||
volumes:
|
||||
- ../tabby:/home/app/tabby
|
||||
ports:
|
||||
- 8083:8083
|
||||
- 8084:8084
|
||||
|
|
|
|||
|
|
@ -1,14 +1,21 @@
|
|||
import os
|
||||
|
||||
import streamlit as st
|
||||
from utils.service_info import ServiceInfo
|
||||
from utils.streamlit import set_page_config
|
||||
|
||||
SERVICES = [
|
||||
ServiceInfo(label="server", health_url="http://localhost:8081"),
|
||||
ServiceInfo(label="triton", health_url="http://localhost:8002/metrics"),
|
||||
ServiceInfo(label="vector", health_url="http://localhost:8686/health"),
|
||||
ServiceInfo(label="dagu", health_url="http://localhost:8083"),
|
||||
ServiceInfo(label="server", health_url="http://localhost:8081"),
|
||||
]
|
||||
|
||||
if os.environ.get("FLAGS_enable_meilisearch", False):
|
||||
SERVICES.append(
|
||||
ServiceInfo(label="meilisearch", health_url="http://localhost:8084")
|
||||
)
|
||||
|
||||
|
||||
def make_badge_markdown(x: ServiceInfo):
|
||||
return f""
|
||||
|
|
|
|||
|
|
@ -0,0 +1 @@
|
|||
export FLAGS_enable_meilisearch=""
|
||||
|
|
@ -1,18 +1,23 @@
|
|||
#!/bin/bash
|
||||
set -e
|
||||
|
||||
# Shared environment variables
|
||||
export LOGS_DIR="${LOGS_DIR:-/data/logs}"
|
||||
export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}"
|
||||
export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml}
|
||||
# import flags
|
||||
SCRIPT_DIR=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
|
||||
source "$SCRIPT_DIR/flags.sh"
|
||||
|
||||
# server
|
||||
# INPUT ENVIRONMENT ARGS
|
||||
export DATA_DIR="${DATA_DIR:-/data}"
|
||||
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
|
||||
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
|
||||
|
||||
# Shared environment variables
|
||||
export LOGS_DIR="$DATA_DIR/logs"
|
||||
export DB_FILE="$LOGS_DIR/duckdb/duck.db"
|
||||
export CONFIG_FILE="$DATA_DIR/config/tabby.toml"
|
||||
|
||||
# projects
|
||||
export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}"
|
||||
export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}"
|
||||
export GIT_REPOSITORIES_DIR="$DATA_DIR/repositories"
|
||||
export DATASET_DIR="$DATA_DIR/dataset"
|
||||
|
||||
# dagu
|
||||
export DAGU_DAGS="tabby/tasks"
|
||||
|
|
@ -54,6 +59,17 @@ command=caddy run --config tabby/config/Caddyfile $CADDY_ARGS
|
|||
EOF
|
||||
}
|
||||
|
||||
program:meilisearch() {
|
||||
local MEILI_DIR="$DATA_DIR/meili"
|
||||
|
||||
if [[ ! -z ${FLAGS_enable_meilisearch} ]]; then
|
||||
cat <<EOF
|
||||
[program:meilisearch]
|
||||
command=meilisearch --http-addr 0.0.0.0:8084 --db-path ${MEILI_DIR}/data.ms --dump-dir ${MEILI_DIR}/dumps/
|
||||
EOF
|
||||
fi
|
||||
}
|
||||
|
||||
supervisor() {
|
||||
# Create logs dir if not exists.
|
||||
mkdir -p ${LOGS_DIR}
|
||||
|
|
@ -81,6 +97,8 @@ command=dagu server --host 0.0.0.0 --port 8083
|
|||
$(program:triton)
|
||||
|
||||
$(program:caddy)
|
||||
|
||||
$(program:meilisearch)
|
||||
EOF
|
||||
)
|
||||
}
|
||||
|
|
|
|||
|
|
@ -7,6 +7,7 @@ env:
|
|||
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
|
||||
- DATASET_DIR: "$DATASET_DIR"
|
||||
- HOME: "$HOME"
|
||||
- FLAGS_enable_meilisearch: "$FLAGS_enable_meilisearch"
|
||||
steps:
|
||||
- name: update repositories
|
||||
dir: $APP_DIR
|
||||
|
|
@ -17,3 +18,14 @@ steps:
|
|||
command: python -m tabby.tools.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
|
||||
depends:
|
||||
- update repositories
|
||||
|
||||
- name: refresh index
|
||||
dir: $APP_DIR
|
||||
preconditions:
|
||||
- condition: "$FLAGS_enable_meilisearch"
|
||||
expected: "1"
|
||||
depends:
|
||||
- generate dataset
|
||||
command: |
|
||||
curl -X DELETE 'http://localhost:8084/indexes/dataset/documents'
|
||||
curl -X POST 'http://localhost:8084/indexes/dataset/documents?primaryKey=id' -H 'Content-Type: application/x-ndjson' --data-binary @$DATASET_DIR/dumps.json
|
||||
|
|
|
|||
|
|
@ -1,3 +1,4 @@
|
|||
import base64
|
||||
import glob
|
||||
import json
|
||||
import os
|
||||
|
|
@ -39,11 +40,10 @@ def read_extension_to_language_mappings():
|
|||
return mappings
|
||||
|
||||
|
||||
def dataset_iter(files):
|
||||
def dataset_iter(project_dir, files):
|
||||
def gen():
|
||||
mappings = read_extension_to_language_mappings()
|
||||
for x in files:
|
||||
print("Processing", x)
|
||||
_, extname = os.path.splitext(x)
|
||||
|
||||
with open(x) as f:
|
||||
|
|
@ -53,7 +53,13 @@ def dataset_iter(files):
|
|||
print("Cannot decode unicode", x)
|
||||
continue
|
||||
|
||||
segments = x.removeprefix(project_dir).split(os.sep)
|
||||
project = segments[1]
|
||||
file = os.path.join(*segments[2:])
|
||||
yield dict(
|
||||
id=to_id(project, file),
|
||||
project=project,
|
||||
file=file,
|
||||
language=mappings[extname],
|
||||
content=content,
|
||||
**metrics.compute(content),
|
||||
|
|
@ -73,6 +79,11 @@ def count_by_language(dataset):
|
|||
return df
|
||||
|
||||
|
||||
def to_id(*args):
|
||||
token = ":".join(args)
|
||||
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
valid_extensions = read_valid_extensions()
|
||||
|
||||
|
|
@ -91,8 +102,9 @@ if __name__ == "__main__":
|
|||
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
|
||||
)
|
||||
|
||||
ds = Dataset.from_generator(dataset_iter(files))
|
||||
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
|
||||
ds.save_to_disk(args.output_dir)
|
||||
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
|
||||
|
||||
print("\n## Summary")
|
||||
print("Number of source files", len(ds))
|
||||
|
|
|
|||
|
|
@ -1,32 +1,28 @@
|
|||
{
|
||||
"Assembly": [".asm"],
|
||||
"Batchfile": [".bat", ".cmd"],
|
||||
"C": [".c", ".h"],
|
||||
"C#": [".cs"],
|
||||
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
||||
"CMake": [".cmake"],
|
||||
"CSS": [".css"],
|
||||
"Dockerfile": [".dockerfile", "Dockerfile"],
|
||||
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
|
||||
"Go": [".go"],
|
||||
"Haskell": [".hs"],
|
||||
"HTML": [".html"],
|
||||
"Java": [".java"],
|
||||
"JavaScript": [".js"],
|
||||
"Julia": [".jl"],
|
||||
"Lua": [".lua"],
|
||||
"Makefile": ["Makefile"],
|
||||
"Markdown": [".md", ".markdown"],
|
||||
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
||||
"Perl": [".pl", ".pm", ".pod", ".perl"],
|
||||
"PowerShell": [".ps1", ".psd1", ".psm1"],
|
||||
"Python": [".py"],
|
||||
"Ruby": [".rb"],
|
||||
"Rust": [".rs"],
|
||||
"SQL": [".sql"],
|
||||
"Scala": [".scala"],
|
||||
"Shell": [".sh", ".bash", ".command", ".zsh"],
|
||||
"TypeScript": [".ts", ".tsx"],
|
||||
"TeX": [".tex"],
|
||||
"Visual Basic": [".vb"]
|
||||
"c": [".c", ".h"],
|
||||
"csharp": [".cs"],
|
||||
"cpp": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
||||
"css": [".css"],
|
||||
"dockerfile": ["Dockerfile"],
|
||||
"go": [".go"],
|
||||
"haskell": [".hs"],
|
||||
"html": [".html"],
|
||||
"java": [".java"],
|
||||
"javascript": [".js"],
|
||||
"julia": [".jl"],
|
||||
"lua": [".lua"],
|
||||
"makefile": ["Makefile"],
|
||||
"markdown": [".md", ".markdown"],
|
||||
"php": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
||||
"perl": [".pl", ".pm", ".pod", ".perl"],
|
||||
"powershell": [".ps1", ".psd1", ".psm1"],
|
||||
"python": [".py"],
|
||||
"ruby": [".rb"],
|
||||
"rust": [".rs"],
|
||||
"sql": [".sql"],
|
||||
"scala": [".scala"],
|
||||
"shellscript": [".sh", ".bash", ".command", ".zsh"],
|
||||
"typescript": [".ts", ".tsx"],
|
||||
"tex": [".tex"],
|
||||
"vb": [".vb"]
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue