feat: integrate with meilisearch for snippet retrieval (#85)
* feat: extract json dump for dataset building * feat: integrate with meilisearch for code index * condition changes for meili with FLAGadd-tracing
parent
9c4b13174e
commit
aa5608fcb3
|
|
@ -61,6 +61,12 @@ RUN <<EOF
|
||||||
rm caddy.tar.gz README.md LICENSE
|
rm caddy.tar.gz README.md LICENSE
|
||||||
EOF
|
EOF
|
||||||
|
|
||||||
|
# Install meilisearch
|
||||||
|
RUN <<EOF
|
||||||
|
curl -L https://install.meilisearch.com | bash
|
||||||
|
mv meilisearch ~/.bin/
|
||||||
|
EOF
|
||||||
|
|
||||||
# Setup file permissions
|
# Setup file permissions
|
||||||
USER root
|
USER root
|
||||||
RUN mkdir -p /var/lib/vector
|
RUN mkdir -p /var/lib/vector
|
||||||
|
|
|
||||||
|
|
@ -17,3 +17,6 @@ services:
|
||||||
STREAMLIT_HIDE_MENU: false
|
STREAMLIT_HIDE_MENU: false
|
||||||
volumes:
|
volumes:
|
||||||
- ../tabby:/home/app/tabby
|
- ../tabby:/home/app/tabby
|
||||||
|
ports:
|
||||||
|
- 8083:8083
|
||||||
|
- 8084:8084
|
||||||
|
|
|
||||||
|
|
@ -1,14 +1,21 @@
|
||||||
|
import os
|
||||||
|
|
||||||
import streamlit as st
|
import streamlit as st
|
||||||
from utils.service_info import ServiceInfo
|
from utils.service_info import ServiceInfo
|
||||||
from utils.streamlit import set_page_config
|
from utils.streamlit import set_page_config
|
||||||
|
|
||||||
SERVICES = [
|
SERVICES = [
|
||||||
|
ServiceInfo(label="server", health_url="http://localhost:8081"),
|
||||||
ServiceInfo(label="triton", health_url="http://localhost:8002/metrics"),
|
ServiceInfo(label="triton", health_url="http://localhost:8002/metrics"),
|
||||||
ServiceInfo(label="vector", health_url="http://localhost:8686/health"),
|
ServiceInfo(label="vector", health_url="http://localhost:8686/health"),
|
||||||
ServiceInfo(label="dagu", health_url="http://localhost:8083"),
|
ServiceInfo(label="dagu", health_url="http://localhost:8083"),
|
||||||
ServiceInfo(label="server", health_url="http://localhost:8081"),
|
|
||||||
]
|
]
|
||||||
|
|
||||||
|
if os.environ.get("FLAGS_enable_meilisearch", False):
|
||||||
|
SERVICES.append(
|
||||||
|
ServiceInfo(label="meilisearch", health_url="http://localhost:8084")
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
def make_badge_markdown(x: ServiceInfo):
|
def make_badge_markdown(x: ServiceInfo):
|
||||||
return f""
|
return f""
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1 @@
|
||||||
|
export FLAGS_enable_meilisearch=""
|
||||||
|
|
@ -1,18 +1,23 @@
|
||||||
#!/bin/bash
|
#!/bin/bash
|
||||||
set -e
|
set -e
|
||||||
|
|
||||||
# Shared environment variables
|
# import flags
|
||||||
export LOGS_DIR="${LOGS_DIR:-/data/logs}"
|
SCRIPT_DIR=$( cd "$(dirname "${BASH_SOURCE[0]}")" ; pwd -P )
|
||||||
export DB_FILE="${DB_FILE:-/data/logs/duckdb/duck.db}"
|
source "$SCRIPT_DIR/flags.sh"
|
||||||
export CONFIG_FILE=${CONFIG_FILE:-/data/config/tabby.toml}
|
|
||||||
|
|
||||||
# server
|
# INPUT ENVIRONMENT ARGS
|
||||||
|
export DATA_DIR="${DATA_DIR:-/data}"
|
||||||
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
|
export MODEL_NAME="${MODEL_NAME:-TabbyML/J-350M}"
|
||||||
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
|
export MODEL_BACKEND="${MODEL_BACKEND:-python}"
|
||||||
|
|
||||||
|
# Shared environment variables
|
||||||
|
export LOGS_DIR="$DATA_DIR/logs"
|
||||||
|
export DB_FILE="$LOGS_DIR/duckdb/duck.db"
|
||||||
|
export CONFIG_FILE="$DATA_DIR/config/tabby.toml"
|
||||||
|
|
||||||
# projects
|
# projects
|
||||||
export GIT_REPOSITORIES_DIR="${REPOSITORIES_DIR:-/data/repositories}"
|
export GIT_REPOSITORIES_DIR="$DATA_DIR/repositories"
|
||||||
export DATASET_DIR="${REPOSITORIES_DIR:-/data/dataset}"
|
export DATASET_DIR="$DATA_DIR/dataset"
|
||||||
|
|
||||||
# dagu
|
# dagu
|
||||||
export DAGU_DAGS="tabby/tasks"
|
export DAGU_DAGS="tabby/tasks"
|
||||||
|
|
@ -54,6 +59,17 @@ command=caddy run --config tabby/config/Caddyfile $CADDY_ARGS
|
||||||
EOF
|
EOF
|
||||||
}
|
}
|
||||||
|
|
||||||
|
program:meilisearch() {
|
||||||
|
local MEILI_DIR="$DATA_DIR/meili"
|
||||||
|
|
||||||
|
if [[ ! -z ${FLAGS_enable_meilisearch} ]]; then
|
||||||
|
cat <<EOF
|
||||||
|
[program:meilisearch]
|
||||||
|
command=meilisearch --http-addr 0.0.0.0:8084 --db-path ${MEILI_DIR}/data.ms --dump-dir ${MEILI_DIR}/dumps/
|
||||||
|
EOF
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
supervisor() {
|
supervisor() {
|
||||||
# Create logs dir if not exists.
|
# Create logs dir if not exists.
|
||||||
mkdir -p ${LOGS_DIR}
|
mkdir -p ${LOGS_DIR}
|
||||||
|
|
@ -81,6 +97,8 @@ command=dagu server --host 0.0.0.0 --port 8083
|
||||||
$(program:triton)
|
$(program:triton)
|
||||||
|
|
||||||
$(program:caddy)
|
$(program:caddy)
|
||||||
|
|
||||||
|
$(program:meilisearch)
|
||||||
EOF
|
EOF
|
||||||
)
|
)
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -7,6 +7,7 @@ env:
|
||||||
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
|
- GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR"
|
||||||
- DATASET_DIR: "$DATASET_DIR"
|
- DATASET_DIR: "$DATASET_DIR"
|
||||||
- HOME: "$HOME"
|
- HOME: "$HOME"
|
||||||
|
- FLAGS_enable_meilisearch: "$FLAGS_enable_meilisearch"
|
||||||
steps:
|
steps:
|
||||||
- name: update repositories
|
- name: update repositories
|
||||||
dir: $APP_DIR
|
dir: $APP_DIR
|
||||||
|
|
@ -17,3 +18,14 @@ steps:
|
||||||
command: python -m tabby.tools.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
|
command: python -m tabby.tools.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR
|
||||||
depends:
|
depends:
|
||||||
- update repositories
|
- update repositories
|
||||||
|
|
||||||
|
- name: refresh index
|
||||||
|
dir: $APP_DIR
|
||||||
|
preconditions:
|
||||||
|
- condition: "$FLAGS_enable_meilisearch"
|
||||||
|
expected: "1"
|
||||||
|
depends:
|
||||||
|
- generate dataset
|
||||||
|
command: |
|
||||||
|
curl -X DELETE 'http://localhost:8084/indexes/dataset/documents'
|
||||||
|
curl -X POST 'http://localhost:8084/indexes/dataset/documents?primaryKey=id' -H 'Content-Type: application/x-ndjson' --data-binary @$DATASET_DIR/dumps.json
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
import base64
|
||||||
import glob
|
import glob
|
||||||
import json
|
import json
|
||||||
import os
|
import os
|
||||||
|
|
@ -39,11 +40,10 @@ def read_extension_to_language_mappings():
|
||||||
return mappings
|
return mappings
|
||||||
|
|
||||||
|
|
||||||
def dataset_iter(files):
|
def dataset_iter(project_dir, files):
|
||||||
def gen():
|
def gen():
|
||||||
mappings = read_extension_to_language_mappings()
|
mappings = read_extension_to_language_mappings()
|
||||||
for x in files:
|
for x in files:
|
||||||
print("Processing", x)
|
|
||||||
_, extname = os.path.splitext(x)
|
_, extname = os.path.splitext(x)
|
||||||
|
|
||||||
with open(x) as f:
|
with open(x) as f:
|
||||||
|
|
@ -53,7 +53,13 @@ def dataset_iter(files):
|
||||||
print("Cannot decode unicode", x)
|
print("Cannot decode unicode", x)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
segments = x.removeprefix(project_dir).split(os.sep)
|
||||||
|
project = segments[1]
|
||||||
|
file = os.path.join(*segments[2:])
|
||||||
yield dict(
|
yield dict(
|
||||||
|
id=to_id(project, file),
|
||||||
|
project=project,
|
||||||
|
file=file,
|
||||||
language=mappings[extname],
|
language=mappings[extname],
|
||||||
content=content,
|
content=content,
|
||||||
**metrics.compute(content),
|
**metrics.compute(content),
|
||||||
|
|
@ -73,6 +79,11 @@ def count_by_language(dataset):
|
||||||
return df
|
return df
|
||||||
|
|
||||||
|
|
||||||
|
def to_id(*args):
|
||||||
|
token = ":".join(args)
|
||||||
|
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
valid_extensions = read_valid_extensions()
|
valid_extensions = read_valid_extensions()
|
||||||
|
|
||||||
|
|
@ -91,8 +102,9 @@ if __name__ == "__main__":
|
||||||
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
|
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
|
||||||
)
|
)
|
||||||
|
|
||||||
ds = Dataset.from_generator(dataset_iter(files))
|
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
|
||||||
ds.save_to_disk(args.output_dir)
|
ds.save_to_disk(args.output_dir)
|
||||||
|
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
|
||||||
|
|
||||||
print("\n## Summary")
|
print("\n## Summary")
|
||||||
print("Number of source files", len(ds))
|
print("Number of source files", len(ds))
|
||||||
|
|
|
||||||
|
|
@ -1,32 +1,28 @@
|
||||||
{
|
{
|
||||||
"Assembly": [".asm"],
|
"c": [".c", ".h"],
|
||||||
"Batchfile": [".bat", ".cmd"],
|
"csharp": [".cs"],
|
||||||
"C": [".c", ".h"],
|
"cpp": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
||||||
"C#": [".cs"],
|
"css": [".css"],
|
||||||
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
"dockerfile": ["Dockerfile"],
|
||||||
"CMake": [".cmake"],
|
"go": [".go"],
|
||||||
"CSS": [".css"],
|
"haskell": [".hs"],
|
||||||
"Dockerfile": [".dockerfile", "Dockerfile"],
|
"html": [".html"],
|
||||||
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
|
"java": [".java"],
|
||||||
"Go": [".go"],
|
"javascript": [".js"],
|
||||||
"Haskell": [".hs"],
|
"julia": [".jl"],
|
||||||
"HTML": [".html"],
|
"lua": [".lua"],
|
||||||
"Java": [".java"],
|
"makefile": ["Makefile"],
|
||||||
"JavaScript": [".js"],
|
"markdown": [".md", ".markdown"],
|
||||||
"Julia": [".jl"],
|
"php": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
||||||
"Lua": [".lua"],
|
"perl": [".pl", ".pm", ".pod", ".perl"],
|
||||||
"Makefile": ["Makefile"],
|
"powershell": [".ps1", ".psd1", ".psm1"],
|
||||||
"Markdown": [".md", ".markdown"],
|
"python": [".py"],
|
||||||
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
"ruby": [".rb"],
|
||||||
"Perl": [".pl", ".pm", ".pod", ".perl"],
|
"rust": [".rs"],
|
||||||
"PowerShell": [".ps1", ".psd1", ".psm1"],
|
"sql": [".sql"],
|
||||||
"Python": [".py"],
|
"scala": [".scala"],
|
||||||
"Ruby": [".rb"],
|
"shellscript": [".sh", ".bash", ".command", ".zsh"],
|
||||||
"Rust": [".rs"],
|
"typescript": [".ts", ".tsx"],
|
||||||
"SQL": [".sql"],
|
"tex": [".tex"],
|
||||||
"Scala": [".scala"],
|
"vb": [".vb"]
|
||||||
"Shell": [".sh", ".bash", ".command", ".zsh"],
|
|
||||||
"TypeScript": [".ts", ".tsx"],
|
|
||||||
"TeX": [".tex"],
|
|
||||||
"Visual Basic": [".vb"]
|
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue