feat: add /experimental/search endpoint (#258)

* feat: add /experimental/search endpoint

* fix format
sweep/improve-logging-information
Meng Zhang 2023-06-22 13:47:32 -07:00 committed by GitHub
parent 8c1f6c2806
commit 04980160e5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 312 additions and 58 deletions

179
Cargo.lock generated
View File

@ -30,11 +30,11 @@ dependencies = [
[[package]] [[package]]
name = "ahash" name = "ahash"
version = "0.7.6" version = "0.8.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f"
dependencies = [ dependencies = [
"getrandom", "cfg-if",
"once_cell", "once_cell",
"version_check", "version_check",
] ]
@ -711,7 +711,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
dependencies = [ dependencies = [
"cfg-if", "cfg-if",
"hashbrown", "hashbrown 0.12.3",
"lock_api", "lock_api",
"once_cell", "once_cell",
"parking_lot_core", "parking_lot_core",
@ -862,20 +862,6 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04" checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04"
[[package]]
name = "fastfield_codecs"
version = "0.3.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "374a3a53c1bd5fb31b10084229290eafb0a05f260ec90f1f726afffda4877a8a"
dependencies = [
"fastdivide",
"itertools 0.10.5",
"log",
"ownedbytes",
"tantivy-bitpacker",
"tantivy-common",
]
[[package]] [[package]]
name = "fastrand" name = "fastrand"
version = "1.9.0" version = "1.9.0"
@ -972,6 +958,16 @@ dependencies = [
"winapi", "winapi",
] ]
[[package]]
name = "fs4"
version = "0.6.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7672706608ecb74ab2e055c68327ffc25ae4cac1e12349204fd5fb0f3487cce2"
dependencies = [
"rustix",
"windows-sys 0.48.0",
]
[[package]] [[package]]
name = "futures" name = "futures"
version = "0.3.28" version = "0.3.28"
@ -1131,6 +1127,12 @@ name = "hashbrown"
version = "0.12.3" version = "0.12.3"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888"
[[package]]
name = "hashbrown"
version = "0.13.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e"
dependencies = [ dependencies = [
"ahash", "ahash",
] ]
@ -1306,7 +1308,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99"
dependencies = [ dependencies = [
"autocfg", "autocfg",
"hashbrown", "hashbrown 0.12.3",
"serde", "serde",
] ]
@ -1526,18 +1528,18 @@ dependencies = [
[[package]] [[package]]
name = "lru" name = "lru"
version = "0.7.8" version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a" checksum = "03f1160296536f10c833a82dca22267d5486734230d47bf00bf435885814ba1e"
dependencies = [ dependencies = [
"hashbrown", "hashbrown 0.13.2",
] ]
[[package]] [[package]]
name = "lz4_flex" name = "lz4_flex"
version = "0.9.5" version = "0.10.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83"
[[package]] [[package]]
name = "macro_rules_attribute" name = "macro_rules_attribute"
@ -1597,9 +1599,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
[[package]] [[package]]
name = "memmap2" name = "memmap2"
version = "0.5.10" version = "0.6.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872"
dependencies = [ dependencies = [
"libc", "libc",
] ]
@ -1694,12 +1696,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a"
[[package]] [[package]]
name = "murmurhash32" name = "murmurhash32"
version = "0.2.0" version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df"
dependencies = [
"byteorder",
]
[[package]] [[package]]
name = "native-tls" name = "native-tls"
@ -1957,9 +1956,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39"
[[package]] [[package]]
name = "ownedbytes" name = "ownedbytes"
version = "0.4.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8e957eaa64a299f39755416e5b3128c505e9d63a91d0453771ad2ccd3907f8db" checksum = "c718e498b20704d5fb5d51d07f414a22f61c19254c1708e117b93fd76860739c"
dependencies = [ dependencies = [
"stable_deref_trait", "stable_deref_trait",
] ]
@ -2661,6 +2660,15 @@ dependencies = [
"libc", "libc",
] ]
[[package]]
name = "sketches-ddsketch"
version = "0.2.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "slab" name = "slab"
version = "0.4.8" version = "0.4.8"
@ -2794,6 +2802,7 @@ dependencies = [
"tabby-common", "tabby-common",
"tabby-download", "tabby-download",
"tabby-scheduler", "tabby-scheduler",
"tantivy",
"tokio", "tokio",
"tower", "tower",
"tower-http 0.4.0", "tower-http 0.4.0",
@ -2855,14 +2864,14 @@ dependencies = [
[[package]] [[package]]
name = "tantivy" name = "tantivy"
version = "0.19.2" version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "5bb26a6b22c84d8be41d99a14016d6f04d30d8d31a2ea411a8ab553af5cc490d" checksum = "aec540e9cebc88f523f67f596dee213e491f0c55961de013566f267a0c31f5e9"
dependencies = [ dependencies = [
"aho-corasick 0.7.20", "aho-corasick 1.0.1",
"arc-swap", "arc-swap",
"async-trait", "async-trait",
"base64 0.13.1", "base64 0.21.2",
"bitpacking", "bitpacking",
"byteorder", "byteorder",
"census", "census",
@ -2871,8 +2880,7 @@ dependencies = [
"downcast-rs", "downcast-rs",
"fail", "fail",
"fastdivide", "fastdivide",
"fastfield_codecs", "fs4",
"fs2",
"htmlescape", "htmlescape",
"itertools 0.10.5", "itertools 0.10.5",
"levenshtein_automata", "levenshtein_automata",
@ -2885,19 +2893,21 @@ dependencies = [
"num_cpus", "num_cpus",
"once_cell", "once_cell",
"oneshot", "oneshot",
"ownedbytes",
"rayon", "rayon",
"regex", "regex",
"rust-stemmers", "rust-stemmers",
"rustc-hash", "rustc-hash",
"serde", "serde",
"serde_json", "serde_json",
"sketches-ddsketch",
"smallvec", "smallvec",
"stable_deref_trait",
"tantivy-bitpacker", "tantivy-bitpacker",
"tantivy-columnar",
"tantivy-common", "tantivy-common",
"tantivy-fst", "tantivy-fst",
"tantivy-query-grammar", "tantivy-query-grammar",
"tantivy-stacker",
"tantivy-tokenizer-api",
"tempfile", "tempfile",
"thiserror", "thiserror",
"time 0.3.21", "time 0.3.21",
@ -2907,18 +2917,40 @@ dependencies = [
[[package]] [[package]]
name = "tantivy-bitpacker" name = "tantivy-bitpacker"
version = "0.3.0" version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e71a0c95b82d4292b097a09b989a6380d28c3a86800c841a2d03bae1fc8b9fa6" checksum = "16099e96f0ede682084469b80d6909dc170aa2b11d2a45538b5b36b2a90090b9"
dependencies = [
"bitpacking",
]
[[package]]
name = "tantivy-columnar"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56e32b024b26eab93eb8648faf08004356bf9d47376557ee4409f4b210163656"
dependencies = [
"fastdivide",
"fnv",
"itertools 0.10.5",
"serde",
"tantivy-bitpacker",
"tantivy-common",
"tantivy-sstable",
"tantivy-stacker",
]
[[package]] [[package]]
name = "tantivy-common" name = "tantivy-common"
version = "0.4.0" version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "14fef4182bb60df9a4b92cd8ecab39ba2e50a05542934af17eef1f49660705cb" checksum = "e7d12fdd6ec0f7e0962f129c03c696a85ec567734950cbb2b89af4a293ce342f"
dependencies = [ dependencies = [
"async-trait",
"byteorder", "byteorder",
"ownedbytes", "ownedbytes",
"serde",
"time 0.3.21",
] ]
[[package]] [[package]]
@ -2934,15 +2966,45 @@ dependencies = [
[[package]] [[package]]
name = "tantivy-query-grammar" name = "tantivy-query-grammar"
version = "0.19.0" version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "343e3ada4c1c480953f6960f8a21ce9c76611480ffdd4f4e230fdddce0fc5331" checksum = "106d8f78ad1da4f0fdd526a0760c326c0573510d4dedabeb1962d35a35879797"
dependencies = [ dependencies = [
"combine", "combine",
"once_cell", "once_cell",
"regex", "regex",
] ]
[[package]]
name = "tantivy-sstable"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "eda34243d3ee64bd8f9ba74a3b0d05f4d07beff7767a727212e9b5a19c13dde7"
dependencies = [
"tantivy-common",
"tantivy-fst",
"zstd 0.12.3+zstd.1.5.2",
]
[[package]]
name = "tantivy-stacker"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "67b9e9470301b026ad3b95f79a791a2a3ee81f3ab16fbe412a9dd81ff834acf5"
dependencies = [
"murmurhash32",
"tantivy-common",
]
[[package]]
name = "tantivy-tokenizer-api"
version = "0.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "8bee3519354cbe6bfff4bf6be9cf3e2dfcd0a6ea748a42312fbf1242c0d66fc6"
dependencies = [
"serde",
]
[[package]] [[package]]
name = "tar" name = "tar"
version = "0.4.38" version = "0.4.38"
@ -4082,7 +4144,7 @@ dependencies = [
"pbkdf2", "pbkdf2",
"sha1", "sha1",
"time 0.3.21", "time 0.3.21",
"zstd", "zstd 0.11.2+zstd.1.5.2",
] ]
[[package]] [[package]]
@ -4091,7 +4153,16 @@ version = "0.11.2+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index" source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4"
dependencies = [ dependencies = [
"zstd-safe", "zstd-safe 5.0.2+zstd.1.5.2",
]
[[package]]
name = "zstd"
version = "0.12.3+zstd.1.5.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806"
dependencies = [
"zstd-safe 6.0.5+zstd.1.5.4",
] ]
[[package]] [[package]]
@ -4104,6 +4175,16 @@ dependencies = [
"zstd-sys", "zstd-sys",
] ]
[[package]]
name = "zstd-safe"
version = "6.0.5+zstd.1.5.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d56d9e60b4b1758206c238a10165fbcae3ca37b01744e394c463463f6529d23b"
dependencies = [
"libc",
"zstd-sys",
]
[[package]] [[package]]
name = "zstd-sys" name = "zstd-sys"
version = "2.0.8+zstd.1.5.5" version = "2.0.8+zstd.1.5.5"

View File

@ -24,3 +24,4 @@ tracing = "0.1"
tracing-subscriber = "0.3" tracing-subscriber = "0.3"
anyhow = "1.0.71" anyhow = "1.0.71"
serde-jsonlines = "0.4.0" serde-jsonlines = "0.4.0"
tantivy = "0.20.2"

View File

@ -10,7 +10,7 @@ anyhow = { workspace = true }
filenamify = "0.1.0" filenamify = "0.1.0"
job_scheduler = "1.2.1" job_scheduler = "1.2.1"
tabby-common = { path = "../tabby-common" } tabby-common = { path = "../tabby-common" }
tantivy = "0.19.2" tantivy = { workspace = true }
tracing = { workspace = true } tracing = { workspace = true }
tree-sitter-javascript = "0.20.0" tree-sitter-javascript = "0.20.0"
tree-sitter-tags = "0.20.2" tree-sitter-tags = "0.20.2"

View File

@ -8,6 +8,7 @@ use tantivy::{
schema::{Schema, STORED, STRING, TEXT}, schema::{Schema, STORED, STRING, TEXT},
Index, Index,
}; };
use tracing::info;
pub fn index_repositories(_config: &Config) -> Result<()> { pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
@ -24,15 +25,24 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
writer.delete_all_documents()?; writer.delete_all_documents()?;
for doc in Document::all()? { for doc in Document::all()? {
writer.add_document(doc!( if is_valid_doc(&doc) {
git_url => doc.git_url, writer.add_document(doc!(
filepath => doc.filepath, git_url => doc.git_url,
content => doc.content, filepath => doc.filepath,
language => doc.language, content => doc.content,
))?; language => doc.language,
))?;
} else {
info!("Skip {} - {}", doc.git_url, doc.filepath);
}
} }
info!("Finalize index...");
writer.commit()?; writer.commit()?;
Ok(()) Ok(())
} }
fn is_valid_doc(x: &Document) -> bool {
x.max_line_length < 1000 && x.avg_line_length < 100.0 && x.alphanum_fraction > 0.25
}

View File

@ -30,7 +30,7 @@ opentelemetry = { version = "0.18.0", features = ["rt-tokio"] }
opentelemetry-otlp = "0.11.0" opentelemetry-otlp = "0.11.0"
axum-tracing-opentelemetry = "0.10.0" axum-tracing-opentelemetry = "0.10.0"
tracing-opentelemetry = "0.18.0" tracing-opentelemetry = "0.18.0"
tantivy = { workspace = true }
[dependencies.uuid] [dependencies.uuid]
version = "1.3.3" version = "1.3.3"

View File

@ -1,6 +1,7 @@
mod completions; mod completions;
mod events; mod events;
mod health; mod health;
mod search;
use std::{ use std::{
net::{Ipv4Addr, SocketAddr}, net::{Ipv4Addr, SocketAddr},
@ -30,7 +31,7 @@ OpenAPI documentation for [tabby](https://github.com/TabbyML/tabby), a self-host
(url = "https://playground.app.tabbyml.com/tabby", description = "Playground server"), (url = "https://playground.app.tabbyml.com/tabby", description = "Playground server"),
(url = "http://localhost:8080", description = "Local server"), (url = "http://localhost:8080", description = "Local server"),
), ),
paths(events::log_event, completions::completion, health::health), paths(events::log_event, completions::completion, health::health, search::search),
components(schemas( components(schemas(
events::LogEventRequest, events::LogEventRequest,
completions::CompletionRequest, completions::CompletionRequest,
@ -38,6 +39,8 @@ OpenAPI documentation for [tabby](https://github.com/TabbyML/tabby), a self-host
completions::Segments, completions::Segments,
completions::Choice, completions::Choice,
health::HealthState, health::HealthState,
search::SearchResponse,
search::Document,
)) ))
)] )]
struct ApiDoc; struct ApiDoc;
@ -123,6 +126,7 @@ pub async fn main(args: &ServeArgs) {
let app = Router::new() let app = Router::new()
.merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", ApiDoc::openapi())) .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", ApiDoc::openapi()))
.nest("/v1", api_router(args)) .nest("/v1", api_router(args))
.nest("/experimental", experimental_router())
.fallback(fallback()); .fallback(fallback());
let address = SocketAddr::from((Ipv4Addr::UNSPECIFIED, args.port)); let address = SocketAddr::from((Ipv4Addr::UNSPECIFIED, args.port));
@ -149,6 +153,22 @@ fn api_router(args: &ServeArgs) -> Router {
.layer(opentelemetry_tracing_layer()) .layer(opentelemetry_tracing_layer())
} }
fn experimental_router() -> Router {
let state = search::SearchState::new();
if let Some(state) = state {
Router::new()
.route(
"/search",
routing::get(search::search).with_state(Arc::new(state)),
)
.layer(CorsLayer::permissive())
.layer(opentelemetry_tracing_layer())
} else {
Router::new()
}
}
fn fallback() -> routing::MethodRouter { fn fallback() -> routing::MethodRouter {
routing::get(|| async { axum::response::Redirect::temporary("/swagger-ui") }) routing::get(|| async { axum::response::Redirect::temporary("/swagger-ui") })
} }

View File

@ -0,0 +1,142 @@
use std::sync::Arc;
use axum::{
extract::{Query, State},
Json,
};
use hyper::StatusCode;
use serde::{Deserialize, Serialize};
use tabby_common::path::index_dir;
use tantivy::{
collector::TopDocs,
query::{QueryParser, QueryParserError},
schema::Field,
DocAddress, Index, IndexReader, Score, TantivyError,
};
use utoipa::{IntoParams, ToSchema};
pub struct SearchState {
reader: IndexReader,
query_parser: QueryParser,
git_url_field: Field,
language_field: Field,
content_field: Field,
}
impl SearchState {
pub fn new() -> Option<Self> {
let index = Index::open_in_dir(index_dir()).ok()?;
let content_field = index.schema().get_field("content").ok()?;
let language_field = index.schema().get_field("language").ok()?;
let git_url_field = index.schema().get_field("git_url").ok()?;
let query_parser = QueryParser::for_index(&index, vec![content_field]);
let state = SearchState {
reader: index.reader().ok()?,
query_parser,
content_field,
language_field,
git_url_field,
};
Some(state)
}
}
#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
pub struct Document {
git_url: String,
language: String,
content: String,
}
#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
pub struct SearchResponse {
docs: Vec<Document>,
}
#[derive(Deserialize, IntoParams)]
pub struct SearchRequest {
#[param(example = "function")]
q: String,
#[param(example = "10")]
limit: Option<usize>,
#[param(example = "0")]
offset: Option<usize>,
}
#[utoipa::path(
get,
path = "/experimental/search",
tag = "experimental",
params(SearchRequest),
responses(
(status = 200, description = "Success", body = SearchResponse, content_type = "application/json"),
(status = 501, description = "Not Implemented"),
)
)]
pub async fn search(
State(state): State<Arc<SearchState>>,
params: Query<SearchRequest>,
) -> Result<Json<SearchResponse>, StatusCode> {
let searcher = state.reader.searcher();
let query = state
.query_parser
.parse_query(&params.q)
.map_err(QueryParserError::status)?;
let search_options =
TopDocs::with_limit(params.limit.unwrap_or(10)).and_offset(params.offset.unwrap_or(0));
let top_docs: Vec<(Score, DocAddress)> = searcher
.search(&query, &search_options)
.map_err(TantivyError::status)?;
let docs = top_docs
.iter()
.filter_map(|(_score, doc_address)| {
let Some(retrieved_doc) = searcher.doc(*doc_address).ok() else {
return None;
};
Some(Document {
git_url: retrieved_doc
.get_first(state.git_url_field)
.unwrap()
.as_text()
.unwrap()
.to_owned(),
language: retrieved_doc
.get_first(state.language_field)
.unwrap()
.as_text()
.unwrap()
.to_owned(),
content: retrieved_doc
.get_first(state.content_field)
.unwrap()
.as_text()
.unwrap()
.to_owned(),
})
})
.collect();
Ok(Json(SearchResponse { docs }))
}
trait Handler<T> {
fn status(self) -> StatusCode;
}
impl Handler<TantivyError> for TantivyError {
fn status(self) -> StatusCode {
StatusCode::INTERNAL_SERVER_ERROR
}
}
impl Handler<QueryParserError> for QueryParserError {
fn status(self) -> StatusCode {
StatusCode::INTERNAL_SERVER_ERROR
}
}