diff --git a/Cargo.lock b/Cargo.lock index fed0ccb..630eb22 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -30,11 +30,11 @@ dependencies = [ [[package]] name = "ahash" -version = "0.7.6" +version = "0.8.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fcb51a0695d8f838b1ee009b3fbf66bda078cd64590202a864a8f3e8c4315c47" +checksum = "2c99f64d1e06488f620f932677e24bc6e2897582980441ae90a671415bd7ec2f" dependencies = [ - "getrandom", + "cfg-if", "once_cell", "version_check", ] @@ -711,7 +711,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc" dependencies = [ "cfg-if", - "hashbrown", + "hashbrown 0.12.3", "lock_api", "once_cell", "parking_lot_core", @@ -862,20 +862,6 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "25c7df09945d65ea8d70b3321547ed414bbc540aad5bac6883d021b970f35b04" -[[package]] -name = "fastfield_codecs" -version = "0.3.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "374a3a53c1bd5fb31b10084229290eafb0a05f260ec90f1f726afffda4877a8a" -dependencies = [ - "fastdivide", - "itertools 0.10.5", - "log", - "ownedbytes", - "tantivy-bitpacker", - "tantivy-common", -] - [[package]] name = "fastrand" version = "1.9.0" @@ -972,6 +958,16 @@ dependencies = [ "winapi", ] +[[package]] +name = "fs4" +version = "0.6.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7672706608ecb74ab2e055c68327ffc25ae4cac1e12349204fd5fb0f3487cce2" +dependencies = [ + "rustix", + "windows-sys 0.48.0", +] + [[package]] name = "futures" version = "0.3.28" @@ -1131,6 +1127,12 @@ name = "hashbrown" version = "0.12.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8a9ee70c43aaf417c914396645a0fa852624801b24ebb7ae78fe8272889ac888" + +[[package]] +name = "hashbrown" +version = "0.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "43a3c133739dddd0d2990f9a4bdf8eb4b21ef50e4851ca85ab661199821d510e" dependencies = [ "ahash", ] @@ -1306,7 +1308,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "bd070e393353796e801d209ad339e89596eb4c8d430d18ede6a1cced8fafbd99" dependencies = [ "autocfg", - "hashbrown", + "hashbrown 0.12.3", "serde", ] @@ -1526,18 +1528,18 @@ dependencies = [ [[package]] name = "lru" -version = "0.7.8" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e999beba7b6e8345721bd280141ed958096a2e4abdf74f67ff4ce49b4b54e47a" +checksum = "03f1160296536f10c833a82dca22267d5486734230d47bf00bf435885814ba1e" dependencies = [ - "hashbrown", + "hashbrown 0.13.2", ] [[package]] name = "lz4_flex" -version = "0.9.5" +version = "0.10.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1a8cbbb2831780bc3b9c15a41f5b49222ef756b6730a95f3decfdd15903eb5a3" +checksum = "8b8c72594ac26bfd34f2d99dfced2edfaddfe8a476e3ff2ca0eb293d925c4f83" [[package]] name = "macro_rules_attribute" @@ -1597,9 +1599,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d" [[package]] name = "memmap2" -version = "0.5.10" +version = "0.6.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "83faa42c0a078c393f6b29d5db232d8be22776a891f8f56e5284faee4a20b327" +checksum = "6d28bba84adfe6646737845bc5ebbfa2c08424eb1c37e94a1fd2a82adb56a872" dependencies = [ "libc", ] @@ -1694,12 +1696,9 @@ checksum = "e5ce46fe64a9d73be07dcbe690a38ce1b293be448fd8ce1e6c1b8062c9f72c6a" [[package]] name = "murmurhash32" -version = "0.2.0" +version = "0.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d736ff882f0e85fe9689fb23db229616c4c00aee2b3ac282f666d8f20eb25d4a" -dependencies = [ - "byteorder", -] +checksum = "d9380db4c04d219ac5c51d14996bbf2c2e9a15229771b53f8671eb6c83cf44df" [[package]] name = "native-tls" @@ -1957,9 +1956,9 @@ checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" [[package]] name = "ownedbytes" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8e957eaa64a299f39755416e5b3128c505e9d63a91d0453771ad2ccd3907f8db" +checksum = "c718e498b20704d5fb5d51d07f414a22f61c19254c1708e117b93fd76860739c" dependencies = [ "stable_deref_trait", ] @@ -2661,6 +2660,15 @@ dependencies = [ "libc", ] +[[package]] +name = "sketches-ddsketch" +version = "0.2.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68a406c1882ed7f29cd5e248c9848a80e7cb6ae0fea82346d2746f2f941c07e1" +dependencies = [ + "serde", +] + [[package]] name = "slab" version = "0.4.8" @@ -2794,6 +2802,7 @@ dependencies = [ "tabby-common", "tabby-download", "tabby-scheduler", + "tantivy", "tokio", "tower", "tower-http 0.4.0", @@ -2855,14 +2864,14 @@ dependencies = [ [[package]] name = "tantivy" -version = "0.19.2" +version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5bb26a6b22c84d8be41d99a14016d6f04d30d8d31a2ea411a8ab553af5cc490d" +checksum = "aec540e9cebc88f523f67f596dee213e491f0c55961de013566f267a0c31f5e9" dependencies = [ - "aho-corasick 0.7.20", + "aho-corasick 1.0.1", "arc-swap", "async-trait", - "base64 0.13.1", + "base64 0.21.2", "bitpacking", "byteorder", "census", @@ -2871,8 +2880,7 @@ dependencies = [ "downcast-rs", "fail", "fastdivide", - "fastfield_codecs", - "fs2", + "fs4", "htmlescape", "itertools 0.10.5", "levenshtein_automata", @@ -2885,19 +2893,21 @@ dependencies = [ "num_cpus", "once_cell", "oneshot", - "ownedbytes", "rayon", "regex", "rust-stemmers", "rustc-hash", "serde", "serde_json", + "sketches-ddsketch", "smallvec", - "stable_deref_trait", "tantivy-bitpacker", + "tantivy-columnar", "tantivy-common", "tantivy-fst", "tantivy-query-grammar", + "tantivy-stacker", + "tantivy-tokenizer-api", "tempfile", "thiserror", "time 0.3.21", @@ -2907,18 +2917,40 @@ dependencies = [ [[package]] name = "tantivy-bitpacker" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e71a0c95b82d4292b097a09b989a6380d28c3a86800c841a2d03bae1fc8b9fa6" +checksum = "16099e96f0ede682084469b80d6909dc170aa2b11d2a45538b5b36b2a90090b9" +dependencies = [ + "bitpacking", +] + +[[package]] +name = "tantivy-columnar" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "56e32b024b26eab93eb8648faf08004356bf9d47376557ee4409f4b210163656" +dependencies = [ + "fastdivide", + "fnv", + "itertools 0.10.5", + "serde", + "tantivy-bitpacker", + "tantivy-common", + "tantivy-sstable", + "tantivy-stacker", +] [[package]] name = "tantivy-common" -version = "0.4.0" +version = "0.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "14fef4182bb60df9a4b92cd8ecab39ba2e50a05542934af17eef1f49660705cb" +checksum = "e7d12fdd6ec0f7e0962f129c03c696a85ec567734950cbb2b89af4a293ce342f" dependencies = [ + "async-trait", "byteorder", "ownedbytes", + "serde", + "time 0.3.21", ] [[package]] @@ -2934,15 +2966,45 @@ dependencies = [ [[package]] name = "tantivy-query-grammar" -version = "0.19.0" +version = "0.20.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "343e3ada4c1c480953f6960f8a21ce9c76611480ffdd4f4e230fdddce0fc5331" +checksum = "106d8f78ad1da4f0fdd526a0760c326c0573510d4dedabeb1962d35a35879797" dependencies = [ "combine", "once_cell", "regex", ] +[[package]] +name = "tantivy-sstable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eda34243d3ee64bd8f9ba74a3b0d05f4d07beff7767a727212e9b5a19c13dde7" +dependencies = [ + "tantivy-common", + "tantivy-fst", + "zstd 0.12.3+zstd.1.5.2", +] + +[[package]] +name = "tantivy-stacker" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "67b9e9470301b026ad3b95f79a791a2a3ee81f3ab16fbe412a9dd81ff834acf5" +dependencies = [ + "murmurhash32", + "tantivy-common", +] + +[[package]] +name = "tantivy-tokenizer-api" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8bee3519354cbe6bfff4bf6be9cf3e2dfcd0a6ea748a42312fbf1242c0d66fc6" +dependencies = [ + "serde", +] + [[package]] name = "tar" version = "0.4.38" @@ -4082,7 +4144,7 @@ dependencies = [ "pbkdf2", "sha1", "time 0.3.21", - "zstd", + "zstd 0.11.2+zstd.1.5.2", ] [[package]] @@ -4091,7 +4153,16 @@ version = "0.11.2+zstd.1.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "20cc960326ece64f010d2d2107537f26dc589a6573a316bd5b1dba685fa5fde4" dependencies = [ - "zstd-safe", + "zstd-safe 5.0.2+zstd.1.5.2", +] + +[[package]] +name = "zstd" +version = "0.12.3+zstd.1.5.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "76eea132fb024e0e13fd9c2f5d5d595d8a967aa72382ac2f9d39fcc95afd0806" +dependencies = [ + "zstd-safe 6.0.5+zstd.1.5.4", ] [[package]] @@ -4104,6 +4175,16 @@ dependencies = [ "zstd-sys", ] +[[package]] +name = "zstd-safe" +version = "6.0.5+zstd.1.5.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d56d9e60b4b1758206c238a10165fbcae3ca37b01744e394c463463f6529d23b" +dependencies = [ + "libc", + "zstd-sys", +] + [[package]] name = "zstd-sys" version = "2.0.8+zstd.1.5.5" diff --git a/Cargo.toml b/Cargo.toml index 28fa131..99228bb 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -24,3 +24,4 @@ tracing = "0.1" tracing-subscriber = "0.3" anyhow = "1.0.71" serde-jsonlines = "0.4.0" +tantivy = "0.20.2" diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index c72adc4..9707dc9 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -10,7 +10,7 @@ anyhow = { workspace = true } filenamify = "0.1.0" job_scheduler = "1.2.1" tabby-common = { path = "../tabby-common" } -tantivy = "0.19.2" +tantivy = { workspace = true } tracing = { workspace = true } tree-sitter-javascript = "0.20.0" tree-sitter-tags = "0.20.2" diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index 4e2d65b..a5953a8 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -8,6 +8,7 @@ use tantivy::{ schema::{Schema, STORED, STRING, TEXT}, Index, }; +use tracing::info; pub fn index_repositories(_config: &Config) -> Result<()> { let mut builder = Schema::builder(); @@ -24,15 +25,24 @@ pub fn index_repositories(_config: &Config) -> Result<()> { writer.delete_all_documents()?; for doc in Document::all()? { - writer.add_document(doc!( - git_url => doc.git_url, - filepath => doc.filepath, - content => doc.content, - language => doc.language, - ))?; + if is_valid_doc(&doc) { + writer.add_document(doc!( + git_url => doc.git_url, + filepath => doc.filepath, + content => doc.content, + language => doc.language, + ))?; + } else { + info!("Skip {} - {}", doc.git_url, doc.filepath); + } } + info!("Finalize index..."); writer.commit()?; Ok(()) } + +fn is_valid_doc(x: &Document) -> bool { + x.max_line_length < 1000 && x.avg_line_length < 100.0 && x.alphanum_fraction > 0.25 +} diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index be6ee1a..1e3e486 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -30,7 +30,7 @@ opentelemetry = { version = "0.18.0", features = ["rt-tokio"] } opentelemetry-otlp = "0.11.0" axum-tracing-opentelemetry = "0.10.0" tracing-opentelemetry = "0.18.0" - +tantivy = { workspace = true } [dependencies.uuid] version = "1.3.3" diff --git a/crates/tabby/src/serve/mod.rs b/crates/tabby/src/serve/mod.rs index 3a2d482..dcbe698 100644 --- a/crates/tabby/src/serve/mod.rs +++ b/crates/tabby/src/serve/mod.rs @@ -1,6 +1,7 @@ mod completions; mod events; mod health; +mod search; use std::{ net::{Ipv4Addr, SocketAddr}, @@ -30,7 +31,7 @@ OpenAPI documentation for [tabby](https://github.com/TabbyML/tabby), a self-host (url = "https://playground.app.tabbyml.com/tabby", description = "Playground server"), (url = "http://localhost:8080", description = "Local server"), ), - paths(events::log_event, completions::completion, health::health), + paths(events::log_event, completions::completion, health::health, search::search), components(schemas( events::LogEventRequest, completions::CompletionRequest, @@ -38,6 +39,8 @@ OpenAPI documentation for [tabby](https://github.com/TabbyML/tabby), a self-host completions::Segments, completions::Choice, health::HealthState, + search::SearchResponse, + search::Document, )) )] struct ApiDoc; @@ -123,6 +126,7 @@ pub async fn main(args: &ServeArgs) { let app = Router::new() .merge(SwaggerUi::new("/swagger-ui").url("/api-docs/openapi.json", ApiDoc::openapi())) .nest("/v1", api_router(args)) + .nest("/experimental", experimental_router()) .fallback(fallback()); let address = SocketAddr::from((Ipv4Addr::UNSPECIFIED, args.port)); @@ -149,6 +153,22 @@ fn api_router(args: &ServeArgs) -> Router { .layer(opentelemetry_tracing_layer()) } +fn experimental_router() -> Router { + let state = search::SearchState::new(); + + if let Some(state) = state { + Router::new() + .route( + "/search", + routing::get(search::search).with_state(Arc::new(state)), + ) + .layer(CorsLayer::permissive()) + .layer(opentelemetry_tracing_layer()) + } else { + Router::new() + } +} + fn fallback() -> routing::MethodRouter { routing::get(|| async { axum::response::Redirect::temporary("/swagger-ui") }) } diff --git a/crates/tabby/src/serve/search.rs b/crates/tabby/src/serve/search.rs new file mode 100644 index 0000000..63fd152 --- /dev/null +++ b/crates/tabby/src/serve/search.rs @@ -0,0 +1,142 @@ +use std::sync::Arc; + +use axum::{ + extract::{Query, State}, + Json, +}; +use hyper::StatusCode; +use serde::{Deserialize, Serialize}; +use tabby_common::path::index_dir; +use tantivy::{ + collector::TopDocs, + query::{QueryParser, QueryParserError}, + schema::Field, + DocAddress, Index, IndexReader, Score, TantivyError, +}; +use utoipa::{IntoParams, ToSchema}; + +pub struct SearchState { + reader: IndexReader, + query_parser: QueryParser, + git_url_field: Field, + language_field: Field, + content_field: Field, +} + +impl SearchState { + pub fn new() -> Option { + let index = Index::open_in_dir(index_dir()).ok()?; + let content_field = index.schema().get_field("content").ok()?; + let language_field = index.schema().get_field("language").ok()?; + let git_url_field = index.schema().get_field("git_url").ok()?; + + let query_parser = QueryParser::for_index(&index, vec![content_field]); + let state = SearchState { + reader: index.reader().ok()?, + query_parser, + content_field, + language_field, + git_url_field, + }; + + Some(state) + } +} + +#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)] +pub struct Document { + git_url: String, + language: String, + content: String, +} + +#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)] +pub struct SearchResponse { + docs: Vec, +} + +#[derive(Deserialize, IntoParams)] +pub struct SearchRequest { + #[param(example = "function")] + q: String, + #[param(example = "10")] + limit: Option, + #[param(example = "0")] + offset: Option, +} + +#[utoipa::path( + get, + path = "/experimental/search", + tag = "experimental", + params(SearchRequest), + responses( + (status = 200, description = "Success", body = SearchResponse, content_type = "application/json"), + (status = 501, description = "Not Implemented"), + ) +)] +pub async fn search( + State(state): State>, + params: Query, +) -> Result, StatusCode> { + let searcher = state.reader.searcher(); + + let query = state + .query_parser + .parse_query(¶ms.q) + .map_err(QueryParserError::status)?; + + let search_options = + TopDocs::with_limit(params.limit.unwrap_or(10)).and_offset(params.offset.unwrap_or(0)); + let top_docs: Vec<(Score, DocAddress)> = searcher + .search(&query, &search_options) + .map_err(TantivyError::status)?; + + let docs = top_docs + .iter() + .filter_map(|(_score, doc_address)| { + let Some(retrieved_doc) = searcher.doc(*doc_address).ok() else { + return None; + }; + + Some(Document { + git_url: retrieved_doc + .get_first(state.git_url_field) + .unwrap() + .as_text() + .unwrap() + .to_owned(), + language: retrieved_doc + .get_first(state.language_field) + .unwrap() + .as_text() + .unwrap() + .to_owned(), + content: retrieved_doc + .get_first(state.content_field) + .unwrap() + .as_text() + .unwrap() + .to_owned(), + }) + }) + .collect(); + + Ok(Json(SearchResponse { docs })) +} + +trait Handler { + fn status(self) -> StatusCode; +} + +impl Handler for TantivyError { + fn status(self) -> StatusCode { + StatusCode::INTERNAL_SERVER_ERROR + } +} + +impl Handler for QueryParserError { + fn status(self) -> StatusCode { + StatusCode::INTERNAL_SERVER_ERROR + } +}