From 07ff90156652a429d8c9bf50cff4185da62028d8 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Tue, 17 Oct 2023 14:39:46 -0700 Subject: [PATCH] feat: dedup snippet within same file, excluding highly overlapped snippets --- Cargo.lock | 1 + Cargo.toml | 1 + crates/tabby-scheduler/Cargo.toml | 1 + crates/tabby-scheduler/src/index.rs | 27 +++++++++++++++++++++++++-- crates/tabby/Cargo.toml | 2 +- 5 files changed, 29 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 44855c5..61ad125 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -3216,6 +3216,7 @@ dependencies = [ "tabby-common", "tantivy", "temp_testdir", + "textdistance", "tokio", "tracing", "tracing-test", diff --git a/Cargo.toml b/Cargo.toml index f0d25cd..1109622 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -38,3 +38,4 @@ futures = "0.3.28" async-stream = "0.3.5" regex = "1.10.0" thiserror = "1.0.49" +textdistance = "1.0.2" \ No newline at end of file diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index 531f6fd..c8f40e3 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -22,6 +22,7 @@ tree-sitter-python = "0.20.2" tree-sitter-rust = "0.20.3" tree-sitter-typescript = "0.20.3" tree-sitter-go = "0.20.0" +textdistance.workspace = true [dev-dependencies] temp_testdir = "0.2" diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index ce4a179..196767d 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -13,11 +13,13 @@ use tantivy::{ schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING}, Index, }; +use textdistance::Algorithm; // Magic numbers static MAX_LINE_LENGTH_THRESHOLD: usize = 300; static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; static MAX_BODY_LINES_THRESHOLD: usize = 15; +static MAX_SIMILARITY_THRESHOLD: f32 = 0.9; pub fn index_repositories(_config: &Config) -> Result<()> { let mut builder = Schema::builder(); @@ -73,6 +75,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> { } /// Atomic repository document in index. +#[derive(Clone)] struct IndexedDocument { git_url: String, filepath: String, @@ -82,7 +85,17 @@ struct IndexedDocument { kind: String, } +impl IndexedDocument { + fn distance_to(&self, rhs: &IndexedDocument) -> f32 { + let distance = textdistance::LCSSeq::default() + .for_str(&self.body, &rhs.body) + .val(); + distance as f32 / std::cmp::max(self.body.len(), rhs.body.len()) as f32 + } +} + fn from_source_file(file: SourceFile) -> impl Iterator { + let mut docs: Vec = vec![]; file.tags.into_iter().filter_map(move |tag| { let name = file.content.get(tag.name_range).unwrap().to_owned(); let body = file.content.get(tag.range).unwrap().to_owned(); @@ -91,14 +104,24 @@ fn from_source_file(file: SourceFile) -> impl Iterator { return None; } - Some(IndexedDocument { + let doc = IndexedDocument { git_url: file.git_url.clone(), filepath: file.filepath.clone(), language: file.language.clone(), name, body, kind: tag.syntax_type_name, - }) + }; + + for x in &docs { + if x.distance_to(&doc) > MAX_SIMILARITY_THRESHOLD { + // Exclude snippets that's are very similar in index. + return None; + } + } + + docs.push(doc.clone()); + Some(doc) }) } diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 24f4c02..f5f3dc1 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -40,7 +40,7 @@ futures = { workspace = true } async-stream = { workspace = true } axum-streams = { version = "0.9.1", features = ["json"] } minijinja = { version = "1.0.8", features = ["loader"] } -textdistance = "1.0.2" +textdistance.workspace = true regex.workspace = true thiserror.workspace = true