feat: dedup snippet within same file, excluding highly overlapped snippets

dedup-snippet-at-index
Meng Zhang 2023-10-17 14:39:46 -07:00
parent be5e76650f
commit 07ff901566
5 changed files with 29 additions and 3 deletions

1
Cargo.lock generated
View File

@ -3216,6 +3216,7 @@ dependencies = [
"tabby-common", "tabby-common",
"tantivy", "tantivy",
"temp_testdir", "temp_testdir",
"textdistance",
"tokio", "tokio",
"tracing", "tracing",
"tracing-test", "tracing-test",

View File

@ -38,3 +38,4 @@ futures = "0.3.28"
async-stream = "0.3.5" async-stream = "0.3.5"
regex = "1.10.0" regex = "1.10.0"
thiserror = "1.0.49" thiserror = "1.0.49"
textdistance = "1.0.2"

View File

@ -22,6 +22,7 @@ tree-sitter-python = "0.20.2"
tree-sitter-rust = "0.20.3" tree-sitter-rust = "0.20.3"
tree-sitter-typescript = "0.20.3" tree-sitter-typescript = "0.20.3"
tree-sitter-go = "0.20.0" tree-sitter-go = "0.20.0"
textdistance.workspace = true
[dev-dependencies] [dev-dependencies]
temp_testdir = "0.2" temp_testdir = "0.2"

View File

@ -13,11 +13,13 @@ use tantivy::{
schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING}, schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING},
Index, Index,
}; };
use textdistance::Algorithm;
// Magic numbers // Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300; static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
static MAX_BODY_LINES_THRESHOLD: usize = 15; static MAX_BODY_LINES_THRESHOLD: usize = 15;
static MAX_SIMILARITY_THRESHOLD: f32 = 0.9;
pub fn index_repositories(_config: &Config) -> Result<()> { pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
@ -73,6 +75,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
} }
/// Atomic repository document in index. /// Atomic repository document in index.
#[derive(Clone)]
struct IndexedDocument { struct IndexedDocument {
git_url: String, git_url: String,
filepath: String, filepath: String,
@ -82,7 +85,17 @@ struct IndexedDocument {
kind: String, kind: String,
} }
impl IndexedDocument {
fn distance_to(&self, rhs: &IndexedDocument) -> f32 {
let distance = textdistance::LCSSeq::default()
.for_str(&self.body, &rhs.body)
.val();
distance as f32 / std::cmp::max(self.body.len(), rhs.body.len()) as f32
}
}
fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> { fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
let mut docs: Vec<IndexedDocument> = vec![];
file.tags.into_iter().filter_map(move |tag| { file.tags.into_iter().filter_map(move |tag| {
let name = file.content.get(tag.name_range).unwrap().to_owned(); let name = file.content.get(tag.name_range).unwrap().to_owned();
let body = file.content.get(tag.range).unwrap().to_owned(); let body = file.content.get(tag.range).unwrap().to_owned();
@ -91,14 +104,24 @@ fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
return None; return None;
} }
Some(IndexedDocument { let doc = IndexedDocument {
git_url: file.git_url.clone(), git_url: file.git_url.clone(),
filepath: file.filepath.clone(), filepath: file.filepath.clone(),
language: file.language.clone(), language: file.language.clone(),
name, name,
body, body,
kind: tag.syntax_type_name, kind: tag.syntax_type_name,
}) };
for x in &docs {
if x.distance_to(&doc) > MAX_SIMILARITY_THRESHOLD {
// Exclude snippets that's are very similar in index.
return None;
}
}
docs.push(doc.clone());
Some(doc)
}) })
} }

View File

@ -40,7 +40,7 @@ futures = { workspace = true }
async-stream = { workspace = true } async-stream = { workspace = true }
axum-streams = { version = "0.9.1", features = ["json"] } axum-streams = { version = "0.9.1", features = ["json"] }
minijinja = { version = "1.0.8", features = ["loader"] } minijinja = { version = "1.0.8", features = ["loader"] }
textdistance = "1.0.2" textdistance.workspace = true
regex.workspace = true regex.workspace = true
thiserror.workspace = true thiserror.workspace = true