feat: dedup snippet within same file, excluding highly overlapped snippets
parent
be5e76650f
commit
07ff901566
|
|
@ -3216,6 +3216,7 @@ dependencies = [
|
||||||
"tabby-common",
|
"tabby-common",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
"temp_testdir",
|
"temp_testdir",
|
||||||
|
"textdistance",
|
||||||
"tokio",
|
"tokio",
|
||||||
"tracing",
|
"tracing",
|
||||||
"tracing-test",
|
"tracing-test",
|
||||||
|
|
|
||||||
|
|
@ -38,3 +38,4 @@ futures = "0.3.28"
|
||||||
async-stream = "0.3.5"
|
async-stream = "0.3.5"
|
||||||
regex = "1.10.0"
|
regex = "1.10.0"
|
||||||
thiserror = "1.0.49"
|
thiserror = "1.0.49"
|
||||||
|
textdistance = "1.0.2"
|
||||||
|
|
@ -22,6 +22,7 @@ tree-sitter-python = "0.20.2"
|
||||||
tree-sitter-rust = "0.20.3"
|
tree-sitter-rust = "0.20.3"
|
||||||
tree-sitter-typescript = "0.20.3"
|
tree-sitter-typescript = "0.20.3"
|
||||||
tree-sitter-go = "0.20.0"
|
tree-sitter-go = "0.20.0"
|
||||||
|
textdistance.workspace = true
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
temp_testdir = "0.2"
|
temp_testdir = "0.2"
|
||||||
|
|
|
||||||
|
|
@ -13,11 +13,13 @@ use tantivy::{
|
||||||
schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING},
|
schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING},
|
||||||
Index,
|
Index,
|
||||||
};
|
};
|
||||||
|
use textdistance::Algorithm;
|
||||||
|
|
||||||
// Magic numbers
|
// Magic numbers
|
||||||
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
|
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
|
||||||
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
|
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
|
||||||
static MAX_BODY_LINES_THRESHOLD: usize = 15;
|
static MAX_BODY_LINES_THRESHOLD: usize = 15;
|
||||||
|
static MAX_SIMILARITY_THRESHOLD: f32 = 0.9;
|
||||||
|
|
||||||
pub fn index_repositories(_config: &Config) -> Result<()> {
|
pub fn index_repositories(_config: &Config) -> Result<()> {
|
||||||
let mut builder = Schema::builder();
|
let mut builder = Schema::builder();
|
||||||
|
|
@ -73,6 +75,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Atomic repository document in index.
|
/// Atomic repository document in index.
|
||||||
|
#[derive(Clone)]
|
||||||
struct IndexedDocument {
|
struct IndexedDocument {
|
||||||
git_url: String,
|
git_url: String,
|
||||||
filepath: String,
|
filepath: String,
|
||||||
|
|
@ -82,7 +85,17 @@ struct IndexedDocument {
|
||||||
kind: String,
|
kind: String,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl IndexedDocument {
|
||||||
|
fn distance_to(&self, rhs: &IndexedDocument) -> f32 {
|
||||||
|
let distance = textdistance::LCSSeq::default()
|
||||||
|
.for_str(&self.body, &rhs.body)
|
||||||
|
.val();
|
||||||
|
distance as f32 / std::cmp::max(self.body.len(), rhs.body.len()) as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
|
fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
|
||||||
|
let mut docs: Vec<IndexedDocument> = vec![];
|
||||||
file.tags.into_iter().filter_map(move |tag| {
|
file.tags.into_iter().filter_map(move |tag| {
|
||||||
let name = file.content.get(tag.name_range).unwrap().to_owned();
|
let name = file.content.get(tag.name_range).unwrap().to_owned();
|
||||||
let body = file.content.get(tag.range).unwrap().to_owned();
|
let body = file.content.get(tag.range).unwrap().to_owned();
|
||||||
|
|
@ -91,14 +104,24 @@ fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
|
||||||
return None;
|
return None;
|
||||||
}
|
}
|
||||||
|
|
||||||
Some(IndexedDocument {
|
let doc = IndexedDocument {
|
||||||
git_url: file.git_url.clone(),
|
git_url: file.git_url.clone(),
|
||||||
filepath: file.filepath.clone(),
|
filepath: file.filepath.clone(),
|
||||||
language: file.language.clone(),
|
language: file.language.clone(),
|
||||||
name,
|
name,
|
||||||
body,
|
body,
|
||||||
kind: tag.syntax_type_name,
|
kind: tag.syntax_type_name,
|
||||||
})
|
};
|
||||||
|
|
||||||
|
for x in &docs {
|
||||||
|
if x.distance_to(&doc) > MAX_SIMILARITY_THRESHOLD {
|
||||||
|
// Exclude snippets that's are very similar in index.
|
||||||
|
return None;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
docs.push(doc.clone());
|
||||||
|
Some(doc)
|
||||||
})
|
})
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -40,7 +40,7 @@ futures = { workspace = true }
|
||||||
async-stream = { workspace = true }
|
async-stream = { workspace = true }
|
||||||
axum-streams = { version = "0.9.1", features = ["json"] }
|
axum-streams = { version = "0.9.1", features = ["json"] }
|
||||||
minijinja = { version = "1.0.8", features = ["loader"] }
|
minijinja = { version = "1.0.8", features = ["loader"] }
|
||||||
textdistance = "1.0.2"
|
textdistance.workspace = true
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue