feat: dedup snippet within same file, excluding highly overlapped snippets
parent
be5e76650f
commit
07ff901566
|
|
@ -3216,6 +3216,7 @@ dependencies = [
|
|||
"tabby-common",
|
||||
"tantivy",
|
||||
"temp_testdir",
|
||||
"textdistance",
|
||||
"tokio",
|
||||
"tracing",
|
||||
"tracing-test",
|
||||
|
|
|
|||
|
|
@ -38,3 +38,4 @@ futures = "0.3.28"
|
|||
async-stream = "0.3.5"
|
||||
regex = "1.10.0"
|
||||
thiserror = "1.0.49"
|
||||
textdistance = "1.0.2"
|
||||
|
|
@ -22,6 +22,7 @@ tree-sitter-python = "0.20.2"
|
|||
tree-sitter-rust = "0.20.3"
|
||||
tree-sitter-typescript = "0.20.3"
|
||||
tree-sitter-go = "0.20.0"
|
||||
textdistance.workspace = true
|
||||
|
||||
[dev-dependencies]
|
||||
temp_testdir = "0.2"
|
||||
|
|
|
|||
|
|
@ -13,11 +13,13 @@ use tantivy::{
|
|||
schema::{Schema, TextFieldIndexing, TextOptions, STORED, STRING},
|
||||
Index,
|
||||
};
|
||||
use textdistance::Algorithm;
|
||||
|
||||
// Magic numbers
|
||||
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
|
||||
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
|
||||
static MAX_BODY_LINES_THRESHOLD: usize = 15;
|
||||
static MAX_SIMILARITY_THRESHOLD: f32 = 0.9;
|
||||
|
||||
pub fn index_repositories(_config: &Config) -> Result<()> {
|
||||
let mut builder = Schema::builder();
|
||||
|
|
@ -73,6 +75,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
|||
}
|
||||
|
||||
/// Atomic repository document in index.
|
||||
#[derive(Clone)]
|
||||
struct IndexedDocument {
|
||||
git_url: String,
|
||||
filepath: String,
|
||||
|
|
@ -82,7 +85,17 @@ struct IndexedDocument {
|
|||
kind: String,
|
||||
}
|
||||
|
||||
impl IndexedDocument {
|
||||
fn distance_to(&self, rhs: &IndexedDocument) -> f32 {
|
||||
let distance = textdistance::LCSSeq::default()
|
||||
.for_str(&self.body, &rhs.body)
|
||||
.val();
|
||||
distance as f32 / std::cmp::max(self.body.len(), rhs.body.len()) as f32
|
||||
}
|
||||
}
|
||||
|
||||
fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
|
||||
let mut docs: Vec<IndexedDocument> = vec![];
|
||||
file.tags.into_iter().filter_map(move |tag| {
|
||||
let name = file.content.get(tag.name_range).unwrap().to_owned();
|
||||
let body = file.content.get(tag.range).unwrap().to_owned();
|
||||
|
|
@ -91,14 +104,24 @@ fn from_source_file(file: SourceFile) -> impl Iterator<Item = IndexedDocument> {
|
|||
return None;
|
||||
}
|
||||
|
||||
Some(IndexedDocument {
|
||||
let doc = IndexedDocument {
|
||||
git_url: file.git_url.clone(),
|
||||
filepath: file.filepath.clone(),
|
||||
language: file.language.clone(),
|
||||
name,
|
||||
body,
|
||||
kind: tag.syntax_type_name,
|
||||
})
|
||||
};
|
||||
|
||||
for x in &docs {
|
||||
if x.distance_to(&doc) > MAX_SIMILARITY_THRESHOLD {
|
||||
// Exclude snippets that's are very similar in index.
|
||||
return None;
|
||||
}
|
||||
}
|
||||
|
||||
docs.push(doc.clone());
|
||||
Some(doc)
|
||||
})
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -40,7 +40,7 @@ futures = { workspace = true }
|
|||
async-stream = { workspace = true }
|
||||
axum-streams = { version = "0.9.1", features = ["json"] }
|
||||
minijinja = { version = "1.0.8", features = ["loader"] }
|
||||
textdistance = "1.0.2"
|
||||
textdistance.workspace = true
|
||||
regex.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue