feat: dedup against existing snippets for retrieval augmented code (#582)

completion.
r0.4
Meng Zhang 2023-10-17 16:10:01 -07:00 committed by GitHub
parent 99d1bf34bb
commit f9eb052659
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 9 additions and 3 deletions

View File

@ -106,7 +106,7 @@ fn build_prefix(language: &str, prefix: &str, snippets: &[Snippet]) -> String {
fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> { fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> {
let mut ret = Vec::new(); let mut ret = Vec::new();
let tokens = tokenize_text(text); let mut tokens = Box::new(tokenize_text(text));
let sanitized_text = tokens.join(" "); let sanitized_text = tokens.join(" ");
let sanitized_text = sanitized_text.trim(); let sanitized_text = sanitized_text.trim();
@ -131,7 +131,7 @@ fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> V
let mut count_characters = 0; let mut count_characters = 0;
for hit in serp.hits { for hit in serp.hits {
let body = hit.doc.body; let body = hit.doc.body;
let body_tokens = tokenize_text(&body); let mut body_tokens = tokenize_text(&body);
if count_characters + body.len() > MAX_SNIPPET_CHARS_IN_PROMPT { if count_characters + body.len() > MAX_SNIPPET_CHARS_IN_PROMPT {
break; break;
@ -151,6 +151,11 @@ fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> V
continue; continue;
} }
// Prepend body tokens and update tokens, so future similarity calculation will consider
// added snippets.
body_tokens.append(&mut tokens);
*tokens = body_tokens;
count_characters += body.len(); count_characters += body.len();
ret.push(Snippet { ret.push(Snippet {
filepath: hit.doc.filepath, filepath: hit.doc.filepath,
@ -166,10 +171,11 @@ lazy_static! {
static ref TOKENIZER: Regex = Regex::new(r"[^\w]").unwrap(); static ref TOKENIZER: Regex = Regex::new(r"[^\w]").unwrap();
} }
fn tokenize_text(text: &str) -> Vec<&str> { fn tokenize_text(text: &str) -> Vec<String> {
TOKENIZER TOKENIZER
.split(text) .split(text)
.filter(|s| *s != "AND" && *s != "OR" && *s != "NOT" && !s.is_empty()) .filter(|s| *s != "AND" && *s != "OR" && *s != "NOT" && !s.is_empty())
.map(|x| x.to_owned())
.collect() .collect()
} }