fix: avoid special keywords (e.g AND) failed the query parsing (#695)

release-notes-05
Meng Zhang 2023-11-02 18:13:28 -07:00 committed by GitHub
parent 2adcc0726c
commit e4efcc4091
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 75 additions and 43 deletions

View File

@ -4,6 +4,7 @@ use lazy_static::lazy_static;
use regex::Regex; use regex::Regex;
use strfmt::strfmt; use strfmt::strfmt;
use tabby_common::languages::get_language; use tabby_common::languages::get_language;
use tantivy::{query::BooleanQuery, query_grammar::Occur};
use textdistance::Algorithm; use textdistance::Algorithm;
use tracing::warn; use tracing::warn;
@ -106,17 +107,16 @@ fn build_prefix(language: &str, prefix: &str, snippets: &[Snippet]) -> String {
fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> { fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> {
let mut ret = Vec::new(); let mut ret = Vec::new();
let mut tokens = Box::new(tokenize_text(text)); let mut tokens = tokenize_text(text);
let sanitized_text = tokens.join(" "); let language_query = index_server.language_query(language).unwrap();
let sanitized_text = sanitized_text.trim(); let body_query = index_server.body_query(&tokens).unwrap();
if sanitized_text.is_empty() { let query = BooleanQuery::new(vec![
return ret; (Occur::Must, language_query),
} (Occur::Must, body_query),
]);
let query_text = format!("language:{} AND ({})", language, sanitized_text); let serp = match index_server.search_with_query(&query, MAX_SNIPPETS_TO_FETCH, 0) {
let serp = match index_server.search(&query_text, MAX_SNIPPETS_TO_FETCH, 0) {
Ok(serp) => serp, Ok(serp) => serp,
Err(IndexServerError::NotReady) => { Err(IndexServerError::NotReady) => {
// Ignore. // Ignore.
@ -154,7 +154,7 @@ fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> V
// Prepend body tokens and update tokens, so future similarity calculation will consider // Prepend body tokens and update tokens, so future similarity calculation will consider
// added snippets. // added snippets.
body_tokens.append(&mut tokens); body_tokens.append(&mut tokens);
*tokens = body_tokens; tokens.append(&mut body_tokens);
count_characters += body.len(); count_characters += body.len();
ret.push(Snippet { ret.push(Snippet {
@ -172,11 +172,7 @@ lazy_static! {
} }
fn tokenize_text(text: &str) -> Vec<String> { fn tokenize_text(text: &str) -> Vec<String> {
TOKENIZER TOKENIZER.split(text).map(|x| x.to_owned()).collect()
.split(text)
.filter(|s| *s != "AND" && *s != "OR" && *s != "NOT" && !s.is_empty())
.map(|x| x.to_owned())
.collect()
} }
#[cfg(test)] #[cfg(test)]

View File

@ -10,13 +10,13 @@ use serde::{Deserialize, Serialize};
use tabby_common::{index::IndexExt, path}; use tabby_common::{index::IndexExt, path};
use tantivy::{ use tantivy::{
collector::{Count, TopDocs}, collector::{Count, TopDocs},
query::QueryParser, query::{QueryParser, TermQuery, TermSetQuery},
schema::Field, schema::{Field, IndexRecordOption},
DocAddress, Document, Index, IndexReader, DocAddress, Document, Index, IndexReader, Term,
}; };
use thiserror::Error; use thiserror::Error;
use tokio::{sync::OnceCell, task, time::sleep}; use tokio::{sync::OnceCell, task, time::sleep};
use tracing::{debug, instrument, log::info}; use tracing::{debug, instrument, log::info, warn};
use utoipa::{IntoParams, ToSchema}; use utoipa::{IntoParams, ToSchema};
#[derive(Deserialize, IntoParams)] #[derive(Deserialize, IntoParams)]
@ -70,15 +70,18 @@ pub async fn search(
State(state): State<Arc<IndexServer>>, State(state): State<Arc<IndexServer>>,
query: Query<SearchQuery>, query: Query<SearchQuery>,
) -> Result<Json<SearchResponse>, StatusCode> { ) -> Result<Json<SearchResponse>, StatusCode> {
let Ok(serp) = state.search( match state.search(
&query.q, &query.q,
query.limit.unwrap_or(20), query.limit.unwrap_or(20),
query.offset.unwrap_or(0), query.offset.unwrap_or(0),
) else { ) {
return Err(StatusCode::NOT_IMPLEMENTED); Ok(serp) => Ok(Json(serp)),
}; Err(IndexServerError::NotReady) => Err(StatusCode::NOT_IMPLEMENTED),
Err(IndexServerError::TantivyError(err)) => {
Ok(Json(serp)) warn!("{}", err);
Err(StatusCode::INTERNAL_SERVER_ERROR)
}
}
} }
struct IndexServerImpl { struct IndexServerImpl {
@ -119,17 +122,19 @@ impl IndexServerImpl {
} }
pub fn search(&self, q: &str, limit: usize, offset: usize) -> tantivy::Result<SearchResponse> { pub fn search(&self, q: &str, limit: usize, offset: usize) -> tantivy::Result<SearchResponse> {
let query = self let query = self.query_parser.parse_query(q)?;
.query_parser self.search_with_query(&query, limit, offset)
.parse_query(q) }
.expect("Parsing the query failed");
pub fn search_with_query(
&self,
q: &dyn tantivy::query::Query,
limit: usize,
offset: usize,
) -> tantivy::Result<SearchResponse> {
let searcher = self.reader.searcher(); let searcher = self.reader.searcher();
let (top_docs, num_hits) = { let (top_docs, num_hits) =
searcher.search( { searcher.search(q, &(TopDocs::with_limit(limit).and_offset(offset), Count))? };
&query,
&(TopDocs::with_limit(limit).and_offset(offset), Count),
)?
};
let hits: Vec<Hit> = { let hits: Vec<Hit> = {
top_docs top_docs
.iter() .iter()
@ -179,8 +184,15 @@ impl IndexServer {
Self {} Self {}
} }
fn get_cell(&self) -> Option<&IndexServerImpl> { fn with_impl<T, F>(&self, op: F) -> Result<T, IndexServerError>
IMPL.get() where
F: FnOnce(&IndexServerImpl) -> Result<T, IndexServerError>,
{
if let Some(imp) = IMPL.get() {
op(imp)
} else {
Err(IndexServerError::NotReady)
}
} }
async fn worker() -> IndexServerImpl { async fn worker() -> IndexServerImpl {
@ -199,17 +211,41 @@ impl IndexServer {
} }
} }
pub fn language_query(&self, language: &str) -> Result<Box<TermQuery>, IndexServerError> {
self.with_impl(|imp| {
Ok(Box::new(TermQuery::new(
Term::from_field_text(imp.field_language, language),
IndexRecordOption::WithFreqsAndPositions,
)))
})
}
pub fn body_query(&self, tokens: &[String]) -> Result<Box<TermSetQuery>, IndexServerError> {
self.with_impl(|imp| {
Ok(Box::new(TermSetQuery::new(
tokens
.iter()
.map(|x| Term::from_field_text(imp.field_body, x)),
)))
})
}
pub fn search( pub fn search(
&self, &self,
q: &str, q: &str,
limit: usize, limit: usize,
offset: usize, offset: usize,
) -> Result<SearchResponse, IndexServerError> { ) -> Result<SearchResponse, IndexServerError> {
if let Some(imp) = self.get_cell() { self.with_impl(|imp| Ok(imp.search(q, limit, offset)?))
Ok(imp.search(q, limit, offset)?)
} else {
Err(IndexServerError::NotReady)
} }
pub fn search_with_query(
&self,
q: &dyn tantivy::query::Query,
limit: usize,
offset: usize,
) -> Result<SearchResponse, IndexServerError> {
self.with_impl(|imp| Ok(imp.search_with_query(q, limit, offset)?))
} }
} }
@ -218,6 +254,6 @@ pub enum IndexServerError {
#[error("index not ready")] #[error("index not ready")]
NotReady, NotReady,
#[error("underlying tantivy error")] #[error("{0}")]
TantivyError(#[from] tantivy::TantivyError), TantivyError(#[from] tantivy::TantivyError),
} }