fix: avoid special keywords (e.g AND) failed the query parsing (#695)
parent
2adcc0726c
commit
e4efcc4091
|
|
@ -4,6 +4,7 @@ use lazy_static::lazy_static;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use strfmt::strfmt;
|
use strfmt::strfmt;
|
||||||
use tabby_common::languages::get_language;
|
use tabby_common::languages::get_language;
|
||||||
|
use tantivy::{query::BooleanQuery, query_grammar::Occur};
|
||||||
use textdistance::Algorithm;
|
use textdistance::Algorithm;
|
||||||
use tracing::warn;
|
use tracing::warn;
|
||||||
|
|
||||||
|
|
@ -106,17 +107,16 @@ fn build_prefix(language: &str, prefix: &str, snippets: &[Snippet]) -> String {
|
||||||
|
|
||||||
fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> {
|
fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> Vec<Snippet> {
|
||||||
let mut ret = Vec::new();
|
let mut ret = Vec::new();
|
||||||
let mut tokens = Box::new(tokenize_text(text));
|
let mut tokens = tokenize_text(text);
|
||||||
|
|
||||||
let sanitized_text = tokens.join(" ");
|
let language_query = index_server.language_query(language).unwrap();
|
||||||
let sanitized_text = sanitized_text.trim();
|
let body_query = index_server.body_query(&tokens).unwrap();
|
||||||
if sanitized_text.is_empty() {
|
let query = BooleanQuery::new(vec![
|
||||||
return ret;
|
(Occur::Must, language_query),
|
||||||
}
|
(Occur::Must, body_query),
|
||||||
|
]);
|
||||||
|
|
||||||
let query_text = format!("language:{} AND ({})", language, sanitized_text);
|
let serp = match index_server.search_with_query(&query, MAX_SNIPPETS_TO_FETCH, 0) {
|
||||||
|
|
||||||
let serp = match index_server.search(&query_text, MAX_SNIPPETS_TO_FETCH, 0) {
|
|
||||||
Ok(serp) => serp,
|
Ok(serp) => serp,
|
||||||
Err(IndexServerError::NotReady) => {
|
Err(IndexServerError::NotReady) => {
|
||||||
// Ignore.
|
// Ignore.
|
||||||
|
|
@ -154,7 +154,7 @@ fn collect_snippets(index_server: &IndexServer, language: &str, text: &str) -> V
|
||||||
// Prepend body tokens and update tokens, so future similarity calculation will consider
|
// Prepend body tokens and update tokens, so future similarity calculation will consider
|
||||||
// added snippets.
|
// added snippets.
|
||||||
body_tokens.append(&mut tokens);
|
body_tokens.append(&mut tokens);
|
||||||
*tokens = body_tokens;
|
tokens.append(&mut body_tokens);
|
||||||
|
|
||||||
count_characters += body.len();
|
count_characters += body.len();
|
||||||
ret.push(Snippet {
|
ret.push(Snippet {
|
||||||
|
|
@ -172,11 +172,7 @@ lazy_static! {
|
||||||
}
|
}
|
||||||
|
|
||||||
fn tokenize_text(text: &str) -> Vec<String> {
|
fn tokenize_text(text: &str) -> Vec<String> {
|
||||||
TOKENIZER
|
TOKENIZER.split(text).map(|x| x.to_owned()).collect()
|
||||||
.split(text)
|
|
||||||
.filter(|s| *s != "AND" && *s != "OR" && *s != "NOT" && !s.is_empty())
|
|
||||||
.map(|x| x.to_owned())
|
|
||||||
.collect()
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(test)]
|
#[cfg(test)]
|
||||||
|
|
|
||||||
|
|
@ -10,13 +10,13 @@ use serde::{Deserialize, Serialize};
|
||||||
use tabby_common::{index::IndexExt, path};
|
use tabby_common::{index::IndexExt, path};
|
||||||
use tantivy::{
|
use tantivy::{
|
||||||
collector::{Count, TopDocs},
|
collector::{Count, TopDocs},
|
||||||
query::QueryParser,
|
query::{QueryParser, TermQuery, TermSetQuery},
|
||||||
schema::Field,
|
schema::{Field, IndexRecordOption},
|
||||||
DocAddress, Document, Index, IndexReader,
|
DocAddress, Document, Index, IndexReader, Term,
|
||||||
};
|
};
|
||||||
use thiserror::Error;
|
use thiserror::Error;
|
||||||
use tokio::{sync::OnceCell, task, time::sleep};
|
use tokio::{sync::OnceCell, task, time::sleep};
|
||||||
use tracing::{debug, instrument, log::info};
|
use tracing::{debug, instrument, log::info, warn};
|
||||||
use utoipa::{IntoParams, ToSchema};
|
use utoipa::{IntoParams, ToSchema};
|
||||||
|
|
||||||
#[derive(Deserialize, IntoParams)]
|
#[derive(Deserialize, IntoParams)]
|
||||||
|
|
@ -70,15 +70,18 @@ pub async fn search(
|
||||||
State(state): State<Arc<IndexServer>>,
|
State(state): State<Arc<IndexServer>>,
|
||||||
query: Query<SearchQuery>,
|
query: Query<SearchQuery>,
|
||||||
) -> Result<Json<SearchResponse>, StatusCode> {
|
) -> Result<Json<SearchResponse>, StatusCode> {
|
||||||
let Ok(serp) = state.search(
|
match state.search(
|
||||||
&query.q,
|
&query.q,
|
||||||
query.limit.unwrap_or(20),
|
query.limit.unwrap_or(20),
|
||||||
query.offset.unwrap_or(0),
|
query.offset.unwrap_or(0),
|
||||||
) else {
|
) {
|
||||||
return Err(StatusCode::NOT_IMPLEMENTED);
|
Ok(serp) => Ok(Json(serp)),
|
||||||
};
|
Err(IndexServerError::NotReady) => Err(StatusCode::NOT_IMPLEMENTED),
|
||||||
|
Err(IndexServerError::TantivyError(err)) => {
|
||||||
Ok(Json(serp))
|
warn!("{}", err);
|
||||||
|
Err(StatusCode::INTERNAL_SERVER_ERROR)
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
struct IndexServerImpl {
|
struct IndexServerImpl {
|
||||||
|
|
@ -119,17 +122,19 @@ impl IndexServerImpl {
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn search(&self, q: &str, limit: usize, offset: usize) -> tantivy::Result<SearchResponse> {
|
pub fn search(&self, q: &str, limit: usize, offset: usize) -> tantivy::Result<SearchResponse> {
|
||||||
let query = self
|
let query = self.query_parser.parse_query(q)?;
|
||||||
.query_parser
|
self.search_with_query(&query, limit, offset)
|
||||||
.parse_query(q)
|
}
|
||||||
.expect("Parsing the query failed");
|
|
||||||
|
pub fn search_with_query(
|
||||||
|
&self,
|
||||||
|
q: &dyn tantivy::query::Query,
|
||||||
|
limit: usize,
|
||||||
|
offset: usize,
|
||||||
|
) -> tantivy::Result<SearchResponse> {
|
||||||
let searcher = self.reader.searcher();
|
let searcher = self.reader.searcher();
|
||||||
let (top_docs, num_hits) = {
|
let (top_docs, num_hits) =
|
||||||
searcher.search(
|
{ searcher.search(q, &(TopDocs::with_limit(limit).and_offset(offset), Count))? };
|
||||||
&query,
|
|
||||||
&(TopDocs::with_limit(limit).and_offset(offset), Count),
|
|
||||||
)?
|
|
||||||
};
|
|
||||||
let hits: Vec<Hit> = {
|
let hits: Vec<Hit> = {
|
||||||
top_docs
|
top_docs
|
||||||
.iter()
|
.iter()
|
||||||
|
|
@ -179,8 +184,15 @@ impl IndexServer {
|
||||||
Self {}
|
Self {}
|
||||||
}
|
}
|
||||||
|
|
||||||
fn get_cell(&self) -> Option<&IndexServerImpl> {
|
fn with_impl<T, F>(&self, op: F) -> Result<T, IndexServerError>
|
||||||
IMPL.get()
|
where
|
||||||
|
F: FnOnce(&IndexServerImpl) -> Result<T, IndexServerError>,
|
||||||
|
{
|
||||||
|
if let Some(imp) = IMPL.get() {
|
||||||
|
op(imp)
|
||||||
|
} else {
|
||||||
|
Err(IndexServerError::NotReady)
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async fn worker() -> IndexServerImpl {
|
async fn worker() -> IndexServerImpl {
|
||||||
|
|
@ -199,17 +211,41 @@ impl IndexServer {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn language_query(&self, language: &str) -> Result<Box<TermQuery>, IndexServerError> {
|
||||||
|
self.with_impl(|imp| {
|
||||||
|
Ok(Box::new(TermQuery::new(
|
||||||
|
Term::from_field_text(imp.field_language, language),
|
||||||
|
IndexRecordOption::WithFreqsAndPositions,
|
||||||
|
)))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn body_query(&self, tokens: &[String]) -> Result<Box<TermSetQuery>, IndexServerError> {
|
||||||
|
self.with_impl(|imp| {
|
||||||
|
Ok(Box::new(TermSetQuery::new(
|
||||||
|
tokens
|
||||||
|
.iter()
|
||||||
|
.map(|x| Term::from_field_text(imp.field_body, x)),
|
||||||
|
)))
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
pub fn search(
|
pub fn search(
|
||||||
&self,
|
&self,
|
||||||
q: &str,
|
q: &str,
|
||||||
limit: usize,
|
limit: usize,
|
||||||
offset: usize,
|
offset: usize,
|
||||||
) -> Result<SearchResponse, IndexServerError> {
|
) -> Result<SearchResponse, IndexServerError> {
|
||||||
if let Some(imp) = self.get_cell() {
|
self.with_impl(|imp| Ok(imp.search(q, limit, offset)?))
|
||||||
Ok(imp.search(q, limit, offset)?)
|
|
||||||
} else {
|
|
||||||
Err(IndexServerError::NotReady)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn search_with_query(
|
||||||
|
&self,
|
||||||
|
q: &dyn tantivy::query::Query,
|
||||||
|
limit: usize,
|
||||||
|
offset: usize,
|
||||||
|
) -> Result<SearchResponse, IndexServerError> {
|
||||||
|
self.with_impl(|imp| Ok(imp.search_with_query(q, limit, offset)?))
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -218,6 +254,6 @@ pub enum IndexServerError {
|
||||||
#[error("index not ready")]
|
#[error("index not ready")]
|
||||||
NotReady,
|
NotReady,
|
||||||
|
|
||||||
#[error("underlying tantivy error")]
|
#[error("{0}")]
|
||||||
TantivyError(#[from] tantivy::TantivyError),
|
TantivyError(#[from] tantivy::TantivyError),
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue