2023-10-06 18:54:12 +00:00
|
|
|
use tantivy::{
|
2023-11-03 01:04:33 +00:00
|
|
|
tokenizer::{NgramTokenizer, RegexTokenizer, RemoveLongFilter, TextAnalyzer},
|
2023-10-06 18:54:12 +00:00
|
|
|
Index,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
pub trait IndexExt {
|
|
|
|
|
fn register_tokenizer(&self);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub static CODE_TOKENIZER: &str = "code";
|
2023-11-03 01:04:33 +00:00
|
|
|
pub static IDENTIFIER_TOKENIZER: &str = "identifier";
|
2023-10-06 18:54:12 +00:00
|
|
|
|
|
|
|
|
impl IndexExt for Index {
|
|
|
|
|
fn register_tokenizer(&self) {
|
|
|
|
|
let code_tokenizer = TextAnalyzer::builder(RegexTokenizer::new(r"(?:\w+)").unwrap())
|
|
|
|
|
.filter(RemoveLongFilter::limit(128))
|
|
|
|
|
.build();
|
|
|
|
|
|
|
|
|
|
self.tokenizers().register(CODE_TOKENIZER, code_tokenizer);
|
2023-11-03 01:04:33 +00:00
|
|
|
|
|
|
|
|
let identifier_tokenzier =
|
|
|
|
|
TextAnalyzer::builder(NgramTokenizer::prefix_only(2, 5).unwrap()).build();
|
|
|
|
|
|
|
|
|
|
self.tokenizers()
|
|
|
|
|
.register(IDENTIFIER_TOKENIZER, identifier_tokenzier);
|
2023-10-06 18:54:12 +00:00
|
|
|
}
|
|
|
|
|
}
|