add cache for stop words regexp
parent
030f694261
commit
2430a18599
|
|
@ -578,6 +578,7 @@ dependencies = [
|
||||||
"cmake",
|
"cmake",
|
||||||
"cxx",
|
"cxx",
|
||||||
"cxx-build",
|
"cxx-build",
|
||||||
|
"dashmap",
|
||||||
"derive_builder",
|
"derive_builder",
|
||||||
"regex",
|
"regex",
|
||||||
"rust-cxx-cmake-bridge",
|
"rust-cxx-cmake-bridge",
|
||||||
|
|
@ -665,6 +666,19 @@ dependencies = [
|
||||||
"syn 1.0.109",
|
"syn 1.0.109",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "dashmap"
|
||||||
|
version = "5.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "907076dfda823b0b36d2a1bb5f90c96660a5bbcd7729e10727f07858f22c4edc"
|
||||||
|
dependencies = [
|
||||||
|
"cfg-if",
|
||||||
|
"hashbrown",
|
||||||
|
"lock_api",
|
||||||
|
"once_cell",
|
||||||
|
"parking_lot_core",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "derive_builder"
|
name = "derive_builder"
|
||||||
version = "0.12.0"
|
version = "0.12.0"
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,7 @@ edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
cxx = "1.0"
|
cxx = "1.0"
|
||||||
|
dashmap = "5.4.0"
|
||||||
derive_builder = "0.12.0"
|
derive_builder = "0.12.0"
|
||||||
regex = "1.8.4"
|
regex = "1.8.4"
|
||||||
tokenizers = "0.13.3"
|
tokenizers = "0.13.3"
|
||||||
|
|
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
use dashmap::DashMap;
|
||||||
use regex::Regex;
|
use regex::Regex;
|
||||||
use tokenizers::tokenizer::Tokenizer;
|
use tokenizers::tokenizer::Tokenizer;
|
||||||
use tokio_util::sync::CancellationToken;
|
use tokio_util::sync::CancellationToken;
|
||||||
|
|
@ -91,6 +92,7 @@ impl InferenceContext {
|
||||||
pub struct TextInferenceEngine {
|
pub struct TextInferenceEngine {
|
||||||
engine: cxx::SharedPtr<ffi::TextInferenceEngine>,
|
engine: cxx::SharedPtr<ffi::TextInferenceEngine>,
|
||||||
tokenizer: Tokenizer,
|
tokenizer: Tokenizer,
|
||||||
|
stop_regex_cache: DashMap<&'static Vec<&'static str>, Regex>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl TextInferenceEngine {
|
impl TextInferenceEngine {
|
||||||
|
|
@ -104,6 +106,7 @@ impl TextInferenceEngine {
|
||||||
);
|
);
|
||||||
return TextInferenceEngine {
|
return TextInferenceEngine {
|
||||||
engine,
|
engine,
|
||||||
|
stop_regex_cache: DashMap::new(),
|
||||||
tokenizer: Tokenizer::from_file(&options.tokenizer_path).unwrap(),
|
tokenizer: Tokenizer::from_file(&options.tokenizer_path).unwrap(),
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
@ -116,10 +119,11 @@ impl TextInferenceEngine {
|
||||||
let cancel_for_inference = cancel.clone();
|
let cancel_for_inference = cancel.clone();
|
||||||
let _guard = cancel.drop_guard();
|
let _guard = cancel.drop_guard();
|
||||||
|
|
||||||
let stop_re = if options.stop_words.is_empty() {
|
let stop_re: Option<Regex> = if options.stop_words.is_empty() {
|
||||||
None
|
None
|
||||||
} else {
|
} else {
|
||||||
// FIXME(meng): consider cache the regexp.
|
let mut re = self.stop_regex_cache.get(options.stop_words);
|
||||||
|
if re.is_none() {
|
||||||
let encodings = self
|
let encodings = self
|
||||||
.tokenizer
|
.tokenizer
|
||||||
.encode_batch(options.stop_words.clone(), false)
|
.encode_batch(options.stop_words.clone(), false)
|
||||||
|
|
@ -133,7 +137,11 @@ impl TextInferenceEngine {
|
||||||
|
|
||||||
// \A means absolute begins of string.
|
// \A means absolute begins of string.
|
||||||
let regex_string = r"(?m)\A".to_owned() + &stop_tokens.join("|");
|
let regex_string = r"(?m)\A".to_owned() + &stop_tokens.join("|");
|
||||||
Some(Regex::new(®ex_string).unwrap())
|
let regex = Regex::new(®ex_string).unwrap();
|
||||||
|
self.stop_regex_cache.insert(options.stop_words, regex);
|
||||||
|
re = self.stop_regex_cache.get(options.stop_words);
|
||||||
|
}
|
||||||
|
re.map(|x| x.value().clone())
|
||||||
};
|
};
|
||||||
|
|
||||||
let context = InferenceContext::new(stop_re, cancel_for_inference);
|
let context = InferenceContext::new(stop_re, cancel_for_inference);
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue