fix: add additional whitespace to match tokens that combining space and li… (#270)

* fix: add additional whitespace to match tokens that combining space and line break

* fix lint
sweep/improve-logging-information
Meng Zhang 2023-06-24 18:15:52 -07:00 committed by GitHub
parent af517fb15b
commit 9ca1f7e5f1
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 35 additions and 5 deletions

View File

@ -23,7 +23,6 @@ pub async fn scheduler(now: bool) -> Result<()> {
let ret = dataset::create_dataset(&config);
if let Err(err) = ret {
error!("Failed to build dataset, err: '{}'", err);
return;
}
};

View File

@ -3,18 +3,37 @@ use std::collections::HashMap;
use lazy_static::lazy_static;
lazy_static! {
static ref DEFAULT: Vec<&'static str> = vec!("\n\n");
static ref DEFAULT: Vec<&'static str> = vec![
"\n\n",
"\n\n ",
"\n\n ",
"\n\n ",
"\n\n ",
"\n\n ",
"\n\n ",
"\n\n ",
"\n\n",
"\n\n\t",
"\n\n\t\t",
"\n\n\t\t\t",
"\n\n\t\t\t\t",
"\n\n\t\t\t\t\t",
"\n\n\t\t\t\t\t\t",
"\n\n\t\t\t\t\t\t\t",
];
static ref LANGUAGES: HashMap<&'static str, Vec<&'static str>> = {
let mut map = HashMap::new();
map.insert("python", vec!["\n\n", "\ndef", "\n#", "\nfrom", "\nclass"]);
map.insert(
"python",
vec!["\ndef", "\n#", "\nfrom", "\nclass"].with_default(),
);
map.insert(
"javascript",
vec!["\n\n", "\nfunction", "\n//", "\nimport", "\nclass"],
vec!["\nfunction", "\n//", "\nimport", "\nclass"],
);
map.insert(
"typescript",
vec![
"\n\n",
"\nfunction",
"\n//",
"\nimport",
@ -27,6 +46,18 @@ lazy_static! {
};
}
trait WithDefault {
fn with_default(self) -> Self;
}
impl WithDefault for Vec<&'static str> {
fn with_default(mut self) -> Self {
let mut x = DEFAULT.clone();
self.append(&mut x);
self
}
}
pub fn get_stop_words(language: &str) -> &'static Vec<&'static str> {
LANGUAGES.get(language).unwrap_or(&DEFAULT)
}