tabby/crates/tabby-scheduler/src/index.rs

79 lines
2.5 KiB
Rust

use std::{collections::HashMap, fs};
use anyhow::Result;
use lazy_static::lazy_static;
use tabby_common::{config::Config, path::index_dir, Document};
use tantivy::{
directory::MmapDirectory,
doc,
schema::{Schema, STORED, STRING, TEXT},
Index,
};
pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder();
let field_git_url = builder.add_text_field("git_url", STRING | STORED);
let field_filepath = builder.add_text_field("filepath", STRING | STORED);
let field_language = builder.add_text_field("language", STRING | STORED);
let field_name = builder.add_text_field("name", STRING | STORED);
let field_kind = builder.add_text_field("kind", STRING | STORED);
let field_body = builder.add_text_field("body", TEXT | STORED);
let schema = builder.build();
fs::create_dir_all(index_dir())?;
let directory = MmapDirectory::open(index_dir())?;
let index = Index::open_or_create(directory, schema)?;
let mut writer = index.writer(10_000_000)?;
writer.delete_all_documents()?;
for doc in Document::all()? {
for tag in doc.tags {
let name = doc.content.get(tag.name_range).unwrap();
if name.len() < 5 {
continue;
}
let body = doc.content.get(tag.range).unwrap();
let count_body_lines = body.lines().count();
if !(3..=10).contains(&count_body_lines) {
continue;
}
if let Some(blacklist) = LANGUAGE_NAME_BLACKLIST.get(doc.language.as_str()) {
if blacklist.contains(&name) {
continue;
}
}
let language = reduce_language_if_needed(&doc.language);
writer.add_document(doc!(
field_git_url => doc.git_url.clone(),
field_filepath => doc.filepath.clone(),
field_language => language,
field_name => name,
field_body => body,
field_kind => tag.syntax_type_name,
))?;
}
}
writer.commit()?;
Ok(())
}
fn reduce_language_if_needed(language: &str) -> &str {
if ["javascript", "jsx", "typescript", "tsx"].contains(&language) {
"javascript-typescript"
} else {
language
}
}
lazy_static! {
static ref LANGUAGE_NAME_BLACKLIST: HashMap<&'static str, Vec<&'static str>> =
HashMap::from([("python", vec!["__init__"])]);
}