feat: switching index to tag based (#265)

* add tag based index

* update for python

* fix lint
sweep/improve-logging-information
Meng Zhang 2023-07-13 14:25:39 +08:00 committed by GitHub
parent be90047477
commit 24d17dc1ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 50 deletions

11
Cargo.lock generated
View File

@ -2847,7 +2847,6 @@ dependencies = [
"temp_testdir",
"tracing",
"tracing-test",
"tree-sitter-java",
"tree-sitter-javascript",
"tree-sitter-python",
"tree-sitter-tags",
@ -3507,16 +3506,6 @@ dependencies = [
"regex",
]
[[package]]
name = "tree-sitter-java"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0bf5d3f508cbffcbfe1805834101c0d24297a8b6c2184ad9c595556c46d2420"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-javascript"
version = "0.20.0"

View File

@ -19,7 +19,6 @@ lazy_static = { workspace = true }
serde = { workspace = true }
serde-jsonlines = { workspace = true }
file-rotate = "0.7.5"
tree-sitter-java = "0.20.0"
tree-sitter-python = "0.20.2"
[dev-dependencies]

View File

@ -207,7 +207,8 @@ lazy_static! {
("sql", vec!["sql"]),
("scala", vec!["scala"]),
("shellscript", vec!["sh", "bash", "command", "zsh"]),
("typescript", vec!["ts", "tsx"]),
("typescript", vec!["ts"]),
("tsx", vec!["tsx"]),
("tex", vec!["tex"]),
("vb", vec!["vb"]),
])
@ -223,19 +224,7 @@ lazy_static! {
map
};
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
HashMap::from([
(
"javascript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_javascript::language(),
tree_sitter_javascript::TAGGING_QUERY,
tree_sitter_javascript::LOCALS_QUERY,
)
.unwrap(),
),
),
(
HashMap::from([(
"python",
TagsConfigurationSync(
TagsConfiguration::new(
@ -245,18 +234,6 @@ lazy_static! {
)
.unwrap(),
),
),
(
"java",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_java::language(),
tree_sitter_java::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
])
)])
};
}

View File

@ -1,6 +1,7 @@
use std::fs;
use std::{collections::HashMap, fs};
use anyhow::Result;
use lazy_static::lazy_static;
use tabby_common::{config::Config, path::index_dir, Document};
use tantivy::{
directory::MmapDirectory,
@ -11,8 +12,8 @@ use tantivy::{
pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder();
let git_url = builder.add_text_field("git_url", STRING | STORED);
let filepath = builder.add_text_field("filepath", STRING | STORED);
let git_url = builder.add_text_field("name", STRING | STORED);
let filepath = builder.add_text_field("body", STRING | STORED);
let content = builder.add_text_field("content", TEXT | STORED);
let language = builder.add_text_field("language", TEXT | STORED);
let schema = builder.build();
@ -24,6 +25,24 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
writer.delete_all_documents()?;
for doc in Document::all()? {
for tag in doc.tags {
let name = doc.content.get(tag.name_range).unwrap();
if name.len() < 5 {
continue;
}
let body = doc.content.get(tag.range).unwrap();
let count_body_lines = body.lines().count();
if !(3..=10).contains(&count_body_lines) {
continue;
}
if let Some(blacklist) = LANGUAGE_NAME_BLACKLIST.get(doc.language.as_str()) {
if blacklist.contains(&name) {
continue;
}
}
}
writer.add_document(doc!(
git_url => doc.git_url,
filepath => doc.filepath,
@ -36,3 +55,8 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
Ok(())
}
lazy_static! {
static ref LANGUAGE_NAME_BLACKLIST: HashMap<&'static str, Vec<&'static str>> =
HashMap::from([("python", vec!["__init__"])]);
}