feat: switching index to tag based (#265)

* add tag based index

* update for python

* fix lint
sweep/improve-logging-information
Meng Zhang 2023-07-13 14:25:39 +08:00 committed by GitHub
parent be90047477
commit 24d17dc1ed
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 39 additions and 50 deletions

11
Cargo.lock generated
View File

@ -2847,7 +2847,6 @@ dependencies = [
"temp_testdir", "temp_testdir",
"tracing", "tracing",
"tracing-test", "tracing-test",
"tree-sitter-java",
"tree-sitter-javascript", "tree-sitter-javascript",
"tree-sitter-python", "tree-sitter-python",
"tree-sitter-tags", "tree-sitter-tags",
@ -3507,16 +3506,6 @@ dependencies = [
"regex", "regex",
] ]
[[package]]
name = "tree-sitter-java"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0bf5d3f508cbffcbfe1805834101c0d24297a8b6c2184ad9c595556c46d2420"
dependencies = [
"cc",
"tree-sitter",
]
[[package]] [[package]]
name = "tree-sitter-javascript" name = "tree-sitter-javascript"
version = "0.20.0" version = "0.20.0"

View File

@ -19,7 +19,6 @@ lazy_static = { workspace = true }
serde = { workspace = true } serde = { workspace = true }
serde-jsonlines = { workspace = true } serde-jsonlines = { workspace = true }
file-rotate = "0.7.5" file-rotate = "0.7.5"
tree-sitter-java = "0.20.0"
tree-sitter-python = "0.20.2" tree-sitter-python = "0.20.2"
[dev-dependencies] [dev-dependencies]

View File

@ -207,7 +207,8 @@ lazy_static! {
("sql", vec!["sql"]), ("sql", vec!["sql"]),
("scala", vec!["scala"]), ("scala", vec!["scala"]),
("shellscript", vec!["sh", "bash", "command", "zsh"]), ("shellscript", vec!["sh", "bash", "command", "zsh"]),
("typescript", vec!["ts", "tsx"]), ("typescript", vec!["ts"]),
("tsx", vec!["tsx"]),
("tex", vec!["tex"]), ("tex", vec!["tex"]),
("vb", vec!["vb"]), ("vb", vec!["vb"]),
]) ])
@ -223,19 +224,7 @@ lazy_static! {
map map
}; };
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = { static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
HashMap::from([ HashMap::from([(
(
"javascript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_javascript::language(),
tree_sitter_javascript::TAGGING_QUERY,
tree_sitter_javascript::LOCALS_QUERY,
)
.unwrap(),
),
),
(
"python", "python",
TagsConfigurationSync( TagsConfigurationSync(
TagsConfiguration::new( TagsConfiguration::new(
@ -245,18 +234,6 @@ lazy_static! {
) )
.unwrap(), .unwrap(),
), ),
), )])
(
"java",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_java::language(),
tree_sitter_java::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
])
}; };
} }

View File

@ -1,6 +1,7 @@
use std::fs; use std::{collections::HashMap, fs};
use anyhow::Result; use anyhow::Result;
use lazy_static::lazy_static;
use tabby_common::{config::Config, path::index_dir, Document}; use tabby_common::{config::Config, path::index_dir, Document};
use tantivy::{ use tantivy::{
directory::MmapDirectory, directory::MmapDirectory,
@ -11,8 +12,8 @@ use tantivy::{
pub fn index_repositories(_config: &Config) -> Result<()> { pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
let git_url = builder.add_text_field("git_url", STRING | STORED); let git_url = builder.add_text_field("name", STRING | STORED);
let filepath = builder.add_text_field("filepath", STRING | STORED); let filepath = builder.add_text_field("body", STRING | STORED);
let content = builder.add_text_field("content", TEXT | STORED); let content = builder.add_text_field("content", TEXT | STORED);
let language = builder.add_text_field("language", TEXT | STORED); let language = builder.add_text_field("language", TEXT | STORED);
let schema = builder.build(); let schema = builder.build();
@ -24,6 +25,24 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
writer.delete_all_documents()?; writer.delete_all_documents()?;
for doc in Document::all()? { for doc in Document::all()? {
for tag in doc.tags {
let name = doc.content.get(tag.name_range).unwrap();
if name.len() < 5 {
continue;
}
let body = doc.content.get(tag.range).unwrap();
let count_body_lines = body.lines().count();
if !(3..=10).contains(&count_body_lines) {
continue;
}
if let Some(blacklist) = LANGUAGE_NAME_BLACKLIST.get(doc.language.as_str()) {
if blacklist.contains(&name) {
continue;
}
}
}
writer.add_document(doc!( writer.add_document(doc!(
git_url => doc.git_url, git_url => doc.git_url,
filepath => doc.filepath, filepath => doc.filepath,
@ -36,3 +55,8 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
Ok(()) Ok(())
} }
lazy_static! {
static ref LANGUAGE_NAME_BLACKLIST: HashMap<&'static str, Vec<&'static str>> =
HashMap::from([("python", vec!["__init__"])]);
}