diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e7d205..097d4f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,9 @@ ## Features +## Fixes and Improvements +* Fix the slow repository indexing due to constraint memory arena in tantivy index writer. + # v0.5.5 ## Fixes and Improvements diff --git a/Cargo.lock b/Cargo.lock index 609ed6c..21de5d3 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2041,6 +2041,16 @@ dependencies = [ "wasm-bindgen", ] +[[package]] +name = "kdam" +version = "0.5.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e352f4e1acc6a3d0919eaeb014ca63e5da9450a12ef7106fe2936a07a1648d44" +dependencies = [ + "terminal_size", + "windows-sys 0.48.0", +] + [[package]] name = "kv-log-macro" version = "1.0.7" @@ -4152,6 +4162,7 @@ dependencies = [ "file-rotate", "ignore", "job_scheduler", + "kdam", "lazy_static", "serde-jsonlines", "serde_json", @@ -4360,6 +4371,16 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "terminal_size" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7" +dependencies = [ + "rustix 0.38.17", + "windows-sys 0.48.0", +] + [[package]] name = "textdistance" version = "1.0.2" diff --git a/Cargo.toml b/Cargo.toml index dae1755..ec445aa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -34,4 +34,4 @@ futures = "0.3.28" async-stream = "0.3.5" regex = "1.10.0" thiserror = "1.0.49" -utoipa = "3.3" \ No newline at end of file +utoipa = "3.3" diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index e0c701c..912dd02 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -22,6 +22,7 @@ tree-sitter-typescript = "0.20.3" tree-sitter-go = "0.20.0" tree-sitter-ruby= "0.20.0" ignore = "0.4.20" +kdam = { version = "0.5.0" } [dev-dependencies] temp_testdir = "0.2" diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index 9366e86..bddbb97 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -8,6 +8,7 @@ use std::{ use anyhow::Result; use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; use ignore::{DirEntry, Walk}; +use kdam::BarExt; use lazy_static::lazy_static; use serde_jsonlines::WriteExt; use tabby_common::{ @@ -15,9 +16,11 @@ use tabby_common::{ path::dataset_dir, SourceFile, }; -use tracing::{error, info}; +use tracing::error; use tree_sitter_tags::{TagsConfiguration, TagsContext}; +use crate::utils::tqdm; + trait RepositoryExt { fn create_dataset(&self, writer: &mut impl Write) -> Result<()>; } @@ -26,19 +29,24 @@ impl RepositoryExt for RepositoryConfig { fn create_dataset(&self, writer: &mut impl Write) -> Result<()> { let dir = self.dir(); - info!("Start indexing repository {}", self.git_url); - let walk_dir = Walk::new(dir.as_path()) - .filter_map(Result::ok) - .filter(is_source_code); + let walk_dir_iter = || { + Walk::new(dir.as_path()) + .filter_map(Result::ok) + .filter(is_source_code) + }; + + let mut pb = tqdm(walk_dir_iter().count()); + let walk_dir = walk_dir_iter(); let mut context = TagsContext::new(); for entry in walk_dir { + pb.update(1)?; + let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap(); let language = get_language(relative_path.extension().unwrap()) .unwrap() .to_owned(); if let Ok(file_content) = read_to_string(entry.path()) { - info!("Building {:?}", relative_path); let source_file = SourceFile { git_url: self.git_url.clone(), filepath: relative_path.display().to_string(), diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index 85feba8..f9de49e 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -1,6 +1,7 @@ use std::fs; use anyhow::Result; +use kdam::BarExt; use tabby_common::{ config::Config, index::{register_tokenizers, CodeSearchSchema}, @@ -9,6 +10,8 @@ use tabby_common::{ }; use tantivy::{directory::MmapDirectory, doc, Index}; +use crate::utils::tqdm; + // Magic numbers static MAX_LINE_LENGTH_THRESHOLD: usize = 300; static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; @@ -22,10 +25,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> { let index = Index::open_or_create(directory, code.schema)?; register_tokenizers(&index); + // Initialize the search index writer with an initial arena size of 150 MB. let mut writer = index.writer(150_000_000)?; writer.delete_all_documents()?; + let mut pb = tqdm(SourceFile::all()?.count()); for file in SourceFile::all()? { + pb.update(1)?; if file.max_line_length > MAX_LINE_LENGTH_THRESHOLD { continue; } @@ -47,6 +53,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> { } writer.commit()?; + writer.wait_merging_threads()?; Ok(()) } diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index f9d12c5..b15c250 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -1,6 +1,7 @@ mod dataset; mod index; mod repository; +mod utils; use anyhow::Result; use job_scheduler::{Job, JobScheduler}; @@ -12,26 +13,28 @@ pub async fn scheduler(now: bool) -> Result<()> { let mut scheduler = JobScheduler::new(); let job1 = || { - info!("Syncing repositories..."); + println!("Syncing repositories..."); let ret = repository::sync_repositories(&config); if let Err(err) = ret { error!("Failed to sync repositories, err: '{}'", err); return; } - info!("Building dataset..."); + println!("Building dataset..."); let ret = dataset::create_dataset(&config); if let Err(err) = ret { error!("Failed to build dataset, err: '{}'", err); } + println!(); }; let job2 = || { - info!("Indexing repositories..."); + println!("Indexing repositories..."); let ret = index::index_repositories(&config); if let Err(err) = ret { error!("Failed to index repositories, err: '{}'", err); } + println!() }; if now { diff --git a/crates/tabby-scheduler/src/utils.rs b/crates/tabby-scheduler/src/utils.rs new file mode 100644 index 0000000..b0f3b82 --- /dev/null +++ b/crates/tabby-scheduler/src/utils.rs @@ -0,0 +1,5 @@ +use kdam::{tqdm, Bar}; + +pub fn tqdm(total: usize) -> Bar { + tqdm!(total = total, ncols = 40, force_refresh = true) +}