feat(scheduler): add a tqdm bar for scheduler job to better present the remaining time. (#754)

* feat(scheduler): add a tqdm bar for scheduler job to better present the
remaining time.

* update

* add changelog
refactor-extract-code
Meng Zhang 2023-11-10 11:52:07 -08:00 committed by GitHub
parent 3600ef77fc
commit 73a76a3d8e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 58 additions and 10 deletions

View File

@ -2,6 +2,9 @@
## Features ## Features
## Fixes and Improvements
* Fix the slow repository indexing due to constraint memory arena in tantivy index writer.
# v0.5.5 # v0.5.5
## Fixes and Improvements ## Fixes and Improvements

21
Cargo.lock generated
View File

@ -2041,6 +2041,16 @@ dependencies = [
"wasm-bindgen", "wasm-bindgen",
] ]
[[package]]
name = "kdam"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e352f4e1acc6a3d0919eaeb014ca63e5da9450a12ef7106fe2936a07a1648d44"
dependencies = [
"terminal_size",
"windows-sys 0.48.0",
]
[[package]] [[package]]
name = "kv-log-macro" name = "kv-log-macro"
version = "1.0.7" version = "1.0.7"
@ -4152,6 +4162,7 @@ dependencies = [
"file-rotate", "file-rotate",
"ignore", "ignore",
"job_scheduler", "job_scheduler",
"kdam",
"lazy_static", "lazy_static",
"serde-jsonlines", "serde-jsonlines",
"serde_json", "serde_json",
@ -4360,6 +4371,16 @@ dependencies = [
"winapi-util", "winapi-util",
] ]
[[package]]
name = "terminal_size"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
dependencies = [
"rustix 0.38.17",
"windows-sys 0.48.0",
]
[[package]] [[package]]
name = "textdistance" name = "textdistance"
version = "1.0.2" version = "1.0.2"

View File

@ -22,6 +22,7 @@ tree-sitter-typescript = "0.20.3"
tree-sitter-go = "0.20.0" tree-sitter-go = "0.20.0"
tree-sitter-ruby= "0.20.0" tree-sitter-ruby= "0.20.0"
ignore = "0.4.20" ignore = "0.4.20"
kdam = { version = "0.5.0" }
[dev-dependencies] [dev-dependencies]
temp_testdir = "0.2" temp_testdir = "0.2"

View File

@ -8,6 +8,7 @@ use std::{
use anyhow::Result; use anyhow::Result;
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
use ignore::{DirEntry, Walk}; use ignore::{DirEntry, Walk};
use kdam::BarExt;
use lazy_static::lazy_static; use lazy_static::lazy_static;
use serde_jsonlines::WriteExt; use serde_jsonlines::WriteExt;
use tabby_common::{ use tabby_common::{
@ -15,9 +16,11 @@ use tabby_common::{
path::dataset_dir, path::dataset_dir,
SourceFile, SourceFile,
}; };
use tracing::{error, info}; use tracing::error;
use tree_sitter_tags::{TagsConfiguration, TagsContext}; use tree_sitter_tags::{TagsConfiguration, TagsContext};
use crate::utils::tqdm;
trait RepositoryExt { trait RepositoryExt {
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>; fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
} }
@ -26,19 +29,24 @@ impl RepositoryExt for RepositoryConfig {
fn create_dataset(&self, writer: &mut impl Write) -> Result<()> { fn create_dataset(&self, writer: &mut impl Write) -> Result<()> {
let dir = self.dir(); let dir = self.dir();
info!("Start indexing repository {}", self.git_url); let walk_dir_iter = || {
let walk_dir = Walk::new(dir.as_path()) Walk::new(dir.as_path())
.filter_map(Result::ok) .filter_map(Result::ok)
.filter(is_source_code); .filter(is_source_code)
};
let mut pb = tqdm(walk_dir_iter().count());
let walk_dir = walk_dir_iter();
let mut context = TagsContext::new(); let mut context = TagsContext::new();
for entry in walk_dir { for entry in walk_dir {
pb.update(1)?;
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap(); let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
let language = get_language(relative_path.extension().unwrap()) let language = get_language(relative_path.extension().unwrap())
.unwrap() .unwrap()
.to_owned(); .to_owned();
if let Ok(file_content) = read_to_string(entry.path()) { if let Ok(file_content) = read_to_string(entry.path()) {
info!("Building {:?}", relative_path);
let source_file = SourceFile { let source_file = SourceFile {
git_url: self.git_url.clone(), git_url: self.git_url.clone(),
filepath: relative_path.display().to_string(), filepath: relative_path.display().to_string(),

View File

@ -1,6 +1,7 @@
use std::fs; use std::fs;
use anyhow::Result; use anyhow::Result;
use kdam::BarExt;
use tabby_common::{ use tabby_common::{
config::Config, config::Config,
index::{register_tokenizers, CodeSearchSchema}, index::{register_tokenizers, CodeSearchSchema},
@ -9,6 +10,8 @@ use tabby_common::{
}; };
use tantivy::{directory::MmapDirectory, doc, Index}; use tantivy::{directory::MmapDirectory, doc, Index};
use crate::utils::tqdm;
// Magic numbers // Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300; static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32; static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
@ -22,10 +25,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
let index = Index::open_or_create(directory, code.schema)?; let index = Index::open_or_create(directory, code.schema)?;
register_tokenizers(&index); register_tokenizers(&index);
// Initialize the search index writer with an initial arena size of 150 MB.
let mut writer = index.writer(150_000_000)?; let mut writer = index.writer(150_000_000)?;
writer.delete_all_documents()?; writer.delete_all_documents()?;
let mut pb = tqdm(SourceFile::all()?.count());
for file in SourceFile::all()? { for file in SourceFile::all()? {
pb.update(1)?;
if file.max_line_length > MAX_LINE_LENGTH_THRESHOLD { if file.max_line_length > MAX_LINE_LENGTH_THRESHOLD {
continue; continue;
} }
@ -47,6 +53,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
} }
writer.commit()?; writer.commit()?;
writer.wait_merging_threads()?;
Ok(()) Ok(())
} }

View File

@ -1,6 +1,7 @@
mod dataset; mod dataset;
mod index; mod index;
mod repository; mod repository;
mod utils;
use anyhow::Result; use anyhow::Result;
use job_scheduler::{Job, JobScheduler}; use job_scheduler::{Job, JobScheduler};
@ -12,26 +13,28 @@ pub async fn scheduler(now: bool) -> Result<()> {
let mut scheduler = JobScheduler::new(); let mut scheduler = JobScheduler::new();
let job1 = || { let job1 = || {
info!("Syncing repositories..."); println!("Syncing repositories...");
let ret = repository::sync_repositories(&config); let ret = repository::sync_repositories(&config);
if let Err(err) = ret { if let Err(err) = ret {
error!("Failed to sync repositories, err: '{}'", err); error!("Failed to sync repositories, err: '{}'", err);
return; return;
} }
info!("Building dataset..."); println!("Building dataset...");
let ret = dataset::create_dataset(&config); let ret = dataset::create_dataset(&config);
if let Err(err) = ret { if let Err(err) = ret {
error!("Failed to build dataset, err: '{}'", err); error!("Failed to build dataset, err: '{}'", err);
} }
println!();
}; };
let job2 = || { let job2 = || {
info!("Indexing repositories..."); println!("Indexing repositories...");
let ret = index::index_repositories(&config); let ret = index::index_repositories(&config);
if let Err(err) = ret { if let Err(err) = ret {
error!("Failed to index repositories, err: '{}'", err); error!("Failed to index repositories, err: '{}'", err);
} }
println!()
}; };
if now { if now {

View File

@ -0,0 +1,5 @@
use kdam::{tqdm, Bar};
pub fn tqdm(total: usize) -> Bar {
tqdm!(total = total, ncols = 40, force_refresh = true)
}