feat(scheduler): add a tqdm bar for scheduler job to better present the remaining time. (#754)

* feat(scheduler): add a tqdm bar for scheduler job to better present the
remaining time.

* update

* add changelog
refactor-extract-code
Meng Zhang 2023-11-10 11:52:07 -08:00 committed by GitHub
parent 3600ef77fc
commit 73a76a3d8e
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
8 changed files with 58 additions and 10 deletions

View File

@ -2,6 +2,9 @@
## Features
## Fixes and Improvements
* Fix the slow repository indexing due to constraint memory arena in tantivy index writer.
# v0.5.5
## Fixes and Improvements

21
Cargo.lock generated
View File

@ -2041,6 +2041,16 @@ dependencies = [
"wasm-bindgen",
]
[[package]]
name = "kdam"
version = "0.5.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e352f4e1acc6a3d0919eaeb014ca63e5da9450a12ef7106fe2936a07a1648d44"
dependencies = [
"terminal_size",
"windows-sys 0.48.0",
]
[[package]]
name = "kv-log-macro"
version = "1.0.7"
@ -4152,6 +4162,7 @@ dependencies = [
"file-rotate",
"ignore",
"job_scheduler",
"kdam",
"lazy_static",
"serde-jsonlines",
"serde_json",
@ -4360,6 +4371,16 @@ dependencies = [
"winapi-util",
]
[[package]]
name = "terminal_size"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
dependencies = [
"rustix 0.38.17",
"windows-sys 0.48.0",
]
[[package]]
name = "textdistance"
version = "1.0.2"

View File

@ -34,4 +34,4 @@ futures = "0.3.28"
async-stream = "0.3.5"
regex = "1.10.0"
thiserror = "1.0.49"
utoipa = "3.3"
utoipa = "3.3"

View File

@ -22,6 +22,7 @@ tree-sitter-typescript = "0.20.3"
tree-sitter-go = "0.20.0"
tree-sitter-ruby= "0.20.0"
ignore = "0.4.20"
kdam = { version = "0.5.0" }
[dev-dependencies]
temp_testdir = "0.2"

View File

@ -8,6 +8,7 @@ use std::{
use anyhow::Result;
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
use ignore::{DirEntry, Walk};
use kdam::BarExt;
use lazy_static::lazy_static;
use serde_jsonlines::WriteExt;
use tabby_common::{
@ -15,9 +16,11 @@ use tabby_common::{
path::dataset_dir,
SourceFile,
};
use tracing::{error, info};
use tracing::error;
use tree_sitter_tags::{TagsConfiguration, TagsContext};
use crate::utils::tqdm;
trait RepositoryExt {
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
}
@ -26,19 +29,24 @@ impl RepositoryExt for RepositoryConfig {
fn create_dataset(&self, writer: &mut impl Write) -> Result<()> {
let dir = self.dir();
info!("Start indexing repository {}", self.git_url);
let walk_dir = Walk::new(dir.as_path())
.filter_map(Result::ok)
.filter(is_source_code);
let walk_dir_iter = || {
Walk::new(dir.as_path())
.filter_map(Result::ok)
.filter(is_source_code)
};
let mut pb = tqdm(walk_dir_iter().count());
let walk_dir = walk_dir_iter();
let mut context = TagsContext::new();
for entry in walk_dir {
pb.update(1)?;
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
let language = get_language(relative_path.extension().unwrap())
.unwrap()
.to_owned();
if let Ok(file_content) = read_to_string(entry.path()) {
info!("Building {:?}", relative_path);
let source_file = SourceFile {
git_url: self.git_url.clone(),
filepath: relative_path.display().to_string(),

View File

@ -1,6 +1,7 @@
use std::fs;
use anyhow::Result;
use kdam::BarExt;
use tabby_common::{
config::Config,
index::{register_tokenizers, CodeSearchSchema},
@ -9,6 +10,8 @@ use tabby_common::{
};
use tantivy::{directory::MmapDirectory, doc, Index};
use crate::utils::tqdm;
// Magic numbers
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
@ -22,10 +25,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
let index = Index::open_or_create(directory, code.schema)?;
register_tokenizers(&index);
// Initialize the search index writer with an initial arena size of 150 MB.
let mut writer = index.writer(150_000_000)?;
writer.delete_all_documents()?;
let mut pb = tqdm(SourceFile::all()?.count());
for file in SourceFile::all()? {
pb.update(1)?;
if file.max_line_length > MAX_LINE_LENGTH_THRESHOLD {
continue;
}
@ -47,6 +53,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
}
writer.commit()?;
writer.wait_merging_threads()?;
Ok(())
}

View File

@ -1,6 +1,7 @@
mod dataset;
mod index;
mod repository;
mod utils;
use anyhow::Result;
use job_scheduler::{Job, JobScheduler};
@ -12,26 +13,28 @@ pub async fn scheduler(now: bool) -> Result<()> {
let mut scheduler = JobScheduler::new();
let job1 = || {
info!("Syncing repositories...");
println!("Syncing repositories...");
let ret = repository::sync_repositories(&config);
if let Err(err) = ret {
error!("Failed to sync repositories, err: '{}'", err);
return;
}
info!("Building dataset...");
println!("Building dataset...");
let ret = dataset::create_dataset(&config);
if let Err(err) = ret {
error!("Failed to build dataset, err: '{}'", err);
}
println!();
};
let job2 = || {
info!("Indexing repositories...");
println!("Indexing repositories...");
let ret = index::index_repositories(&config);
if let Err(err) = ret {
error!("Failed to index repositories, err: '{}'", err);
}
println!()
};
if now {

View File

@ -0,0 +1,5 @@
use kdam::{tqdm, Bar};
pub fn tqdm(total: usize) -> Bar {
tqdm!(total = total, ncols = 40, force_refresh = true)
}