feat(scheduler): add a tqdm bar for scheduler job to better present the remaining time. (#754)
* feat(scheduler): add a tqdm bar for scheduler job to better present the remaining time. * update * add changelogrefactor-extract-code
parent
3600ef77fc
commit
73a76a3d8e
|
|
@ -2,6 +2,9 @@
|
|||
|
||||
## Features
|
||||
|
||||
## Fixes and Improvements
|
||||
* Fix the slow repository indexing due to constraint memory arena in tantivy index writer.
|
||||
|
||||
# v0.5.5
|
||||
|
||||
## Fixes and Improvements
|
||||
|
|
|
|||
|
|
@ -2041,6 +2041,16 @@ dependencies = [
|
|||
"wasm-bindgen",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kdam"
|
||||
version = "0.5.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e352f4e1acc6a3d0919eaeb014ca63e5da9450a12ef7106fe2936a07a1648d44"
|
||||
dependencies = [
|
||||
"terminal_size",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "kv-log-macro"
|
||||
version = "1.0.7"
|
||||
|
|
@ -4152,6 +4162,7 @@ dependencies = [
|
|||
"file-rotate",
|
||||
"ignore",
|
||||
"job_scheduler",
|
||||
"kdam",
|
||||
"lazy_static",
|
||||
"serde-jsonlines",
|
||||
"serde_json",
|
||||
|
|
@ -4360,6 +4371,16 @@ dependencies = [
|
|||
"winapi-util",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "terminal_size"
|
||||
version = "0.3.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "21bebf2b7c9e0a515f6e0f8c51dc0f8e4696391e6f1ff30379559f8365fb0df7"
|
||||
dependencies = [
|
||||
"rustix 0.38.17",
|
||||
"windows-sys 0.48.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "textdistance"
|
||||
version = "1.0.2"
|
||||
|
|
|
|||
|
|
@ -34,4 +34,4 @@ futures = "0.3.28"
|
|||
async-stream = "0.3.5"
|
||||
regex = "1.10.0"
|
||||
thiserror = "1.0.49"
|
||||
utoipa = "3.3"
|
||||
utoipa = "3.3"
|
||||
|
|
|
|||
|
|
@ -22,6 +22,7 @@ tree-sitter-typescript = "0.20.3"
|
|||
tree-sitter-go = "0.20.0"
|
||||
tree-sitter-ruby= "0.20.0"
|
||||
ignore = "0.4.20"
|
||||
kdam = { version = "0.5.0" }
|
||||
|
||||
[dev-dependencies]
|
||||
temp_testdir = "0.2"
|
||||
|
|
|
|||
|
|
@ -8,6 +8,7 @@ use std::{
|
|||
use anyhow::Result;
|
||||
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
|
||||
use ignore::{DirEntry, Walk};
|
||||
use kdam::BarExt;
|
||||
use lazy_static::lazy_static;
|
||||
use serde_jsonlines::WriteExt;
|
||||
use tabby_common::{
|
||||
|
|
@ -15,9 +16,11 @@ use tabby_common::{
|
|||
path::dataset_dir,
|
||||
SourceFile,
|
||||
};
|
||||
use tracing::{error, info};
|
||||
use tracing::error;
|
||||
use tree_sitter_tags::{TagsConfiguration, TagsContext};
|
||||
|
||||
use crate::utils::tqdm;
|
||||
|
||||
trait RepositoryExt {
|
||||
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
|
||||
}
|
||||
|
|
@ -26,19 +29,24 @@ impl RepositoryExt for RepositoryConfig {
|
|||
fn create_dataset(&self, writer: &mut impl Write) -> Result<()> {
|
||||
let dir = self.dir();
|
||||
|
||||
info!("Start indexing repository {}", self.git_url);
|
||||
let walk_dir = Walk::new(dir.as_path())
|
||||
.filter_map(Result::ok)
|
||||
.filter(is_source_code);
|
||||
let walk_dir_iter = || {
|
||||
Walk::new(dir.as_path())
|
||||
.filter_map(Result::ok)
|
||||
.filter(is_source_code)
|
||||
};
|
||||
|
||||
let mut pb = tqdm(walk_dir_iter().count());
|
||||
let walk_dir = walk_dir_iter();
|
||||
|
||||
let mut context = TagsContext::new();
|
||||
for entry in walk_dir {
|
||||
pb.update(1)?;
|
||||
|
||||
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
|
||||
let language = get_language(relative_path.extension().unwrap())
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
if let Ok(file_content) = read_to_string(entry.path()) {
|
||||
info!("Building {:?}", relative_path);
|
||||
let source_file = SourceFile {
|
||||
git_url: self.git_url.clone(),
|
||||
filepath: relative_path.display().to_string(),
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
use std::fs;
|
||||
|
||||
use anyhow::Result;
|
||||
use kdam::BarExt;
|
||||
use tabby_common::{
|
||||
config::Config,
|
||||
index::{register_tokenizers, CodeSearchSchema},
|
||||
|
|
@ -9,6 +10,8 @@ use tabby_common::{
|
|||
};
|
||||
use tantivy::{directory::MmapDirectory, doc, Index};
|
||||
|
||||
use crate::utils::tqdm;
|
||||
|
||||
// Magic numbers
|
||||
static MAX_LINE_LENGTH_THRESHOLD: usize = 300;
|
||||
static AVG_LINE_LENGTH_THRESHOLD: f32 = 150f32;
|
||||
|
|
@ -22,10 +25,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
|||
let index = Index::open_or_create(directory, code.schema)?;
|
||||
register_tokenizers(&index);
|
||||
|
||||
// Initialize the search index writer with an initial arena size of 150 MB.
|
||||
let mut writer = index.writer(150_000_000)?;
|
||||
writer.delete_all_documents()?;
|
||||
|
||||
let mut pb = tqdm(SourceFile::all()?.count());
|
||||
for file in SourceFile::all()? {
|
||||
pb.update(1)?;
|
||||
if file.max_line_length > MAX_LINE_LENGTH_THRESHOLD {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -47,6 +53,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
|||
}
|
||||
|
||||
writer.commit()?;
|
||||
writer.wait_merging_threads()?;
|
||||
|
||||
Ok(())
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,6 +1,7 @@
|
|||
mod dataset;
|
||||
mod index;
|
||||
mod repository;
|
||||
mod utils;
|
||||
|
||||
use anyhow::Result;
|
||||
use job_scheduler::{Job, JobScheduler};
|
||||
|
|
@ -12,26 +13,28 @@ pub async fn scheduler(now: bool) -> Result<()> {
|
|||
let mut scheduler = JobScheduler::new();
|
||||
|
||||
let job1 = || {
|
||||
info!("Syncing repositories...");
|
||||
println!("Syncing repositories...");
|
||||
let ret = repository::sync_repositories(&config);
|
||||
if let Err(err) = ret {
|
||||
error!("Failed to sync repositories, err: '{}'", err);
|
||||
return;
|
||||
}
|
||||
|
||||
info!("Building dataset...");
|
||||
println!("Building dataset...");
|
||||
let ret = dataset::create_dataset(&config);
|
||||
if let Err(err) = ret {
|
||||
error!("Failed to build dataset, err: '{}'", err);
|
||||
}
|
||||
println!();
|
||||
};
|
||||
|
||||
let job2 = || {
|
||||
info!("Indexing repositories...");
|
||||
println!("Indexing repositories...");
|
||||
let ret = index::index_repositories(&config);
|
||||
if let Err(err) = ret {
|
||||
error!("Failed to index repositories, err: '{}'", err);
|
||||
}
|
||||
println!()
|
||||
};
|
||||
|
||||
if now {
|
||||
|
|
|
|||
|
|
@ -0,0 +1,5 @@
|
|||
use kdam::{tqdm, Bar};
|
||||
|
||||
pub fn tqdm(total: usize) -> Bar {
|
||||
tqdm!(total = total, ncols = 40, force_refresh = true)
|
||||
}
|
||||
Loading…
Reference in New Issue