From 9af6607f85acf6b207882103ba841f7cb7363e21 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 12 Jun 2023 08:19:43 -0700 Subject: [PATCH] feat: add dataset building (#231) * feat: build dataset in scheduler * add file rotate --- Cargo.lock | 24 +++++ crates/tabby-common/src/path.rs | 4 + crates/tabby-scheduler/Cargo.toml | 4 + crates/tabby-scheduler/src/dataset.rs | 149 ++++++++++++++++++++++++++ crates/tabby-scheduler/src/lib.rs | 17 ++- 5 files changed, 195 insertions(+), 3 deletions(-) create mode 100644 crates/tabby-scheduler/src/dataset.rs diff --git a/Cargo.lock b/Cargo.lock index 19dea27..97d96fb 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -885,6 +885,16 @@ dependencies = [ "instant", ] +[[package]] +name = "file-rotate" +version = "0.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ddf221ceec4517f3cb764dae3541b2bd87666fc8832e51322fbb97250b468c71" +dependencies = [ + "chrono", + "flate2", +] + [[package]] name = "filenamify" version = "0.1.0" @@ -2527,6 +2537,16 @@ dependencies = [ "serde_derive", ] +[[package]] +name = "serde-jsonlines" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4684abdec99c1de7e507a3516c82385ff74d54c385973846b079bfd9f5920d35" +dependencies = [ + "serde", + "serde_json", +] + [[package]] name = "serde_derive" version = "1.0.163" @@ -2814,8 +2834,12 @@ name = "tabby-scheduler" version = "0.1.0" dependencies = [ "anyhow", + "file-rotate", "filenamify", "job_scheduler", + "lazy_static", + "serde", + "serde-jsonlines", "tabby-common", "tantivy", "temp_testdir", diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index f17ff54..c6e2fec 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -35,6 +35,10 @@ pub fn index_dir() -> PathBuf { tabby_root().join("index") } +pub fn dataset_dir() -> PathBuf { + tabby_root().join("dataset") +} + pub fn models_dir() -> PathBuf { tabby_root().join("models") } diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index fb97f8c..ba105b8 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -13,6 +13,10 @@ tabby-common = { path = "../tabby-common" } tantivy = "0.19.2" tracing = { workspace = true } walkdir = "2.3.3" +lazy_static = { workspace = true } +serde = { workspace = true } +serde-jsonlines = "0.4.0" +file-rotate = "0.7.5" [dev-dependencies] temp_testdir = "0.2" diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs new file mode 100644 index 0000000..228321b --- /dev/null +++ b/crates/tabby-scheduler/src/dataset.rs @@ -0,0 +1,149 @@ +use std::{ + collections::HashMap, + ffi::OsStr, + fs::{self, read_to_string}, + io::Write, +}; + +use anyhow::Result; +use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; +use lazy_static::lazy_static; +use serde::Serialize; +use serde_jsonlines::WriteExt; +use tabby_common::{ + config::{Config, Repository}, + path::dataset_dir, +}; +use tracing::{error, info}; +use walkdir::{DirEntry, WalkDir}; + +lazy_static! { + static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { + HashMap::from([ + ("c", vec!["c", "h"]), + ("csharp", vec!["cs"]), + ( + "cpp", + vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"], + ), + ("css", vec!["css"]), + ("dockerfile", vec!["Dockerfile"]), + ("go", vec!["go"]), + ("haskell", vec!["hs"]), + ("html", vec!["html"]), + ("java", vec!["java"]), + ("javascript", vec!["js"]), + ("julia", vec!["jl"]), + ("lua", vec!["lua"]), + ("makefile", vec!["Makefile"]), + ("markdown", vec!["md", "markdown"]), + ("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]), + ("perl", vec!["pl", "pm", "pod", "perl"]), + ("powershell", vec!["ps1", "psd1", "psm1"]), + ("python", vec!["py"]), + ("ruby", vec!["rb"]), + ("rust", vec!["rs"]), + ("sql", vec!["sql"]), + ("scala", vec!["scala"]), + ("shellscript", vec!["sh", "bash", "command", "zsh"]), + ("typescript", vec!["ts", "tsx"]), + ("tex", vec!["tex"]), + ("vb", vec!["vb"]), + ]) + }; + static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = { + let mut map = HashMap::new(); + for (lang, exts) in &*LANGUAGE_EXTENSION { + for ext in exts { + map.insert(*ext, *lang); + } + } + + map + }; +} + +trait RepositoryExt { + fn create_dataset(&self, writer: &mut impl Write) -> Result<()>; +} + +impl RepositoryExt for Repository { + fn create_dataset(&self, writer: &mut impl Write) -> Result<()> { + let dir = self.dir(); + + info!("Start indexing repository {}", self.git_url); + let walk_dir = WalkDir::new(dir.as_path()) + .into_iter() + .filter_entry(is_not_hidden) + .filter_map(Result::ok) + .filter(is_source_code); + + for entry in walk_dir { + let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap(); + if let Ok(file_content) = read_to_string(entry.path()) { + info!("Building {:?}", relative_path); + writer.write_json_lines([Document { + git_url: self.git_url.clone(), + filepath: relative_path.display().to_string(), + content: file_content, + language: get_language(relative_path.extension().unwrap()) + .unwrap() + .to_owned(), + }])?; + } else { + error!("Cannot read {:?}", relative_path); + } + } + + Ok(()) + } +} + +#[derive(Serialize)] +struct Document { + git_url: String, + filepath: String, + content: String, + language: String, +} + +fn get_language(ext: &OsStr) -> Option<&str> { + let ext = ext.to_str().unwrap_or(""); + EXTENSION_LANGUAGE.get(ext).copied() +} + +fn is_source_code(entry: &DirEntry) -> bool { + if entry.file_type().is_file() { + entry.path().extension().and_then(get_language).is_some() + } else { + false + } +} + +fn is_not_hidden(entry: &DirEntry) -> bool { + entry + .file_name() + .to_str() + .map(|s| entry.depth() == 0 || !s.starts_with('.')) + .unwrap_or(false) +} + +pub fn create_dataset(config: &Config) -> Result<()> { + fs::remove_dir_all(dataset_dir()).ok(); + fs::create_dir_all(dataset_dir())?; + let mut writer = FileRotate::new( + dataset_dir().join("data.jsonl"), + AppendCount::new(usize::max_value()), + ContentLimit::Lines(1000), + Compression::None, + #[cfg(unix)] + None, + ); + + for repository in config.repositories.as_slice() { + repository.create_dataset(&mut writer)?; + } + + writer.flush()?; + Ok(()) +} diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 6e39a34..5597470 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -1,3 +1,4 @@ +mod dataset; mod index; mod repository; @@ -18,10 +19,19 @@ pub async fn scheduler(now: bool) -> Result<()> { return; } - info!("Indexing repositories..."); - let ret = index::index_repositories(&config); + info!("Building dataset..."); + let ret = dataset::create_dataset(&config); if let Err(err) = ret { - error!("Failed to index repositories, err: '{}'", err); + error!("Failed to build dataset, err: '{}'", err); + return; + } + + if false { + info!("Indexing repositories..."); + let ret = index::index_repositories(&config); + if let Err(err) = ret { + error!("Failed to index repositories, err: '{}'", err); + } } }; @@ -66,6 +76,7 @@ mod tests { }; repository::sync_repositories(&config).unwrap(); + dataset::create_dataset(&config).unwrap(); index::index_repositories(&config).unwrap(); } }