feat: add dataset building (#231)
* feat: build dataset in scheduler * add file rotateimprove-workflow
parent
795c34e9c4
commit
9af6607f85
|
|
@ -885,6 +885,16 @@ dependencies = [
|
||||||
"instant",
|
"instant",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "file-rotate"
|
||||||
|
version = "0.7.5"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "ddf221ceec4517f3cb764dae3541b2bd87666fc8832e51322fbb97250b468c71"
|
||||||
|
dependencies = [
|
||||||
|
"chrono",
|
||||||
|
"flate2",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "filenamify"
|
name = "filenamify"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
|
|
@ -2527,6 +2537,16 @@ dependencies = [
|
||||||
"serde_derive",
|
"serde_derive",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
[[package]]
|
||||||
|
name = "serde-jsonlines"
|
||||||
|
version = "0.4.0"
|
||||||
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||||
|
checksum = "4684abdec99c1de7e507a3516c82385ff74d54c385973846b079bfd9f5920d35"
|
||||||
|
dependencies = [
|
||||||
|
"serde",
|
||||||
|
"serde_json",
|
||||||
|
]
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "serde_derive"
|
name = "serde_derive"
|
||||||
version = "1.0.163"
|
version = "1.0.163"
|
||||||
|
|
@ -2814,8 +2834,12 @@ name = "tabby-scheduler"
|
||||||
version = "0.1.0"
|
version = "0.1.0"
|
||||||
dependencies = [
|
dependencies = [
|
||||||
"anyhow",
|
"anyhow",
|
||||||
|
"file-rotate",
|
||||||
"filenamify",
|
"filenamify",
|
||||||
"job_scheduler",
|
"job_scheduler",
|
||||||
|
"lazy_static",
|
||||||
|
"serde",
|
||||||
|
"serde-jsonlines",
|
||||||
"tabby-common",
|
"tabby-common",
|
||||||
"tantivy",
|
"tantivy",
|
||||||
"temp_testdir",
|
"temp_testdir",
|
||||||
|
|
|
||||||
|
|
@ -35,6 +35,10 @@ pub fn index_dir() -> PathBuf {
|
||||||
tabby_root().join("index")
|
tabby_root().join("index")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn dataset_dir() -> PathBuf {
|
||||||
|
tabby_root().join("dataset")
|
||||||
|
}
|
||||||
|
|
||||||
pub fn models_dir() -> PathBuf {
|
pub fn models_dir() -> PathBuf {
|
||||||
tabby_root().join("models")
|
tabby_root().join("models")
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -13,6 +13,10 @@ tabby-common = { path = "../tabby-common" }
|
||||||
tantivy = "0.19.2"
|
tantivy = "0.19.2"
|
||||||
tracing = { workspace = true }
|
tracing = { workspace = true }
|
||||||
walkdir = "2.3.3"
|
walkdir = "2.3.3"
|
||||||
|
lazy_static = { workspace = true }
|
||||||
|
serde = { workspace = true }
|
||||||
|
serde-jsonlines = "0.4.0"
|
||||||
|
file-rotate = "0.7.5"
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
temp_testdir = "0.2"
|
temp_testdir = "0.2"
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,149 @@
|
||||||
|
use std::{
|
||||||
|
collections::HashMap,
|
||||||
|
ffi::OsStr,
|
||||||
|
fs::{self, read_to_string},
|
||||||
|
io::Write,
|
||||||
|
};
|
||||||
|
|
||||||
|
use anyhow::Result;
|
||||||
|
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
|
||||||
|
use lazy_static::lazy_static;
|
||||||
|
use serde::Serialize;
|
||||||
|
use serde_jsonlines::WriteExt;
|
||||||
|
use tabby_common::{
|
||||||
|
config::{Config, Repository},
|
||||||
|
path::dataset_dir,
|
||||||
|
};
|
||||||
|
use tracing::{error, info};
|
||||||
|
use walkdir::{DirEntry, WalkDir};
|
||||||
|
|
||||||
|
lazy_static! {
|
||||||
|
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
|
||||||
|
HashMap::from([
|
||||||
|
("c", vec!["c", "h"]),
|
||||||
|
("csharp", vec!["cs"]),
|
||||||
|
(
|
||||||
|
"cpp",
|
||||||
|
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
|
||||||
|
),
|
||||||
|
("css", vec!["css"]),
|
||||||
|
("dockerfile", vec!["Dockerfile"]),
|
||||||
|
("go", vec!["go"]),
|
||||||
|
("haskell", vec!["hs"]),
|
||||||
|
("html", vec!["html"]),
|
||||||
|
("java", vec!["java"]),
|
||||||
|
("javascript", vec!["js"]),
|
||||||
|
("julia", vec!["jl"]),
|
||||||
|
("lua", vec!["lua"]),
|
||||||
|
("makefile", vec!["Makefile"]),
|
||||||
|
("markdown", vec!["md", "markdown"]),
|
||||||
|
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
|
||||||
|
("perl", vec!["pl", "pm", "pod", "perl"]),
|
||||||
|
("powershell", vec!["ps1", "psd1", "psm1"]),
|
||||||
|
("python", vec!["py"]),
|
||||||
|
("ruby", vec!["rb"]),
|
||||||
|
("rust", vec!["rs"]),
|
||||||
|
("sql", vec!["sql"]),
|
||||||
|
("scala", vec!["scala"]),
|
||||||
|
("shellscript", vec!["sh", "bash", "command", "zsh"]),
|
||||||
|
("typescript", vec!["ts", "tsx"]),
|
||||||
|
("tex", vec!["tex"]),
|
||||||
|
("vb", vec!["vb"]),
|
||||||
|
])
|
||||||
|
};
|
||||||
|
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
|
||||||
|
let mut map = HashMap::new();
|
||||||
|
for (lang, exts) in &*LANGUAGE_EXTENSION {
|
||||||
|
for ext in exts {
|
||||||
|
map.insert(*ext, *lang);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
map
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
trait RepositoryExt {
|
||||||
|
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
|
||||||
|
}
|
||||||
|
|
||||||
|
impl RepositoryExt for Repository {
|
||||||
|
fn create_dataset(&self, writer: &mut impl Write) -> Result<()> {
|
||||||
|
let dir = self.dir();
|
||||||
|
|
||||||
|
info!("Start indexing repository {}", self.git_url);
|
||||||
|
let walk_dir = WalkDir::new(dir.as_path())
|
||||||
|
.into_iter()
|
||||||
|
.filter_entry(is_not_hidden)
|
||||||
|
.filter_map(Result::ok)
|
||||||
|
.filter(is_source_code);
|
||||||
|
|
||||||
|
for entry in walk_dir {
|
||||||
|
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
|
||||||
|
if let Ok(file_content) = read_to_string(entry.path()) {
|
||||||
|
info!("Building {:?}", relative_path);
|
||||||
|
writer.write_json_lines([Document {
|
||||||
|
git_url: self.git_url.clone(),
|
||||||
|
filepath: relative_path.display().to_string(),
|
||||||
|
content: file_content,
|
||||||
|
language: get_language(relative_path.extension().unwrap())
|
||||||
|
.unwrap()
|
||||||
|
.to_owned(),
|
||||||
|
}])?;
|
||||||
|
} else {
|
||||||
|
error!("Cannot read {:?}", relative_path);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#[derive(Serialize)]
|
||||||
|
struct Document {
|
||||||
|
git_url: String,
|
||||||
|
filepath: String,
|
||||||
|
content: String,
|
||||||
|
language: String,
|
||||||
|
}
|
||||||
|
|
||||||
|
fn get_language(ext: &OsStr) -> Option<&str> {
|
||||||
|
let ext = ext.to_str().unwrap_or("");
|
||||||
|
EXTENSION_LANGUAGE.get(ext).copied()
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_source_code(entry: &DirEntry) -> bool {
|
||||||
|
if entry.file_type().is_file() {
|
||||||
|
entry.path().extension().and_then(get_language).is_some()
|
||||||
|
} else {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn is_not_hidden(entry: &DirEntry) -> bool {
|
||||||
|
entry
|
||||||
|
.file_name()
|
||||||
|
.to_str()
|
||||||
|
.map(|s| entry.depth() == 0 || !s.starts_with('.'))
|
||||||
|
.unwrap_or(false)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn create_dataset(config: &Config) -> Result<()> {
|
||||||
|
fs::remove_dir_all(dataset_dir()).ok();
|
||||||
|
fs::create_dir_all(dataset_dir())?;
|
||||||
|
let mut writer = FileRotate::new(
|
||||||
|
dataset_dir().join("data.jsonl"),
|
||||||
|
AppendCount::new(usize::max_value()),
|
||||||
|
ContentLimit::Lines(1000),
|
||||||
|
Compression::None,
|
||||||
|
#[cfg(unix)]
|
||||||
|
None,
|
||||||
|
);
|
||||||
|
|
||||||
|
for repository in config.repositories.as_slice() {
|
||||||
|
repository.create_dataset(&mut writer)?;
|
||||||
|
}
|
||||||
|
|
||||||
|
writer.flush()?;
|
||||||
|
Ok(())
|
||||||
|
}
|
||||||
|
|
@ -1,3 +1,4 @@
|
||||||
|
mod dataset;
|
||||||
mod index;
|
mod index;
|
||||||
mod repository;
|
mod repository;
|
||||||
|
|
||||||
|
|
@ -18,11 +19,20 @@ pub async fn scheduler(now: bool) -> Result<()> {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
info!("Building dataset...");
|
||||||
|
let ret = dataset::create_dataset(&config);
|
||||||
|
if let Err(err) = ret {
|
||||||
|
error!("Failed to build dataset, err: '{}'", err);
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
if false {
|
||||||
info!("Indexing repositories...");
|
info!("Indexing repositories...");
|
||||||
let ret = index::index_repositories(&config);
|
let ret = index::index_repositories(&config);
|
||||||
if let Err(err) = ret {
|
if let Err(err) = ret {
|
||||||
error!("Failed to index repositories, err: '{}'", err);
|
error!("Failed to index repositories, err: '{}'", err);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
if now {
|
if now {
|
||||||
|
|
@ -66,6 +76,7 @@ mod tests {
|
||||||
};
|
};
|
||||||
|
|
||||||
repository::sync_repositories(&config).unwrap();
|
repository::sync_repositories(&config).unwrap();
|
||||||
|
dataset::create_dataset(&config).unwrap();
|
||||||
index::index_repositories(&config).unwrap();
|
index::index_repositories(&config).unwrap();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue