diff --git a/Makefile b/Makefile index 192e894..30e9e41 100644 --- a/Makefile +++ b/Makefile @@ -3,3 +3,6 @@ smoke: loadtest: k6 run tests/*.loadtest.js + +fix: + cargo clippy --fix --allow-dirty --allow-staged && cargo +nightly fmt diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index 228321b..7941432 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -8,7 +8,6 @@ use std::{ use anyhow::Result; use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; use lazy_static::lazy_static; -use serde::Serialize; use serde_jsonlines::WriteExt; use tabby_common::{ config::{Config, Repository}, @@ -17,6 +16,8 @@ use tabby_common::{ use tracing::{error, info}; use walkdir::{DirEntry, WalkDir}; +use crate::document::Document; + lazy_static! { static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { HashMap::from([ @@ -99,14 +100,6 @@ impl RepositoryExt for Repository { } } -#[derive(Serialize)] -struct Document { - git_url: String, - filepath: String, - content: String, - language: String, -} - fn get_language(ext: &OsStr) -> Option<&str> { let ext = ext.to_str().unwrap_or(""); EXTENSION_LANGUAGE.get(ext).copied() diff --git a/crates/tabby-scheduler/src/document.rs b/crates/tabby-scheduler/src/document.rs new file mode 100644 index 0000000..c027be5 --- /dev/null +++ b/crates/tabby-scheduler/src/document.rs @@ -0,0 +1,9 @@ +use serde::{Deserialize, Serialize}; + +#[derive(Serialize, Deserialize)] +pub struct Document { + pub git_url: String, + pub filepath: String, + pub content: String, + pub language: String, +} diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index 8ac2499..e3abe23 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -1,82 +1,53 @@ -use std::fs::{self, read_to_string}; +use std::{ + fs::{self, File}, + io::BufReader, +}; use anyhow::Result; +use serde_jsonlines::JsonLinesReader; use tabby_common::{ - config::{Config, Repository}, - path::index_dir, + config::Config, + path::{dataset_dir, index_dir}, }; use tantivy::{ directory::MmapDirectory, doc, schema::{Schema, STORED, STRING, TEXT}, - Index, IndexWriter, + Index, }; -use tracing::{info, warn}; -use walkdir::{DirEntry, WalkDir}; +use tracing::info; -trait RepositoryExt { - fn index(&self, schema: &Schema, writer: &mut IndexWriter) -> Result<()>; -} +use crate::document::Document; -impl RepositoryExt for Repository { - fn index(&self, schema: &Schema, writer: &mut IndexWriter) -> Result<()> { - let git_url = schema.get_field("git_url").unwrap(); - let filepath = schema.get_field("filepath").unwrap(); - let content = schema.get_field("content").unwrap(); - let dir = self.dir(); - - info!("Start indexing repository {}", self.git_url); - let walk_dir = WalkDir::new(dir.as_path()) - .into_iter() - .filter_entry(is_not_hidden) - .filter_map(Result::ok) - .filter(|e| !e.file_type().is_dir()); - - for entry in walk_dir { - let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap(); - if let Ok(file_content) = read_to_string(entry.path()) { - info!("Indexing {:?}", relative_path); - writer.add_document(doc!( - git_url => self.git_url.clone(), - filepath => relative_path.display().to_string(), - content => file_content, - ))?; - } else { - warn!("Skip {:?}", relative_path); - } - } - - Ok(()) - } -} - -fn is_not_hidden(entry: &DirEntry) -> bool { - entry - .file_name() - .to_str() - .map(|s| entry.depth() == 0 || !s.starts_with('.')) - .unwrap_or(false) -} - -fn create_schema() -> Schema { +pub fn index_repositories(_config: &Config) -> Result<()> { let mut builder = Schema::builder(); - builder.add_text_field("git_url", STRING | STORED); - builder.add_text_field("filepath", STRING | STORED); - builder.add_text_field("content", TEXT | STORED); - builder.build() -} - -pub fn index_repositories(config: &Config) -> Result<()> { - let schema = create_schema(); + let git_url = builder.add_text_field("git_url", STRING | STORED); + let filepath = builder.add_text_field("filepath", STRING | STORED); + let content = builder.add_text_field("content", TEXT | STORED); + let language = builder.add_text_field("language", TEXT | STORED); + let schema = builder.build(); fs::create_dir_all(index_dir())?; let directory = MmapDirectory::open(index_dir())?; - let index = Index::open_or_create(directory, schema.clone())?; + let index = Index::open_or_create(directory, schema)?; let mut writer = index.writer(10_000_000)?; - writer.delete_all_documents()?; - for repository in config.repositories.as_slice() { - repository.index(&schema, &mut writer)?; + + for path in dataset_dir().read_dir()? { + let path = path?.path(); + info!("Indexing {:?}", path.as_path()); + + let fp = BufReader::new(File::open(path.as_path())?); + let reader = JsonLinesReader::new(fp); + for doc in reader.read_all::() { + let doc = doc?; + writer.add_document(doc!( + git_url => doc.git_url, + filepath => doc.filepath, + content => doc.content, + language => doc.language, + ))?; + } } writer.commit()?; diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 5597470..0a05d8e 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -1,4 +1,5 @@ mod dataset; +mod document; mod index; mod repository; @@ -26,12 +27,10 @@ pub async fn scheduler(now: bool) -> Result<()> { return; } - if false { - info!("Indexing repositories..."); - let ret = index::index_repositories(&config); - if let Err(err) = ret { - error!("Failed to index repositories, err: '{}'", err); - } + info!("Indexing repositories..."); + let ret = index::index_repositories(&config); + if let Err(err) = ret { + error!("Failed to index repositories, err: '{}'", err); } };