feat: build index from dataset (#234)

improve-workflow
Meng Zhang 2023-06-12 12:21:27 -07:00 committed by GitHub
parent 32c805c3be
commit 2983f70d22
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
5 changed files with 52 additions and 77 deletions

View File

@ -3,3 +3,6 @@ smoke:
loadtest: loadtest:
k6 run tests/*.loadtest.js k6 run tests/*.loadtest.js
fix:
cargo clippy --fix --allow-dirty --allow-staged && cargo +nightly fmt

View File

@ -8,7 +8,6 @@ use std::{
use anyhow::Result; use anyhow::Result;
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate}; use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
use lazy_static::lazy_static; use lazy_static::lazy_static;
use serde::Serialize;
use serde_jsonlines::WriteExt; use serde_jsonlines::WriteExt;
use tabby_common::{ use tabby_common::{
config::{Config, Repository}, config::{Config, Repository},
@ -17,6 +16,8 @@ use tabby_common::{
use tracing::{error, info}; use tracing::{error, info};
use walkdir::{DirEntry, WalkDir}; use walkdir::{DirEntry, WalkDir};
use crate::document::Document;
lazy_static! { lazy_static! {
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
HashMap::from([ HashMap::from([
@ -99,14 +100,6 @@ impl RepositoryExt for Repository {
} }
} }
#[derive(Serialize)]
struct Document {
git_url: String,
filepath: String,
content: String,
language: String,
}
fn get_language(ext: &OsStr) -> Option<&str> { fn get_language(ext: &OsStr) -> Option<&str> {
let ext = ext.to_str().unwrap_or(""); let ext = ext.to_str().unwrap_or("");
EXTENSION_LANGUAGE.get(ext).copied() EXTENSION_LANGUAGE.get(ext).copied()

View File

@ -0,0 +1,9 @@
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct Document {
pub git_url: String,
pub filepath: String,
pub content: String,
pub language: String,
}

View File

@ -1,82 +1,53 @@
use std::fs::{self, read_to_string}; use std::{
fs::{self, File},
io::BufReader,
};
use anyhow::Result; use anyhow::Result;
use serde_jsonlines::JsonLinesReader;
use tabby_common::{ use tabby_common::{
config::{Config, Repository}, config::Config,
path::index_dir, path::{dataset_dir, index_dir},
}; };
use tantivy::{ use tantivy::{
directory::MmapDirectory, directory::MmapDirectory,
doc, doc,
schema::{Schema, STORED, STRING, TEXT}, schema::{Schema, STORED, STRING, TEXT},
Index, IndexWriter, Index,
}; };
use tracing::{info, warn}; use tracing::info;
use walkdir::{DirEntry, WalkDir};
trait RepositoryExt { use crate::document::Document;
fn index(&self, schema: &Schema, writer: &mut IndexWriter) -> Result<()>;
}
impl RepositoryExt for Repository { pub fn index_repositories(_config: &Config) -> Result<()> {
fn index(&self, schema: &Schema, writer: &mut IndexWriter) -> Result<()> {
let git_url = schema.get_field("git_url").unwrap();
let filepath = schema.get_field("filepath").unwrap();
let content = schema.get_field("content").unwrap();
let dir = self.dir();
info!("Start indexing repository {}", self.git_url);
let walk_dir = WalkDir::new(dir.as_path())
.into_iter()
.filter_entry(is_not_hidden)
.filter_map(Result::ok)
.filter(|e| !e.file_type().is_dir());
for entry in walk_dir {
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
if let Ok(file_content) = read_to_string(entry.path()) {
info!("Indexing {:?}", relative_path);
writer.add_document(doc!(
git_url => self.git_url.clone(),
filepath => relative_path.display().to_string(),
content => file_content,
))?;
} else {
warn!("Skip {:?}", relative_path);
}
}
Ok(())
}
}
fn is_not_hidden(entry: &DirEntry) -> bool {
entry
.file_name()
.to_str()
.map(|s| entry.depth() == 0 || !s.starts_with('.'))
.unwrap_or(false)
}
fn create_schema() -> Schema {
let mut builder = Schema::builder(); let mut builder = Schema::builder();
builder.add_text_field("git_url", STRING | STORED); let git_url = builder.add_text_field("git_url", STRING | STORED);
builder.add_text_field("filepath", STRING | STORED); let filepath = builder.add_text_field("filepath", STRING | STORED);
builder.add_text_field("content", TEXT | STORED); let content = builder.add_text_field("content", TEXT | STORED);
builder.build() let language = builder.add_text_field("language", TEXT | STORED);
} let schema = builder.build();
pub fn index_repositories(config: &Config) -> Result<()> {
let schema = create_schema();
fs::create_dir_all(index_dir())?; fs::create_dir_all(index_dir())?;
let directory = MmapDirectory::open(index_dir())?; let directory = MmapDirectory::open(index_dir())?;
let index = Index::open_or_create(directory, schema.clone())?; let index = Index::open_or_create(directory, schema)?;
let mut writer = index.writer(10_000_000)?; let mut writer = index.writer(10_000_000)?;
writer.delete_all_documents()?; writer.delete_all_documents()?;
for repository in config.repositories.as_slice() {
repository.index(&schema, &mut writer)?; for path in dataset_dir().read_dir()? {
let path = path?.path();
info!("Indexing {:?}", path.as_path());
let fp = BufReader::new(File::open(path.as_path())?);
let reader = JsonLinesReader::new(fp);
for doc in reader.read_all::<Document>() {
let doc = doc?;
writer.add_document(doc!(
git_url => doc.git_url,
filepath => doc.filepath,
content => doc.content,
language => doc.language,
))?;
}
} }
writer.commit()?; writer.commit()?;

View File

@ -1,4 +1,5 @@
mod dataset; mod dataset;
mod document;
mod index; mod index;
mod repository; mod repository;
@ -26,12 +27,10 @@ pub async fn scheduler(now: bool) -> Result<()> {
return; return;
} }
if false { info!("Indexing repositories...");
info!("Indexing repositories..."); let ret = index::index_repositories(&config);
let ret = index::index_repositories(&config); if let Err(err) = ret {
if let Err(err) = ret { error!("Failed to index repositories, err: '{}'", err);
error!("Failed to index repositories, err: '{}'", err);
}
} }
}; };