feat: experiment ctags support in scheduler (#207)
* experiment ctags support * add document.rs * extract Document to common * integrate tags into dataset builder * skip if none * do not add scheduler in client binary * fix fmtsweep/improve-logging-information
parent
41c747e295
commit
7ed5dd584d
|
|
@ -82,7 +82,7 @@ jobs:
|
|||
~/.cargo/git
|
||||
- run: bash ./ci/prepare_build_environment.sh
|
||||
- name: Bulid release binary
|
||||
run: cargo build --release --target ${{ matrix.target }}
|
||||
run: cargo build --no-default-features --release --target ${{ matrix.target }}
|
||||
|
||||
- name: Rename release binary
|
||||
run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}
|
||||
|
|
|
|||
|
|
@ -2813,6 +2813,7 @@ dependencies = [
|
|||
"filenamify",
|
||||
"lazy_static",
|
||||
"serde",
|
||||
"serde-jsonlines",
|
||||
"serdeconv",
|
||||
]
|
||||
|
||||
|
|
@ -2845,6 +2846,8 @@ dependencies = [
|
|||
"temp_testdir",
|
||||
"tracing",
|
||||
"tracing-test",
|
||||
"tree-sitter-javascript",
|
||||
"tree-sitter-tags",
|
||||
"walkdir",
|
||||
]
|
||||
|
||||
|
|
@ -3480,6 +3483,38 @@ dependencies = [
|
|||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter"
|
||||
version = "0.20.10"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "e747b1f9b7b931ed39a548c1fae149101497de3c1fc8d9e18c62c1a66c683d3d"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"regex",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-javascript"
|
||||
version = "0.20.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed"
|
||||
dependencies = [
|
||||
"cc",
|
||||
"tree-sitter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "tree-sitter-tags"
|
||||
version = "0.20.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "ccb3f1376219530a37a809751ecf65aa35fd8b9c1c4ab6d4faf5f6a9eeda2c05"
|
||||
dependencies = [
|
||||
"memchr",
|
||||
"regex",
|
||||
"thiserror",
|
||||
"tree-sitter",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "try-lock"
|
||||
version = "0.2.4"
|
||||
|
|
|
|||
|
|
@ -23,3 +23,4 @@ tokio-util = "0.7"
|
|||
tracing = "0.1"
|
||||
tracing-subscriber = "0.3"
|
||||
anyhow = "1.0.71"
|
||||
serde-jsonlines = "0.4.0"
|
||||
|
|
|
|||
|
|
@ -9,6 +9,7 @@ filenamify = "0.1.0"
|
|||
lazy_static = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serdeconv = { workspace = true }
|
||||
serde-jsonlines = { workspace = true }
|
||||
|
||||
[features]
|
||||
testutils = []
|
||||
|
|
|
|||
|
|
@ -1,3 +1,48 @@
|
|||
pub mod config;
|
||||
pub mod events;
|
||||
pub mod path;
|
||||
|
||||
use std::{
|
||||
fs::File,
|
||||
io::{BufReader, Error},
|
||||
ops::Range,
|
||||
};
|
||||
|
||||
use path::dataset_dir;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use serde_jsonlines::JsonLinesReader;
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct Document {
|
||||
pub git_url: String,
|
||||
pub filepath: String,
|
||||
pub content: String,
|
||||
pub language: String,
|
||||
pub max_line_length: usize,
|
||||
pub avg_line_length: f32,
|
||||
pub alphanum_fraction: f32,
|
||||
pub tags: Vec<Tag>,
|
||||
}
|
||||
|
||||
impl Document {
|
||||
pub fn all() -> Result<impl Iterator<Item = Self>, Error> {
|
||||
let iter = dataset_dir().read_dir()?.flat_map(|path| {
|
||||
let path = path.unwrap().path();
|
||||
let fp = BufReader::new(File::open(path).unwrap());
|
||||
let reader = JsonLinesReader::new(fp);
|
||||
reader.read_all::<Document>().map(|x| x.unwrap())
|
||||
});
|
||||
Ok(iter)
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct Tag {
|
||||
pub range: Range<usize>,
|
||||
pub name_range: Range<usize>,
|
||||
pub line_range: Range<usize>,
|
||||
#[serde(skip_serializing_if = "Option::is_none")]
|
||||
pub docs: Option<String>,
|
||||
pub is_definition: bool,
|
||||
pub syntax_type_name: String,
|
||||
}
|
||||
|
|
|
|||
|
|
@ -12,10 +12,12 @@ job_scheduler = "1.2.1"
|
|||
tabby-common = { path = "../tabby-common" }
|
||||
tantivy = "0.19.2"
|
||||
tracing = { workspace = true }
|
||||
tree-sitter-javascript = "0.20.0"
|
||||
tree-sitter-tags = "0.20.2"
|
||||
walkdir = "2.3.3"
|
||||
lazy_static = { workspace = true }
|
||||
serde = { workspace = true }
|
||||
serde-jsonlines = "0.4.0"
|
||||
serde-jsonlines = { workspace = true }
|
||||
file-rotate = "0.7.5"
|
||||
|
||||
[dev-dependencies]
|
||||
|
|
|
|||
|
|
@ -12,58 +12,12 @@ use serde_jsonlines::WriteExt;
|
|||
use tabby_common::{
|
||||
config::{Config, Repository},
|
||||
path::dataset_dir,
|
||||
Document,
|
||||
};
|
||||
use tracing::{error, info};
|
||||
use tree_sitter_tags::{TagsConfiguration, TagsContext};
|
||||
use walkdir::{DirEntry, WalkDir};
|
||||
|
||||
use crate::document::Document;
|
||||
|
||||
lazy_static! {
|
||||
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
|
||||
HashMap::from([
|
||||
("c", vec!["c", "h"]),
|
||||
("csharp", vec!["cs"]),
|
||||
(
|
||||
"cpp",
|
||||
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
|
||||
),
|
||||
("css", vec!["css"]),
|
||||
("dockerfile", vec!["Dockerfile"]),
|
||||
("go", vec!["go"]),
|
||||
("haskell", vec!["hs"]),
|
||||
("html", vec!["html"]),
|
||||
("java", vec!["java"]),
|
||||
("javascript", vec!["js"]),
|
||||
("julia", vec!["jl"]),
|
||||
("lua", vec!["lua"]),
|
||||
("makefile", vec!["Makefile"]),
|
||||
("markdown", vec!["md", "markdown"]),
|
||||
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
|
||||
("perl", vec!["pl", "pm", "pod", "perl"]),
|
||||
("powershell", vec!["ps1", "psd1", "psm1"]),
|
||||
("python", vec!["py"]),
|
||||
("ruby", vec!["rb"]),
|
||||
("rust", vec!["rs"]),
|
||||
("sql", vec!["sql"]),
|
||||
("scala", vec!["scala"]),
|
||||
("shellscript", vec!["sh", "bash", "command", "zsh"]),
|
||||
("typescript", vec!["ts", "tsx"]),
|
||||
("tex", vec!["tex"]),
|
||||
("vb", vec!["vb"]),
|
||||
])
|
||||
};
|
||||
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
|
||||
let mut map = HashMap::new();
|
||||
for (lang, exts) in &*LANGUAGE_EXTENSION {
|
||||
for ext in exts {
|
||||
map.insert(*ext, *lang);
|
||||
}
|
||||
}
|
||||
|
||||
map
|
||||
};
|
||||
}
|
||||
|
||||
trait RepositoryExt {
|
||||
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
|
||||
}
|
||||
|
|
@ -79,21 +33,25 @@ impl RepositoryExt for Repository {
|
|||
.filter_map(Result::ok)
|
||||
.filter(is_source_code);
|
||||
|
||||
let mut context = TagsContext::new();
|
||||
for entry in walk_dir {
|
||||
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
|
||||
let language = get_language(relative_path.extension().unwrap())
|
||||
.unwrap()
|
||||
.to_owned();
|
||||
if let Ok(file_content) = read_to_string(entry.path()) {
|
||||
info!("Building {:?}", relative_path);
|
||||
writer.write_json_lines([Document {
|
||||
let doc = Document {
|
||||
git_url: self.git_url.clone(),
|
||||
filepath: relative_path.display().to_string(),
|
||||
language: get_language(relative_path.extension().unwrap())
|
||||
.unwrap()
|
||||
.to_owned(),
|
||||
max_line_length: metrics::max_line_length(&file_content),
|
||||
avg_line_length: metrics::avg_line_length(&file_content),
|
||||
alphanum_fraction: metrics::alphanum_fraction(&file_content),
|
||||
tags: tags::collect(&mut context, &language, &file_content),
|
||||
language,
|
||||
content: file_content,
|
||||
}])?;
|
||||
};
|
||||
writer.write_json_lines([doc])?;
|
||||
} else {
|
||||
error!("Cannot read {:?}", relative_path);
|
||||
}
|
||||
|
|
@ -159,7 +117,11 @@ mod metrics {
|
|||
total += x.len();
|
||||
}
|
||||
|
||||
if len > 0 {
|
||||
total as f32 / len as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
|
||||
pub fn alphanum_fraction(content: &str) -> f32 {
|
||||
|
|
@ -167,6 +129,109 @@ mod metrics {
|
|||
.chars()
|
||||
.map(|x| f32::from(u8::from(x.is_alphanumeric())))
|
||||
.sum();
|
||||
if !content.is_empty() {
|
||||
num_alphanumn / content.len() as f32
|
||||
} else {
|
||||
0.0
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
mod tags {
|
||||
use tabby_common::Tag;
|
||||
use tree_sitter_tags::TagsContext;
|
||||
|
||||
use super::LANGUAGE_TAGS;
|
||||
|
||||
pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec<Tag> {
|
||||
let config = LANGUAGE_TAGS.get(language);
|
||||
let empty = Vec::new();
|
||||
|
||||
let Some(config) = config else {
|
||||
return empty;
|
||||
};
|
||||
|
||||
let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) else {
|
||||
return empty;
|
||||
};
|
||||
|
||||
if has_error {
|
||||
return empty;
|
||||
}
|
||||
|
||||
tags.filter_map(|x| x.ok())
|
||||
.map(|x| Tag {
|
||||
range: x.range,
|
||||
name_range: x.name_range,
|
||||
line_range: x.line_range,
|
||||
docs: x.docs,
|
||||
is_definition: x.is_definition,
|
||||
syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(),
|
||||
})
|
||||
.collect()
|
||||
}
|
||||
}
|
||||
|
||||
// Mark TagsConfiguration as thread sync / safe.
|
||||
struct TagsConfigurationSync(TagsConfiguration);
|
||||
unsafe impl Send for TagsConfigurationSync {}
|
||||
unsafe impl Sync for TagsConfigurationSync {}
|
||||
|
||||
lazy_static! {
|
||||
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
|
||||
HashMap::from([
|
||||
("c", vec!["c", "h"]),
|
||||
("csharp", vec!["cs"]),
|
||||
(
|
||||
"cpp",
|
||||
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
|
||||
),
|
||||
("css", vec!["css"]),
|
||||
("dockerfile", vec!["Dockerfile"]),
|
||||
("go", vec!["go"]),
|
||||
("haskell", vec!["hs"]),
|
||||
("html", vec!["html"]),
|
||||
("java", vec!["java"]),
|
||||
("javascript", vec!["js"]),
|
||||
("julia", vec!["jl"]),
|
||||
("lua", vec!["lua"]),
|
||||
("makefile", vec!["Makefile"]),
|
||||
("markdown", vec!["md", "markdown"]),
|
||||
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
|
||||
("perl", vec!["pl", "pm", "pod", "perl"]),
|
||||
("powershell", vec!["ps1", "psd1", "psm1"]),
|
||||
("python", vec!["py"]),
|
||||
("ruby", vec!["rb"]),
|
||||
("rust", vec!["rs"]),
|
||||
("sql", vec!["sql"]),
|
||||
("scala", vec!["scala"]),
|
||||
("shellscript", vec!["sh", "bash", "command", "zsh"]),
|
||||
("typescript", vec!["ts", "tsx"]),
|
||||
("tex", vec!["tex"]),
|
||||
("vb", vec!["vb"]),
|
||||
])
|
||||
};
|
||||
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
|
||||
let mut map = HashMap::new();
|
||||
for (lang, exts) in &*LANGUAGE_EXTENSION {
|
||||
for ext in exts {
|
||||
map.insert(*ext, *lang);
|
||||
}
|
||||
}
|
||||
|
||||
map
|
||||
};
|
||||
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
|
||||
HashMap::from([(
|
||||
"javascript",
|
||||
TagsConfigurationSync(
|
||||
TagsConfiguration::new(
|
||||
tree_sitter_javascript::language(),
|
||||
tree_sitter_javascript::TAGGING_QUERY,
|
||||
tree_sitter_javascript::LOCALS_QUERY,
|
||||
)
|
||||
.unwrap(),
|
||||
),
|
||||
)])
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -1,12 +0,0 @@
|
|||
use serde::{Deserialize, Serialize};
|
||||
|
||||
#[derive(Serialize, Deserialize)]
|
||||
pub struct Document {
|
||||
pub git_url: String,
|
||||
pub filepath: String,
|
||||
pub content: String,
|
||||
pub language: String,
|
||||
pub max_line_length: usize,
|
||||
pub avg_line_length: f32,
|
||||
pub alphanum_fraction: f32,
|
||||
}
|
||||
|
|
@ -1,23 +1,13 @@
|
|||
use std::{
|
||||
fs::{self, File},
|
||||
io::BufReader,
|
||||
};
|
||||
use std::fs;
|
||||
|
||||
use anyhow::Result;
|
||||
use serde_jsonlines::JsonLinesReader;
|
||||
use tabby_common::{
|
||||
config::Config,
|
||||
path::{dataset_dir, index_dir},
|
||||
};
|
||||
use tabby_common::{config::Config, path::index_dir, Document};
|
||||
use tantivy::{
|
||||
directory::MmapDirectory,
|
||||
doc,
|
||||
schema::{Schema, STORED, STRING, TEXT},
|
||||
Index,
|
||||
};
|
||||
use tracing::info;
|
||||
|
||||
use crate::document::Document;
|
||||
|
||||
pub fn index_repositories(_config: &Config) -> Result<()> {
|
||||
let mut builder = Schema::builder();
|
||||
|
|
@ -33,14 +23,7 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
|||
let mut writer = index.writer(10_000_000)?;
|
||||
writer.delete_all_documents()?;
|
||||
|
||||
for path in dataset_dir().read_dir()? {
|
||||
let path = path?.path();
|
||||
info!("Indexing {:?}", path.as_path());
|
||||
|
||||
let fp = BufReader::new(File::open(path.as_path())?);
|
||||
let reader = JsonLinesReader::new(fp);
|
||||
for doc in reader.read_all::<Document>() {
|
||||
let doc = doc?;
|
||||
for doc in Document::all()? {
|
||||
writer.add_document(doc!(
|
||||
git_url => doc.git_url,
|
||||
filepath => doc.filepath,
|
||||
|
|
@ -48,7 +31,6 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
|
|||
language => doc.language,
|
||||
))?;
|
||||
}
|
||||
}
|
||||
|
||||
writer.commit()?;
|
||||
|
||||
|
|
|
|||
|
|
@ -1,5 +1,4 @@
|
|||
mod dataset;
|
||||
mod document;
|
||||
mod index;
|
||||
mod repository;
|
||||
|
||||
|
|
|
|||
|
|
@ -6,7 +6,7 @@ edition = "2021"
|
|||
[dependencies]
|
||||
ctranslate2-bindings = { path = "../ctranslate2-bindings" }
|
||||
tabby-common = { path = "../tabby-common" }
|
||||
tabby-scheduler = { path = "../tabby-scheduler" }
|
||||
tabby-scheduler = { path = "../tabby-scheduler", optional = true }
|
||||
tabby-download = { path = "../tabby-download" }
|
||||
axum = "0.6"
|
||||
hyper = { version = "0.14", features = ["full"] }
|
||||
|
|
@ -41,4 +41,6 @@ features = [
|
|||
]
|
||||
|
||||
[features]
|
||||
default = [ "scheduler" ]
|
||||
link_shared = ["ctranslate2-bindings/link_shared"]
|
||||
scheduler = [ "tabby-scheduler" ]
|
||||
|
|
|
|||
|
|
@ -31,6 +31,7 @@ pub enum Commands {
|
|||
Download(download::DownloadArgs),
|
||||
|
||||
/// Run scheduler progress for cron jobs integrating external code repositories.
|
||||
#[cfg(feature = "scheduler")]
|
||||
Scheduler(SchedulerArgs),
|
||||
}
|
||||
|
||||
|
|
@ -49,6 +50,7 @@ async fn main() {
|
|||
match &cli.command {
|
||||
Commands::Serve(args) => serve::main(args).await,
|
||||
Commands::Download(args) => download::main(args).await,
|
||||
#[cfg(feature = "scheduler")]
|
||||
Commands::Scheduler(args) => tabby_scheduler::scheduler(args.now)
|
||||
.await
|
||||
.unwrap_or_else(|err| fatal!("Scheduler failed due to '{}'", err)),
|
||||
|
|
|
|||
Loading…
Reference in New Issue