diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b59e7b0..1a321f5 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -82,7 +82,7 @@ jobs: ~/.cargo/git - run: bash ./ci/prepare_build_environment.sh - name: Bulid release binary - run: cargo build --release --target ${{ matrix.target }} + run: cargo build --no-default-features --release --target ${{ matrix.target }} - name: Rename release binary run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }} diff --git a/Cargo.lock b/Cargo.lock index 97d96fb..be71472 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2813,6 +2813,7 @@ dependencies = [ "filenamify", "lazy_static", "serde", + "serde-jsonlines", "serdeconv", ] @@ -2845,6 +2846,8 @@ dependencies = [ "temp_testdir", "tracing", "tracing-test", + "tree-sitter-javascript", + "tree-sitter-tags", "walkdir", ] @@ -3480,6 +3483,38 @@ dependencies = [ "syn 1.0.109", ] +[[package]] +name = "tree-sitter" +version = "0.20.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e747b1f9b7b931ed39a548c1fae149101497de3c1fc8d9e18c62c1a66c683d3d" +dependencies = [ + "cc", + "regex", +] + +[[package]] +name = "tree-sitter-javascript" +version = "0.20.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed" +dependencies = [ + "cc", + "tree-sitter", +] + +[[package]] +name = "tree-sitter-tags" +version = "0.20.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ccb3f1376219530a37a809751ecf65aa35fd8b9c1c4ab6d4faf5f6a9eeda2c05" +dependencies = [ + "memchr", + "regex", + "thiserror", + "tree-sitter", +] + [[package]] name = "try-lock" version = "0.2.4" diff --git a/Cargo.toml b/Cargo.toml index 16964d2..28fa131 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -23,3 +23,4 @@ tokio-util = "0.7" tracing = "0.1" tracing-subscriber = "0.3" anyhow = "1.0.71" +serde-jsonlines = "0.4.0" diff --git a/crates/tabby-common/Cargo.toml b/crates/tabby-common/Cargo.toml index 1a705c7..77e217b 100644 --- a/crates/tabby-common/Cargo.toml +++ b/crates/tabby-common/Cargo.toml @@ -9,6 +9,7 @@ filenamify = "0.1.0" lazy_static = { workspace = true } serde = { workspace = true } serdeconv = { workspace = true } +serde-jsonlines = { workspace = true } [features] testutils = [] diff --git a/crates/tabby-common/src/lib.rs b/crates/tabby-common/src/lib.rs index 9f22303..60362c7 100644 --- a/crates/tabby-common/src/lib.rs +++ b/crates/tabby-common/src/lib.rs @@ -1,3 +1,48 @@ pub mod config; pub mod events; pub mod path; + +use std::{ + fs::File, + io::{BufReader, Error}, + ops::Range, +}; + +use path::dataset_dir; +use serde::{Deserialize, Serialize}; +use serde_jsonlines::JsonLinesReader; + +#[derive(Serialize, Deserialize)] +pub struct Document { + pub git_url: String, + pub filepath: String, + pub content: String, + pub language: String, + pub max_line_length: usize, + pub avg_line_length: f32, + pub alphanum_fraction: f32, + pub tags: Vec, +} + +impl Document { + pub fn all() -> Result, Error> { + let iter = dataset_dir().read_dir()?.flat_map(|path| { + let path = path.unwrap().path(); + let fp = BufReader::new(File::open(path).unwrap()); + let reader = JsonLinesReader::new(fp); + reader.read_all::().map(|x| x.unwrap()) + }); + Ok(iter) + } +} + +#[derive(Serialize, Deserialize)] +pub struct Tag { + pub range: Range, + pub name_range: Range, + pub line_range: Range, + #[serde(skip_serializing_if = "Option::is_none")] + pub docs: Option, + pub is_definition: bool, + pub syntax_type_name: String, +} diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index ba105b8..b5b383e 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -12,10 +12,12 @@ job_scheduler = "1.2.1" tabby-common = { path = "../tabby-common" } tantivy = "0.19.2" tracing = { workspace = true } +tree-sitter-javascript = "0.20.0" +tree-sitter-tags = "0.20.2" walkdir = "2.3.3" lazy_static = { workspace = true } serde = { workspace = true } -serde-jsonlines = "0.4.0" +serde-jsonlines = { workspace = true } file-rotate = "0.7.5" [dev-dependencies] diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index d54c25e..c926dc0 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -12,58 +12,12 @@ use serde_jsonlines::WriteExt; use tabby_common::{ config::{Config, Repository}, path::dataset_dir, + Document, }; use tracing::{error, info}; +use tree_sitter_tags::{TagsConfiguration, TagsContext}; use walkdir::{DirEntry, WalkDir}; -use crate::document::Document; - -lazy_static! { - static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { - HashMap::from([ - ("c", vec!["c", "h"]), - ("csharp", vec!["cs"]), - ( - "cpp", - vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"], - ), - ("css", vec!["css"]), - ("dockerfile", vec!["Dockerfile"]), - ("go", vec!["go"]), - ("haskell", vec!["hs"]), - ("html", vec!["html"]), - ("java", vec!["java"]), - ("javascript", vec!["js"]), - ("julia", vec!["jl"]), - ("lua", vec!["lua"]), - ("makefile", vec!["Makefile"]), - ("markdown", vec!["md", "markdown"]), - ("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]), - ("perl", vec!["pl", "pm", "pod", "perl"]), - ("powershell", vec!["ps1", "psd1", "psm1"]), - ("python", vec!["py"]), - ("ruby", vec!["rb"]), - ("rust", vec!["rs"]), - ("sql", vec!["sql"]), - ("scala", vec!["scala"]), - ("shellscript", vec!["sh", "bash", "command", "zsh"]), - ("typescript", vec!["ts", "tsx"]), - ("tex", vec!["tex"]), - ("vb", vec!["vb"]), - ]) - }; - static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = { - let mut map = HashMap::new(); - for (lang, exts) in &*LANGUAGE_EXTENSION { - for ext in exts { - map.insert(*ext, *lang); - } - } - - map - }; -} - trait RepositoryExt { fn create_dataset(&self, writer: &mut impl Write) -> Result<()>; } @@ -79,21 +33,25 @@ impl RepositoryExt for Repository { .filter_map(Result::ok) .filter(is_source_code); + let mut context = TagsContext::new(); for entry in walk_dir { let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap(); + let language = get_language(relative_path.extension().unwrap()) + .unwrap() + .to_owned(); if let Ok(file_content) = read_to_string(entry.path()) { info!("Building {:?}", relative_path); - writer.write_json_lines([Document { + let doc = Document { git_url: self.git_url.clone(), filepath: relative_path.display().to_string(), - language: get_language(relative_path.extension().unwrap()) - .unwrap() - .to_owned(), max_line_length: metrics::max_line_length(&file_content), avg_line_length: metrics::avg_line_length(&file_content), alphanum_fraction: metrics::alphanum_fraction(&file_content), + tags: tags::collect(&mut context, &language, &file_content), + language, content: file_content, - }])?; + }; + writer.write_json_lines([doc])?; } else { error!("Cannot read {:?}", relative_path); } @@ -159,7 +117,11 @@ mod metrics { total += x.len(); } - total as f32 / len as f32 + if len > 0 { + total as f32 / len as f32 + } else { + 0.0 + } } pub fn alphanum_fraction(content: &str) -> f32 { @@ -167,6 +129,109 @@ mod metrics { .chars() .map(|x| f32::from(u8::from(x.is_alphanumeric()))) .sum(); - num_alphanumn / content.len() as f32 + if !content.is_empty() { + num_alphanumn / content.len() as f32 + } else { + 0.0 + } } } + +mod tags { + use tabby_common::Tag; + use tree_sitter_tags::TagsContext; + + use super::LANGUAGE_TAGS; + + pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec { + let config = LANGUAGE_TAGS.get(language); + let empty = Vec::new(); + + let Some(config) = config else { + return empty; + }; + + let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) else { + return empty; + }; + + if has_error { + return empty; + } + + tags.filter_map(|x| x.ok()) + .map(|x| Tag { + range: x.range, + name_range: x.name_range, + line_range: x.line_range, + docs: x.docs, + is_definition: x.is_definition, + syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(), + }) + .collect() + } +} + +// Mark TagsConfiguration as thread sync / safe. +struct TagsConfigurationSync(TagsConfiguration); +unsafe impl Send for TagsConfigurationSync {} +unsafe impl Sync for TagsConfigurationSync {} + +lazy_static! { + static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { + HashMap::from([ + ("c", vec!["c", "h"]), + ("csharp", vec!["cs"]), + ( + "cpp", + vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"], + ), + ("css", vec!["css"]), + ("dockerfile", vec!["Dockerfile"]), + ("go", vec!["go"]), + ("haskell", vec!["hs"]), + ("html", vec!["html"]), + ("java", vec!["java"]), + ("javascript", vec!["js"]), + ("julia", vec!["jl"]), + ("lua", vec!["lua"]), + ("makefile", vec!["Makefile"]), + ("markdown", vec!["md", "markdown"]), + ("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]), + ("perl", vec!["pl", "pm", "pod", "perl"]), + ("powershell", vec!["ps1", "psd1", "psm1"]), + ("python", vec!["py"]), + ("ruby", vec!["rb"]), + ("rust", vec!["rs"]), + ("sql", vec!["sql"]), + ("scala", vec!["scala"]), + ("shellscript", vec!["sh", "bash", "command", "zsh"]), + ("typescript", vec!["ts", "tsx"]), + ("tex", vec!["tex"]), + ("vb", vec!["vb"]), + ]) + }; + static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = { + let mut map = HashMap::new(); + for (lang, exts) in &*LANGUAGE_EXTENSION { + for ext in exts { + map.insert(*ext, *lang); + } + } + + map + }; + static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = { + HashMap::from([( + "javascript", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_javascript::language(), + tree_sitter_javascript::TAGGING_QUERY, + tree_sitter_javascript::LOCALS_QUERY, + ) + .unwrap(), + ), + )]) + }; +} diff --git a/crates/tabby-scheduler/src/document.rs b/crates/tabby-scheduler/src/document.rs deleted file mode 100644 index cc816bf..0000000 --- a/crates/tabby-scheduler/src/document.rs +++ /dev/null @@ -1,12 +0,0 @@ -use serde::{Deserialize, Serialize}; - -#[derive(Serialize, Deserialize)] -pub struct Document { - pub git_url: String, - pub filepath: String, - pub content: String, - pub language: String, - pub max_line_length: usize, - pub avg_line_length: f32, - pub alphanum_fraction: f32, -} diff --git a/crates/tabby-scheduler/src/index.rs b/crates/tabby-scheduler/src/index.rs index e3abe23..4e2d65b 100644 --- a/crates/tabby-scheduler/src/index.rs +++ b/crates/tabby-scheduler/src/index.rs @@ -1,23 +1,13 @@ -use std::{ - fs::{self, File}, - io::BufReader, -}; +use std::fs; use anyhow::Result; -use serde_jsonlines::JsonLinesReader; -use tabby_common::{ - config::Config, - path::{dataset_dir, index_dir}, -}; +use tabby_common::{config::Config, path::index_dir, Document}; use tantivy::{ directory::MmapDirectory, doc, schema::{Schema, STORED, STRING, TEXT}, Index, }; -use tracing::info; - -use crate::document::Document; pub fn index_repositories(_config: &Config) -> Result<()> { let mut builder = Schema::builder(); @@ -33,21 +23,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> { let mut writer = index.writer(10_000_000)?; writer.delete_all_documents()?; - for path in dataset_dir().read_dir()? { - let path = path?.path(); - info!("Indexing {:?}", path.as_path()); - - let fp = BufReader::new(File::open(path.as_path())?); - let reader = JsonLinesReader::new(fp); - for doc in reader.read_all::() { - let doc = doc?; - writer.add_document(doc!( - git_url => doc.git_url, - filepath => doc.filepath, - content => doc.content, - language => doc.language, - ))?; - } + for doc in Document::all()? { + writer.add_document(doc!( + git_url => doc.git_url, + filepath => doc.filepath, + content => doc.content, + language => doc.language, + ))?; } writer.commit()?; diff --git a/crates/tabby-scheduler/src/lib.rs b/crates/tabby-scheduler/src/lib.rs index 0a05d8e..5d2c08f 100644 --- a/crates/tabby-scheduler/src/lib.rs +++ b/crates/tabby-scheduler/src/lib.rs @@ -1,5 +1,4 @@ mod dataset; -mod document; mod index; mod repository; diff --git a/crates/tabby/Cargo.toml b/crates/tabby/Cargo.toml index 1d53364..be6ee1a 100644 --- a/crates/tabby/Cargo.toml +++ b/crates/tabby/Cargo.toml @@ -6,7 +6,7 @@ edition = "2021" [dependencies] ctranslate2-bindings = { path = "../ctranslate2-bindings" } tabby-common = { path = "../tabby-common" } -tabby-scheduler = { path = "../tabby-scheduler" } +tabby-scheduler = { path = "../tabby-scheduler", optional = true } tabby-download = { path = "../tabby-download" } axum = "0.6" hyper = { version = "0.14", features = ["full"] } @@ -41,4 +41,6 @@ features = [ ] [features] +default = [ "scheduler" ] link_shared = ["ctranslate2-bindings/link_shared"] +scheduler = [ "tabby-scheduler" ] diff --git a/crates/tabby/src/main.rs b/crates/tabby/src/main.rs index 4891ef9..1c2f9c8 100644 --- a/crates/tabby/src/main.rs +++ b/crates/tabby/src/main.rs @@ -31,6 +31,7 @@ pub enum Commands { Download(download::DownloadArgs), /// Run scheduler progress for cron jobs integrating external code repositories. + #[cfg(feature = "scheduler")] Scheduler(SchedulerArgs), } @@ -49,6 +50,7 @@ async fn main() { match &cli.command { Commands::Serve(args) => serve::main(args).await, Commands::Download(args) => download::main(args).await, + #[cfg(feature = "scheduler")] Commands::Scheduler(args) => tabby_scheduler::scheduler(args.now) .await .unwrap_or_else(|err| fatal!("Scheduler failed due to '{}'", err)),