feat: experiment ctags support in scheduler (#207)

* experiment ctags support

* add document.rs

* extract Document to common

* integrate tags into dataset builder

* skip if none

* do not add scheduler in client binary

* fix fmt
sweep/improve-logging-information
Meng Zhang 2023-06-21 19:48:13 -07:00 committed by GitHub
parent 41c747e295
commit 7ed5dd584d
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
12 changed files with 220 additions and 98 deletions

View File

@ -82,7 +82,7 @@ jobs:
~/.cargo/git
- run: bash ./ci/prepare_build_environment.sh
- name: Bulid release binary
run: cargo build --release --target ${{ matrix.target }}
run: cargo build --no-default-features --release --target ${{ matrix.target }}
- name: Rename release binary
run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}

35
Cargo.lock generated
View File

@ -2813,6 +2813,7 @@ dependencies = [
"filenamify",
"lazy_static",
"serde",
"serde-jsonlines",
"serdeconv",
]
@ -2845,6 +2846,8 @@ dependencies = [
"temp_testdir",
"tracing",
"tracing-test",
"tree-sitter-javascript",
"tree-sitter-tags",
"walkdir",
]
@ -3480,6 +3483,38 @@ dependencies = [
"syn 1.0.109",
]
[[package]]
name = "tree-sitter"
version = "0.20.10"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "e747b1f9b7b931ed39a548c1fae149101497de3c1fc8d9e18c62c1a66c683d3d"
dependencies = [
"cc",
"regex",
]
[[package]]
name = "tree-sitter-javascript"
version = "0.20.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2490fab08630b2c8943c320f7b63473cbf65511c8d83aec551beb9b4375906ed"
dependencies = [
"cc",
"tree-sitter",
]
[[package]]
name = "tree-sitter-tags"
version = "0.20.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ccb3f1376219530a37a809751ecf65aa35fd8b9c1c4ab6d4faf5f6a9eeda2c05"
dependencies = [
"memchr",
"regex",
"thiserror",
"tree-sitter",
]
[[package]]
name = "try-lock"
version = "0.2.4"

View File

@ -23,3 +23,4 @@ tokio-util = "0.7"
tracing = "0.1"
tracing-subscriber = "0.3"
anyhow = "1.0.71"
serde-jsonlines = "0.4.0"

View File

@ -9,6 +9,7 @@ filenamify = "0.1.0"
lazy_static = { workspace = true }
serde = { workspace = true }
serdeconv = { workspace = true }
serde-jsonlines = { workspace = true }
[features]
testutils = []

View File

@ -1,3 +1,48 @@
pub mod config;
pub mod events;
pub mod path;
use std::{
fs::File,
io::{BufReader, Error},
ops::Range,
};
use path::dataset_dir;
use serde::{Deserialize, Serialize};
use serde_jsonlines::JsonLinesReader;
#[derive(Serialize, Deserialize)]
pub struct Document {
pub git_url: String,
pub filepath: String,
pub content: String,
pub language: String,
pub max_line_length: usize,
pub avg_line_length: f32,
pub alphanum_fraction: f32,
pub tags: Vec<Tag>,
}
impl Document {
pub fn all() -> Result<impl Iterator<Item = Self>, Error> {
let iter = dataset_dir().read_dir()?.flat_map(|path| {
let path = path.unwrap().path();
let fp = BufReader::new(File::open(path).unwrap());
let reader = JsonLinesReader::new(fp);
reader.read_all::<Document>().map(|x| x.unwrap())
});
Ok(iter)
}
}
#[derive(Serialize, Deserialize)]
pub struct Tag {
pub range: Range<usize>,
pub name_range: Range<usize>,
pub line_range: Range<usize>,
#[serde(skip_serializing_if = "Option::is_none")]
pub docs: Option<String>,
pub is_definition: bool,
pub syntax_type_name: String,
}

View File

@ -12,10 +12,12 @@ job_scheduler = "1.2.1"
tabby-common = { path = "../tabby-common" }
tantivy = "0.19.2"
tracing = { workspace = true }
tree-sitter-javascript = "0.20.0"
tree-sitter-tags = "0.20.2"
walkdir = "2.3.3"
lazy_static = { workspace = true }
serde = { workspace = true }
serde-jsonlines = "0.4.0"
serde-jsonlines = { workspace = true }
file-rotate = "0.7.5"
[dev-dependencies]

View File

@ -12,58 +12,12 @@ use serde_jsonlines::WriteExt;
use tabby_common::{
config::{Config, Repository},
path::dataset_dir,
Document,
};
use tracing::{error, info};
use tree_sitter_tags::{TagsConfiguration, TagsContext};
use walkdir::{DirEntry, WalkDir};
use crate::document::Document;
lazy_static! {
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
HashMap::from([
("c", vec!["c", "h"]),
("csharp", vec!["cs"]),
(
"cpp",
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
),
("css", vec!["css"]),
("dockerfile", vec!["Dockerfile"]),
("go", vec!["go"]),
("haskell", vec!["hs"]),
("html", vec!["html"]),
("java", vec!["java"]),
("javascript", vec!["js"]),
("julia", vec!["jl"]),
("lua", vec!["lua"]),
("makefile", vec!["Makefile"]),
("markdown", vec!["md", "markdown"]),
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
("perl", vec!["pl", "pm", "pod", "perl"]),
("powershell", vec!["ps1", "psd1", "psm1"]),
("python", vec!["py"]),
("ruby", vec!["rb"]),
("rust", vec!["rs"]),
("sql", vec!["sql"]),
("scala", vec!["scala"]),
("shellscript", vec!["sh", "bash", "command", "zsh"]),
("typescript", vec!["ts", "tsx"]),
("tex", vec!["tex"]),
("vb", vec!["vb"]),
])
};
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
let mut map = HashMap::new();
for (lang, exts) in &*LANGUAGE_EXTENSION {
for ext in exts {
map.insert(*ext, *lang);
}
}
map
};
}
trait RepositoryExt {
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
}
@ -79,21 +33,25 @@ impl RepositoryExt for Repository {
.filter_map(Result::ok)
.filter(is_source_code);
let mut context = TagsContext::new();
for entry in walk_dir {
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
let language = get_language(relative_path.extension().unwrap())
.unwrap()
.to_owned();
if let Ok(file_content) = read_to_string(entry.path()) {
info!("Building {:?}", relative_path);
writer.write_json_lines([Document {
let doc = Document {
git_url: self.git_url.clone(),
filepath: relative_path.display().to_string(),
language: get_language(relative_path.extension().unwrap())
.unwrap()
.to_owned(),
max_line_length: metrics::max_line_length(&file_content),
avg_line_length: metrics::avg_line_length(&file_content),
alphanum_fraction: metrics::alphanum_fraction(&file_content),
tags: tags::collect(&mut context, &language, &file_content),
language,
content: file_content,
}])?;
};
writer.write_json_lines([doc])?;
} else {
error!("Cannot read {:?}", relative_path);
}
@ -159,7 +117,11 @@ mod metrics {
total += x.len();
}
total as f32 / len as f32
if len > 0 {
total as f32 / len as f32
} else {
0.0
}
}
pub fn alphanum_fraction(content: &str) -> f32 {
@ -167,6 +129,109 @@ mod metrics {
.chars()
.map(|x| f32::from(u8::from(x.is_alphanumeric())))
.sum();
num_alphanumn / content.len() as f32
if !content.is_empty() {
num_alphanumn / content.len() as f32
} else {
0.0
}
}
}
mod tags {
use tabby_common::Tag;
use tree_sitter_tags::TagsContext;
use super::LANGUAGE_TAGS;
pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec<Tag> {
let config = LANGUAGE_TAGS.get(language);
let empty = Vec::new();
let Some(config) = config else {
return empty;
};
let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) else {
return empty;
};
if has_error {
return empty;
}
tags.filter_map(|x| x.ok())
.map(|x| Tag {
range: x.range,
name_range: x.name_range,
line_range: x.line_range,
docs: x.docs,
is_definition: x.is_definition,
syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(),
})
.collect()
}
}
// Mark TagsConfiguration as thread sync / safe.
struct TagsConfigurationSync(TagsConfiguration);
unsafe impl Send for TagsConfigurationSync {}
unsafe impl Sync for TagsConfigurationSync {}
lazy_static! {
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
HashMap::from([
("c", vec!["c", "h"]),
("csharp", vec!["cs"]),
(
"cpp",
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
),
("css", vec!["css"]),
("dockerfile", vec!["Dockerfile"]),
("go", vec!["go"]),
("haskell", vec!["hs"]),
("html", vec!["html"]),
("java", vec!["java"]),
("javascript", vec!["js"]),
("julia", vec!["jl"]),
("lua", vec!["lua"]),
("makefile", vec!["Makefile"]),
("markdown", vec!["md", "markdown"]),
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
("perl", vec!["pl", "pm", "pod", "perl"]),
("powershell", vec!["ps1", "psd1", "psm1"]),
("python", vec!["py"]),
("ruby", vec!["rb"]),
("rust", vec!["rs"]),
("sql", vec!["sql"]),
("scala", vec!["scala"]),
("shellscript", vec!["sh", "bash", "command", "zsh"]),
("typescript", vec!["ts", "tsx"]),
("tex", vec!["tex"]),
("vb", vec!["vb"]),
])
};
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
let mut map = HashMap::new();
for (lang, exts) in &*LANGUAGE_EXTENSION {
for ext in exts {
map.insert(*ext, *lang);
}
}
map
};
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
HashMap::from([(
"javascript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_javascript::language(),
tree_sitter_javascript::TAGGING_QUERY,
tree_sitter_javascript::LOCALS_QUERY,
)
.unwrap(),
),
)])
};
}

View File

@ -1,12 +0,0 @@
use serde::{Deserialize, Serialize};
#[derive(Serialize, Deserialize)]
pub struct Document {
pub git_url: String,
pub filepath: String,
pub content: String,
pub language: String,
pub max_line_length: usize,
pub avg_line_length: f32,
pub alphanum_fraction: f32,
}

View File

@ -1,23 +1,13 @@
use std::{
fs::{self, File},
io::BufReader,
};
use std::fs;
use anyhow::Result;
use serde_jsonlines::JsonLinesReader;
use tabby_common::{
config::Config,
path::{dataset_dir, index_dir},
};
use tabby_common::{config::Config, path::index_dir, Document};
use tantivy::{
directory::MmapDirectory,
doc,
schema::{Schema, STORED, STRING, TEXT},
Index,
};
use tracing::info;
use crate::document::Document;
pub fn index_repositories(_config: &Config) -> Result<()> {
let mut builder = Schema::builder();
@ -33,21 +23,13 @@ pub fn index_repositories(_config: &Config) -> Result<()> {
let mut writer = index.writer(10_000_000)?;
writer.delete_all_documents()?;
for path in dataset_dir().read_dir()? {
let path = path?.path();
info!("Indexing {:?}", path.as_path());
let fp = BufReader::new(File::open(path.as_path())?);
let reader = JsonLinesReader::new(fp);
for doc in reader.read_all::<Document>() {
let doc = doc?;
writer.add_document(doc!(
git_url => doc.git_url,
filepath => doc.filepath,
content => doc.content,
language => doc.language,
))?;
}
for doc in Document::all()? {
writer.add_document(doc!(
git_url => doc.git_url,
filepath => doc.filepath,
content => doc.content,
language => doc.language,
))?;
}
writer.commit()?;

View File

@ -1,5 +1,4 @@
mod dataset;
mod document;
mod index;
mod repository;

View File

@ -6,7 +6,7 @@ edition = "2021"
[dependencies]
ctranslate2-bindings = { path = "../ctranslate2-bindings" }
tabby-common = { path = "../tabby-common" }
tabby-scheduler = { path = "../tabby-scheduler" }
tabby-scheduler = { path = "../tabby-scheduler", optional = true }
tabby-download = { path = "../tabby-download" }
axum = "0.6"
hyper = { version = "0.14", features = ["full"] }
@ -41,4 +41,6 @@ features = [
]
[features]
default = [ "scheduler" ]
link_shared = ["ctranslate2-bindings/link_shared"]
scheduler = [ "tabby-scheduler" ]

View File

@ -31,6 +31,7 @@ pub enum Commands {
Download(download::DownloadArgs),
/// Run scheduler progress for cron jobs integrating external code repositories.
#[cfg(feature = "scheduler")]
Scheduler(SchedulerArgs),
}
@ -49,6 +50,7 @@ async fn main() {
match &cli.command {
Commands::Serve(args) => serve::main(args).await,
Commands::Download(args) => download::main(args).await,
#[cfg(feature = "scheduler")]
Commands::Scheduler(args) => tabby_scheduler::scheduler(args.now)
.await
.unwrap_or_else(|err| fatal!("Scheduler failed due to '{}'", err)),