diff --git a/Cargo.lock b/Cargo.lock index cf095c7..3aef1df 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1723,6 +1723,16 @@ dependencies = [ "regex", ] +[[package]] +name = "globwalk" +version = "0.7.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d9db17aec586697a93219b19726b5b68307eba92898c34b170857343fe67c99d" +dependencies = [ + "ignore", + "walkdir", +] + [[package]] name = "gloo-timers" version = "0.2.6" @@ -3029,6 +3039,51 @@ version = "2.2.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e" +[[package]] +name = "pest" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ae9cee2a55a544be8b89dc6848072af97a20f2422603c10865be2a42b580fff5" +dependencies = [ + "memchr", + "thiserror", + "ucd-trie", +] + +[[package]] +name = "pest_derive" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81d78524685f5ef2a3b3bd1cafbc9fcabb036253d9b1463e726a91cd16e2dfc2" +dependencies = [ + "pest", + "pest_generator", +] + +[[package]] +name = "pest_generator" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "68bd1206e71118b5356dae5ddc61c8b11e28b09ef6a31acbd15ea48a28e0c227" +dependencies = [ + "pest", + "pest_meta", + "proc-macro2", + "quote", + "syn 2.0.28", +] + +[[package]] +name = "pest_meta" +version = "2.7.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7c747191d4ad9e4a4ab9c8798f1e82a39affe7ef9648390b7e5548d18e099de6" +dependencies = [ + "once_cell", + "pest", + "sha2", +] + [[package]] name = "petgraph" version = "0.6.3" @@ -3444,6 +3499,19 @@ version = "0.8.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "56d84fdd47036b038fc80dd333d10b6aab10d5d31f4a366e20014def75328d33" +[[package]] +name = "requirements" +version = "0.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2643e903f79d8e6bc310ee0def974d12a33561d14e0728511b6ba5e8be0791c3" +dependencies = [ + "globwalk", + "pest", + "pest_derive", + "regex", + "walkdir", +] + [[package]] name = "reqwest" version = "0.11.22" @@ -4440,8 +4508,11 @@ dependencies = [ "job_scheduler", "kdam", "lazy_static", + "requirements", + "serde", "serde-jsonlines", "serde_json", + "serdeconv", "tabby-common", "tantivy", "temp_testdir", @@ -5399,6 +5470,12 @@ version = "1.16.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba" +[[package]] +name = "ucd-trie" +version = "0.1.6" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9" + [[package]] name = "unicase" version = "2.7.0" diff --git a/crates/tabby-common/src/lib.rs b/crates/tabby-common/src/lib.rs index ec17a42..c202833 100644 --- a/crates/tabby-common/src/lib.rs +++ b/crates/tabby-common/src/lib.rs @@ -49,3 +49,16 @@ pub struct Tag { pub is_definition: bool, pub syntax_type_name: String, } + +#[derive(Default, Serialize, Deserialize, Clone)] +pub struct Dependency { + pub language: String, + pub name: String, + #[serde(skip_serializing_if = "Option::is_none")] + pub version: Option, +} + +#[derive(Default, Serialize, Deserialize)] +pub struct DependencyFile { + pub deps: Vec, +} diff --git a/crates/tabby-common/src/path.rs b/crates/tabby-common/src/path.rs index 55f757e..aaec53a 100644 --- a/crates/tabby-common/src/path.rs +++ b/crates/tabby-common/src/path.rs @@ -35,6 +35,10 @@ pub fn repositories_dir() -> PathBuf { tabby_root().join("repositories") } +pub fn dependency_file() -> PathBuf { + repositories_dir().join("deps.json") +} + pub fn index_dir() -> PathBuf { tabby_root().join("index") } diff --git a/crates/tabby-scheduler/Cargo.toml b/crates/tabby-scheduler/Cargo.toml index 912dd02..188ec7f 100644 --- a/crates/tabby-scheduler/Cargo.toml +++ b/crates/tabby-scheduler/Cargo.toml @@ -23,6 +23,9 @@ tree-sitter-go = "0.20.0" tree-sitter-ruby= "0.20.0" ignore = "0.4.20" kdam = { version = "0.5.0" } +requirements = "0.3.0" +serde.workspace = true +serdeconv.workspace = true [dev-dependencies] temp_testdir = "0.2" diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index bddbb97..e0ada78 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -1,3 +1,6 @@ +mod deps; +mod tags; + use std::{ collections::HashMap, ffi::OsStr, @@ -13,11 +16,11 @@ use lazy_static::lazy_static; use serde_jsonlines::WriteExt; use tabby_common::{ config::{Config, RepositoryConfig}, - path::dataset_dir, - SourceFile, + path::{dataset_dir, dependency_file}, + DependencyFile, SourceFile, }; use tracing::error; -use tree_sitter_tags::{TagsConfiguration, TagsContext}; +use tree_sitter_tags::TagsContext; use crate::utils::tqdm; @@ -92,10 +95,14 @@ pub fn create_dataset(config: &Config) -> Result<()> { None, ); + let mut deps = DependencyFile::default(); for repository in config.repositories.as_slice() { + deps::collect(repository.dir().as_path(), &mut deps); repository.create_dataset(&mut writer)?; } + serdeconv::to_json_file(&deps, dependency_file())?; + writer.flush()?; Ok(()) } @@ -135,47 +142,6 @@ mod metrics { } } -mod tags { - use tabby_common::Tag; - use tree_sitter_tags::TagsContext; - - use super::LANGUAGE_TAGS; - - pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec { - let config = LANGUAGE_TAGS.get(language); - let empty = Vec::new(); - - let Some(config) = config else { - return empty; - }; - - let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) - else { - return empty; - }; - - if has_error { - return empty; - } - - tags.filter_map(|x| x.ok()) - .map(|x| Tag { - range: x.range, - name_range: x.name_range, - line_range: x.line_range, - docs: x.docs, - is_definition: x.is_definition, - syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(), - }) - .collect() - } -} - -// Mark TagsConfiguration as thread sync / safe. -struct TagsConfigurationSync(TagsConfiguration); -unsafe impl Send for TagsConfigurationSync {} -unsafe impl Sync for TagsConfigurationSync {} - lazy_static! { static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = { HashMap::from([ @@ -222,74 +188,4 @@ lazy_static! { map }; - static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = { - HashMap::from([ - ( - "python", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_python::language(), - tree_sitter_python::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "rust", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_rust::language(), - tree_sitter_rust::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "java", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_java::language(), - tree_sitter_java::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ( - "javascript-typescript", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_typescript::language_tsx(), - include_str!("../queries/tsx.scm"), - "", - ) - .unwrap(), - ), - ), - ( - "go", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_go::language(), - include_str!("../queries/go.scm"), - "", - ) - .unwrap(), - ), - ), - ( - "ruby", - TagsConfigurationSync( - TagsConfiguration::new( - tree_sitter_ruby::language(), - tree_sitter_ruby::TAGGING_QUERY, - "", - ) - .unwrap(), - ), - ), - ]) - }; } diff --git a/crates/tabby-scheduler/src/dataset/deps.rs b/crates/tabby-scheduler/src/dataset/deps.rs new file mode 100644 index 0000000..a8d4eb0 --- /dev/null +++ b/crates/tabby-scheduler/src/dataset/deps.rs @@ -0,0 +1,86 @@ +use std::{collections::HashSet, path::Path}; + +use anyhow::Result; +use serde::Deserialize; +use tabby_common::{Dependency, DependencyFile}; +use tracing::warn; + +pub fn collect(path: &Path, file: &mut DependencyFile) { + if let Ok(mut deps) = process_requirements_txt(path) { + file.deps.append(&mut deps); + } + + if let Ok(mut deps) = process_lock_file(path, "poetry.lock", "python") { + file.deps.append(&mut deps); + } + + if let Ok(mut deps) = process_lock_file(path, "Cargo.lock", "rust") { + file.deps.append(&mut deps); + } + + remove_duplicates(file); +} + +fn process_requirements_txt(path: &Path) -> Result> { + let requirements_txt = path.join("requirements.txt"); + let content = std::fs::read_to_string(requirements_txt)?; + + let mut deps = vec![]; + match requirements::parse_str(&content) { + Ok(requirements) => { + for requirement in requirements { + if let Some(name) = requirement.name { + deps.push(Dependency { + language: "python".to_owned(), + name, + version: None, // requirements.txt doesn't come with accurate version information. + }); + } + } + } + Err(err) => { + warn!("Failed to parse requirements.txt: {}", err); + } + } + + Ok(deps) +} + +#[derive(Deserialize)] +struct LockFileDependency { + name: String, + version: String, +} + +#[derive(Deserialize)] +struct LockFile { + package: Vec, +} + +fn process_lock_file(path: &Path, filename: &str, language: &str) -> Result> { + let poetry_lock = path.join(filename); + let deps: LockFile = serdeconv::from_toml_file(poetry_lock)?; + Ok(deps + .package + .into_iter() + .map(|x| Dependency { + language: language.to_string(), + name: x.name, + version: Some(x.version), + }) + .collect()) +} + +fn remove_duplicates(file: &mut DependencyFile) { + let mut keys: HashSet<(String, String)> = HashSet::default(); + let mut deps = vec![]; + for x in &file.deps { + let key = (x.language.clone(), x.name.clone()); + if !keys.contains(&key) { + keys.insert(key); + deps.push(x.clone()); + } + } + + file.deps = deps; +} diff --git a/crates/tabby-scheduler/src/dataset/tags.rs b/crates/tabby-scheduler/src/dataset/tags.rs new file mode 100644 index 0000000..7fbf87d --- /dev/null +++ b/crates/tabby-scheduler/src/dataset/tags.rs @@ -0,0 +1,111 @@ +use std::collections::HashMap; + +use lazy_static::lazy_static; +use tabby_common::Tag; +use tree_sitter_tags::{TagsConfiguration, TagsContext}; + +pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec { + let config = LANGUAGE_TAGS.get(language); + let empty = Vec::new(); + + let Some(config) = config else { + return empty; + }; + + let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) else { + return empty; + }; + + if has_error { + return empty; + } + + tags.filter_map(|x| x.ok()) + .map(|x| Tag { + range: x.range, + name_range: x.name_range, + line_range: x.line_range, + docs: x.docs, + is_definition: x.is_definition, + syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(), + }) + .collect() +} + +// Mark TagsConfiguration as thread sync / safe. +struct TagsConfigurationSync(TagsConfiguration); +unsafe impl Send for TagsConfigurationSync {} +unsafe impl Sync for TagsConfigurationSync {} + +lazy_static! { + static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = { + HashMap::from([ + ( + "python", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_python::language(), + tree_sitter_python::TAGGING_QUERY, + "", + ) + .unwrap(), + ), + ), + ( + "rust", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_rust::language(), + tree_sitter_rust::TAGGING_QUERY, + "", + ) + .unwrap(), + ), + ), + ( + "java", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_java::language(), + tree_sitter_java::TAGGING_QUERY, + "", + ) + .unwrap(), + ), + ), + ( + "javascript-typescript", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_typescript::language_tsx(), + include_str!("../../queries/tsx.scm"), + "", + ) + .unwrap(), + ), + ), + ( + "go", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_go::language(), + include_str!("../../queries/go.scm"), + "", + ) + .unwrap(), + ), + ), + ( + "ruby", + TagsConfigurationSync( + TagsConfiguration::new( + tree_sitter_ruby::language(), + tree_sitter_ruby::TAGGING_QUERY, + "", + ) + .unwrap(), + ), + ), + ]) + }; +}