feat: implement basic dependency detection for python / rust (#825)

* refactor: extract tags mod for dataset.rs

* feat: implement basic dependency detection for python / rust
release-fix-intellij-update-support-version-range
Meng Zhang 2023-11-18 00:05:48 -08:00 committed by GitHub
parent 5e7ca4f569
commit 1fe0922c72
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
7 changed files with 304 additions and 114 deletions

77
Cargo.lock generated
View File

@ -1723,6 +1723,16 @@ dependencies = [
"regex",
]
[[package]]
name = "globwalk"
version = "0.7.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9db17aec586697a93219b19726b5b68307eba92898c34b170857343fe67c99d"
dependencies = [
"ignore",
"walkdir",
]
[[package]]
name = "gloo-timers"
version = "0.2.6"
@ -3029,6 +3039,51 @@ version = "2.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "478c572c3d73181ff3c2539045f6eb99e5491218eae919370993b890cdbdd98e"
[[package]]
name = "pest"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ae9cee2a55a544be8b89dc6848072af97a20f2422603c10865be2a42b580fff5"
dependencies = [
"memchr",
"thiserror",
"ucd-trie",
]
[[package]]
name = "pest_derive"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "81d78524685f5ef2a3b3bd1cafbc9fcabb036253d9b1463e726a91cd16e2dfc2"
dependencies = [
"pest",
"pest_generator",
]
[[package]]
name = "pest_generator"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "68bd1206e71118b5356dae5ddc61c8b11e28b09ef6a31acbd15ea48a28e0c227"
dependencies = [
"pest",
"pest_meta",
"proc-macro2",
"quote",
"syn 2.0.28",
]
[[package]]
name = "pest_meta"
version = "2.7.5"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7c747191d4ad9e4a4ab9c8798f1e82a39affe7ef9648390b7e5548d18e099de6"
dependencies = [
"once_cell",
"pest",
"sha2",
]
[[package]]
name = "petgraph"
version = "0.6.3"
@ -3444,6 +3499,19 @@ version = "0.8.1"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "56d84fdd47036b038fc80dd333d10b6aab10d5d31f4a366e20014def75328d33"
[[package]]
name = "requirements"
version = "0.3.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2643e903f79d8e6bc310ee0def974d12a33561d14e0728511b6ba5e8be0791c3"
dependencies = [
"globwalk",
"pest",
"pest_derive",
"regex",
"walkdir",
]
[[package]]
name = "reqwest"
version = "0.11.22"
@ -4440,8 +4508,11 @@ dependencies = [
"job_scheduler",
"kdam",
"lazy_static",
"requirements",
"serde",
"serde-jsonlines",
"serde_json",
"serdeconv",
"tabby-common",
"tantivy",
"temp_testdir",
@ -5399,6 +5470,12 @@ version = "1.16.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "497961ef93d974e23eb6f433eb5fe1b7930b659f06d12dec6fc44a8f554c0bba"
[[package]]
name = "ucd-trie"
version = "0.1.6"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "ed646292ffc8188ef8ea4d1e0e0150fb15a5c2e12ad9b8fc191ae7a8a7f3c4b9"
[[package]]
name = "unicase"
version = "2.7.0"

View File

@ -49,3 +49,16 @@ pub struct Tag {
pub is_definition: bool,
pub syntax_type_name: String,
}
#[derive(Default, Serialize, Deserialize, Clone)]
pub struct Dependency {
pub language: String,
pub name: String,
#[serde(skip_serializing_if = "Option::is_none")]
pub version: Option<String>,
}
#[derive(Default, Serialize, Deserialize)]
pub struct DependencyFile {
pub deps: Vec<Dependency>,
}

View File

@ -35,6 +35,10 @@ pub fn repositories_dir() -> PathBuf {
tabby_root().join("repositories")
}
pub fn dependency_file() -> PathBuf {
repositories_dir().join("deps.json")
}
pub fn index_dir() -> PathBuf {
tabby_root().join("index")
}

View File

@ -23,6 +23,9 @@ tree-sitter-go = "0.20.0"
tree-sitter-ruby= "0.20.0"
ignore = "0.4.20"
kdam = { version = "0.5.0" }
requirements = "0.3.0"
serde.workspace = true
serdeconv.workspace = true
[dev-dependencies]
temp_testdir = "0.2"

View File

@ -1,3 +1,6 @@
mod deps;
mod tags;
use std::{
collections::HashMap,
ffi::OsStr,
@ -13,11 +16,11 @@ use lazy_static::lazy_static;
use serde_jsonlines::WriteExt;
use tabby_common::{
config::{Config, RepositoryConfig},
path::dataset_dir,
SourceFile,
path::{dataset_dir, dependency_file},
DependencyFile, SourceFile,
};
use tracing::error;
use tree_sitter_tags::{TagsConfiguration, TagsContext};
use tree_sitter_tags::TagsContext;
use crate::utils::tqdm;
@ -92,10 +95,14 @@ pub fn create_dataset(config: &Config) -> Result<()> {
None,
);
let mut deps = DependencyFile::default();
for repository in config.repositories.as_slice() {
deps::collect(repository.dir().as_path(), &mut deps);
repository.create_dataset(&mut writer)?;
}
serdeconv::to_json_file(&deps, dependency_file())?;
writer.flush()?;
Ok(())
}
@ -135,47 +142,6 @@ mod metrics {
}
}
mod tags {
use tabby_common::Tag;
use tree_sitter_tags::TagsContext;
use super::LANGUAGE_TAGS;
pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec<Tag> {
let config = LANGUAGE_TAGS.get(language);
let empty = Vec::new();
let Some(config) = config else {
return empty;
};
let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None)
else {
return empty;
};
if has_error {
return empty;
}
tags.filter_map(|x| x.ok())
.map(|x| Tag {
range: x.range,
name_range: x.name_range,
line_range: x.line_range,
docs: x.docs,
is_definition: x.is_definition,
syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(),
})
.collect()
}
}
// Mark TagsConfiguration as thread sync / safe.
struct TagsConfigurationSync(TagsConfiguration);
unsafe impl Send for TagsConfigurationSync {}
unsafe impl Sync for TagsConfigurationSync {}
lazy_static! {
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
HashMap::from([
@ -222,74 +188,4 @@ lazy_static! {
map
};
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
HashMap::from([
(
"python",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_python::language(),
tree_sitter_python::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"rust",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_rust::language(),
tree_sitter_rust::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"java",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_java::language(),
tree_sitter_java::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"javascript-typescript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_typescript::language_tsx(),
include_str!("../queries/tsx.scm"),
"",
)
.unwrap(),
),
),
(
"go",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_go::language(),
include_str!("../queries/go.scm"),
"",
)
.unwrap(),
),
),
(
"ruby",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_ruby::language(),
tree_sitter_ruby::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
])
};
}

View File

@ -0,0 +1,86 @@
use std::{collections::HashSet, path::Path};
use anyhow::Result;
use serde::Deserialize;
use tabby_common::{Dependency, DependencyFile};
use tracing::warn;
pub fn collect(path: &Path, file: &mut DependencyFile) {
if let Ok(mut deps) = process_requirements_txt(path) {
file.deps.append(&mut deps);
}
if let Ok(mut deps) = process_lock_file(path, "poetry.lock", "python") {
file.deps.append(&mut deps);
}
if let Ok(mut deps) = process_lock_file(path, "Cargo.lock", "rust") {
file.deps.append(&mut deps);
}
remove_duplicates(file);
}
fn process_requirements_txt(path: &Path) -> Result<Vec<Dependency>> {
let requirements_txt = path.join("requirements.txt");
let content = std::fs::read_to_string(requirements_txt)?;
let mut deps = vec![];
match requirements::parse_str(&content) {
Ok(requirements) => {
for requirement in requirements {
if let Some(name) = requirement.name {
deps.push(Dependency {
language: "python".to_owned(),
name,
version: None, // requirements.txt doesn't come with accurate version information.
});
}
}
}
Err(err) => {
warn!("Failed to parse requirements.txt: {}", err);
}
}
Ok(deps)
}
#[derive(Deserialize)]
struct LockFileDependency {
name: String,
version: String,
}
#[derive(Deserialize)]
struct LockFile {
package: Vec<LockFileDependency>,
}
fn process_lock_file(path: &Path, filename: &str, language: &str) -> Result<Vec<Dependency>> {
let poetry_lock = path.join(filename);
let deps: LockFile = serdeconv::from_toml_file(poetry_lock)?;
Ok(deps
.package
.into_iter()
.map(|x| Dependency {
language: language.to_string(),
name: x.name,
version: Some(x.version),
})
.collect())
}
fn remove_duplicates(file: &mut DependencyFile) {
let mut keys: HashSet<(String, String)> = HashSet::default();
let mut deps = vec![];
for x in &file.deps {
let key = (x.language.clone(), x.name.clone());
if !keys.contains(&key) {
keys.insert(key);
deps.push(x.clone());
}
}
file.deps = deps;
}

View File

@ -0,0 +1,111 @@
use std::collections::HashMap;
use lazy_static::lazy_static;
use tabby_common::Tag;
use tree_sitter_tags::{TagsConfiguration, TagsContext};
pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec<Tag> {
let config = LANGUAGE_TAGS.get(language);
let empty = Vec::new();
let Some(config) = config else {
return empty;
};
let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None) else {
return empty;
};
if has_error {
return empty;
}
tags.filter_map(|x| x.ok())
.map(|x| Tag {
range: x.range,
name_range: x.name_range,
line_range: x.line_range,
docs: x.docs,
is_definition: x.is_definition,
syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(),
})
.collect()
}
// Mark TagsConfiguration as thread sync / safe.
struct TagsConfigurationSync(TagsConfiguration);
unsafe impl Send for TagsConfigurationSync {}
unsafe impl Sync for TagsConfigurationSync {}
lazy_static! {
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
HashMap::from([
(
"python",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_python::language(),
tree_sitter_python::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"rust",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_rust::language(),
tree_sitter_rust::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"java",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_java::language(),
tree_sitter_java::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
(
"javascript-typescript",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_typescript::language_tsx(),
include_str!("../../queries/tsx.scm"),
"",
)
.unwrap(),
),
),
(
"go",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_go::language(),
include_str!("../../queries/go.scm"),
"",
)
.unwrap(),
),
),
(
"ruby",
TagsConfigurationSync(
TagsConfiguration::new(
tree_sitter_ruby::language(),
tree_sitter_ruby::TAGGING_QUERY,
"",
)
.unwrap(),
),
),
])
};
}