2023-06-12 15:19:43 +00:00
|
|
|
use std::{
|
|
|
|
|
collections::HashMap,
|
|
|
|
|
ffi::OsStr,
|
|
|
|
|
fs::{self, read_to_string},
|
|
|
|
|
io::Write,
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
use anyhow::Result;
|
|
|
|
|
use file_rotate::{compression::Compression, suffix::AppendCount, ContentLimit, FileRotate};
|
2023-10-24 01:29:38 +00:00
|
|
|
use ignore::{DirEntry, Walk};
|
2023-06-12 15:19:43 +00:00
|
|
|
use lazy_static::lazy_static;
|
|
|
|
|
use serde_jsonlines::WriteExt;
|
|
|
|
|
use tabby_common::{
|
2023-10-25 22:05:23 +00:00
|
|
|
config::{Config, RepositoryConfig},
|
2023-06-12 15:19:43 +00:00
|
|
|
path::dataset_dir,
|
2023-10-04 23:27:19 +00:00
|
|
|
SourceFile,
|
2023-06-12 15:19:43 +00:00
|
|
|
};
|
|
|
|
|
use tracing::{error, info};
|
2023-06-22 02:48:13 +00:00
|
|
|
use tree_sitter_tags::{TagsConfiguration, TagsContext};
|
2023-06-12 15:19:43 +00:00
|
|
|
|
|
|
|
|
trait RepositoryExt {
|
|
|
|
|
fn create_dataset(&self, writer: &mut impl Write) -> Result<()>;
|
|
|
|
|
}
|
|
|
|
|
|
2023-10-25 22:05:23 +00:00
|
|
|
impl RepositoryExt for RepositoryConfig {
|
2023-06-12 15:19:43 +00:00
|
|
|
fn create_dataset(&self, writer: &mut impl Write) -> Result<()> {
|
|
|
|
|
let dir = self.dir();
|
|
|
|
|
|
|
|
|
|
info!("Start indexing repository {}", self.git_url);
|
2023-10-24 01:29:38 +00:00
|
|
|
let walk_dir = Walk::new(dir.as_path())
|
2023-06-12 15:19:43 +00:00
|
|
|
.filter_map(Result::ok)
|
|
|
|
|
.filter(is_source_code);
|
|
|
|
|
|
2023-06-22 02:48:13 +00:00
|
|
|
let mut context = TagsContext::new();
|
2023-06-12 15:19:43 +00:00
|
|
|
for entry in walk_dir {
|
|
|
|
|
let relative_path = entry.path().strip_prefix(dir.as_path()).unwrap();
|
2023-06-22 02:48:13 +00:00
|
|
|
let language = get_language(relative_path.extension().unwrap())
|
|
|
|
|
.unwrap()
|
|
|
|
|
.to_owned();
|
2023-06-12 15:19:43 +00:00
|
|
|
if let Ok(file_content) = read_to_string(entry.path()) {
|
|
|
|
|
info!("Building {:?}", relative_path);
|
2023-10-04 23:27:19 +00:00
|
|
|
let source_file = SourceFile {
|
2023-06-12 15:19:43 +00:00
|
|
|
git_url: self.git_url.clone(),
|
|
|
|
|
filepath: relative_path.display().to_string(),
|
2023-06-12 19:39:43 +00:00
|
|
|
max_line_length: metrics::max_line_length(&file_content),
|
|
|
|
|
avg_line_length: metrics::avg_line_length(&file_content),
|
|
|
|
|
alphanum_fraction: metrics::alphanum_fraction(&file_content),
|
2023-06-22 02:48:13 +00:00
|
|
|
tags: tags::collect(&mut context, &language, &file_content),
|
|
|
|
|
language,
|
2023-06-12 19:39:43 +00:00
|
|
|
content: file_content,
|
2023-06-22 02:48:13 +00:00
|
|
|
};
|
2023-10-04 23:27:19 +00:00
|
|
|
writer.write_json_lines([source_file])?;
|
2023-06-12 15:19:43 +00:00
|
|
|
} else {
|
|
|
|
|
error!("Cannot read {:?}", relative_path);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn get_language(ext: &OsStr) -> Option<&str> {
|
|
|
|
|
let ext = ext.to_str().unwrap_or("");
|
|
|
|
|
EXTENSION_LANGUAGE.get(ext).copied()
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
fn is_source_code(entry: &DirEntry) -> bool {
|
2023-10-24 01:29:38 +00:00
|
|
|
if entry.file_type().is_some_and(|x| x.is_file()) {
|
2023-06-12 15:19:43 +00:00
|
|
|
entry.path().extension().and_then(get_language).is_some()
|
|
|
|
|
} else {
|
|
|
|
|
false
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn create_dataset(config: &Config) -> Result<()> {
|
|
|
|
|
fs::remove_dir_all(dataset_dir()).ok();
|
|
|
|
|
fs::create_dir_all(dataset_dir())?;
|
|
|
|
|
let mut writer = FileRotate::new(
|
|
|
|
|
dataset_dir().join("data.jsonl"),
|
|
|
|
|
AppendCount::new(usize::max_value()),
|
|
|
|
|
ContentLimit::Lines(1000),
|
|
|
|
|
Compression::None,
|
|
|
|
|
#[cfg(unix)]
|
|
|
|
|
None,
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
for repository in config.repositories.as_slice() {
|
|
|
|
|
repository.create_dataset(&mut writer)?;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
writer.flush()?;
|
|
|
|
|
Ok(())
|
|
|
|
|
}
|
2023-06-12 19:39:43 +00:00
|
|
|
|
|
|
|
|
mod metrics {
|
|
|
|
|
use std::cmp::max;
|
|
|
|
|
|
|
|
|
|
pub fn max_line_length(content: &str) -> usize {
|
|
|
|
|
content.lines().map(|x| x.len()).reduce(max).unwrap_or(0)
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn avg_line_length(content: &str) -> f32 {
|
|
|
|
|
let mut total = 0;
|
|
|
|
|
let mut len = 0;
|
|
|
|
|
for x in content.lines() {
|
|
|
|
|
len += 1;
|
|
|
|
|
total += x.len();
|
|
|
|
|
}
|
|
|
|
|
|
2023-06-22 02:48:13 +00:00
|
|
|
if len > 0 {
|
|
|
|
|
total as f32 / len as f32
|
|
|
|
|
} else {
|
|
|
|
|
0.0
|
|
|
|
|
}
|
2023-06-12 19:39:43 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
pub fn alphanum_fraction(content: &str) -> f32 {
|
|
|
|
|
let num_alphanumn: f32 = content
|
|
|
|
|
.chars()
|
|
|
|
|
.map(|x| f32::from(u8::from(x.is_alphanumeric())))
|
|
|
|
|
.sum();
|
2023-06-22 02:48:13 +00:00
|
|
|
if !content.is_empty() {
|
|
|
|
|
num_alphanumn / content.len() as f32
|
|
|
|
|
} else {
|
|
|
|
|
0.0
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
mod tags {
|
|
|
|
|
use tabby_common::Tag;
|
|
|
|
|
use tree_sitter_tags::TagsContext;
|
|
|
|
|
|
|
|
|
|
use super::LANGUAGE_TAGS;
|
|
|
|
|
|
|
|
|
|
pub fn collect(context: &mut TagsContext, language: &str, content: &str) -> Vec<Tag> {
|
|
|
|
|
let config = LANGUAGE_TAGS.get(language);
|
|
|
|
|
let empty = Vec::new();
|
|
|
|
|
|
|
|
|
|
let Some(config) = config else {
|
2023-07-09 05:18:19 +00:00
|
|
|
return empty;
|
|
|
|
|
};
|
2023-06-22 02:48:13 +00:00
|
|
|
|
2023-07-09 05:18:19 +00:00
|
|
|
let Ok((tags, has_error)) = context.generate_tags(&config.0, content.as_bytes(), None)
|
|
|
|
|
else {
|
|
|
|
|
return empty;
|
|
|
|
|
};
|
2023-06-22 02:48:13 +00:00
|
|
|
|
|
|
|
|
if has_error {
|
|
|
|
|
return empty;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
tags.filter_map(|x| x.ok())
|
|
|
|
|
.map(|x| Tag {
|
|
|
|
|
range: x.range,
|
|
|
|
|
name_range: x.name_range,
|
|
|
|
|
line_range: x.line_range,
|
|
|
|
|
docs: x.docs,
|
|
|
|
|
is_definition: x.is_definition,
|
|
|
|
|
syntax_type_name: config.0.syntax_type_name(x.syntax_type_id).to_owned(),
|
|
|
|
|
})
|
|
|
|
|
.collect()
|
2023-06-12 19:39:43 +00:00
|
|
|
}
|
|
|
|
|
}
|
2023-06-22 02:48:13 +00:00
|
|
|
|
|
|
|
|
// Mark TagsConfiguration as thread sync / safe.
|
|
|
|
|
struct TagsConfigurationSync(TagsConfiguration);
|
|
|
|
|
unsafe impl Send for TagsConfigurationSync {}
|
|
|
|
|
unsafe impl Sync for TagsConfigurationSync {}
|
|
|
|
|
|
|
|
|
|
lazy_static! {
|
|
|
|
|
static ref LANGUAGE_EXTENSION: HashMap<&'static str, Vec<&'static str>> = {
|
|
|
|
|
HashMap::from([
|
|
|
|
|
("c", vec!["c", "h"]),
|
|
|
|
|
("csharp", vec!["cs"]),
|
|
|
|
|
(
|
|
|
|
|
"cpp",
|
|
|
|
|
vec!["cpp", "hpp", "c++", "h++", "cc", "hh", "C", "H"],
|
|
|
|
|
),
|
|
|
|
|
("css", vec!["css"]),
|
|
|
|
|
("dockerfile", vec!["Dockerfile"]),
|
|
|
|
|
("go", vec!["go"]),
|
|
|
|
|
("haskell", vec!["hs"]),
|
|
|
|
|
("html", vec!["html"]),
|
|
|
|
|
("java", vec!["java"]),
|
|
|
|
|
("julia", vec!["jl"]),
|
|
|
|
|
("lua", vec!["lua"]),
|
|
|
|
|
("makefile", vec!["Makefile"]),
|
|
|
|
|
("markdown", vec!["md", "markdown"]),
|
|
|
|
|
("php", vec!["php", "php3", "php4", "php5", "phps", "phpt"]),
|
|
|
|
|
("perl", vec!["pl", "pm", "pod", "perl"]),
|
|
|
|
|
("powershell", vec!["ps1", "psd1", "psm1"]),
|
|
|
|
|
("python", vec!["py"]),
|
|
|
|
|
("ruby", vec!["rb"]),
|
|
|
|
|
("rust", vec!["rs"]),
|
|
|
|
|
("sql", vec!["sql"]),
|
|
|
|
|
("scala", vec!["scala"]),
|
|
|
|
|
("shellscript", vec!["sh", "bash", "command", "zsh"]),
|
2023-10-12 00:42:58 +00:00
|
|
|
(
|
|
|
|
|
"javascript-typescript",
|
|
|
|
|
vec!["ts", "mts", "js", "mjs", "jsx", "tsx"],
|
|
|
|
|
),
|
2023-06-22 02:48:13 +00:00
|
|
|
("tex", vec!["tex"]),
|
|
|
|
|
("vb", vec!["vb"]),
|
|
|
|
|
])
|
|
|
|
|
};
|
|
|
|
|
static ref EXTENSION_LANGUAGE: HashMap<&'static str, &'static str> = {
|
|
|
|
|
let mut map = HashMap::new();
|
|
|
|
|
for (lang, exts) in &*LANGUAGE_EXTENSION {
|
|
|
|
|
for ext in exts {
|
|
|
|
|
map.insert(*ext, *lang);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
map
|
|
|
|
|
};
|
|
|
|
|
static ref LANGUAGE_TAGS: HashMap<&'static str, TagsConfigurationSync> = {
|
2023-07-13 09:31:44 +00:00
|
|
|
HashMap::from([
|
|
|
|
|
(
|
|
|
|
|
"python",
|
|
|
|
|
TagsConfigurationSync(
|
|
|
|
|
TagsConfiguration::new(
|
|
|
|
|
tree_sitter_python::language(),
|
|
|
|
|
tree_sitter_python::TAGGING_QUERY,
|
|
|
|
|
"",
|
|
|
|
|
)
|
|
|
|
|
.unwrap(),
|
|
|
|
|
),
|
2023-06-22 02:48:13 +00:00
|
|
|
),
|
2023-07-13 09:31:44 +00:00
|
|
|
(
|
|
|
|
|
"rust",
|
|
|
|
|
TagsConfigurationSync(
|
|
|
|
|
TagsConfiguration::new(
|
|
|
|
|
tree_sitter_rust::language(),
|
|
|
|
|
tree_sitter_rust::TAGGING_QUERY,
|
|
|
|
|
"",
|
|
|
|
|
)
|
|
|
|
|
.unwrap(),
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-10-12 00:42:58 +00:00
|
|
|
(
|
|
|
|
|
"javascript-typescript",
|
|
|
|
|
TagsConfigurationSync(
|
|
|
|
|
TagsConfiguration::new(
|
|
|
|
|
tree_sitter_typescript::language_tsx(),
|
|
|
|
|
include_str!("../queries/tsx.scm"),
|
|
|
|
|
"",
|
|
|
|
|
)
|
|
|
|
|
.unwrap(),
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-10-14 07:57:54 +00:00
|
|
|
(
|
|
|
|
|
"go",
|
|
|
|
|
TagsConfigurationSync(
|
|
|
|
|
TagsConfiguration::new(
|
|
|
|
|
tree_sitter_go::language(),
|
|
|
|
|
include_str!("../queries/go.scm"),
|
|
|
|
|
"",
|
|
|
|
|
)
|
|
|
|
|
.unwrap(),
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-10-20 18:48:37 +00:00
|
|
|
(
|
|
|
|
|
"ruby",
|
|
|
|
|
TagsConfigurationSync(
|
|
|
|
|
TagsConfiguration::new(
|
|
|
|
|
tree_sitter_ruby::language(),
|
2023-10-21 20:01:18 +00:00
|
|
|
tree_sitter_ruby::TAGGING_QUERY,
|
2023-10-20 18:48:37 +00:00
|
|
|
"",
|
|
|
|
|
)
|
|
|
|
|
.unwrap(),
|
|
|
|
|
),
|
|
|
|
|
),
|
2023-07-13 09:31:44 +00:00
|
|
|
])
|
2023-06-22 02:48:13 +00:00
|
|
|
};
|
|
|
|
|
}
|