From a8a145d9b30877335a3e7aaba321fbb0c1a792fa Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 12 Jun 2023 12:39:43 -0700 Subject: [PATCH] feat: add metrics in dataset (#235) --- crates/tabby-scheduler/src/dataset.rs | 32 +++++++++++++++++++++++++- crates/tabby-scheduler/src/document.rs | 3 +++ 2 files changed, 34 insertions(+), 1 deletion(-) diff --git a/crates/tabby-scheduler/src/dataset.rs b/crates/tabby-scheduler/src/dataset.rs index 7941432..d54c25e 100644 --- a/crates/tabby-scheduler/src/dataset.rs +++ b/crates/tabby-scheduler/src/dataset.rs @@ -86,10 +86,13 @@ impl RepositoryExt for Repository { writer.write_json_lines([Document { git_url: self.git_url.clone(), filepath: relative_path.display().to_string(), - content: file_content, language: get_language(relative_path.extension().unwrap()) .unwrap() .to_owned(), + max_line_length: metrics::max_line_length(&file_content), + avg_line_length: metrics::avg_line_length(&file_content), + alphanum_fraction: metrics::alphanum_fraction(&file_content), + content: file_content, }])?; } else { error!("Cannot read {:?}", relative_path); @@ -140,3 +143,30 @@ pub fn create_dataset(config: &Config) -> Result<()> { writer.flush()?; Ok(()) } + +mod metrics { + use std::cmp::max; + + pub fn max_line_length(content: &str) -> usize { + content.lines().map(|x| x.len()).reduce(max).unwrap_or(0) + } + + pub fn avg_line_length(content: &str) -> f32 { + let mut total = 0; + let mut len = 0; + for x in content.lines() { + len += 1; + total += x.len(); + } + + total as f32 / len as f32 + } + + pub fn alphanum_fraction(content: &str) -> f32 { + let num_alphanumn: f32 = content + .chars() + .map(|x| f32::from(u8::from(x.is_alphanumeric()))) + .sum(); + num_alphanumn / content.len() as f32 + } +} diff --git a/crates/tabby-scheduler/src/document.rs b/crates/tabby-scheduler/src/document.rs index c027be5..cc816bf 100644 --- a/crates/tabby-scheduler/src/document.rs +++ b/crates/tabby-scheduler/src/document.rs @@ -6,4 +6,7 @@ pub struct Document { pub filepath: String, pub content: String, pub language: String, + pub max_line_length: usize, + pub avg_line_length: f32, + pub alphanum_fraction: f32, }