feat: add metrics in dataset (#235)

improve-workflow
Meng Zhang 2023-06-12 12:39:43 -07:00 committed by GitHub
parent 2983f70d22
commit a8a145d9b3
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 34 additions and 1 deletions

View File

@ -86,10 +86,13 @@ impl RepositoryExt for Repository {
writer.write_json_lines([Document {
git_url: self.git_url.clone(),
filepath: relative_path.display().to_string(),
content: file_content,
language: get_language(relative_path.extension().unwrap())
.unwrap()
.to_owned(),
max_line_length: metrics::max_line_length(&file_content),
avg_line_length: metrics::avg_line_length(&file_content),
alphanum_fraction: metrics::alphanum_fraction(&file_content),
content: file_content,
}])?;
} else {
error!("Cannot read {:?}", relative_path);
@ -140,3 +143,30 @@ pub fn create_dataset(config: &Config) -> Result<()> {
writer.flush()?;
Ok(())
}
mod metrics {
use std::cmp::max;
pub fn max_line_length(content: &str) -> usize {
content.lines().map(|x| x.len()).reduce(max).unwrap_or(0)
}
pub fn avg_line_length(content: &str) -> f32 {
let mut total = 0;
let mut len = 0;
for x in content.lines() {
len += 1;
total += x.len();
}
total as f32 / len as f32
}
pub fn alphanum_fraction(content: &str) -> f32 {
let num_alphanumn: f32 = content
.chars()
.map(|x| f32::from(u8::from(x.is_alphanumeric())))
.sum();
num_alphanumn / content.len() as f32
}
}

View File

@ -6,4 +6,7 @@ pub struct Document {
pub filepath: String,
pub content: String,
pub language: String,
pub max_line_length: usize,
pub avg_line_length: f32,
pub alphanum_fraction: f32,
}