diff --git a/tabby/tasks/update_dataset.yaml b/tabby/tasks/update_dataset.yaml index 90ef220..ba0ccd5 100644 --- a/tabby/tasks/update_dataset.yaml +++ b/tabby/tasks/update_dataset.yaml @@ -7,12 +7,12 @@ env: - GIT_REPOSITORIES_DIR: "$GIT_REPOSITORIES_DIR" - DATASET_DIR: "$DATASET_DIR" steps: - - name: Update repositories + - name: update repositories dir: $APP_DIR command: python -m tabby.tools.repository.updater --data_dir=$GIT_REPOSITORIES_DIR --config_file=$CONFIG_FILE - - name: Generate dataset + - name: generate dataset dir: $APP_DIR - command: python -m tabby.tools.preprocess.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR + command: python -m tabby.tools.build_dataset --project_dir=$GIT_REPOSITORIES_DIR --output_dir=$DATASET_DIR depends: - - Update repositories + - update repositories diff --git a/tabby/tools/preprocess/build_dataset.py b/tabby/tools/build_dataset/__main__.py similarity index 83% rename from tabby/tools/preprocess/build_dataset.py rename to tabby/tools/build_dataset/__main__.py index e9a52a2..36645c3 100644 --- a/tabby/tools/preprocess/build_dataset.py +++ b/tabby/tools/build_dataset/__main__.py @@ -2,10 +2,11 @@ import glob import json import os +import pandas as pd from datasets import Dataset from transformers import HfArgumentParser -from . import filters, metrics +from . import metrics from .args import PreprocessProjectArgs @@ -61,6 +62,17 @@ def dataset_iter(files): return gen +def count_by_language(dataset): + key = "language" + df = ( + pd.DataFrame(dataset[key], columns=[key]) + .groupby([key]) + .size() + .to_frame("count") + ) + return df + + if __name__ == "__main__": valid_extensions = read_valid_extensions() @@ -80,5 +92,8 @@ if __name__ == "__main__": ) ds = Dataset.from_generator(dataset_iter(files)) - ds = ds.filter(filters.basic_filters(args)) ds.save_to_disk(args.output_dir) + + print("\n## Summary") + print("Number of source files", len(ds)) + print("Number of source files by languages", count_by_language(ds).to_json()) diff --git a/tabby/tools/build_dataset/args.py b/tabby/tools/build_dataset/args.py new file mode 100644 index 0000000..99c4af3 --- /dev/null +++ b/tabby/tools/build_dataset/args.py @@ -0,0 +1,14 @@ +from dataclasses import dataclass, field +from typing import Optional + + +@dataclass +class PreprocessProjectArgs: + # add arguments in the following format + project_dir: Optional[str] = field( + metadata={"help": "Project directory."}, + ) + + output_dir: Optional[str] = field( + metadata={"help": "Output save path directory."}, + ) diff --git a/tabby/tools/preprocess/metrics.py b/tabby/tools/build_dataset/metrics.py similarity index 100% rename from tabby/tools/preprocess/metrics.py rename to tabby/tools/build_dataset/metrics.py diff --git a/tabby/tools/preprocess/programming-languages-to-file-extensions.json b/tabby/tools/build_dataset/programming-languages-to-file-extensions.json similarity index 100% rename from tabby/tools/preprocess/programming-languages-to-file-extensions.json rename to tabby/tools/build_dataset/programming-languages-to-file-extensions.json diff --git a/tabby/tools/preprocess/args.py b/tabby/tools/preprocess/args.py deleted file mode 100644 index f55b113..0000000 --- a/tabby/tools/preprocess/args.py +++ /dev/null @@ -1,30 +0,0 @@ -from dataclasses import dataclass, field -from typing import Optional - - -@dataclass(kw_only=True) -class FilterArgs: - line_max: Optional[int] = field( - default=1000, - metadata={"help": "Max line length allowed"}, - ) - line_mean: Optional[int] = field( - default=100, - metadata={"help": "Mean line length allowed"}, - ) - alpha_frac: Optional[float] = field( - default=0.25, - metadata={"help": "Minimum fraction of alphanumeric characters allowed."}, - ) - - -@dataclass -class PreprocessProjectArgs(FilterArgs): - # add arguments in the following format - project_dir: Optional[str] = field( - metadata={"help": "Project directory."}, - ) - - output_dir: Optional[str] = field( - metadata={"help": "Output save path directory."}, - ) diff --git a/tabby/tools/preprocess/filters.py b/tabby/tools/preprocess/filters.py deleted file mode 100644 index 11c7fe3..0000000 --- a/tabby/tools/preprocess/filters.py +++ /dev/null @@ -1,15 +0,0 @@ -from .args import FilterArgs - - -def basic_filters(args: FilterArgs): - def fn(example): - """Filter files based on line length and % alphanumeric characters""" - if example["max_line_length"] > args.line_max: - return False - elif example["avg_line_length"] > args.line_mean: - return False - elif example["alphanum_fraction"] < args.alpha_frac: - return False - return True - - return fn