Add project preprocessing

add-more-languages
Meng Zhang 2023-03-16 17:26:43 +08:00
parent c2c1785389
commit cf27c1a504
9 changed files with 1623 additions and 2 deletions

1
.gitignore vendored Normal file
View File

@ -0,0 +1 @@
__pycache__

2
Makefile Normal file
View File

@ -0,0 +1,2 @@
format:
poetry run python -m black **/*.py

5
README.md Normal file
View File

@ -0,0 +1,5 @@
## 🐾 Tabby
[![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/psf/black)
> **Warning**
> This repository is undering heavy construction, everything changes fast.

1467
poetry.lock generated

File diff suppressed because it is too large Load Diff

14
preprocess/args.py Normal file
View File

@ -0,0 +1,14 @@
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class PreprocessProjectArgs:
# add arguments in the following format
project_dir: Optional[str] = field(
metadata={"help": "Project directory."},
)
output_dir: Optional[str] = field(
metadata={"help": "Output save path directory."},
)

21
preprocess/metrics.py Normal file
View File

@ -0,0 +1,21 @@
def max_line_length(content):
return max([len(x) for x in content.splitlines()])
def avg_line_length(content):
lines = [len(x) for x in content.splitlines()]
total = sum(lines)
return total / len(lines)
def alphanum_fraction(content):
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
return len(alphanum) / len(content)
def compute(content):
return dict(
max_line_length=max_line_length(content),
avg_line_length=avg_line_length(content),
alphanum_fraction=alphanum_fraction(content),
)

View File

@ -0,0 +1,78 @@
import os
import glob
import json
from datasets import Dataset
from transformers import HfArgumentParser
import metrics
from args import PreprocessProjectArgs
def parse_args():
parser = HfArgumentParser(PreprocessProjectArgs)
return parser.parse_args()
def read_languages_to_file_extensions():
path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(path, "programming-languages-to-file-extensions.json")
with open(path) as f:
return json.load(f)
def read_valid_extensions():
content = read_languages_to_file_extensions()
extensions = []
for k, exts in content.items():
extensions += exts
return set(extensions)
def read_extension_to_language_mappings():
content = read_languages_to_file_extensions()
mappings = dict()
for k, exts in content.items():
for x in exts:
mappings[x] = k
return mappings
def dataset_iter(files):
def gen():
mappings = read_extension_to_language_mappings()
for x in files:
_, extname = os.path.splitext(x)
with open(x) as f:
content = f.read()
yield dict(
language=mappings[extname],
content=content,
**metrics.compute(content),
)
return gen
if __name__ == "__main__":
valid_extensions = read_valid_extensions()
def is_valid_file(x):
if not os.path.isfile(x):
return False
_, extname = os.path.splitext(x)
if not extname in valid_extensions:
return False
return True
args = parse_args()
files = list(
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
)
ds = Dataset.from_generator(dataset_iter(files))
ds.save_to_disk(args.output_dir)

View File

@ -0,0 +1,32 @@
{
"Assembly": [".asm"],
"Batchfile": [".bat", ".cmd"],
"C": [".c", ".h"],
"C#": [".cs"],
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
"CMake": [".cmake"],
"CSS": [".css"],
"Dockerfile": [".dockerfile", "Dockerfile"],
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
"Go": [".go"],
"Haskell": [".hs"],
"HTML": [".html"],
"Java": [".java"],
"JavaScript": [".js"],
"Julia": [".jl"],
"Lua": [".lua"],
"Makefile": ["Makefile"],
"Markdown": [".md", ".markdown"],
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
"Perl": [".pl", ".pm", ".pod", ".perl"],
"PowerShell": [".ps1", ".psd1", ".psm1"],
"Python": [".py"],
"Ruby": [".rb"],
"Rust": [".rs"],
"SQL": [".sql"],
"Scala": [".scala"],
"Shell": [".sh", ".bash", ".command", ".zsh"],
"TypeScript": [".ts", ".tsx"],
"TeX": [".tex"],
"Visual Basic": [".vb"]
}

View File

@ -8,8 +8,13 @@ readme = "README.md"
[tool.poetry.dependencies]
python = "^3.10"
datasets = "^2.10.1"
transformers = "^4.27.1"
[tool.poetry.group.dev.dependencies]
black = "^23.1.0"
[build-system]
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"