Add project preprocessing
parent
c2c1785389
commit
cf27c1a504
|
|
@ -0,0 +1 @@
|
|||
__pycache__
|
||||
|
|
@ -0,0 +1,5 @@
|
|||
## 🐾 Tabby
|
||||
[](https://github.com/psf/black)
|
||||
|
||||
> **Warning**
|
||||
> This repository is undering heavy construction, everything changes fast.
|
||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,14 @@
|
|||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
|
||||
|
||||
@dataclass
|
||||
class PreprocessProjectArgs:
|
||||
# add arguments in the following format
|
||||
project_dir: Optional[str] = field(
|
||||
metadata={"help": "Project directory."},
|
||||
)
|
||||
|
||||
output_dir: Optional[str] = field(
|
||||
metadata={"help": "Output save path directory."},
|
||||
)
|
||||
|
|
@ -0,0 +1,21 @@
|
|||
def max_line_length(content):
|
||||
return max([len(x) for x in content.splitlines()])
|
||||
|
||||
|
||||
def avg_line_length(content):
|
||||
lines = [len(x) for x in content.splitlines()]
|
||||
total = sum(lines)
|
||||
return total / len(lines)
|
||||
|
||||
|
||||
def alphanum_fraction(content):
|
||||
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
|
||||
return len(alphanum) / len(content)
|
||||
|
||||
|
||||
def compute(content):
|
||||
return dict(
|
||||
max_line_length=max_line_length(content),
|
||||
avg_line_length=avg_line_length(content),
|
||||
alphanum_fraction=alphanum_fraction(content),
|
||||
)
|
||||
|
|
@ -0,0 +1,78 @@
|
|||
import os
|
||||
import glob
|
||||
import json
|
||||
|
||||
from datasets import Dataset
|
||||
from transformers import HfArgumentParser
|
||||
|
||||
import metrics
|
||||
from args import PreprocessProjectArgs
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = HfArgumentParser(PreprocessProjectArgs)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def read_languages_to_file_extensions():
|
||||
path = os.path.abspath(os.path.dirname(__file__))
|
||||
path = os.path.join(path, "programming-languages-to-file-extensions.json")
|
||||
with open(path) as f:
|
||||
return json.load(f)
|
||||
|
||||
|
||||
def read_valid_extensions():
|
||||
content = read_languages_to_file_extensions()
|
||||
extensions = []
|
||||
for k, exts in content.items():
|
||||
extensions += exts
|
||||
return set(extensions)
|
||||
|
||||
|
||||
def read_extension_to_language_mappings():
|
||||
content = read_languages_to_file_extensions()
|
||||
mappings = dict()
|
||||
for k, exts in content.items():
|
||||
for x in exts:
|
||||
mappings[x] = k
|
||||
return mappings
|
||||
|
||||
|
||||
def dataset_iter(files):
|
||||
def gen():
|
||||
mappings = read_extension_to_language_mappings()
|
||||
for x in files:
|
||||
_, extname = os.path.splitext(x)
|
||||
|
||||
with open(x) as f:
|
||||
content = f.read()
|
||||
|
||||
yield dict(
|
||||
language=mappings[extname],
|
||||
content=content,
|
||||
**metrics.compute(content),
|
||||
)
|
||||
|
||||
return gen
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
valid_extensions = read_valid_extensions()
|
||||
|
||||
def is_valid_file(x):
|
||||
if not os.path.isfile(x):
|
||||
return False
|
||||
|
||||
_, extname = os.path.splitext(x)
|
||||
if not extname in valid_extensions:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
args = parse_args()
|
||||
files = list(
|
||||
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
|
||||
)
|
||||
|
||||
ds = Dataset.from_generator(dataset_iter(files))
|
||||
ds.save_to_disk(args.output_dir)
|
||||
|
|
@ -0,0 +1,32 @@
|
|||
{
|
||||
"Assembly": [".asm"],
|
||||
"Batchfile": [".bat", ".cmd"],
|
||||
"C": [".c", ".h"],
|
||||
"C#": [".cs"],
|
||||
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
||||
"CMake": [".cmake"],
|
||||
"CSS": [".css"],
|
||||
"Dockerfile": [".dockerfile", "Dockerfile"],
|
||||
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
|
||||
"Go": [".go"],
|
||||
"Haskell": [".hs"],
|
||||
"HTML": [".html"],
|
||||
"Java": [".java"],
|
||||
"JavaScript": [".js"],
|
||||
"Julia": [".jl"],
|
||||
"Lua": [".lua"],
|
||||
"Makefile": ["Makefile"],
|
||||
"Markdown": [".md", ".markdown"],
|
||||
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
||||
"Perl": [".pl", ".pm", ".pod", ".perl"],
|
||||
"PowerShell": [".ps1", ".psd1", ".psm1"],
|
||||
"Python": [".py"],
|
||||
"Ruby": [".rb"],
|
||||
"Rust": [".rs"],
|
||||
"SQL": [".sql"],
|
||||
"Scala": [".scala"],
|
||||
"Shell": [".sh", ".bash", ".command", ".zsh"],
|
||||
"TypeScript": [".ts", ".tsx"],
|
||||
"TeX": [".tex"],
|
||||
"Visual Basic": [".vb"]
|
||||
}
|
||||
|
|
@ -8,8 +8,13 @@ readme = "README.md"
|
|||
|
||||
[tool.poetry.dependencies]
|
||||
python = "^3.10"
|
||||
datasets = "^2.10.1"
|
||||
transformers = "^4.27.1"
|
||||
|
||||
|
||||
[tool.poetry.group.dev.dependencies]
|
||||
black = "^23.1.0"
|
||||
|
||||
[build-system]
|
||||
requires = ["poetry-core"]
|
||||
build-backend = "poetry.core.masonry.api"
|
||||
|
|
|
|||
Loading…
Reference in New Issue