Add project preprocessing
parent
c2c1785389
commit
cf27c1a504
|
|
@ -0,0 +1 @@
|
||||||
|
__pycache__
|
||||||
|
|
@ -0,0 +1,5 @@
|
||||||
|
## 🐾 Tabby
|
||||||
|
[](https://github.com/psf/black)
|
||||||
|
|
||||||
|
> **Warning**
|
||||||
|
> This repository is undering heavy construction, everything changes fast.
|
||||||
File diff suppressed because it is too large
Load Diff
|
|
@ -0,0 +1,14 @@
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class PreprocessProjectArgs:
|
||||||
|
# add arguments in the following format
|
||||||
|
project_dir: Optional[str] = field(
|
||||||
|
metadata={"help": "Project directory."},
|
||||||
|
)
|
||||||
|
|
||||||
|
output_dir: Optional[str] = field(
|
||||||
|
metadata={"help": "Output save path directory."},
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,21 @@
|
||||||
|
def max_line_length(content):
|
||||||
|
return max([len(x) for x in content.splitlines()])
|
||||||
|
|
||||||
|
|
||||||
|
def avg_line_length(content):
|
||||||
|
lines = [len(x) for x in content.splitlines()]
|
||||||
|
total = sum(lines)
|
||||||
|
return total / len(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def alphanum_fraction(content):
|
||||||
|
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
|
||||||
|
return len(alphanum) / len(content)
|
||||||
|
|
||||||
|
|
||||||
|
def compute(content):
|
||||||
|
return dict(
|
||||||
|
max_line_length=max_line_length(content),
|
||||||
|
avg_line_length=avg_line_length(content),
|
||||||
|
alphanum_fraction=alphanum_fraction(content),
|
||||||
|
)
|
||||||
|
|
@ -0,0 +1,78 @@
|
||||||
|
import os
|
||||||
|
import glob
|
||||||
|
import json
|
||||||
|
|
||||||
|
from datasets import Dataset
|
||||||
|
from transformers import HfArgumentParser
|
||||||
|
|
||||||
|
import metrics
|
||||||
|
from args import PreprocessProjectArgs
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
parser = HfArgumentParser(PreprocessProjectArgs)
|
||||||
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
|
def read_languages_to_file_extensions():
|
||||||
|
path = os.path.abspath(os.path.dirname(__file__))
|
||||||
|
path = os.path.join(path, "programming-languages-to-file-extensions.json")
|
||||||
|
with open(path) as f:
|
||||||
|
return json.load(f)
|
||||||
|
|
||||||
|
|
||||||
|
def read_valid_extensions():
|
||||||
|
content = read_languages_to_file_extensions()
|
||||||
|
extensions = []
|
||||||
|
for k, exts in content.items():
|
||||||
|
extensions += exts
|
||||||
|
return set(extensions)
|
||||||
|
|
||||||
|
|
||||||
|
def read_extension_to_language_mappings():
|
||||||
|
content = read_languages_to_file_extensions()
|
||||||
|
mappings = dict()
|
||||||
|
for k, exts in content.items():
|
||||||
|
for x in exts:
|
||||||
|
mappings[x] = k
|
||||||
|
return mappings
|
||||||
|
|
||||||
|
|
||||||
|
def dataset_iter(files):
|
||||||
|
def gen():
|
||||||
|
mappings = read_extension_to_language_mappings()
|
||||||
|
for x in files:
|
||||||
|
_, extname = os.path.splitext(x)
|
||||||
|
|
||||||
|
with open(x) as f:
|
||||||
|
content = f.read()
|
||||||
|
|
||||||
|
yield dict(
|
||||||
|
language=mappings[extname],
|
||||||
|
content=content,
|
||||||
|
**metrics.compute(content),
|
||||||
|
)
|
||||||
|
|
||||||
|
return gen
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
valid_extensions = read_valid_extensions()
|
||||||
|
|
||||||
|
def is_valid_file(x):
|
||||||
|
if not os.path.isfile(x):
|
||||||
|
return False
|
||||||
|
|
||||||
|
_, extname = os.path.splitext(x)
|
||||||
|
if not extname in valid_extensions:
|
||||||
|
return False
|
||||||
|
|
||||||
|
return True
|
||||||
|
|
||||||
|
args = parse_args()
|
||||||
|
files = list(
|
||||||
|
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
|
||||||
|
)
|
||||||
|
|
||||||
|
ds = Dataset.from_generator(dataset_iter(files))
|
||||||
|
ds.save_to_disk(args.output_dir)
|
||||||
|
|
@ -0,0 +1,32 @@
|
||||||
|
{
|
||||||
|
"Assembly": [".asm"],
|
||||||
|
"Batchfile": [".bat", ".cmd"],
|
||||||
|
"C": [".c", ".h"],
|
||||||
|
"C#": [".cs"],
|
||||||
|
"C++": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
|
||||||
|
"CMake": [".cmake"],
|
||||||
|
"CSS": [".css"],
|
||||||
|
"Dockerfile": [".dockerfile", "Dockerfile"],
|
||||||
|
"FORTRAN": [".f90", ".f", ".f03", ".f08", ".f77", ".f95", ".for", ".fpp"],
|
||||||
|
"Go": [".go"],
|
||||||
|
"Haskell": [".hs"],
|
||||||
|
"HTML": [".html"],
|
||||||
|
"Java": [".java"],
|
||||||
|
"JavaScript": [".js"],
|
||||||
|
"Julia": [".jl"],
|
||||||
|
"Lua": [".lua"],
|
||||||
|
"Makefile": ["Makefile"],
|
||||||
|
"Markdown": [".md", ".markdown"],
|
||||||
|
"PHP": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
|
||||||
|
"Perl": [".pl", ".pm", ".pod", ".perl"],
|
||||||
|
"PowerShell": [".ps1", ".psd1", ".psm1"],
|
||||||
|
"Python": [".py"],
|
||||||
|
"Ruby": [".rb"],
|
||||||
|
"Rust": [".rs"],
|
||||||
|
"SQL": [".sql"],
|
||||||
|
"Scala": [".scala"],
|
||||||
|
"Shell": [".sh", ".bash", ".command", ".zsh"],
|
||||||
|
"TypeScript": [".ts", ".tsx"],
|
||||||
|
"TeX": [".tex"],
|
||||||
|
"Visual Basic": [".vb"]
|
||||||
|
}
|
||||||
|
|
@ -8,8 +8,13 @@ readme = "README.md"
|
||||||
|
|
||||||
[tool.poetry.dependencies]
|
[tool.poetry.dependencies]
|
||||||
python = "^3.10"
|
python = "^3.10"
|
||||||
|
datasets = "^2.10.1"
|
||||||
|
transformers = "^4.27.1"
|
||||||
|
|
||||||
|
|
||||||
|
[tool.poetry.group.dev.dependencies]
|
||||||
|
black = "^23.1.0"
|
||||||
|
|
||||||
[build-system]
|
[build-system]
|
||||||
requires = ["poetry-core"]
|
requires = ["poetry-core"]
|
||||||
build-backend = "poetry.core.masonry.api"
|
build-backend = "poetry.core.masonry.api"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue