90 lines
2.0 KiB
Python
90 lines
2.0 KiB
Python
from typing import Iterator
|
|
|
|
import glob
|
|
import json
|
|
from dataclasses import dataclass
|
|
from transformers import HfArgumentParser
|
|
|
|
|
|
@dataclass
|
|
class Item:
|
|
git_url: str
|
|
filepath: str
|
|
language: str
|
|
|
|
name: str
|
|
body: str
|
|
prefix: str
|
|
suffix: str
|
|
|
|
|
|
def iter_items(doc) -> Iterator[Item]:
|
|
if doc["max_line_length"] > 500:
|
|
return
|
|
|
|
if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
|
|
return
|
|
|
|
if doc["alphanum_fraction"] < 0.25:
|
|
return
|
|
|
|
for tag in doc["tags"]:
|
|
content = doc["content"]
|
|
name = get_content(content, tag["name_range"])
|
|
body = get_content(content, tag["range"])
|
|
|
|
prefix = get_prefix(content, tag["range"]["start"])
|
|
suffix = get_suffix(content, tag["range"]["end"])
|
|
|
|
yield Item(
|
|
name=name,
|
|
body=body,
|
|
prefix=prefix,
|
|
suffix=suffix,
|
|
git_url=doc["git_url"],
|
|
filepath=doc["filepath"],
|
|
language=doc["language"],
|
|
)
|
|
|
|
|
|
def iter_docs(filepattern: str):
|
|
for filepath in glob.glob(filepattern):
|
|
with open(filepath) as f:
|
|
for line in f:
|
|
yield json.loads(line)
|
|
|
|
|
|
def get_content(content: str, range: dict):
|
|
return content[range["start"] : range["end"]]
|
|
|
|
|
|
def get_prefix(content: str, start: int, max=20):
|
|
num_lines = 0
|
|
prefix_start = 0
|
|
for prefix_start in range(start - 1, 0, -1):
|
|
if content[prefix_start] == "\n":
|
|
num_lines += 1
|
|
|
|
if num_lines == max:
|
|
break
|
|
|
|
return content[prefix_start + 1 : start]
|
|
|
|
|
|
def get_suffix(content: str, end: int, max=20):
|
|
num_lines = 0
|
|
suffix_end = end
|
|
for suffix_end in range(end, len(content)):
|
|
if content[suffix_end] == "\n":
|
|
num_lines += 1
|
|
|
|
if num_lines == max:
|
|
break
|
|
|
|
return content[end : suffix_end - 1]
|
|
|
|
|
|
def items_from_filepattern(filepattern: str):
|
|
for doc in iter_docs(filepattern):
|
|
yield from iter_items(doc)
|