refactor: remove unused code

improve-workflow
Meng Zhang 2023-06-12 16:14:12 -07:00
parent a8a145d9b3
commit 0f72788d82
13 changed files with 0 additions and 812 deletions

View File

@ -1,74 +0,0 @@
#!/bin/bash
set -e
DB_FILE=${DB_FILE:-"/data/logs/duckdb/duck.db"}
LOGS_DIR=${LOGS_DIR:-"/data/logs"}
TABBY_SERVER_LOGS="${LOGS_DIR}/events/tabby-server/*.json"
# Init schema
function init_scheme() {
mkdir -p $(dirname $DB_FILE)
cat <<EOF | duckdb
CREATE TABLE IF NOT EXISTS completion_events (
id STRING,
created uint64,
prompt STRING,
choices STRUCT(index UINT64, text STRING)[],
view BOOLEAN,
"select" BOOLEAN
);
CREATE UNIQUE INDEX IF NOT EXISTS completion_events_id ON completion_events (id);
EOF
}
# Update table
function collect_tabby_server_logs() {
# Executing collect job only when files exists.
if compgen -G "${TABBY_SERVER_LOGS}" > /dev/null; then
cat <<EOF | duckdb
CREATE TEMP TABLE events AS
SELECT data.* FROM '${TABBY_SERVER_LOGS}';
CREATE TEMP TABLE t AS
SELECT id, created, prompt, choices, IFNULL(rhs.view, false) AS view, IFNULL(rhs.select, false) AS select
FROM
(
SELECT
id,
FIRST(created) AS created,
FIRST(prompt) AS prompt,
FIRST(choices) AS choices
FROM events WHERE id IS NOT NULL GROUP BY 1) lhs
LEFT JOIN (
SELECT
completion_id,
(SUM(IF(type == 'view', 1, 0)) > 0) AS view,
(SUM(IF(type == 'select', 1, 0)) > 0) AS select
FROM events
WHERE completion_id IS NOT NULL
GROUP BY 1
) rhs ON (lhs.id = rhs.completion_id);
INSERT INTO completion_events SELECT t.* FROM t LEFT JOIN completion_events rhs ON (t.id = rhs.id) WHERE rhs.id IS NULL;
EOF
else
echo "No files match ${TABBY_SERVER_LOGS}"
fi
}
function duckdb() {
local SQL=$(tee)
cat << EOF | python3 -
import sys
import duckdb
conn = duckdb.connect('$DB_FILE')
print(conn.sql("""
$SQL
"""))
EOF
}
init_scheme
"$@"

View File

@ -1,126 +0,0 @@
import base64
import glob
import json
import os
import pandas as pd
from datasets import Dataset
from transformers import HfArgumentParser
from . import metrics
from .args import PreprocessProjectArgs
def parse_args():
parser = HfArgumentParser(PreprocessProjectArgs)
return parser.parse_args()
def read_languages_to_file_extensions():
path = os.path.abspath(os.path.dirname(__file__))
path = os.path.join(path, "programming-languages-to-file-extensions.json")
with open(path) as f:
return json.load(f)
def read_valid_extensions():
content = read_languages_to_file_extensions()
extensions = []
for k, exts in content.items():
extensions += exts
return set(extensions)
def read_extension_to_language_mappings():
content = read_languages_to_file_extensions()
mappings = dict()
for k, exts in content.items():
for x in exts:
mappings[x] = k
return mappings
def dataset_iter(project_dir, files):
def gen():
mappings = read_extension_to_language_mappings()
for x in files:
_, extname = os.path.splitext(x)
with open(x) as f:
try:
content = f.read()
except UnicodeDecodeError:
print("Cannot decode unicode", x)
continue
segments = x.removeprefix(project_dir).split(os.sep)
project = segments[1]
file = os.path.join(*segments[2:])
yield dict(
id=to_id(project, file),
project=project,
file=file,
language=mappings[extname],
content=content,
**metrics.compute(content),
)
return gen
def count_by_language(dataset):
key = "language"
df = (
pd.DataFrame(dataset[key], columns=[key])
.groupby([key])
.size()
.to_frame("count")
)
return df
def to_id(*args):
token = ":".join(args)
return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=")
def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25):
def fn(example):
"""Filter files based on line length and % alphanumeric characters"""
if example["max_line_length"] > line_max:
return False
elif example["avg_line_length"] > line_mean:
return False
elif example["alphanum_fraction"] < alpha_frac:
return False
return True
return fn
if __name__ == "__main__":
valid_extensions = read_valid_extensions()
def is_valid_file(x):
if not os.path.isfile(x):
return False
_, extname = os.path.splitext(x)
if not extname in valid_extensions:
return False
return True
args = parse_args()
files = list(
filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True))
)
ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files))
ds = ds.filter(basic_filters())
ds.save_to_disk(args.output_dir)
ds.to_json(os.path.join(args.output_dir, "dumps.json"))
print("\n## Summary")
print("Number of source files", len(ds))
print("Number of source files by languages", count_by_language(ds).to_json())

View File

@ -1,14 +0,0 @@
from dataclasses import dataclass, field
from typing import Optional
@dataclass
class PreprocessProjectArgs:
# add arguments in the following format
project_dir: Optional[str] = field(
metadata={"help": "Project directory."},
)
output_dir: Optional[str] = field(
metadata={"help": "Output save path directory."},
)

View File

@ -1,27 +0,0 @@
def max_line_length(content):
return max([0] + [len(x) for x in content.splitlines()])
def avg_line_length(content):
lines = [len(x) for x in content.splitlines()]
total = sum(lines)
if len(lines) != 0:
return total / len(lines)
else:
return 0
def alphanum_fraction(content):
alphanum = [x for x in content if x.isalpha() or x.isnumeric()]
if len(content) != 0:
return len(alphanum) / len(content)
else:
return 0
def compute(content):
return dict(
max_line_length=max_line_length(content),
avg_line_length=avg_line_length(content),
alphanum_fraction=alphanum_fraction(content),
)

View File

@ -1,28 +0,0 @@
{
"c": [".c", ".h"],
"csharp": [".cs"],
"cpp": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"],
"css": [".css"],
"dockerfile": ["Dockerfile"],
"go": [".go"],
"haskell": [".hs"],
"html": [".html"],
"java": [".java"],
"javascript": [".js"],
"julia": [".jl"],
"lua": [".lua"],
"makefile": ["Makefile"],
"markdown": [".md", ".markdown"],
"php": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"],
"perl": [".pl", ".pm", ".pod", ".perl"],
"powershell": [".ps1", ".psd1", ".psm1"],
"python": [".py"],
"ruby": [".rb"],
"rust": [".rs"],
"sql": [".sql"],
"scala": [".scala"],
"shellscript": [".sh", ".bash", ".command", ".zsh"],
"typescript": [".ts", ".tsx"],
"tex": [".tex"],
"vb": [".vb"]
}

View File

@ -1,181 +0,0 @@
import configparser
from argparse import ArgumentParser
from os import makedirs
from pathlib import Path
import numpy as np
import torch
from transformers import PretrainedConfig
torch.set_printoptions(linewidth=130, sci_mode=False)
np.set_printoptions(linewidth=130, suppress=True)
# This converter is used to convert the huggingface gpt-j-6B model
# in https://huggingface.co/EleutherAI/gpt-j-6B/blob/main/pytorch_model.bin.
def savebin(param, save_path):
if isinstance(param, torch.Tensor):
param = param.cpu().float().numpy()
np.squeeze(param).astype(np.float16).tofile(save_path + ".bin")
def param2file(pt_param, layer_id, save_dir, dest_key):
base_n = save_dir + "/model.layers." + str(layer_id) + "."
save_path = base_n + dest_key
savebin(pt_param, save_path)
def param2distributed(
pt_param,
layer_id,
save_dir,
dest_key,
n_inference_gpus,
split_axis,
):
np_param = pt_param.cpu().float().numpy()
base_n = save_dir + "/model.layers." + str(layer_id) + "."
save_path = base_n + dest_key
split_param = np.split(np_param, n_inference_gpus, axis=split_axis)
for i, p in enumerate(split_param):
savebin(p, save_path + f".{i}")
def save(w, save_dir, n_inference_gpus, n_layers, layer_id):
makedirs(save_dir, exist_ok=True)
savebin(w["transformer.wte.weight"], save_dir + "/model.wte")
l = layer_id
print(f"Saving layer {l + 1} / {n_layers}")
base_k = "transformer.h." + str(l) + "."
param2file(w[base_k + "ln_1.bias"], l, save_dir, "input_layernorm.bias")
param2file(w[base_k + "ln_1.weight"], l, save_dir, "input_layernorm.weight")
param2distributed(
w[base_k + "mlp.fc_in.weight"].T,
l,
save_dir,
"mlp.dense_h_to_4h.weight",
n_inference_gpus,
split_axis=-1, # split fast indx
)
param2distributed(
w[base_k + "mlp.fc_in.bias"],
l,
save_dir,
"mlp.dense_h_to_4h.bias",
n_inference_gpus,
split_axis=-1, # split fast indx
)
param2distributed(
w[base_k + "mlp.fc_out.weight"].T,
l,
save_dir,
"mlp.dense_4h_to_h.weight",
n_inference_gpus,
split_axis=0, # split slow indx
)
param2file(w[base_k + "mlp.fc_out.bias"], l, save_dir, "mlp.dense_4h_to_h.bias")
param2distributed(
w[base_k + "attn.out_proj.weight"].T,
l,
save_dir,
"attention.dense.weight",
n_inference_gpus,
split_axis=0, # split slow indx
)
QKV_w = torch.stack(
[
w[base_k + "attn.q_proj.weight"],
w[base_k + "attn.k_proj.weight"],
w[base_k + "attn.v_proj.weight"],
]
) # [qkv, n_heads * dim_head, latent_space]
QKV_w = QKV_w.permute(2, 0, 1)
param2distributed(
QKV_w,
l,
save_dir,
"attention.query_key_value.weight",
n_inference_gpus,
split_axis=-1, # split fast indx
)
# Other unneeded per-layer params:
# attn.attention.masked_bias = torch.tensor(-1e9)
# attn.attention.bias = torch.tril(torch.ones(1, 1, 2048, 2048))
if __name__ == "__main__":
parser = ArgumentParser(
description="Convert GPT-J slim checkpoint to FasterTransformer",
)
parser.add_argument(
"--output-dir",
help="Folder where binary files are stored",
)
parser.add_argument(
"--ckpt-dir",
help="File of GPT-J huggingface checkpoint",
)
parser.add_argument(
"--n-inference-gpus",
help="Number of GPUs used for inference runtime",
default=1,
type=int,
)
args = parser.parse_args()
ckpt_file = args.ckpt_dir + "/pytorch_model.bin"
checkpoint = torch.load(ckpt_file)
print(f"loading from {ckpt_file}")
out_path = args.output_dir
output_dir = out_path + f"/{args.n_inference_gpus}-gpu/"
print(f"saving to {output_dir}")
config_file = args.ckpt_dir + "/config.json"
hf_config = PretrainedConfig.from_json_file(config_file).to_dict()
# NOTE: save parameters to config files (loaded by triton backends)
config = configparser.ConfigParser()
config["gptj"] = {}
try:
config["gptj"]["model_name"] = (
"gptj" if hf_config["_name_or_path"] == "" else hf_config["_name_or_path"]
)
config["gptj"]["head_num"] = str(hf_config["n_head"])
n_embd = hf_config["n_embd"]
config["gptj"]["size_per_head"] = str(n_embd // hf_config["n_head"])
config["gptj"]["inter_size"] = str(n_embd * 4)
config["gptj"]["num_layer"] = str(hf_config["n_layer"])
rotary_dim = (
n_embd // hf_config["n_head"]
if hf_config["rotary_dim"] is None
else hf_config["rotary_dim"]
)
config["gptj"]["rotary_embedding"] = str(hf_config["rotary_dim"])
config["gptj"]["vocab_size"] = str(hf_config["vocab_size"])
config["gptj"]["start_id"] = str(hf_config["bos_token_id"])
config["gptj"]["end_id"] = str(hf_config["eos_token_id"])
config["gptj"]["weight_data_type"] = "fp16"
Path(output_dir).mkdir(exist_ok=True, parents=True)
with open(output_dir + "/config.ini", "w") as configfile:
config.write(configfile)
except:
print(f"Fail to save the config in config.ini.")
n_layers = hf_config["n_layer"]
for i in range(n_layers):
save(checkpoint, output_dir, args.n_inference_gpus, n_layers, i)
savebin(
checkpoint["transformer.ln_f.weight"],
output_dir + "/model.final_layernorm.weight",
)
savebin(
checkpoint["transformer.ln_f.bias"], output_dir + "/model.final_layernorm.bias"
)
savebin(checkpoint["lm_head.weight"], output_dir + "/model.lm_head.weight")
savebin(checkpoint["lm_head.bias"], output_dir + "/model.lm_head.bias")
print("done")

View File

@ -1,256 +0,0 @@
import argparse
import configparser
import multiprocessing
import os
from pathlib import Path
import numpy as np
import torch
from transformers import GPTNeoXForCausalLM
def get_weight_data_type(data_type):
if data_type == "fp32":
return np.float32
elif data_type == "fp16":
return np.float16
else:
assert False, f"Invalid weight data type {data_type}"
def split_and_convert_process(saved_dir, factor, key, args, config, val):
if (
key.find("input_layernorm.weight") != -1
or key.find("input_layernorm.bias") != -1
or key.find("post_attention_layernorm.weight") != -1
or key.find("post_attention_layernorm.bias") != -1
or key.find("final_layernorm.weight") != -1
or key.find("final_layernorm.bias") != -1
):
saved_path = saved_dir + f"/model.{key}.bin"
val.tofile(saved_path)
elif (
key.find("attention.dense.bias") != -1
or key.find("mlp.dense_4h_to_h.bias") != -1
):
saved_path = saved_dir + f"/model.{key}.bin"
val = (val / factor) if factor > 1 else val
val.tofile(saved_path)
else:
if (
key.find("attention.dense.weight") != -1
or key.find("mlp.dense_4h_to_h.weight") != -1
):
split_vals = np.split(val, factor, axis=0)
elif (
key.find("mlp.dense_h_to_4h.weight") != -1
or key.find("mlp.dense_h_to_4h.bias") != -1
):
split_vals = np.split(val, factor, axis=-1)
elif key.find("attention.query_key_value.bias") != -1:
local_dim = (int)(val.shape[-1] / 3)
n_head = config["num_attention_heads"]
val = val.reshape(n_head, 3, local_dim // n_head)
val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim)
split_vals = np.split(val, factor, axis=-1)
elif key.find("attention.query_key_value.weight") != -1:
hidden_dim = val.shape[0]
local_dim = (int)(val.shape[-1] / 3)
n_head = config["num_attention_heads"]
# Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden]
# FT needs the shape of [hidden_size, 3, num_heads, head_hidden]
val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head)
val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim)
# print(np.mean(np.abs(val[:, 0, :])))
split_vals = np.split(val, factor, axis=-1)
else:
print("[ERROR] cannot find key '{}'".format(key))
return
for j in range(factor):
saved_path = saved_dir + f"/model.{key}.{j}.bin"
split_vals[j].tofile(saved_path)
def split_and_convert(args):
saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
if os.path.exists(saved_dir) == False:
os.makedirs(saved_dir)
factor = args.infer_gpu_num
# load position_embedding from rank 0
# model = torch.load(ckpt_name)
model = GPTNeoXForCausalLM.from_pretrained(args.in_file)
hf_config = vars(model.config)
np_weight_data_type = get_weight_data_type(args.weight_data_type)
try:
model_name = args.model_name
n_heads = hf_config["num_attention_heads"]
head_size = hf_config["hidden_size"] // n_heads
rotary_dim = int(head_size * hf_config["rotary_pct"])
use_gptj_residual = int(hf_config["use_parallel_residual"])
config = configparser.ConfigParser()
config["gptneox"] = {}
config["gptneox"]["model_name"] = model_name
config["gptneox"]["head_num"] = str(n_heads)
config["gptneox"]["size_per_head"] = str(head_size)
config["gptneox"]["inter_size"] = str(hf_config["intermediate_size"])
config["gptneox"]["num_layer"] = str(hf_config["num_hidden_layers"])
config["gptneox"]["rotary_embedding"] = str(rotary_dim)
config["gptneox"]["vocab_size"] = str(hf_config["vocab_size"])
config["gptneox"]["start_id"] = str(hf_config["bos_token_id"])
config["gptneox"]["end_id"] = str(hf_config["eos_token_id"])
config["gptneox"]["use_gptj_residual"] = str(use_gptj_residual)
config["gptneox"]["weight_data_type"] = args.weight_data_type
with open((Path(saved_dir) / f"config.ini").as_posix(), "w") as configfile:
config.write(configfile)
except Exception as e:
print(f"Fail to save the config in config.ini.", e)
ft_model_name_pattern = [
"input_layernorm.bias",
"input_layernorm.weight",
"attention.query_key_value.bias",
"attention.query_key_value.weight",
"attention.dense.bias",
"attention.dense.weight",
"post_attention_layernorm.bias",
"post_attention_layernorm.weight",
"mlp.dense_h_to_4h.bias",
"mlp.dense_h_to_4h.weight",
"mlp.dense_4h_to_h.bias",
"mlp.dense_4h_to_h.weight",
]
huggingface_model_file_list = [
hf_file_name
for hf_file_name in os.listdir(args.in_file)
if hf_file_name.endswith(".bin")
]
if len(huggingface_model_file_list) > 1:
multiprocessing_context = multiprocessing.get_context()
pool_fn = multiprocessing_context.Pool
else:
torch.multiprocessing.set_start_method("spawn")
pool_fn = multiprocessing.Pool
pool = pool_fn(args.processes)
for name, param in model.named_parameters():
array = param.detach().cpu().numpy().astype(np_weight_data_type)
# print("input shape", name, array.shape)
if name.find("weight") == -1 and name.find("bias") == -1:
print("skipped", name)
continue
elif name == "gpt_neox.embed_in.weight":
array.tofile(saved_dir + "model.wte.bin")
elif name == "gpt_neox.final_layer_norm.bias":
array.tofile(saved_dir + "model.final_layernorm.bias.bin")
elif name == "gpt_neox.final_layer_norm.weight":
array.tofile(saved_dir + "model.final_layernorm.weight.bin")
elif name == "embed_out.weight":
array.tofile(saved_dir + "model.lm_head.weight.bin")
else:
processed = False
for i in range(len(ft_model_name_pattern)):
if name.find(ft_model_name_pattern[i]) != -1:
new_name = name.replace("gpt_neox.", "")
pool.starmap(
split_and_convert_process,
[
(
saved_dir,
factor,
new_name,
args,
vars(model.config),
array.T,
)
],
)
processed = True
break
if not processed:
print("Unused layer", name)
pool.close()
pool.join()
# Post-process biases if use_gptj_residual is True
if use_gptj_residual:
for layer_idx in range(hf_config["num_hidden_layers"]):
attn_bias = np.fromfile(
saved_dir + f"/model.layers.{layer_idx}.attention.dense.bias.bin",
dtype=np_weight_data_type,
)
mlp_bias = np.fromfile(
saved_dir + f"/model.layers.{layer_idx}.mlp.dense_4h_to_h.bias.bin",
dtype=np_weight_data_type,
)
(attn_bias + mlp_bias).astype(np_weight_data_type).tofile(
saved_dir + f"/model.layers.{layer_idx}.mlp.attention.bias.sum.bin"
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument(
"-saved_dir", "-o", type=str, help="file name of output file", required=True
)
parser.add_argument(
"-in_file",
"-i",
type=str,
help="file name of input checkpoint file",
required=True,
)
parser.add_argument(
"-infer_gpu_num",
"-i_g",
type=int,
help="How many gpus for inference",
required=True,
)
parser.add_argument(
"-processes",
"-p",
type=int,
help="How many processes to spawn for conversion (default: 4)",
default=4,
)
parser.add_argument(
"-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]
)
parser.add_argument(
"-model_name", "-m_n", type=str, help="model name", required=True
)
args = parser.parse_args()
print("\n=============== Argument ===============")
for key in vars(args):
print("{}: {}".format(key, vars(args)[key]))
print("========================================")
target_dir_path = os.path.join(args.saved_dir, "%d-gpu" % args.infer_gpu_num)
assert not os.path.exists(target_dir_path), (
"target path has exist, please remove %s first." % target_dir_path
)
split_and_convert(args)

View File

@ -1,47 +0,0 @@
from dataclasses import dataclass, field
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
@dataclass
class Arguments:
repo_id: str = field(
metadata={"help": "Huggingface model repository id, e.g TabbyML/NeoX-160M"}
)
prefer_local_files: bool = field(
metadata={
"help": "Whether prefer loading local files (skip remote version check if local files are valid)."
},
default=True,
)
def parse_args():
parser = HfArgumentParser(Arguments)
return parser.parse_args()
def preload(local_files_only=False):
AutoTokenizer.from_pretrained(args.repo_id, local_files_only=local_files_only)
AutoModelForCausalLM.from_pretrained(
args.repo_id, local_files_only=local_files_only
)
snapshot_download(
repo_id=args.repo_id,
allow_patterns="triton/**/*",
local_files_only=local_files_only,
)
if __name__ == "__main__":
args = parse_args()
print(f"Loading {args.repo_id}, this will take a while...")
try:
preload(local_files_only=args.prefer_local_files)
except Exception as e:
if "offline" in str(e) or "local_files_only" in str(e):
preload(local_files_only=False)
else:
raise e
print(f"Loaded {args.repo_id} !")

View File

@ -1,11 +0,0 @@
all:
CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml
test:
# Cleanup
rm -rf testdata/repositories/quickjs
# Initialize
$(CMD)
# Updating
$(CMD)

View File

@ -1,4 +0,0 @@
data_dir = "/repositories"
[repositories.quickjs]
url = "https://gitee.com/vsf-linux/quickjs.git"

View File

@ -1 +0,0 @@
quickjs

View File

@ -1,43 +0,0 @@
import os
import pathlib
import shutil
from dataclasses import dataclass, field
import toml
from git import Repo
from transformers import HfArgumentParser
@dataclass
class Arguments:
data_dir: str = field(metadata={"help": "Base dir for repositories"})
config_file: str = field(metadata={"help": "Configuration file for tabby updater"})
def parse_args():
parser = HfArgumentParser(Arguments)
return parser.parse_args()
if __name__ == "__main__":
args = parse_args()
config = toml.load(args.config_file)
repositories = config["projects"]
for x in pathlib.Path(args.data_dir).glob("*"):
if x.is_dir() and x.name not in repositories:
print("Remove unused dir:", x)
shutil.rmtree(str(x))
elif x.is_file():
print("Remove unused file:", x)
x.unlink()
for name, config in repositories.items():
path = pathlib.Path(args.data_dir, name)
if path.is_dir():
repo = Repo(path)
else:
Repo.clone_from(config["git_url"], path.absolute(), depth=1)
os.system(f"gitup {args.data_dir}")