diff --git a/tabby/tools/converter/huggingface_gptneox_convert.py b/tabby/tools/converter/huggingface_gptneox_convert.py index 9fb4091..636b669 100644 --- a/tabby/tools/converter/huggingface_gptneox_convert.py +++ b/tabby/tools/converter/huggingface_gptneox_convert.py @@ -2,7 +2,6 @@ import argparse import configparser import multiprocessing import os -import shutil from pathlib import Path import numpy as np @@ -24,10 +23,8 @@ def split_and_convert_process(saved_dir, factor, key, args, config, val): if ( key.find("input_layernorm.weight") != -1 or key.find("input_layernorm.bias") != -1 - or key.find("attention.dense.bias") != -1 or key.find("post_attention_layernorm.weight") != -1 or key.find("post_attention_layernorm.bias") != -1 - or key.find("mlp.dense_4h_to_h.bias") != -1 or key.find("final_layernorm.weight") != -1 or key.find("final_layernorm.bias") != -1 ): @@ -35,54 +32,53 @@ def split_and_convert_process(saved_dir, factor, key, args, config, val): val.tofile(saved_path) elif ( - key.find("attention.dense.weight") != -1 - or key.find("mlp.dense_4h_to_h.weight") != -1 + key.find("attention.dense.bias") != -1 + or key.find("mlp.dense_4h_to_h.bias") != -1 ): - split_vals = np.split(val, factor, axis=0) - for j in range(factor): - saved_path = saved_dir + f"/model.{key}.{j}.bin" - split_vals[j].tofile(saved_path) - - elif ( - key.find("mlp.dense_h_to_4h.weight") != -1 - or key.find("mlp.dense_h_to_4h.bias") != -1 - ): - - split_vals = np.split(val, factor, axis=-1) - for j in range(factor): - saved_path = saved_dir + f"/model.{key}.{j}.bin" - split_vals[j].tofile(saved_path) - - elif key.find("attention.query_key_value.bias") != -1: - local_dim = (int)(val.shape[-1] / 3) - n_head = config["num_attention_heads"] - - val = val.reshape(n_head, 3, local_dim // n_head) - val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim) - split_vals = np.split(val, factor, axis=-1) - - for j in range(factor): - saved_path = saved_dir + f"/model.{key}.{j}.bin" - split_vals[j].tofile(saved_path) - - elif key.find("attention.query_key_value.weight") != -1: - hidden_dim = val.shape[0] - local_dim = (int)(val.shape[-1] / 3) - n_head = config["num_attention_heads"] - # Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden] - # FT needs the shape of [hidden_size, 3, num_heads, head_hidden] - val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head) - val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim) - - # print(np.mean(np.abs(val[:, 0, :]))) - split_vals = np.split(val, factor, axis=-1) - - for j in range(factor): - saved_path = saved_dir + f"/model.{key}.{j}.bin" - split_vals[j].tofile(saved_path) + saved_path = saved_dir + f"/model.{key}.bin" + val = (val / factor) if factor > 1 else val + val.tofile(saved_path) else: - print("[ERROR] cannot find key '{}'".format(key)) + if ( + key.find("attention.dense.weight") != -1 + or key.find("mlp.dense_4h_to_h.weight") != -1 + ): + split_vals = np.split(val, factor, axis=0) + + elif ( + key.find("mlp.dense_h_to_4h.weight") != -1 + or key.find("mlp.dense_h_to_4h.bias") != -1 + ): + split_vals = np.split(val, factor, axis=-1) + + elif key.find("attention.query_key_value.bias") != -1: + local_dim = (int)(val.shape[-1] / 3) + n_head = config["num_attention_heads"] + + val = val.reshape(n_head, 3, local_dim // n_head) + val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim) + split_vals = np.split(val, factor, axis=-1) + + elif key.find("attention.query_key_value.weight") != -1: + hidden_dim = val.shape[0] + local_dim = (int)(val.shape[-1] / 3) + n_head = config["num_attention_heads"] + # Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden] + # FT needs the shape of [hidden_size, 3, num_heads, head_hidden] + val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head) + val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim) + + # print(np.mean(np.abs(val[:, 0, :]))) + split_vals = np.split(val, factor, axis=-1) + + else: + print("[ERROR] cannot find key '{}'".format(key)) + return + + for j in range(factor): + saved_path = saved_dir + f"/model.{key}.{j}.bin" + split_vals[j].tofile(saved_path) def split_and_convert(args): @@ -91,11 +87,7 @@ def split_and_convert(args): if os.path.exists(saved_dir) == False: os.makedirs(saved_dir) - t_gpu_num = args.trained_gpu_num - i_gpu_num = args.infer_gpu_num - assert i_gpu_num % t_gpu_num == 0 - - factor = (int)(i_gpu_num / t_gpu_num) + factor = args.infer_gpu_num # load position_embedding from rank 0 # model = torch.load(ckpt_name) @@ -145,8 +137,20 @@ def split_and_convert(args): "mlp.dense_4h_to_h.weight", ] - torch.multiprocessing.set_start_method("spawn") - pool = multiprocessing.Pool(args.processes) + huggingface_model_file_list = [ + hf_file_name + for hf_file_name in os.listdir(args.in_file) + if hf_file_name.endswith(".bin") + ] + if len(huggingface_model_file_list) > 1: + multiprocessing_context = multiprocessing.get_context() + pool_fn = multiprocessing_context.Pool + else: + torch.multiprocessing.set_start_method("spawn") + pool_fn = multiprocessing.Pool + + pool = pool_fn(args.processes) + for name, param in model.named_parameters(): array = param.detach().cpu().numpy().astype(np_weight_data_type) # print("input shape", name, array.shape) @@ -217,13 +221,6 @@ if __name__ == "__main__": help="file name of input checkpoint file", required=True, ) - parser.add_argument( - "-trained_gpu_num", - "-t_g", - type=int, - help="How many gpus for inference", - default=1, - ) parser.add_argument( "-infer_gpu_num", "-i_g", @@ -251,5 +248,9 @@ if __name__ == "__main__": print("{}: {}".format(key, vars(args)[key])) print("========================================") - shutil.rmtree(args.saved_dir, ignore_errors=True) + target_dir_path = os.path.join(args.saved_dir, "%d-gpu" % args.infer_gpu_num) + assert not os.path.exists(target_dir_path), ( + "target path has exist, please remove %s first." % target_dir_path + ) + split_and_convert(args)