Merge pull request #1 from TabbyML/support-gptneox-converter

Support triton fastertransformer converter with GptNeoX
2023-03-20 17:12:01 +08:00 · 2023-03-20 17:12:01 +08:00 · 8459a23a9c
parent 0cd5482eaf fe8e02aec6
commit 8459a23a9c
199 changed files with 938 additions and 474 deletions
--- a/converter/huggingface_gptj_convert.py
+++ b/converter/huggingface_gptj_convert.py
@ -1,253 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.  All rights reserved.
-# Modified by Brendan Dolan-Gavitt, 2022
-# Modified by Meng Zhang, 2023
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import argparse
-import configparser
-import multiprocessing
-import os
-import sys
-from pathlib import Path
-
-import numpy as np
-import torch
-from transformers import GPTJForCausalLM
-
-dir_path = os.path.dirname(os.path.realpath(__file__))
-sys.path.append(dir_path + "/../../../..")
-sys.path.append(dir_path)
-
-
-def get_weight_data_type(data_type):
-    if data_type == "fp32":
-        return np.float32
-    elif data_type == "fp16":
-        return np.float16
-    else:
-        assert False, f"Invalid weight data type {data_type}"
-
-
-def split_and_convert_process(i, saved_dir, factor, key, val):
-    if (
-        key.find("input_layernorm.weight") != -1
-        or key.find("input_layernorm.bias") != -1
-        or key.find("attention.dense.bias") != -1
-        or key.find("post_attention_layernorm.weight") != -1
-        or key.find("post_attention_layernorm.bias") != -1
-        or key.find("mlp.dense_4h_to_h.bias") != -1
-        or key.find("final_layernorm.weight") != -1
-        or key.find("final_layernorm.bias") != -1
-    ):
-
-        # shared weights, only need to convert the weights of rank 0
-        if i == 0:
-            saved_path = saved_dir + "/model." + key + ".bin"
-            val.tofile(saved_path)
-
-    elif (
-        key.find("attention.dense.weight") != -1
-        or key.find("mlp.dense_4h_to_h.weight") != -1
-    ):
-        split_vals = np.split(val, factor, axis=0)
-        for j in range(factor):
-            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
-            split_vals[j].tofile(saved_path)
-
-    elif (
-        key.find("mlp.dense_h_to_4h.weight") != -1
-        or key.find("mlp.dense_h_to_4h.bias") != -1
-    ):
-
-        split_vals = np.split(val, factor, axis=-1)
-        for j in range(factor):
-            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
-            split_vals[j].tofile(saved_path)
-
-    elif key.find("attention.query_key_value.weight") != -1:
-        split_vals = np.split(val, factor, axis=-1)
-
-        for j in range(factor):
-            saved_path = saved_dir + "/model." + key + ".%d.bin" % (i * factor + j)
-            split_vals[j].tofile(saved_path)
-
-    else:
-        print("[ERROR] cannot find key '{}'".format(key))
-
-
-def split_and_convert(args):
-    saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
-
-    if os.path.exists(saved_dir) is False:
-        os.makedirs(saved_dir)
-
-    t_gpu_num = args.trained_gpu_num
-    i_gpu_num = args.infer_gpu_num
-    assert i_gpu_num % t_gpu_num == 0
-
-    factor = (int)(i_gpu_num / t_gpu_num)
-
-    model = GPTJForCausalLM.from_pretrained(args.in_file)
-
-    try:
-        config = configparser.ConfigParser()
-        config["gpt"] = {}
-        for key in vars(args):
-            config["gpt"][key] = f"{vars(args)[key]}"
-        for k, v in vars(model.config).items():
-            config["gpt"][k] = f"{v}"
-        config["gpt"]["weight_data_type"] = args.weight_data_type
-        with open((Path(saved_dir) / "config.ini").as_posix(), "w") as configfile:
-            config.write(configfile)
-    except Exception:
-        print("Fail to save the config in config.ini.")
-    np_weight_data_type = get_weight_data_type(args.weight_data_type)
-
-    huggingface_model_name_pattern = [
-        "ln_1.bias",
-        "ln_1.weight",
-        "attn.q_proj.weight",
-        "attn.out_proj.weight",
-        "mlp.fc_in.bias",
-        "mlp.fc_in.weight",
-        "mlp.fc_out.bias",
-        "mlp.fc_out.weight",
-    ]
-
-    ft_model_name_pattern = [
-        "input_layernorm.bias",
-        "input_layernorm.weight",
-        "attention.query_key_value.weight",
-        "attention.dense.weight",
-        "mlp.dense_h_to_4h.bias",
-        "mlp.dense_h_to_4h.weight",
-        "mlp.dense_4h_to_h.bias",
-        "mlp.dense_4h_to_h.weight",
-    ]
-
-    torch.multiprocessing.set_start_method("spawn")
-    pool = multiprocessing.Pool(args.processes)
-    for name, param in model.named_parameters():
-        if name.find("weight") == -1 and name.find("bias") == -1:
-            continue
-        print(name)
-        if name == "transformer.wte.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.wte.bin"
-            )
-        elif name == "transformer.ln_f.bias":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.bias.bin"
-            )
-        elif name == "transformer.ln_f.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.final_layernorm.weight.bin"
-            )
-        elif name == "lm_head.weight":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.lm_head.weight.bin"
-            )
-        elif name == "lm_head.bias":
-            param.detach().cpu().numpy().astype(np_weight_data_type).tofile(
-                saved_dir + "model.lm_head.bias.bin"
-            )
-        else:
-            for i in range(len(huggingface_model_name_pattern)):
-                if name.find(huggingface_model_name_pattern[i]) != -1:
-                    # Special case for QKV weights
-                    if name.find("attn.q_proj.weight") != -1:
-                        layer = name.split(".")[2]
-                        base_k = f"transformer.h.{layer}."
-                        w = model.state_dict()
-                        QKV_w = torch.stack(
-                            [
-                                w[base_k + "attn.q_proj.weight"],
-                                w[base_k + "attn.k_proj.weight"],
-                                w[base_k + "attn.v_proj.weight"],
-                            ]
-                        )  # [qkv, n_heads * dim_head, latent_space]
-                        QKV_w = QKV_w.permute(2, 0, 1)
-                        weights = (
-                            QKV_w.detach().cpu().numpy().astype(np_weight_data_type)
-                        )
-                    else:
-                        weights = (
-                            param.detach().cpu().numpy().astype(np_weight_data_type)
-                        )
-
-                    # Some weights need to be transposed
-                    if (
-                        name.find("mlp.fc_in.weight") != -1
-                        or name.find("mlp.fc_out.weight") != -1
-                        or name.find("attn.out_proj.weight") != -1
-                    ):
-                        weights = weights.T
-
-                    new_name = name.replace("transformer.h.", "layers.").replace(
-                        huggingface_model_name_pattern[i], ft_model_name_pattern[i]
-                    )
-
-                    pool.starmap(
-                        split_and_convert_process,
-                        [(0, saved_dir, factor, new_name, weights)],
-                    )
-
-    pool.close()
-    pool.join()
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
-    parser.add_argument(
-        "-saved_dir", "-o", type=str, help="file name of output file", required=True
-    )
-    parser.add_argument(
-        "-in_file", "-i", type=str, help="HF model name or directory", required=True
-    )
-    parser.add_argument(
-        "-trained_gpu_num",
-        "-t_g",
-        type=int,
-        help="How many gpus for training",
-        default=1,
-    )
-    parser.add_argument(
-        "-infer_gpu_num",
-        "-i_g",
-        type=int,
-        help="How many gpus for inference",
-        required=True,
-    )
-    parser.add_argument(
-        "-processes",
-        "-p",
-        type=int,
-        help="How many processes to spawn for conversion (default: 4)",
-        default=4,
-    )
-    parser.add_argument(
-        "-weight_data_type",
-        type=str,
-        default="fp32",
-        choices=["fp32", "fp16"],
-        help="output weight data type",
-    )
-
-    args = parser.parse_args()
-    print("\n=============== Argument ===============")
-    for key in vars(args):
-        print("{}: {}".format(key, vars(args)[key]))
-    print("========================================")
-
-    split_and_convert(args)
--- a/converter/huggingface_gptneox_convert.py
+++ b/converter/huggingface_gptneox_convert.py
@ -0,0 +1,257 @@
+import argparse
+import configparser
+import multiprocessing
+import os
+import shutil
+import sys
+from pathlib import Path
+
+import numpy as np
+import torch
+from transformers import GPTNeoXForCausalLM
+
+
+def get_weight_data_type(data_type):
+    if data_type == "fp32":
+        return np.float32
+    elif data_type == "fp16":
+        return np.float16
+    else:
+        assert False, f"Invalid weight data type {data_type}"
+
+
+def split_and_convert_process(saved_dir, factor, key, args, config, val):
+
+    if (
+        key.find("input_layernorm.weight") != -1
+        or key.find("input_layernorm.bias") != -1
+        or key.find("attention.dense.bias") != -1
+        or key.find("post_attention_layernorm.weight") != -1
+        or key.find("post_attention_layernorm.bias") != -1
+        or key.find("mlp.dense_4h_to_h.bias") != -1
+        or key.find("final_layernorm.weight") != -1
+        or key.find("final_layernorm.bias") != -1
+    ):
+        saved_path = saved_dir + f"/model.{key}.bin"
+        val.tofile(saved_path)
+
+    elif (
+        key.find("attention.dense.weight") != -1
+        or key.find("mlp.dense_4h_to_h.weight") != -1
+    ):
+        split_vals = np.split(val, factor, axis=0)
+        for j in range(factor):
+            saved_path = saved_dir + f"/model.{key}.{j}.bin"
+            split_vals[j].tofile(saved_path)
+
+    elif (
+        key.find("mlp.dense_h_to_4h.weight") != -1
+        or key.find("mlp.dense_h_to_4h.bias") != -1
+    ):
+
+        split_vals = np.split(val, factor, axis=-1)
+        for j in range(factor):
+            saved_path = saved_dir + f"/model.{key}.{j}.bin"
+            split_vals[j].tofile(saved_path)
+
+    elif key.find("attention.query_key_value.bias") != -1:
+        local_dim = (int)(val.shape[-1] / 3)
+        n_head = config["num_attention_heads"]
+
+        val = val.reshape(n_head, 3, local_dim // n_head)
+        val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim)
+        split_vals = np.split(val, factor, axis=-1)
+
+        for j in range(factor):
+            saved_path = saved_dir + f"/model.{key}.{j}.bin"
+            split_vals[j].tofile(saved_path)
+
+    elif key.find("attention.query_key_value.weight") != -1:
+        hidden_dim = val.shape[0]
+        local_dim = (int)(val.shape[-1] / 3)
+        n_head = config["num_attention_heads"]
+        # Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden]
+        # FT needs the shape of [hidden_size, 3, num_heads, head_hidden]
+        val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head)
+        val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim)
+
+        # print(np.mean(np.abs(val[:, 0, :])))
+        split_vals = np.split(val, factor, axis=-1)
+
+        for j in range(factor):
+            saved_path = saved_dir + f"/model.{key}.{j}.bin"
+            split_vals[j].tofile(saved_path)
+
+    else:
+        print("[ERROR] cannot find key '{}'".format(key))
+
+
+def split_and_convert(args):
+    saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num
+
+    if os.path.exists(saved_dir) == False:
+        os.makedirs(saved_dir)
+    ckpt_name = args.in_file
+
+    t_gpu_num = args.trained_gpu_num
+    i_gpu_num = args.infer_gpu_num
+    assert i_gpu_num % t_gpu_num == 0
+
+    factor = (int)(i_gpu_num / t_gpu_num)
+
+    # load position_embedding from rank 0
+    # model = torch.load(ckpt_name)
+    model = GPTNeoXForCausalLM.from_pretrained(args.in_file)
+    hf_config = vars(model.config)
+
+    np_weight_data_type = get_weight_data_type(args.weight_data_type)
+
+    try:
+        model_name = args.model_name
+        n_heads = hf_config["num_attention_heads"]
+        head_size = hf_config["hidden_size"] // n_heads
+        rotary_dim = int(head_size * hf_config["rotary_pct"])
+        use_gptj_residual = int(hf_config["use_parallel_residual"])
+
+        config = configparser.ConfigParser()
+        config["gptneox"] = {}
+        config["gptneox"]["model_name"] = model_name
+        config["gptneox"]["head_num"] = str(n_heads)
+        config["gptneox"]["size_per_head"] = str(head_size)
+        config["gptneox"]["inter_size"] = str(hf_config["intermediate_size"])
+        config["gptneox"]["num_layer"] = str(hf_config["num_hidden_layers"])
+        config["gptneox"]["rotary_embedding"] = str(rotary_dim)
+        config["gptneox"]["vocab_size"] = str(hf_config["vocab_size"])
+        config["gptneox"]["start_id"] = str(hf_config["bos_token_id"])
+        config["gptneox"]["end_id"] = str(hf_config["eos_token_id"])
+        config["gptneox"]["use_gptj_residual"] = str(use_gptj_residual)
+        config["gptneox"]["weight_data_type"] = args.weight_data_type
+
+        with open((Path(saved_dir) / f"config.ini").as_posix(), "w") as configfile:
+            config.write(configfile)
+    except Exception as e:
+        print(f"Fail to save the config in config.ini.", e)
+
+    ft_model_name_pattern = [
+        "input_layernorm.bias",
+        "input_layernorm.weight",
+        "attention.query_key_value.bias",
+        "attention.query_key_value.weight",
+        "attention.dense.bias",
+        "attention.dense.weight",
+        "post_attention_layernorm.bias",
+        "post_attention_layernorm.weight",
+        "mlp.dense_h_to_4h.bias",
+        "mlp.dense_h_to_4h.weight",
+        "mlp.dense_4h_to_h.bias",
+        "mlp.dense_4h_to_h.weight",
+    ]
+
+    torch.multiprocessing.set_start_method("spawn")
+    pool = multiprocessing.Pool(args.processes)
+    for name, param in model.named_parameters():
+        array = param.detach().cpu().numpy().astype(np_weight_data_type)
+        # print("input shape", name, array.shape)
+        if name.find("weight") == -1 and name.find("bias") == -1:
+            print("skipped", name)
+            continue
+        elif name == "gpt_neox.embed_in.weight":
+            array.tofile(saved_dir + "model.wte.bin")
+        elif name == "gpt_neox.final_layer_norm.bias":
+            array.tofile(saved_dir + "model.final_layernorm.bias.bin")
+        elif name == "gpt_neox.final_layer_norm.weight":
+            array.tofile(saved_dir + "model.final_layernorm.weight.bin")
+        elif name == "embed_out.weight":
+            array.tofile(saved_dir + "model.lm_head.weight.bin")
+        else:
+            processed = False
+            for i in range(len(ft_model_name_pattern)):
+                if name.find(ft_model_name_pattern[i]) != -1:
+                    new_name = name.replace("gpt_neox.", "")
+                    pool.starmap(
+                        split_and_convert_process,
+                        [
+                            (
+                                saved_dir,
+                                factor,
+                                new_name,
+                                args,
+                                vars(model.config),
+                                array.T,
+                            )
+                        ],
+                    )
+                    processed = True
+                    break
+
+            if not processed:
+                print("Unused layer", name)
+
+    pool.close()
+    pool.join()
+
+    # Post-process biases if use_gptj_residual is True
+    if use_gptj_residual:
+        for layer_idx in range(hf_config["n_layer"]):
+            attn_bias = np.fromfile(
+                saved_dir + f"/model.layers.{layer_idx}.attention.dense.bias.bin",
+                dtype=np.float32,
+            )
+            mlp_bias = np.fromfile(
+                saved_dir + f"/model.layers.{layer_idx}.mlp.dense_4h_to_h.bias.bin",
+                dtype=np.float32,
+            )
+
+            (attn_bias + mlp_bias).tofile(
+                saved_dir + f"/model.layers.{layer_idx}.mlp.attention.bias.sum.bin"
+            )
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter)
+    parser.add_argument(
+        "-saved_dir", "-o", type=str, help="file name of output file", required=True
+    )
+    parser.add_argument(
+        "-in_file",
+        "-i",
+        type=str,
+        help="file name of input checkpoint file",
+        required=True,
+    )
+    parser.add_argument(
+        "-trained_gpu_num",
+        "-t_g",
+        type=int,
+        help="How many gpus for inference",
+        default=1,
+    )
+    parser.add_argument(
+        "-infer_gpu_num",
+        "-i_g",
+        type=int,
+        help="How many gpus for inference",
+        required=True,
+    )
+    parser.add_argument(
+        "-processes",
+        "-p",
+        type=int,
+        help="How many processes to spawn for conversion (default: 4)",
+        default=4,
+    )
+    parser.add_argument(
+        "-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"]
+    )
+    parser.add_argument(
+        "-model_name", "-m_n", type=str, help="model name", required=True
+    )
+
+    args = parser.parse_args()
+    print("\n=============== Argument ===============")
+    for key in vars(args):
+        print("{}: {}".format(key, vars(args)[key]))
+    print("========================================")
+
+    shutil.rmtree(args.saved_dir, ignore_errors=True)
+    split_and_convert(args)
--- a/converter/testdata/1-gpu/config.ini
+++ b/converter/testdata/1-gpu/config.ini
@ -1,86 +0,0 @@
-[gpt]
-saved_dir = out
-in_file = hf-internal-testing/tiny-random-gptj
-trained_gpu_num = 1
-infer_gpu_num = 1
-processes = 4
-weight_data_type = fp32
-vocab_size = 1000
-n_positions = 512
-n_embd = 32
-n_layer = 5
-n_head = 4
-n_inner = None
-rotary_dim = 4
-activation_function = gelu_new
-resid_pdrop = 0.0
-embd_pdrop = 0.0
-attn_pdrop = 0.0
-layer_norm_epsilon = 1e-05
-initializer_range = 0.02
-use_cache = True
-bos_token_id = 98
-eos_token_id = 98
-return_dict = True
-output_hidden_states = False
-output_attentions = False
-torchscript = False
-torch_dtype = None
-use_bfloat16 = False
-tf_legacy_loss = False
-pruned_heads = {}
-tie_word_embeddings = False
-is_encoder_decoder = False
-is_decoder = False
-cross_attention_hidden_size = None
-add_cross_attention = False
-tie_encoder_decoder = False
-max_length = 20
-min_length = 0
-do_sample = False
-early_stopping = False
-num_beams = 1
-num_beam_groups = 1
-diversity_penalty = 0.0
-temperature = 1.0
-top_k = 50
-top_p = 1.0
-typical_p = 1.0
-repetition_penalty = 1.0
-length_penalty = 1.0
-no_repeat_ngram_size = 0
-encoder_no_repeat_ngram_size = 0
-bad_words_ids = None
-num_return_sequences = 1
-chunk_size_feed_forward = 0
-output_scores = False
-return_dict_in_generate = False
-forced_bos_token_id = None
-forced_eos_token_id = None
-remove_invalid_values = False
-exponential_decay_length_penalty = None
-suppress_tokens = None
-begin_suppress_tokens = None
-architectures = None
-finetuning_task = None
-id2label = {0: 'LABEL_0', 1: 'LABEL_1'}
-label2id = {'LABEL_0': 0, 'LABEL_1': 1}
-tokenizer_class = None
-prefix = None
-pad_token_id = 98
-sep_token_id = None
-decoder_start_token_id = None
-task_specific_params = None
-problem_type = None
-_name_or_path = hf-internal-testing/tiny-random-gptj
-_commit_hash = b96595a4bcdeb272096214589efa0314259853a0
-transformers_version = 4.11.0.dev0
-attention_probs_dropout_prob = 0.0
-gradient_checkpointing = False
-hidden_act = gelu
-hidden_dropout_prob = 0.0
-intermediate_size = 37
-model_type = gptj
-n_ctx = 512
-scale_attn_weights = True
-type_vocab_size = 16
--- a/converter/testdata/1-gpu/model.final_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.final_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.final_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.final_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.0.attention.dense.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.0.attention.dense.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:125bd07dc5e7a5c6444f7689ba78df0f4c7959b6dcfbaf7c89edd0634a147ea0
-size 4096
--- a/converter/testdata/1-gpu/model.layers.0.attention.query_key_value.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.0.attention.query_key_value.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:11fd52f2b94fad5fe54e2e03c5848050703ec4d798a43a4c1813f109a6703883
-size 12288
--- a/converter/testdata/1-gpu/model.layers.0.input_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.0.input_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.0.input_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.layers.0.input_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.0.mlp.dense_4h_to_h.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.0.mlp.dense_4h_to_h.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.0.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.0.mlp.dense_4h_to_h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c7459667751237cef87ba33e5faf46e82ea33f7f53aaae7bf95a8667b6f9e639
-size 16384
--- a/converter/testdata/1-gpu/model.layers.0.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/testdata/1-gpu/model.layers.0.mlp.dense_h_to_4h.bias.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:076a27c79e5ace2a3d47f9dd2e83e4ff6ea8872b3c2218f66c92b89b55f36560
-size 512
--- a/converter/testdata/1-gpu/model.layers.0.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.0.mlp.dense_h_to_4h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:55de52e2d73e7e92962bc02673a7dce6123d77e486ac4f8ce7bead1a6727d227
-size 16384
--- a/converter/testdata/1-gpu/model.layers.1.attention.dense.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.1.attention.dense.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ec12f3d7c73cbc440b476ad66bb4a1ef43881c4d740e8114dc1578bb0197a17e
-size 4096
--- a/converter/testdata/1-gpu/model.layers.1.attention.query_key_value.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.1.attention.query_key_value.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:86d3c23c240260084ac27bd98d52524f0b3559d8106839d85bc927b44750bd81
-size 12288
--- a/converter/testdata/1-gpu/model.layers.1.input_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.1.input_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.1.input_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.layers.1.input_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.1.mlp.dense_4h_to_h.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.1.mlp.dense_4h_to_h.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.1.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.1.mlp.dense_4h_to_h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:75338924b49fb652d556c260013520b58ca70c8bd782bf59732c5ca8d1de111d
-size 16384
--- a/converter/testdata/1-gpu/model.layers.1.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/testdata/1-gpu/model.layers.1.mlp.dense_h_to_4h.bias.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:076a27c79e5ace2a3d47f9dd2e83e4ff6ea8872b3c2218f66c92b89b55f36560
-size 512
--- a/converter/testdata/1-gpu/model.layers.1.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.1.mlp.dense_h_to_4h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:be97681ddf43d42ee25d0439c6958edac3549b0edda8a4e3e8bce4275916bb7c
-size 16384
--- a/converter/testdata/1-gpu/model.layers.2.attention.dense.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.2.attention.dense.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:52bfbb09f85dba050a997ddd1869ae74aa8e76a70a63fd4b7c047531128bec07
-size 4096
--- a/converter/testdata/1-gpu/model.layers.2.attention.query_key_value.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.2.attention.query_key_value.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:869b24c7a494832730ba41e0286966b23e700e4239d39198e78551a7ef5e8d7f
-size 12288
--- a/converter/testdata/1-gpu/model.layers.2.input_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.2.input_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.2.input_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.layers.2.input_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.2.mlp.dense_4h_to_h.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.2.mlp.dense_4h_to_h.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.2.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.2.mlp.dense_4h_to_h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:029775363050d9e4c232301274a8085474ca7992a42cabc1112ff46938c5a178
-size 16384
--- a/converter/testdata/1-gpu/model.layers.2.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/testdata/1-gpu/model.layers.2.mlp.dense_h_to_4h.bias.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:076a27c79e5ace2a3d47f9dd2e83e4ff6ea8872b3c2218f66c92b89b55f36560
-size 512
--- a/converter/testdata/1-gpu/model.layers.2.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.2.mlp.dense_h_to_4h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:95beee5255e9d3d5d255ebc6d3f18465e1ab005b7e33e3d0d6495eb7a6178eed
-size 16384
--- a/converter/testdata/1-gpu/model.layers.3.attention.dense.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.3.attention.dense.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:7af1303e224fc1185c5d936ca8fb3ab336e46f1bd09c94f1e749936d0b023713
-size 4096
--- a/converter/testdata/1-gpu/model.layers.3.attention.query_key_value.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.3.attention.query_key_value.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:55961585790611913b727f6043af3589274c4b2350ea68bcacf25009e750cc37
-size 12288
--- a/converter/testdata/1-gpu/model.layers.3.input_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.3.input_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.3.input_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.layers.3.input_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.3.mlp.dense_4h_to_h.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.3.mlp.dense_4h_to_h.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.3.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.3.mlp.dense_4h_to_h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:9f37f3e28a6e716d08d29d02126854bd6e6248763b9a30af2f2e1dcf7b8fd9a5
-size 16384
--- a/converter/testdata/1-gpu/model.layers.3.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/testdata/1-gpu/model.layers.3.mlp.dense_h_to_4h.bias.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:076a27c79e5ace2a3d47f9dd2e83e4ff6ea8872b3c2218f66c92b89b55f36560
-size 512
--- a/converter/testdata/1-gpu/model.layers.3.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.3.mlp.dense_h_to_4h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d903dfb50cce4eae673a2d96a7814353ac1436c0c4d65490bb3ed7d8f8cfc586
-size 16384
--- a/converter/testdata/1-gpu/model.layers.4.attention.dense.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.4.attention.dense.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb76390eda7af1d23d5b162446b4c5da0b791ec337d20d9a643b9716377ae514
-size 4096
--- a/converter/testdata/1-gpu/model.layers.4.attention.query_key_value.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.4.attention.query_key_value.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8f54ff65f519de2db6343ec257da92091162381082fbb5dba04df50d8e12a158
-size 12288
--- a/converter/testdata/1-gpu/model.layers.4.input_layernorm.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.4.input_layernorm.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.4.input_layernorm.weight.bin
+++ b/converter/testdata/1-gpu/model.layers.4.input_layernorm.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b638277a8690e175a9137feff1e43c067f9faf4e2f600caf468fb05b0403b717
-size 128
--- a/converter/testdata/1-gpu/model.layers.4.mlp.dense_4h_to_h.bias.bin
+++ b/converter/testdata/1-gpu/model.layers.4.mlp.dense_4h_to_h.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:38723a2e5e8a17aa7950dc008209944e898f69a7bd10a23c839d341e935fd5ca
-size 128
--- a/converter/testdata/1-gpu/model.layers.4.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.4.mlp.dense_4h_to_h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:67e7225a2de28f7e21a72b7ea1cab517071744c53d13f4c6964cbc27f4d261e4
-size 16384
--- a/converter/testdata/1-gpu/model.layers.4.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/testdata/1-gpu/model.layers.4.mlp.dense_h_to_4h.bias.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:076a27c79e5ace2a3d47f9dd2e83e4ff6ea8872b3c2218f66c92b89b55f36560
-size 512
--- a/converter/testdata/1-gpu/model.layers.4.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/testdata/1-gpu/model.layers.4.mlp.dense_h_to_4h.weight.0.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:cb7582613a3b40b73ac166af2e6b9773511cf6fba7bec12f340119f588c8ea48
-size 16384
--- a/converter/testdata/1-gpu/model.lm_head.bias.bin
+++ b/converter/testdata/1-gpu/model.lm_head.bias.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:fc19b1997119425765295aeab72d76faa6927d4f83985d328c26f20468d6cc76
-size 4000
--- a/converter/testdata/1-gpu/model.lm_head.weight.bin
+++ b/converter/testdata/1-gpu/model.lm_head.weight.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:884fae8e52776e900d3800048825503048c95ab2850c38c1a595c6da962d3286
-size 128000
--- a/converter/testdata/1-gpu/model.wte.bin
+++ b/converter/testdata/1-gpu/model.wte.bin
@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d0d0fec061d1987d0f19acbd319c3471037691cebecd5aa2b00cedcdc2c6177c
-size 128000
--- a/converter/tests/gptneox/docker-compose.yaml
+++ b/converter/tests/gptneox/docker-compose.yaml
@ -0,0 +1,17 @@
+version: '3.3'
+services:
+  triton:
+    image: ghcr.io/tabbyml/fastertransformer_backend:main
+    command: mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model
+    shm_size: 1gb
+    ports:
+      - "8001:8001"
+    volumes:
+      - ./testdata:/model
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/config.ini
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/config.ini
@ -0,0 +1,12 @@
+[gptneox]
+model_name = gptneox
+head_num = 12
+size_per_head = 64
+inter_size = 3072
+num_layer = 12
+rotary_embedding = 64
+vocab_size = 50304
+start_id = 0
+end_id = 0
+use_gptj_residual = 0
+weight_data_type = fp32
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.final_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.final_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c8e8e6b531dabfeff987975cc6c937428139beb7f5699b8b938545716d8eeca7
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.final_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.final_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a20104b8a794c5dfa02c2847d25a2697ae60971ea296192dc88a9eaaa479f606
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.dense.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.dense.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3df9dbc92f1617b447db1aa855da24d07660a12c41b22dee002696813b3ebbba
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.dense.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.dense.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5eba16ee6fe2391cf2efcb7162e4662dd905a58cb8610ff32d07813e7575d6fc
+size 2359296
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.query_key_value.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.query_key_value.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:93fb9545a9c7ba88f8927bb1929a3d1d74962d18909ee21225e6b9756c77c332
+size 9216
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.query_key_value.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.attention.query_key_value.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:35a3c434d0af69813352edd06e29e671e8e160de1221a90abe6f308cd6b624ab
+size 7077888
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.input_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.input_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f324b52b78c5e245f0b68559ac1b40c7c54aa1d48005ca2d12887cb53dbbc9b2
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.input_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.input_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1c632d0534e7e7e4eab3d843a0a1f0d0f3843aa9b1aefd6b706ad71cf3de08ed
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_4h_to_h.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_4h_to_h.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7e2b220335cce206b3928380f069259c01e382937189f1d86d40641bea319a27
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_4h_to_h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2d59fe9d26e5a6b5bf67a2d631bf012cb67e30a8aee34d5a8b5589a1a3b767d0
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_h_to_4h.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1fb0620b893fc6c19a0d99f7b0b19fcbc81262b7de48532a0cfd74a6c2ceb11e
+size 12288
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.mlp.dense_h_to_4h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea4b2ed74e487155f748f4ad057070db5a1ef9bbe18f155caf05f1889a7db0e4
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.post_attention_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.post_attention_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0e949431800d44dc72725fb2a14c1b6257df33a087c89a5363504b2796f033ef
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.post_attention_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.0.post_attention_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e63fbed352f174592299da93e92c5652b45d4d439aea92867c30bc292d2d8891
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.dense.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.dense.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca3de5c5d037abd4ba67b56fca3fd35e11f2d3355313684066188424579295f0
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.dense.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.dense.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3bfab993d3877ea0dcdbb35f1bfbe371cf03dee47e23a124e1a3004c9abde182
+size 2359296
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.query_key_value.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.query_key_value.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ef2961811f9b09add00d1acf68bbb0d983b2a266415bcf4167c108cf0af2b162
+size 9216
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.query_key_value.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.attention.query_key_value.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:678d7bdfff7a580935c141a3c8d56121925ad5688ca4839150f7c38aa07b3816
+size 7077888
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.input_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.input_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:22ee83ada70710e37e74558e042db460e1264b2373ad2272891f8681dd32992f
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.input_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.input_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea7f1c9586a26d8b4c7db214c5e036691d24a6aaa0700434e34fb89789e8413d
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_4h_to_h.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_4h_to_h.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ea6d9660941b2258411fa5a95bc2efae46f9b37b2c2677dbfd15fc4bfa97b7d8
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_4h_to_h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:df394855b5b83e6cef0b009b66d75bc6ad9536f63ea81c118821269ddb389811
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_h_to_4h.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:970ccc870e06e091619acffcd71eec44e61ea4e38490094850dea049a55ee4d4
+size 12288
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.mlp.dense_h_to_4h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:669ab6458f06db05c0310bf04a4c6681b00faa5ad498d144552ddc8b0ebcad1c
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.post_attention_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.post_attention_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d77f19d5997955ea3d639595ca6d0a1d94f16eb0a54e6ed9256d151e84a32160
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.post_attention_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.1.post_attention_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a208323db32e45ec21cb6fa92754862a2d8db9dc32a148ede730567e1b53ea6d
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.dense.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.dense.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9a991d95e2e3a0c716990227c3fca9617a3762963404bd1bbb1649d6031476d7
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.dense.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.dense.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0948b6eb35b6e388e7ad4cdf315abdf943960d7952dfebbebb4897023cc9a278
+size 2359296
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.query_key_value.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.query_key_value.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4755cff515205a9072c5f00b4e592939e5359f7b535d447bc1f7da44830ce011
+size 9216
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.query_key_value.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.attention.query_key_value.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4b52cdfe8cd07b6f604cd3d848c7ab44efaa5a260441b7da057c42488992b3d6
+size 7077888
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.input_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.input_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:21b95c5b59d5a99b5f833825f620ddde882036901fda433046d6047a2ad90b51
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.input_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.input_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e927894a15e7a284be51655cb46338d98d4ca77378d9f865dc00707ae62758b
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_4h_to_h.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_4h_to_h.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:dbf8db67c06bf5b4bb5060f3ace4e5a84ddcc07dea1efc631b93a1244916da4f
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_4h_to_h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7f1d8f3a25a663c5a16d2f390c6db233c808ba22a5617a057f6a0d3350d70260
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_h_to_4h.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:85917a5c6f37da43778c01ddd45b5a444e55265cb17c9156044545c8b378fbbb
+size 12288
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.mlp.dense_h_to_4h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:291116b1849ae751a730ebd574548936105cff112b22e1b21a3830640f2b3b6e
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.post_attention_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.post_attention_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:aedc1896553c1017fcbfa8672ba0ce8a26968f288009904e1e91966e6bb90518
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.post_attention_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.10.post_attention_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d0447f6a6765688512b646eccc2fbb742b273d7f6ba14568a813b1e6d13d4d2
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.dense.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.dense.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e9abd3af37bf3a976fa9a6c2fd499e1e852fa8e6f80af55e868dbcc31b92c6cc
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.dense.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.dense.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b249f47f438a6a13ac0b10f42308dc9c84cf53bab9300129140c56d44d8d4296
+size 2359296
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.query_key_value.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.query_key_value.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e13dee33e6ca1356ef5415df8a755832ca0a4d546256ac9beafc954011402756
+size 9216
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.query_key_value.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.attention.query_key_value.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0a14a2b3a7f763772f4401da40419fa7cc1caae2cd7bf416ce9b1ff78501954a
+size 7077888
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.input_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.input_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bf6231f0316632a83598eee3c9b2dedd42f6086f9aa9948ffb2c2575d42f855b
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.input_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.input_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:358702cdee17d0d06a44e5d4915f0cc4acd75bbe88e678d1d87e1034a823e24d
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_4h_to_h.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_4h_to_h.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f10daf518dd3b8456cbbfcf0e97df875ac9d818559e29ffbc04b6aae9b00933f
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_4h_to_h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_4h_to_h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ed18d4ed924f201ef49ba8c84699c81f9bfb6b470f3f58c3eda58e97a6acfb12
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_h_to_4h.bias.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_h_to_4h.bias.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f7240fb1faaa2309b82123f827715f6294b408e97d688d7fc514d7ce14910b8a
+size 12288
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_h_to_4h.weight.0.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.mlp.dense_h_to_4h.weight.0.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:511d7ea871bb241619c99331ac746dceb467be2eb690031bbaec19b9b747c337
+size 9437184
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.post_attention_layernorm.bias.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.post_attention_layernorm.bias.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d35391ad45043b104204e9f80c538bf30e17e538fc38b171b37d87f2e368b517
+size 3072
--- a/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.post_attention_layernorm.weight.bin
+++ b/converter/tests/gptneox/testdata/fastertransformer/1/1-gpu/model.layers.11.post_attention_layernorm.weight.bin
@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0b7d344fd44e6b84f9e01c13493425a1d4d64f1b05360b2af3d62a624e0f56d
+size 3072
--- a/Show More
+++ b/Show More