feat: add model-converter [TAB-153] (#357)

* feat: add transformers-to-ctranslate * chore: resolve comments * chore: fix * chore: remove dotenv * chore: resolve comments * chore: lint * chore: change dir name
2023-08-17 22:29:20 +08:00 · 2023-08-17 22:29:20 +08:00 · b4381acfbf
parent 732d83feef
commit b4381acfbf
4 changed files with 136 additions and 0 deletions
--- a/experimental/model-converter/init.py
+++ b/experimental/model-converter/init.py
--- a/experimental/model-converter/args.py
+++ b/experimental/model-converter/args.py
@ -0,0 +1,27 @@
+import argparse
+
+
+def make_parser():
+    parser = argparse.ArgumentParser(
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter
+    )
+    parser.add_argument(
+        "--model",
+        required=True,
+        help=(
+            "Name of the pretrained model to download, "
+            "or path to a directory containing the pretrained model."
+        ),
+    )
+    parser.add_argument("--output_dir", required=True, help="Output model directory.")
+    parser.add_argument(
+        "--inference_mode",
+        required=True,
+        choices=["causallm", "seq2seq"],
+        help="Model inference mode. ",
+    )
+    parser.add_argument(
+        "--prompt_template", default=None, help="prompt template for fim"
+    )
+
+    return parser
--- a/experimental/model-converter/main.py
+++ b/experimental/model-converter/main.py
@ -0,0 +1,106 @@
+from args import make_parser
+import json
+import os
+import shutil
+
+from ctranslate2.converters.transformers import TransformersConverter
+from huggingface_hub import snapshot_download
+from transformers.convert_slow_tokenizers_checkpoints_to_fast import (
+    convert_slow_checkpoint_to_fast,
+)
+
+
+class InvalidConvertionException(Exception):
+    def __init__(self, *args: object) -> None:
+        super().__init__(*args)
+
+
+def convert_tokenizer():
+    if os.path.exists("./tokenizer.json"):
+        print("found tokenizer.json, skipping tokenizer conversion")
+        return
+
+    # Infer tokenizer name
+    if not os.path.isfile("tokenizer_config.json"):
+        raise InvalidConvertionException(
+            "cannot find tokenizer_config.json, unable to infer tokenizer name"
+        )
+
+    data = {}
+    with open("tokenizer_config.json", "r", encoding="utf-8") as f:
+        data = json.load(f)
+    tokenizer_name = data["tokenizer_class"]
+
+    convert_tmp_dir = "./convert_tmp"
+
+    # Start to convert
+    convert_slow_checkpoint_to_fast(
+        tokenizer_name=tokenizer_name,
+        checkpoint_name="./",
+        dump_path=convert_tmp_dir,
+        force_download=True,
+    )
+
+    # After successful conversion, copy file from ./convert_tmp to ./
+    for root, dirs, files in os.walk(convert_tmp_dir):
+        for f in files:
+            fpath = os.path.join(root, f)
+            shutil.copy2(fpath, "./")
+        for d in dirs:
+            dpath = os.path.join(root, d)
+            shutil.copy2(dpath, "./")
+    shutil.rmtree(convert_tmp_dir)
+
+
+def generate_tabby_json(args):
+    if os.path.exists("./tabby.json"):
+        print("found tabby.json, skipping tabby.json generation")
+        return
+
+    data = {}
+    data["auto_model"] = (
+        "AutoModelForCausalLM"
+        if args.inference_mode == "causallm"
+        else "AutoModelForSeq2SeqLM"
+    )
+    if args.prompt_template:
+        data["prompt_template"] = args.prompt_template
+    with open("tabby.json", "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4)
+
+
+def main():
+    # Set up args
+    parser = make_parser()
+
+    args = parser.parse_args()
+
+    # Check out model
+    model_path = snapshot_download(
+        repo_id=args.model,
+        cache_dir=args.output_dir,
+        force_download=False,
+    )
+
+    os.chdir(model_path)
+    convert_output_dir = os.path.join(model_path, "ctranslate2")
+
+    # Convert model into ctranslate
+    converter = TransformersConverter(
+        model_name_or_path=model_path,
+        load_as_float16=True,
+        trust_remote_code=True,
+    )
+    converter.convert(
+        output_dir=convert_output_dir, vmap=None, quantization="float16", force=True
+    )
+
+    # Convert model with fast tokenizer
+    convert_tokenizer()
+
+    # Generate tabby.json
+    generate_tabby_json(args)
+
+
+if __name__ == "__main__":
+    main()
--- a/experimental/model-converter/requirements.txt
+++ b/experimental/model-converter/requirements.txt
@ -0,0 +1,3 @@
+ctranslate2
+huggingface_hub
+transformers