From 0f72788d82dcd5694dbb532155e2d5285008e037 Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Mon, 12 Jun 2023 16:14:12 -0700 Subject: [PATCH] refactor: remove unused code --- tabby/tools/analytic/main.sh | 74 ----- tabby/tools/build_dataset/__main__.py | 126 --------- tabby/tools/build_dataset/args.py | 14 - tabby/tools/build_dataset/metrics.py | 27 -- ...gramming-languages-to-file-extensions.json | 28 -- .../converter/huggingface_gptj_convert.py | 181 ------------- .../converter/huggingface_gptneox_convert.py | 256 ------------------ tabby/tools/download_models.py | 47 ---- tabby/tools/repository/Makefile | 11 - tabby/tools/repository/testdata/config.toml | 4 - .../testdata/repositories/.gitignore | 1 - .../repository/testdata/repositories/.gitkeep | 0 tabby/tools/repository/updater.py | 43 --- 13 files changed, 812 deletions(-) delete mode 100755 tabby/tools/analytic/main.sh delete mode 100644 tabby/tools/build_dataset/__main__.py delete mode 100644 tabby/tools/build_dataset/args.py delete mode 100644 tabby/tools/build_dataset/metrics.py delete mode 100644 tabby/tools/build_dataset/programming-languages-to-file-extensions.json delete mode 100644 tabby/tools/converter/huggingface_gptj_convert.py delete mode 100644 tabby/tools/converter/huggingface_gptneox_convert.py delete mode 100644 tabby/tools/download_models.py delete mode 100644 tabby/tools/repository/Makefile delete mode 100644 tabby/tools/repository/testdata/config.toml delete mode 100644 tabby/tools/repository/testdata/repositories/.gitignore delete mode 100644 tabby/tools/repository/testdata/repositories/.gitkeep delete mode 100644 tabby/tools/repository/updater.py diff --git a/tabby/tools/analytic/main.sh b/tabby/tools/analytic/main.sh deleted file mode 100755 index bb6bdd1..0000000 --- a/tabby/tools/analytic/main.sh +++ /dev/null @@ -1,74 +0,0 @@ -#!/bin/bash -set -e - -DB_FILE=${DB_FILE:-"/data/logs/duckdb/duck.db"} -LOGS_DIR=${LOGS_DIR:-"/data/logs"} -TABBY_SERVER_LOGS="${LOGS_DIR}/events/tabby-server/*.json" - -# Init schema -function init_scheme() { -mkdir -p $(dirname $DB_FILE) -cat < /dev/null; then - -cat < 0) AS view, - (SUM(IF(type == 'select', 1, 0)) > 0) AS select - FROM events - WHERE completion_id IS NOT NULL - GROUP BY 1 -) rhs ON (lhs.id = rhs.completion_id); - -INSERT INTO completion_events SELECT t.* FROM t LEFT JOIN completion_events rhs ON (t.id = rhs.id) WHERE rhs.id IS NULL; -EOF - -else - echo "No files match ${TABBY_SERVER_LOGS}" -fi -} - -function duckdb() { - local SQL=$(tee) - cat << EOF | python3 - -import sys -import duckdb -conn = duckdb.connect('$DB_FILE') -print(conn.sql(""" -$SQL -""")) -EOF -} - -init_scheme -"$@" diff --git a/tabby/tools/build_dataset/__main__.py b/tabby/tools/build_dataset/__main__.py deleted file mode 100644 index e76cf5f..0000000 --- a/tabby/tools/build_dataset/__main__.py +++ /dev/null @@ -1,126 +0,0 @@ -import base64 -import glob -import json -import os - -import pandas as pd -from datasets import Dataset -from transformers import HfArgumentParser - -from . import metrics -from .args import PreprocessProjectArgs - - -def parse_args(): - parser = HfArgumentParser(PreprocessProjectArgs) - return parser.parse_args() - - -def read_languages_to_file_extensions(): - path = os.path.abspath(os.path.dirname(__file__)) - path = os.path.join(path, "programming-languages-to-file-extensions.json") - with open(path) as f: - return json.load(f) - - -def read_valid_extensions(): - content = read_languages_to_file_extensions() - extensions = [] - for k, exts in content.items(): - extensions += exts - return set(extensions) - - -def read_extension_to_language_mappings(): - content = read_languages_to_file_extensions() - mappings = dict() - for k, exts in content.items(): - for x in exts: - mappings[x] = k - return mappings - - -def dataset_iter(project_dir, files): - def gen(): - mappings = read_extension_to_language_mappings() - for x in files: - _, extname = os.path.splitext(x) - - with open(x) as f: - try: - content = f.read() - except UnicodeDecodeError: - print("Cannot decode unicode", x) - continue - - segments = x.removeprefix(project_dir).split(os.sep) - project = segments[1] - file = os.path.join(*segments[2:]) - yield dict( - id=to_id(project, file), - project=project, - file=file, - language=mappings[extname], - content=content, - **metrics.compute(content), - ) - - return gen - - -def count_by_language(dataset): - key = "language" - df = ( - pd.DataFrame(dataset[key], columns=[key]) - .groupby([key]) - .size() - .to_frame("count") - ) - return df - - -def to_id(*args): - token = ":".join(args) - return base64.urlsafe_b64encode(token.encode("utf-8")).decode("utf-8").rstrip("=") - - -def basic_filters(line_max=100, line_mean=100, alpha_frac=0.25): - def fn(example): - """Filter files based on line length and % alphanumeric characters""" - if example["max_line_length"] > line_max: - return False - elif example["avg_line_length"] > line_mean: - return False - elif example["alphanum_fraction"] < alpha_frac: - return False - return True - - return fn - - -if __name__ == "__main__": - valid_extensions = read_valid_extensions() - - def is_valid_file(x): - if not os.path.isfile(x): - return False - - _, extname = os.path.splitext(x) - if not extname in valid_extensions: - return False - - return True - - args = parse_args() - files = list( - filter(is_valid_file, glob.glob(args.project_dir + "/**/*", recursive=True)) - ) - - ds = Dataset.from_generator(dataset_iter(os.path.abspath(args.project_dir), files)) - ds = ds.filter(basic_filters()) - ds.save_to_disk(args.output_dir) - ds.to_json(os.path.join(args.output_dir, "dumps.json")) - - print("\n## Summary") - print("Number of source files", len(ds)) - print("Number of source files by languages", count_by_language(ds).to_json()) diff --git a/tabby/tools/build_dataset/args.py b/tabby/tools/build_dataset/args.py deleted file mode 100644 index 99c4af3..0000000 --- a/tabby/tools/build_dataset/args.py +++ /dev/null @@ -1,14 +0,0 @@ -from dataclasses import dataclass, field -from typing import Optional - - -@dataclass -class PreprocessProjectArgs: - # add arguments in the following format - project_dir: Optional[str] = field( - metadata={"help": "Project directory."}, - ) - - output_dir: Optional[str] = field( - metadata={"help": "Output save path directory."}, - ) diff --git a/tabby/tools/build_dataset/metrics.py b/tabby/tools/build_dataset/metrics.py deleted file mode 100644 index 8fd82ab..0000000 --- a/tabby/tools/build_dataset/metrics.py +++ /dev/null @@ -1,27 +0,0 @@ -def max_line_length(content): - return max([0] + [len(x) for x in content.splitlines()]) - - -def avg_line_length(content): - lines = [len(x) for x in content.splitlines()] - total = sum(lines) - if len(lines) != 0: - return total / len(lines) - else: - return 0 - - -def alphanum_fraction(content): - alphanum = [x for x in content if x.isalpha() or x.isnumeric()] - if len(content) != 0: - return len(alphanum) / len(content) - else: - return 0 - - -def compute(content): - return dict( - max_line_length=max_line_length(content), - avg_line_length=avg_line_length(content), - alphanum_fraction=alphanum_fraction(content), - ) diff --git a/tabby/tools/build_dataset/programming-languages-to-file-extensions.json b/tabby/tools/build_dataset/programming-languages-to-file-extensions.json deleted file mode 100644 index 721615f..0000000 --- a/tabby/tools/build_dataset/programming-languages-to-file-extensions.json +++ /dev/null @@ -1,28 +0,0 @@ -{ - "c": [".c", ".h"], - "csharp": [".cs"], - "cpp": [".cpp", ".hpp", ".c++", ".h++", ".cc", ".hh", ".C", ".H"], - "css": [".css"], - "dockerfile": ["Dockerfile"], - "go": [".go"], - "haskell": [".hs"], - "html": [".html"], - "java": [".java"], - "javascript": [".js"], - "julia": [".jl"], - "lua": [".lua"], - "makefile": ["Makefile"], - "markdown": [".md", ".markdown"], - "php": [".php", ".php3", ".php4", ".php5", ".phps", ".phpt"], - "perl": [".pl", ".pm", ".pod", ".perl"], - "powershell": [".ps1", ".psd1", ".psm1"], - "python": [".py"], - "ruby": [".rb"], - "rust": [".rs"], - "sql": [".sql"], - "scala": [".scala"], - "shellscript": [".sh", ".bash", ".command", ".zsh"], - "typescript": [".ts", ".tsx"], - "tex": [".tex"], - "vb": [".vb"] -} diff --git a/tabby/tools/converter/huggingface_gptj_convert.py b/tabby/tools/converter/huggingface_gptj_convert.py deleted file mode 100644 index ddd568e..0000000 --- a/tabby/tools/converter/huggingface_gptj_convert.py +++ /dev/null @@ -1,181 +0,0 @@ -import configparser -from argparse import ArgumentParser -from os import makedirs -from pathlib import Path - -import numpy as np -import torch -from transformers import PretrainedConfig - -torch.set_printoptions(linewidth=130, sci_mode=False) -np.set_printoptions(linewidth=130, suppress=True) - -# This converter is used to convert the huggingface gpt-j-6B model -# in https://huggingface.co/EleutherAI/gpt-j-6B/blob/main/pytorch_model.bin. - - -def savebin(param, save_path): - if isinstance(param, torch.Tensor): - param = param.cpu().float().numpy() - np.squeeze(param).astype(np.float16).tofile(save_path + ".bin") - - -def param2file(pt_param, layer_id, save_dir, dest_key): - base_n = save_dir + "/model.layers." + str(layer_id) + "." - save_path = base_n + dest_key - savebin(pt_param, save_path) - - -def param2distributed( - pt_param, - layer_id, - save_dir, - dest_key, - n_inference_gpus, - split_axis, -): - np_param = pt_param.cpu().float().numpy() - base_n = save_dir + "/model.layers." + str(layer_id) + "." - save_path = base_n + dest_key - split_param = np.split(np_param, n_inference_gpus, axis=split_axis) - for i, p in enumerate(split_param): - savebin(p, save_path + f".{i}") - - -def save(w, save_dir, n_inference_gpus, n_layers, layer_id): - makedirs(save_dir, exist_ok=True) - - savebin(w["transformer.wte.weight"], save_dir + "/model.wte") - l = layer_id - print(f"Saving layer {l + 1} / {n_layers}") - base_k = "transformer.h." + str(l) + "." - param2file(w[base_k + "ln_1.bias"], l, save_dir, "input_layernorm.bias") - param2file(w[base_k + "ln_1.weight"], l, save_dir, "input_layernorm.weight") - param2distributed( - w[base_k + "mlp.fc_in.weight"].T, - l, - save_dir, - "mlp.dense_h_to_4h.weight", - n_inference_gpus, - split_axis=-1, # split fast indx - ) - param2distributed( - w[base_k + "mlp.fc_in.bias"], - l, - save_dir, - "mlp.dense_h_to_4h.bias", - n_inference_gpus, - split_axis=-1, # split fast indx - ) - - param2distributed( - w[base_k + "mlp.fc_out.weight"].T, - l, - save_dir, - "mlp.dense_4h_to_h.weight", - n_inference_gpus, - split_axis=0, # split slow indx - ) - param2file(w[base_k + "mlp.fc_out.bias"], l, save_dir, "mlp.dense_4h_to_h.bias") - param2distributed( - w[base_k + "attn.out_proj.weight"].T, - l, - save_dir, - "attention.dense.weight", - n_inference_gpus, - split_axis=0, # split slow indx - ) - QKV_w = torch.stack( - [ - w[base_k + "attn.q_proj.weight"], - w[base_k + "attn.k_proj.weight"], - w[base_k + "attn.v_proj.weight"], - ] - ) # [qkv, n_heads * dim_head, latent_space] - QKV_w = QKV_w.permute(2, 0, 1) - param2distributed( - QKV_w, - l, - save_dir, - "attention.query_key_value.weight", - n_inference_gpus, - split_axis=-1, # split fast indx - ) - # Other unneeded per-layer params: - # attn.attention.masked_bias = torch.tensor(-1e9) - # attn.attention.bias = torch.tril(torch.ones(1, 1, 2048, 2048)) - - -if __name__ == "__main__": - parser = ArgumentParser( - description="Convert GPT-J slim checkpoint to FasterTransformer", - ) - parser.add_argument( - "--output-dir", - help="Folder where binary files are stored", - ) - parser.add_argument( - "--ckpt-dir", - help="File of GPT-J huggingface checkpoint", - ) - parser.add_argument( - "--n-inference-gpus", - help="Number of GPUs used for inference runtime", - default=1, - type=int, - ) - args = parser.parse_args() - - ckpt_file = args.ckpt_dir + "/pytorch_model.bin" - checkpoint = torch.load(ckpt_file) - print(f"loading from {ckpt_file}") - - out_path = args.output_dir - output_dir = out_path + f"/{args.n_inference_gpus}-gpu/" - print(f"saving to {output_dir}") - - config_file = args.ckpt_dir + "/config.json" - hf_config = PretrainedConfig.from_json_file(config_file).to_dict() - - # NOTE: save parameters to config files (loaded by triton backends) - config = configparser.ConfigParser() - config["gptj"] = {} - try: - config["gptj"]["model_name"] = ( - "gptj" if hf_config["_name_or_path"] == "" else hf_config["_name_or_path"] - ) - config["gptj"]["head_num"] = str(hf_config["n_head"]) - n_embd = hf_config["n_embd"] - config["gptj"]["size_per_head"] = str(n_embd // hf_config["n_head"]) - config["gptj"]["inter_size"] = str(n_embd * 4) - config["gptj"]["num_layer"] = str(hf_config["n_layer"]) - rotary_dim = ( - n_embd // hf_config["n_head"] - if hf_config["rotary_dim"] is None - else hf_config["rotary_dim"] - ) - config["gptj"]["rotary_embedding"] = str(hf_config["rotary_dim"]) - config["gptj"]["vocab_size"] = str(hf_config["vocab_size"]) - config["gptj"]["start_id"] = str(hf_config["bos_token_id"]) - config["gptj"]["end_id"] = str(hf_config["eos_token_id"]) - config["gptj"]["weight_data_type"] = "fp16" - Path(output_dir).mkdir(exist_ok=True, parents=True) - with open(output_dir + "/config.ini", "w") as configfile: - config.write(configfile) - except: - print(f"Fail to save the config in config.ini.") - - n_layers = hf_config["n_layer"] - for i in range(n_layers): - save(checkpoint, output_dir, args.n_inference_gpus, n_layers, i) - savebin( - checkpoint["transformer.ln_f.weight"], - output_dir + "/model.final_layernorm.weight", - ) - savebin( - checkpoint["transformer.ln_f.bias"], output_dir + "/model.final_layernorm.bias" - ) - savebin(checkpoint["lm_head.weight"], output_dir + "/model.lm_head.weight") - savebin(checkpoint["lm_head.bias"], output_dir + "/model.lm_head.bias") - - print("done") diff --git a/tabby/tools/converter/huggingface_gptneox_convert.py b/tabby/tools/converter/huggingface_gptneox_convert.py deleted file mode 100644 index 636b669..0000000 --- a/tabby/tools/converter/huggingface_gptneox_convert.py +++ /dev/null @@ -1,256 +0,0 @@ -import argparse -import configparser -import multiprocessing -import os -from pathlib import Path - -import numpy as np -import torch -from transformers import GPTNeoXForCausalLM - - -def get_weight_data_type(data_type): - if data_type == "fp32": - return np.float32 - elif data_type == "fp16": - return np.float16 - else: - assert False, f"Invalid weight data type {data_type}" - - -def split_and_convert_process(saved_dir, factor, key, args, config, val): - - if ( - key.find("input_layernorm.weight") != -1 - or key.find("input_layernorm.bias") != -1 - or key.find("post_attention_layernorm.weight") != -1 - or key.find("post_attention_layernorm.bias") != -1 - or key.find("final_layernorm.weight") != -1 - or key.find("final_layernorm.bias") != -1 - ): - saved_path = saved_dir + f"/model.{key}.bin" - val.tofile(saved_path) - - elif ( - key.find("attention.dense.bias") != -1 - or key.find("mlp.dense_4h_to_h.bias") != -1 - ): - saved_path = saved_dir + f"/model.{key}.bin" - val = (val / factor) if factor > 1 else val - val.tofile(saved_path) - - else: - if ( - key.find("attention.dense.weight") != -1 - or key.find("mlp.dense_4h_to_h.weight") != -1 - ): - split_vals = np.split(val, factor, axis=0) - - elif ( - key.find("mlp.dense_h_to_4h.weight") != -1 - or key.find("mlp.dense_h_to_4h.bias") != -1 - ): - split_vals = np.split(val, factor, axis=-1) - - elif key.find("attention.query_key_value.bias") != -1: - local_dim = (int)(val.shape[-1] / 3) - n_head = config["num_attention_heads"] - - val = val.reshape(n_head, 3, local_dim // n_head) - val = np.transpose(val, [1, 0, 2]).reshape(3, local_dim) - split_vals = np.split(val, factor, axis=-1) - - elif key.find("attention.query_key_value.weight") != -1: - hidden_dim = val.shape[0] - local_dim = (int)(val.shape[-1] / 3) - n_head = config["num_attention_heads"] - # Note that the HF qkv weight are stored as [hidden_size, num_heads, 3, head_hidden] - # FT needs the shape of [hidden_size, 3, num_heads, head_hidden] - val = val.reshape(hidden_dim, n_head, 3, local_dim // n_head) - val = np.transpose(val, [0, 2, 1, 3]).reshape(hidden_dim, 3, local_dim) - - # print(np.mean(np.abs(val[:, 0, :]))) - split_vals = np.split(val, factor, axis=-1) - - else: - print("[ERROR] cannot find key '{}'".format(key)) - return - - for j in range(factor): - saved_path = saved_dir + f"/model.{key}.{j}.bin" - split_vals[j].tofile(saved_path) - - -def split_and_convert(args): - saved_dir = args.saved_dir + "/%d-gpu/" % args.infer_gpu_num - - if os.path.exists(saved_dir) == False: - os.makedirs(saved_dir) - - factor = args.infer_gpu_num - - # load position_embedding from rank 0 - # model = torch.load(ckpt_name) - model = GPTNeoXForCausalLM.from_pretrained(args.in_file) - hf_config = vars(model.config) - - np_weight_data_type = get_weight_data_type(args.weight_data_type) - - try: - model_name = args.model_name - n_heads = hf_config["num_attention_heads"] - head_size = hf_config["hidden_size"] // n_heads - rotary_dim = int(head_size * hf_config["rotary_pct"]) - use_gptj_residual = int(hf_config["use_parallel_residual"]) - - config = configparser.ConfigParser() - config["gptneox"] = {} - config["gptneox"]["model_name"] = model_name - config["gptneox"]["head_num"] = str(n_heads) - config["gptneox"]["size_per_head"] = str(head_size) - config["gptneox"]["inter_size"] = str(hf_config["intermediate_size"]) - config["gptneox"]["num_layer"] = str(hf_config["num_hidden_layers"]) - config["gptneox"]["rotary_embedding"] = str(rotary_dim) - config["gptneox"]["vocab_size"] = str(hf_config["vocab_size"]) - config["gptneox"]["start_id"] = str(hf_config["bos_token_id"]) - config["gptneox"]["end_id"] = str(hf_config["eos_token_id"]) - config["gptneox"]["use_gptj_residual"] = str(use_gptj_residual) - config["gptneox"]["weight_data_type"] = args.weight_data_type - - with open((Path(saved_dir) / f"config.ini").as_posix(), "w") as configfile: - config.write(configfile) - except Exception as e: - print(f"Fail to save the config in config.ini.", e) - - ft_model_name_pattern = [ - "input_layernorm.bias", - "input_layernorm.weight", - "attention.query_key_value.bias", - "attention.query_key_value.weight", - "attention.dense.bias", - "attention.dense.weight", - "post_attention_layernorm.bias", - "post_attention_layernorm.weight", - "mlp.dense_h_to_4h.bias", - "mlp.dense_h_to_4h.weight", - "mlp.dense_4h_to_h.bias", - "mlp.dense_4h_to_h.weight", - ] - - huggingface_model_file_list = [ - hf_file_name - for hf_file_name in os.listdir(args.in_file) - if hf_file_name.endswith(".bin") - ] - if len(huggingface_model_file_list) > 1: - multiprocessing_context = multiprocessing.get_context() - pool_fn = multiprocessing_context.Pool - else: - torch.multiprocessing.set_start_method("spawn") - pool_fn = multiprocessing.Pool - - pool = pool_fn(args.processes) - - for name, param in model.named_parameters(): - array = param.detach().cpu().numpy().astype(np_weight_data_type) - # print("input shape", name, array.shape) - if name.find("weight") == -1 and name.find("bias") == -1: - print("skipped", name) - continue - elif name == "gpt_neox.embed_in.weight": - array.tofile(saved_dir + "model.wte.bin") - elif name == "gpt_neox.final_layer_norm.bias": - array.tofile(saved_dir + "model.final_layernorm.bias.bin") - elif name == "gpt_neox.final_layer_norm.weight": - array.tofile(saved_dir + "model.final_layernorm.weight.bin") - elif name == "embed_out.weight": - array.tofile(saved_dir + "model.lm_head.weight.bin") - else: - processed = False - for i in range(len(ft_model_name_pattern)): - if name.find(ft_model_name_pattern[i]) != -1: - new_name = name.replace("gpt_neox.", "") - pool.starmap( - split_and_convert_process, - [ - ( - saved_dir, - factor, - new_name, - args, - vars(model.config), - array.T, - ) - ], - ) - processed = True - break - - if not processed: - print("Unused layer", name) - - pool.close() - pool.join() - - # Post-process biases if use_gptj_residual is True - if use_gptj_residual: - for layer_idx in range(hf_config["num_hidden_layers"]): - attn_bias = np.fromfile( - saved_dir + f"/model.layers.{layer_idx}.attention.dense.bias.bin", - dtype=np_weight_data_type, - ) - mlp_bias = np.fromfile( - saved_dir + f"/model.layers.{layer_idx}.mlp.dense_4h_to_h.bias.bin", - dtype=np_weight_data_type, - ) - - (attn_bias + mlp_bias).astype(np_weight_data_type).tofile( - saved_dir + f"/model.layers.{layer_idx}.mlp.attention.bias.sum.bin" - ) - - -if __name__ == "__main__": - parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter) - parser.add_argument( - "-saved_dir", "-o", type=str, help="file name of output file", required=True - ) - parser.add_argument( - "-in_file", - "-i", - type=str, - help="file name of input checkpoint file", - required=True, - ) - parser.add_argument( - "-infer_gpu_num", - "-i_g", - type=int, - help="How many gpus for inference", - required=True, - ) - parser.add_argument( - "-processes", - "-p", - type=int, - help="How many processes to spawn for conversion (default: 4)", - default=4, - ) - parser.add_argument( - "-weight_data_type", type=str, default="fp32", choices=["fp32", "fp16"] - ) - parser.add_argument( - "-model_name", "-m_n", type=str, help="model name", required=True - ) - - args = parser.parse_args() - print("\n=============== Argument ===============") - for key in vars(args): - print("{}: {}".format(key, vars(args)[key])) - print("========================================") - - target_dir_path = os.path.join(args.saved_dir, "%d-gpu" % args.infer_gpu_num) - assert not os.path.exists(target_dir_path), ( - "target path has exist, please remove %s first." % target_dir_path - ) - - split_and_convert(args) diff --git a/tabby/tools/download_models.py b/tabby/tools/download_models.py deleted file mode 100644 index 26a065b..0000000 --- a/tabby/tools/download_models.py +++ /dev/null @@ -1,47 +0,0 @@ -from dataclasses import dataclass, field - -from huggingface_hub import snapshot_download -from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser - - -@dataclass -class Arguments: - repo_id: str = field( - metadata={"help": "Huggingface model repository id, e.g TabbyML/NeoX-160M"} - ) - prefer_local_files: bool = field( - metadata={ - "help": "Whether prefer loading local files (skip remote version check if local files are valid)." - }, - default=True, - ) - - -def parse_args(): - parser = HfArgumentParser(Arguments) - return parser.parse_args() - - -def preload(local_files_only=False): - AutoTokenizer.from_pretrained(args.repo_id, local_files_only=local_files_only) - AutoModelForCausalLM.from_pretrained( - args.repo_id, local_files_only=local_files_only - ) - snapshot_download( - repo_id=args.repo_id, - allow_patterns="triton/**/*", - local_files_only=local_files_only, - ) - - -if __name__ == "__main__": - args = parse_args() - print(f"Loading {args.repo_id}, this will take a while...") - try: - preload(local_files_only=args.prefer_local_files) - except Exception as e: - if "offline" in str(e) or "local_files_only" in str(e): - preload(local_files_only=False) - else: - raise e - print(f"Loaded {args.repo_id} !") diff --git a/tabby/tools/repository/Makefile b/tabby/tools/repository/Makefile deleted file mode 100644 index 94d910e..0000000 --- a/tabby/tools/repository/Makefile +++ /dev/null @@ -1,11 +0,0 @@ -all: - -CMD := poetry run python updater.py --data_dir=./testdata/repositories --config_file=testdata/config.toml - -test: - # Cleanup - rm -rf testdata/repositories/quickjs - # Initialize - $(CMD) - # Updating - $(CMD) diff --git a/tabby/tools/repository/testdata/config.toml b/tabby/tools/repository/testdata/config.toml deleted file mode 100644 index dc681fe..0000000 --- a/tabby/tools/repository/testdata/config.toml +++ /dev/null @@ -1,4 +0,0 @@ -data_dir = "/repositories" - -[repositories.quickjs] -url = "https://gitee.com/vsf-linux/quickjs.git" diff --git a/tabby/tools/repository/testdata/repositories/.gitignore b/tabby/tools/repository/testdata/repositories/.gitignore deleted file mode 100644 index 0d17186..0000000 --- a/tabby/tools/repository/testdata/repositories/.gitignore +++ /dev/null @@ -1 +0,0 @@ -quickjs diff --git a/tabby/tools/repository/testdata/repositories/.gitkeep b/tabby/tools/repository/testdata/repositories/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tabby/tools/repository/updater.py b/tabby/tools/repository/updater.py deleted file mode 100644 index 6060500..0000000 --- a/tabby/tools/repository/updater.py +++ /dev/null @@ -1,43 +0,0 @@ -import os -import pathlib -import shutil -from dataclasses import dataclass, field - -import toml -from git import Repo -from transformers import HfArgumentParser - - -@dataclass -class Arguments: - data_dir: str = field(metadata={"help": "Base dir for repositories"}) - config_file: str = field(metadata={"help": "Configuration file for tabby updater"}) - - -def parse_args(): - parser = HfArgumentParser(Arguments) - return parser.parse_args() - - -if __name__ == "__main__": - args = parse_args() - - config = toml.load(args.config_file) - repositories = config["projects"] - - for x in pathlib.Path(args.data_dir).glob("*"): - if x.is_dir() and x.name not in repositories: - print("Remove unused dir:", x) - shutil.rmtree(str(x)) - elif x.is_file(): - print("Remove unused file:", x) - x.unlink() - - for name, config in repositories.items(): - path = pathlib.Path(args.data_dir, name) - if path.is_dir(): - repo = Repo(path) - else: - Repo.clone_from(config["git_url"], path.absolute(), depth=1) - - os.system(f"gitup {args.data_dir}")