diff --git a/.dockerignore b/.dockerignore index d0b1763..c54e39e 100644 --- a/.dockerignore +++ b/.dockerignore @@ -1,4 +1,3 @@ -testdata deployment **/.git diff --git a/Makefile b/Makefile index 1427b0f..ed870c6 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,5 @@ POETRY_EXISTS := $(shell which poetry &> /dev/null) PRE_COMMIT_HOOK := .git/hooks/pre-commit -LOCAL_MODEL := testdata/tiny-70M/models/fastertransformer/1 pre-commit: poetry run pre-commit @@ -14,12 +13,6 @@ endif $(PRE_COMMIT_HOOK): poetry run pre-commit install --install-hooks -$(LOCAL_MODEL): - poetry run python -m tabby.tools.huggingface_gptneox_convert \ - -in_file EleutherAI/pythia-70m-deduped \ - -o $@ \ - -i_g 1 -m_n tiny-70M -p 1 -w fp16 - setup-development-environment: install-poetry $(PRE_COMMIT_HOOK) @@ -29,11 +22,11 @@ DEV_FLAGS := $(UP_FLAGS) --build up: docker-compose -f deployment/docker-compose.yml $(UP_FLAGS) -up-triton: $(LOCAL_MODEL) +up-triton: docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml $(UP_FLAGS) dev: docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.dev.yml $(DEV_FLAGS) -dev-triton: $(LOCAL_MODEL) +dev-triton: docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml -f deployment/docker-compose.dev.yml $(DEV_FLAGS) diff --git a/deployment/docker-compose.dev.yml b/deployment/docker-compose.dev.yml index b8752fa..3844d3e 100644 --- a/deployment/docker-compose.dev.yml +++ b/deployment/docker-compose.dev.yml @@ -1,11 +1,13 @@ version: '3.3' services: - server: + init: build: context: .. args: PYPI_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple + + server: command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000 --reload volumes: - ../:/app diff --git a/deployment/docker-compose.triton.yml b/deployment/docker-compose.triton.yml index a4eb427..1ee705d 100644 --- a/deployment/docker-compose.triton.yml +++ b/deployment/docker-compose.triton.yml @@ -5,23 +5,18 @@ services: image: tabbyml/tabby environment: - MODEL_BACKEND=triton - - TRITON_TOKENIZER_NAME=EleutherAI/pythia-70m-deduped - links: - - triton - - admin: - links: - - triton - - + depends_on: + triton: + condition: service_healthy triton: image: tabbyml/fastertransformer_backend container_name: tabby-triton - command: mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model + command: /scripts/triton.sh shm_size: 1gb volumes: - - ../testdata/tiny-70M/models:/model + - ./scripts:/scripts + - ./data/hf_cache:/root/.cache/huggingface deploy: resources: reservations: @@ -29,3 +24,13 @@ services: - driver: nvidia count: all capabilities: [gpu] + environment: + - MODEL_NAME=TabbyML/NeoX-70M + depends_on: + init: + condition: service_completed_successfully + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8002/metrics"] + interval: 2s + timeout: 2s + start_period: 2s diff --git a/deployment/docker-compose.yml b/deployment/docker-compose.yml index 9d849c9..44fa7b2 100644 --- a/deployment/docker-compose.yml +++ b/deployment/docker-compose.yml @@ -1,18 +1,28 @@ version: '3.3' services: + init: + image: tabbyml/tabby + container_name: tabby-init + command: python -m tabby.tools.model_preload --repo_id TabbyML/NeoX-70M --backend python + volumes: + - ./data/hf_cache:/root/.cache/huggingface + server: image: tabbyml/tabby container_name: tabby-server command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000 environment: - - PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped + - MODEL_NAME=TabbyML/NeoX-70M - EVENTS_LOG_DIR=/logs ports: - "5000:5000" volumes: - ./data/logs/tabby-server:/logs - ./data/hf_cache:/root/.cache/huggingface + depends_on: + init: + condition: service_completed_successfully admin: image: tabbyml/tabby @@ -20,9 +30,6 @@ services: command: streamlit run tabby/admin/Home.py ports: - "8501:8501" - links: - - server - - vector vector: image: timberio/vector:0.28.1-alpine diff --git a/deployment/scripts/triton.sh b/deployment/scripts/triton.sh new file mode 100755 index 0000000..c86f3eb --- /dev/null +++ b/deployment/scripts/triton.sh @@ -0,0 +1,18 @@ +#!/bin/bash +set -e + +# Get model dir. +MODEL_DIR=$(python3 < List[Choice]: input_ids = self.tokenizer.encode(request.prompt, return_tensors="pt") diff --git a/tabby/tools/model_preload.py b/tabby/tools/model_preload.py new file mode 100644 index 0000000..a010cf8 --- /dev/null +++ b/tabby/tools/model_preload.py @@ -0,0 +1,43 @@ +from dataclasses import dataclass, field + +from huggingface_hub import snapshot_download +from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser + + +@dataclass +class Arguments: + repo_id: str = field( + metadata={"help": "Huggingface model repository id, e.g TabbyML/NeoX-160M"} + ) + backend: str = "python" + prefer_local_files: bool = True + + +def parse_args(): + parser = HfArgumentParser(Arguments) + return parser.parse_args() + + +def preload(local_files_only=False): + AutoTokenizer.from_pretrained(args.repo_id, local_files_only=local_files_only) + AutoModelForCausalLM.from_pretrained( + args.repo_id, local_files_only=local_files_only + ) + snapshot_download( + repo_id=args.repo_id, + allow_patterns="triton/**/*", + local_files_only=local_files_only, + ) + + +if __name__ == "__main__": + args = parse_args() + print(f"Loading {args.repo_id} ...") + try: + preload(local_files_only=args.prefer_local_files) + except Exception as e: + if "offline" in str(e): + preload(local_files_only=False) + else: + raise e + print(f"Loaded {args.repo_id} !") diff --git a/testdata/tiny-70M/models/fastertransformer/.gitignore b/testdata/tiny-70M/models/fastertransformer/.gitignore deleted file mode 100644 index d00491f..0000000 --- a/testdata/tiny-70M/models/fastertransformer/.gitignore +++ /dev/null @@ -1 +0,0 @@ -1 diff --git a/testdata/tiny-70M/models/fastertransformer/config.pbtxt b/testdata/tiny-70M/models/fastertransformer/config.pbtxt deleted file mode 100644 index e6363b9..0000000 --- a/testdata/tiny-70M/models/fastertransformer/config.pbtxt +++ /dev/null @@ -1,208 +0,0 @@ -name: "fastertransformer" -backend: "fastertransformer" -default_model_filename: "tiny-70M" -max_batch_size: 1024 - -model_transaction_policy { - decoupled: False -} - -input [ - { - name: "input_ids" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "start_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "end_id" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "input_lengths" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - }, - { - name: "request_output_len" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "runtime_top_k" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "runtime_top_p" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_search_diversity_rate" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "temperature" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "len_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "repetition_penalty" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "random_seed" - data_type: TYPE_UINT64 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "is_return_log_probs" - data_type: TYPE_BOOL - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "beam_width" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "bad_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - }, - { - name: "stop_words_list" - data_type: TYPE_INT32 - dims: [ 2, -1 ] - optional: true - }, - { - name: "prompt_learning_task_name_ids" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "top_p_decay" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "top_p_min" - data_type: TYPE_FP32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - }, - { - name: "top_p_reset_ids" - data_type: TYPE_UINT32 - dims: [ 1 ] - reshape: { shape: [ ] } - optional: true - } -] -output [ - { - name: "output_ids" - data_type: TYPE_UINT32 - dims: [ -1, -1 ] - }, - { - name: "sequence_length" - data_type: TYPE_UINT32 - dims: [ -1 ] - }, - { - name: "cum_log_probs" - data_type: TYPE_FP32 - dims: [ -1 ] - }, - { - name: "output_log_probs" - data_type: TYPE_FP32 - dims: [ -1, -1 ] - } -] -instance_group [ - { - count: 1 - kind: KIND_CPU - } -] -parameters { - key: "tensor_para_size" - value: { - string_value: "1" - } -} -parameters { - key: "pipeline_para_size" - value: { - string_value: "1" - } -} -parameters { - key: "data_type" - value: { - string_value: "fp32" - } -} -parameters { - key: "model_type" - value: { - string_value: "GPT-NeoX" - } -} -parameters { - key: "model_checkpoint_path" - value: { - string_value: "/model/fastertransformer/1/1-gpu" - } -} -parameters { - key: "enable_custom_all_reduce" - value: { - string_value: "0" - } -}