use TabbyML/NeoX-70M for minimal e2e deployment (#10)

* use TabbyML/NeoX-70M for minimal e2e deployment

* Use python3 of triton image
add-more-languages
Meng Zhang 2023-03-25 17:39:40 +08:00 committed by GitHub
parent 55eefb75d4
commit b622bd6762
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
11 changed files with 102 additions and 239 deletions

View File

@ -1,4 +1,3 @@
testdata
deployment
**/.git

View File

@ -1,6 +1,5 @@
POETRY_EXISTS := $(shell which poetry &> /dev/null)
PRE_COMMIT_HOOK := .git/hooks/pre-commit
LOCAL_MODEL := testdata/tiny-70M/models/fastertransformer/1
pre-commit:
poetry run pre-commit
@ -14,12 +13,6 @@ endif
$(PRE_COMMIT_HOOK):
poetry run pre-commit install --install-hooks
$(LOCAL_MODEL):
poetry run python -m tabby.tools.huggingface_gptneox_convert \
-in_file EleutherAI/pythia-70m-deduped \
-o $@ \
-i_g 1 -m_n tiny-70M -p 1 -w fp16
setup-development-environment: install-poetry $(PRE_COMMIT_HOOK)
@ -29,11 +22,11 @@ DEV_FLAGS := $(UP_FLAGS) --build
up:
docker-compose -f deployment/docker-compose.yml $(UP_FLAGS)
up-triton: $(LOCAL_MODEL)
up-triton:
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml $(UP_FLAGS)
dev:
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.dev.yml $(DEV_FLAGS)
dev-triton: $(LOCAL_MODEL)
dev-triton:
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml -f deployment/docker-compose.dev.yml $(DEV_FLAGS)

View File

@ -1,11 +1,13 @@
version: '3.3'
services:
server:
init:
build:
context: ..
args:
PYPI_INDEX_URL: https://pypi.tuna.tsinghua.edu.cn/simple
server:
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000 --reload
volumes:
- ../:/app

View File

@ -5,23 +5,18 @@ services:
image: tabbyml/tabby
environment:
- MODEL_BACKEND=triton
- TRITON_TOKENIZER_NAME=EleutherAI/pythia-70m-deduped
links:
- triton
admin:
links:
- triton
depends_on:
triton:
condition: service_healthy
triton:
image: tabbyml/fastertransformer_backend
container_name: tabby-triton
command: mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model
command: /scripts/triton.sh
shm_size: 1gb
volumes:
- ../testdata/tiny-70M/models:/model
- ./scripts:/scripts
- ./data/hf_cache:/root/.cache/huggingface
deploy:
resources:
reservations:
@ -29,3 +24,13 @@ services:
- driver: nvidia
count: all
capabilities: [gpu]
environment:
- MODEL_NAME=TabbyML/NeoX-70M
depends_on:
init:
condition: service_completed_successfully
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8002/metrics"]
interval: 2s
timeout: 2s
start_period: 2s

View File

@ -1,18 +1,28 @@
version: '3.3'
services:
init:
image: tabbyml/tabby
container_name: tabby-init
command: python -m tabby.tools.model_preload --repo_id TabbyML/NeoX-70M --backend python
volumes:
- ./data/hf_cache:/root/.cache/huggingface
server:
image: tabbyml/tabby
container_name: tabby-server
command: uvicorn tabby.server:app --host 0.0.0.0 --port 5000
environment:
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
- MODEL_NAME=TabbyML/NeoX-70M
- EVENTS_LOG_DIR=/logs
ports:
- "5000:5000"
volumes:
- ./data/logs/tabby-server:/logs
- ./data/hf_cache:/root/.cache/huggingface
depends_on:
init:
condition: service_completed_successfully
admin:
image: tabbyml/tabby
@ -20,9 +30,6 @@ services:
command: streamlit run tabby/admin/Home.py
ports:
- "8501:8501"
links:
- server
- vector
vector:
image: timberio/vector:0.28.1-alpine

18
deployment/scripts/triton.sh Executable file
View File

@ -0,0 +1,18 @@
#!/bin/bash
set -e
# Get model dir.
MODEL_DIR=$(python3 <<EOF
from huggingface_hub import snapshot_download
print(snapshot_download(repo_id='$MODEL_NAME', allow_patterns='triton/**/*', local_files_only=True))
EOF
)
# Set model dir in triton config.
sed -i 's@${MODEL_DIR}@'$MODEL_DIR'@g' $MODEL_DIR/triton/fastertransformer/config.pbtxt
# Start triton server.
mpirun -n 1 \
--allow-run-as-root /opt/tritonserver/bin/tritonserver \
--model-repository=$MODEL_DIR/triton

View File

@ -16,16 +16,17 @@ app = FastAPI(
docs_url="/",
)
MODEL_NAME = os.environ.get("MODEL_NAME")
MODEL_BACKEND = os.environ.get("MODEL_BACKEND", "python")
if MODEL_BACKEND == "triton":
model_backend = TritonService(
tokenizer_name=os.environ.get("TRITON_TOKENIZER_NAME", None),
tokenizer_name=MODEL_NAME,
host=os.environ.get("TRITON_HOST", "triton"),
port=os.environ.get("TRITON_PORT", "8001"),
)
else:
model_backend = PythonModelService(os.environ["PYTHON_MODEL_NAME"])
model_backend = PythonModelService(MODEL_NAME)
@app.post("/v1/completions")

View File

@ -13,8 +13,12 @@ class PythonModelService:
self,
model_name,
):
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
self.model = AutoModelForCausalLM.from_pretrained(model_name)
self.tokenizer = AutoTokenizer.from_pretrained(
model_name, local_files_only=True
)
self.model = AutoModelForCausalLM.from_pretrained(
model_name, local_files_only=True
)
def generate(self, request: CompletionRequest) -> List[Choice]:
input_ids = self.tokenizer.encode(request.prompt, return_tensors="pt")

View File

@ -0,0 +1,43 @@
from dataclasses import dataclass, field
from huggingface_hub import snapshot_download
from transformers import AutoModelForCausalLM, AutoTokenizer, HfArgumentParser
@dataclass
class Arguments:
repo_id: str = field(
metadata={"help": "Huggingface model repository id, e.g TabbyML/NeoX-160M"}
)
backend: str = "python"
prefer_local_files: bool = True
def parse_args():
parser = HfArgumentParser(Arguments)
return parser.parse_args()
def preload(local_files_only=False):
AutoTokenizer.from_pretrained(args.repo_id, local_files_only=local_files_only)
AutoModelForCausalLM.from_pretrained(
args.repo_id, local_files_only=local_files_only
)
snapshot_download(
repo_id=args.repo_id,
allow_patterns="triton/**/*",
local_files_only=local_files_only,
)
if __name__ == "__main__":
args = parse_args()
print(f"Loading {args.repo_id} ...")
try:
preload(local_files_only=args.prefer_local_files)
except Exception as e:
if "offline" in str(e):
preload(local_files_only=False)
else:
raise e
print(f"Loaded {args.repo_id} !")

View File

@ -1 +0,0 @@
1

View File

@ -1,208 +0,0 @@
name: "fastertransformer"
backend: "fastertransformer"
default_model_filename: "tiny-70M"
max_batch_size: 1024
model_transaction_policy {
decoupled: False
}
input [
{
name: "input_ids"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "start_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "end_id"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "input_lengths"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
},
{
name: "request_output_len"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "runtime_top_k"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "runtime_top_p"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_search_diversity_rate"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "temperature"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "len_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "repetition_penalty"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "random_seed"
data_type: TYPE_UINT64
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "is_return_log_probs"
data_type: TYPE_BOOL
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "beam_width"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "bad_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "stop_words_list"
data_type: TYPE_INT32
dims: [ 2, -1 ]
optional: true
},
{
name: "prompt_learning_task_name_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_decay"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_min"
data_type: TYPE_FP32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
},
{
name: "top_p_reset_ids"
data_type: TYPE_UINT32
dims: [ 1 ]
reshape: { shape: [ ] }
optional: true
}
]
output [
{
name: "output_ids"
data_type: TYPE_UINT32
dims: [ -1, -1 ]
},
{
name: "sequence_length"
data_type: TYPE_UINT32
dims: [ -1 ]
},
{
name: "cum_log_probs"
data_type: TYPE_FP32
dims: [ -1 ]
},
{
name: "output_log_probs"
data_type: TYPE_FP32
dims: [ -1, -1 ]
}
]
instance_group [
{
count: 1
kind: KIND_CPU
}
]
parameters {
key: "tensor_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "pipeline_para_size"
value: {
string_value: "1"
}
}
parameters {
key: "data_type"
value: {
string_value: "fp32"
}
}
parameters {
key: "model_type"
value: {
string_value: "GPT-NeoX"
}
}
parameters {
key: "model_checkpoint_path"
value: {
string_value: "/model/fastertransformer/1/1-gpu"
}
}
parameters {
key: "enable_custom_all_reduce"
value: {
string_value: "0"
}
}