Add python transformer backend for tabby (mainly used for local dev / test in non-cuda environment) (#6)
* Add python backend * Split docker-compose.triton.yml * update makefileadd-more-languages
parent
df149fad61
commit
a0b438da06
19
Makefile
19
Makefile
|
|
@ -1,8 +1,7 @@
|
||||||
POETRY_EXISTS := $(shell which poetry &> /dev/null)
|
POETRY_EXISTS := $(shell which poetry &> /dev/null)
|
||||||
|
PRE_COMMIT_HOOK := .git/hooks/pre-commit
|
||||||
LOCAL_MODEL := testdata/tiny-70M/models/fastertransformer/1
|
LOCAL_MODEL := testdata/tiny-70M/models/fastertransformer/1
|
||||||
|
|
||||||
all:
|
|
||||||
|
|
||||||
pre-commit:
|
pre-commit:
|
||||||
poetry run pre-commit
|
poetry run pre-commit
|
||||||
|
|
||||||
|
|
@ -10,6 +9,10 @@ install-poetry:
|
||||||
ifndef POETRY_EXISTS
|
ifndef POETRY_EXISTS
|
||||||
curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.0 python3 -
|
curl -sSL https://install.python-poetry.org | POETRY_VERSION=1.4.0 python3 -
|
||||||
endif
|
endif
|
||||||
|
poetry install
|
||||||
|
|
||||||
|
$(PRE_COMMIT_HOOK):
|
||||||
|
poetry run pre-commit install --install-hooks
|
||||||
|
|
||||||
$(LOCAL_MODEL):
|
$(LOCAL_MODEL):
|
||||||
poetry run python converter/huggingface_gptneox_convert.py \
|
poetry run python converter/huggingface_gptneox_convert.py \
|
||||||
|
|
@ -17,10 +20,16 @@ $(LOCAL_MODEL):
|
||||||
-o $@ \
|
-o $@ \
|
||||||
-i_g 1 -m_n tiny-70M -p 1 -w fp16
|
-i_g 1 -m_n tiny-70M -p 1 -w fp16
|
||||||
|
|
||||||
setup-development-environment: install-poetry $(LOCAL_MODEL)
|
setup-development-environment: install-poetry $(PRE_COMMIT_HOOK)
|
||||||
|
|
||||||
up: $(LOCAL_MODEL)
|
up:
|
||||||
docker-compose -f deployment/docker-compose.yml up
|
docker-compose -f deployment/docker-compose.yml up
|
||||||
|
|
||||||
dev: $(setup-development-environment) $(LOCAL_MODEL)
|
up-triton: $(LOCAL_MODEL)
|
||||||
|
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml up
|
||||||
|
|
||||||
|
dev:
|
||||||
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.dev.yml up --build
|
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.dev.yml up --build
|
||||||
|
|
||||||
|
dev-triton: $(LOCAL_MODEL)
|
||||||
|
docker-compose -f deployment/docker-compose.yml -f deployment/docker-compose.triton.yml -f deployment/docker-compose.dev.yml up --build
|
||||||
|
|
|
||||||
|
|
@ -23,4 +23,4 @@ Assuming Linux workstation with:
|
||||||
2. docker w/ gpu driver
|
2. docker w/ gpu driver
|
||||||
3. python 3.10
|
3. python 3.10
|
||||||
|
|
||||||
Use `make dev` to start local dev server.
|
Use `make setup-development-environment` to setup basic dev environment, and `make dev` to start local development server.
|
||||||
|
|
|
||||||
|
|
@ -1 +1,2 @@
|
||||||
logs
|
logs
|
||||||
|
hf_cache
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,33 @@
|
||||||
|
version: '3.3'
|
||||||
|
|
||||||
|
services:
|
||||||
|
server:
|
||||||
|
image: tabbyml/tabby
|
||||||
|
environment:
|
||||||
|
- MODEL_BACKEND=triton
|
||||||
|
- TRITON_TOKENIZER_NAME=/tokenizer
|
||||||
|
volumes:
|
||||||
|
- ../testdata/tiny-70M/tokenizer:/tokenizer
|
||||||
|
links:
|
||||||
|
- triton
|
||||||
|
|
||||||
|
admin:
|
||||||
|
links:
|
||||||
|
- triton
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
triton:
|
||||||
|
image: tabbyml/fastertransformer_backend
|
||||||
|
container_name: tabby-triton
|
||||||
|
command: mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model
|
||||||
|
shm_size: 1gb
|
||||||
|
volumes:
|
||||||
|
- ../testdata/tiny-70M/models:/model
|
||||||
|
deploy:
|
||||||
|
resources:
|
||||||
|
reservations:
|
||||||
|
devices:
|
||||||
|
- driver: nvidia
|
||||||
|
count: all
|
||||||
|
capabilities: [gpu]
|
||||||
|
|
@ -7,33 +7,16 @@ services:
|
||||||
working_dir: /app/server
|
working_dir: /app/server
|
||||||
command: uvicorn app:app --host 0.0.0.0 --port 5000
|
command: uvicorn app:app --host 0.0.0.0 --port 5000
|
||||||
environment:
|
environment:
|
||||||
- TOKENIZER_NAME=/tokenizer
|
- PYTHON_MODEL_NAME=EleutherAI/pythia-70m-deduped
|
||||||
- TRITON_HOST=triton
|
|
||||||
- EVENTS_LOG_DIR=/logs/tabby-server
|
- EVENTS_LOG_DIR=/logs/tabby-server
|
||||||
ports:
|
ports:
|
||||||
- "5000:5000"
|
- "5000:5000"
|
||||||
volumes:
|
volumes:
|
||||||
- ./logs:/logs
|
- ./logs:/logs
|
||||||
- ../testdata/tiny-70M/tokenizer:/tokenizer
|
- ./hf_cache:/root/.cache/huggingface
|
||||||
links:
|
links:
|
||||||
- triton
|
|
||||||
- vector
|
- vector
|
||||||
|
|
||||||
triton:
|
|
||||||
image: tabbyml/fastertransformer_backend
|
|
||||||
container_name: tabby-triton
|
|
||||||
command: mpirun -n 1 --allow-run-as-root /opt/tritonserver/bin/tritonserver --model-repository=/model
|
|
||||||
shm_size: 1gb
|
|
||||||
volumes:
|
|
||||||
- ../testdata/tiny-70M/models:/model
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: all
|
|
||||||
capabilities: [gpu]
|
|
||||||
|
|
||||||
admin:
|
admin:
|
||||||
image: tabbyml/tabby
|
image: tabbyml/tabby
|
||||||
container_name: tabby-admin
|
container_name: tabby-admin
|
||||||
|
|
@ -43,7 +26,6 @@ services:
|
||||||
- "8501:8501"
|
- "8501:8501"
|
||||||
links:
|
links:
|
||||||
- server
|
- server
|
||||||
- triton
|
|
||||||
- vector
|
- vector
|
||||||
|
|
||||||
vector:
|
vector:
|
||||||
|
|
|
||||||
|
|
@ -6,6 +6,7 @@ import uvicorn
|
||||||
from fastapi import FastAPI, Response
|
from fastapi import FastAPI, Response
|
||||||
from fastapi.responses import JSONResponse
|
from fastapi.responses import JSONResponse
|
||||||
from models import CompletionRequest, CompletionResponse
|
from models import CompletionRequest, CompletionResponse
|
||||||
|
from python import PythonModelService
|
||||||
from triton import TritonService
|
from triton import TritonService
|
||||||
|
|
||||||
app = FastAPI(
|
app = FastAPI(
|
||||||
|
|
@ -14,16 +15,21 @@ app = FastAPI(
|
||||||
docs_url="/",
|
docs_url="/",
|
||||||
)
|
)
|
||||||
|
|
||||||
triton = TritonService(
|
MODEL_BACKEND = os.environ.get("MODEL_BACKEND", "python")
|
||||||
tokenizer_name=os.environ.get("TOKENIZER_NAME", None),
|
|
||||||
host=os.environ.get("TRITON_HOST", "localhost"),
|
if MODEL_BACKEND == "triton":
|
||||||
port=os.environ.get("TRITON_PORT", "8001"),
|
model_backend = TritonService(
|
||||||
)
|
tokenizer_name=os.environ.get("TRITON_TOKENIZER_NAME", None),
|
||||||
|
host=os.environ.get("TRITON_HOST", "triton"),
|
||||||
|
port=os.environ.get("TRITON_PORT", "8001"),
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
model_backend = PythonModelService(os.environ["PYTHON_MODEL_NAME"])
|
||||||
|
|
||||||
|
|
||||||
@app.post("/v1/completions")
|
@app.post("/v1/completions")
|
||||||
async def completions(request: CompletionRequest) -> CompletionResponse:
|
async def completions(request: CompletionRequest) -> CompletionResponse:
|
||||||
response = triton(request)
|
response = model_backend(request)
|
||||||
events.log_completions(request, response)
|
events.log_completions(request, response)
|
||||||
return response
|
return response
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,35 @@
|
||||||
|
import random
|
||||||
|
import string
|
||||||
|
import time
|
||||||
|
from typing import List
|
||||||
|
|
||||||
|
from models import Choice, CompletionRequest, CompletionResponse
|
||||||
|
from transformers import AutoModelForCausalLM, AutoTokenizer
|
||||||
|
|
||||||
|
|
||||||
|
class PythonModelService:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
model_name,
|
||||||
|
):
|
||||||
|
self.tokenizer = AutoTokenizer.from_pretrained(model_name)
|
||||||
|
self.model = AutoModelForCausalLM.from_pretrained(model_name)
|
||||||
|
|
||||||
|
def generate(self, request: CompletionRequest) -> List[Choice]:
|
||||||
|
input_ids = self.tokenizer.encode(request.prompt, return_tensors="pt")
|
||||||
|
res = self.model.generate(input_ids, max_length=64)
|
||||||
|
output_ids = res[0][len(input_ids[0]) :]
|
||||||
|
text = self.tokenizer.decode(output_ids)
|
||||||
|
return [Choice(index=0, text=text)]
|
||||||
|
|
||||||
|
def __call__(self, request: CompletionRequest) -> CompletionResponse:
|
||||||
|
choices = self.generate(request)
|
||||||
|
return CompletionResponse(
|
||||||
|
id=random_completion_id(), created=int(time.time()), choices=choices
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def random_completion_id():
|
||||||
|
return "cmpl-" + "".join(
|
||||||
|
random.choice(string.ascii_letters + string.digits) for _ in range(29)
|
||||||
|
)
|
||||||
Loading…
Reference in New Issue