From 71815bef8fa7b864a81db7ec75a91ceb5ea34190 Mon Sep 17 00:00:00 2001 From: Bryan <15900473+costanzo@users.noreply.github.com> Date: Sat, 11 Nov 2023 09:45:26 +0800 Subject: [PATCH] docs: fix and add additional information in the Modal installation page (#748) * Add additional information in modal installation docs * docs: update tabby version to 0.5.5 update Modal installation script --- website/docs/installation/modal/app.py | 2 +- website/docs/installation/modal/index.md | 48 ++++++++++++++++++++++-- 2 files changed, 46 insertions(+), 4 deletions(-) diff --git a/website/docs/installation/modal/app.py b/website/docs/installation/modal/app.py index 879585b..e6c4cdd 100644 --- a/website/docs/installation/modal/app.py +++ b/website/docs/installation/modal/app.py @@ -4,7 +4,7 @@ modal serve app.py from modal import Image, Stub, asgi_app, gpu -IMAGE_NAME = "tabbyml/tabby:0.4.0" +IMAGE_NAME = "tabbyml/tabby:0.5.5" MODEL_ID = "TabbyML/StarCoder-1B" GPU_CONFIG = gpu.T4() diff --git a/website/docs/installation/modal/index.md b/website/docs/installation/modal/index.md index 5e50627..78f6f3b 100644 --- a/website/docs/installation/modal/index.md +++ b/website/docs/installation/modal/index.md @@ -7,16 +7,27 @@ First we import the components we need from `modal`. ```python -from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method +from modal import Image, Stub, asgi_app, gpu ``` Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM. ```python +IMAGE_NAME = "tabbyml/tabby:0.5.5" MODEL_ID = "TabbyML/StarCoder-1B" GPU_CONFIG = gpu.T4() ``` +Currently supported GPUs in Modal: + +- `T4`: Low-cost GPU option, providing 16GiB of GPU memory. +- `L4`: Mid-tier GPU option, providing 24GiB of GPU memory. +- `A100`: The most powerful GPU available in the cloud. Available in 40GiB and 80GiB GPU memory configurations. +- `A10G`: A10G GPUs deliver up to 3.3x better ML training performance, 3x better ML inference performance, and 3x better graphics performance, in comparison to NVIDIA T4 GPUs. +- `Any`: Selects any one of the GPU classes available within Modal, according to availability. + +For detailed usage, please check official [Modal GPU reference](https://modal.com/docs/reference/modal.gpu). + ## Define the container image We want to create a Modal image which has the Tabby model cache pre-populated. The benefit of this is that the container no longer has to re-download the model - instead, it will take advantage of Modal’s internal filesystem for faster cold starts. @@ -40,7 +51,7 @@ def download_model(): ### Image definition -We’ll start from a image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments. +We’ll start from an image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments. Next we run the download step to pre-populate the image with our model weights. @@ -49,7 +60,7 @@ Finally, we install the `asgi-proxy-lib` to interface with modal's asgi webserve ```python image = ( Image.from_registry( - "tabbyml/tabby:0.3.1", + IMAGE_NAME, add_python="3.11", ) .dockerfile_commands("ENTRYPOINT []") @@ -68,6 +79,7 @@ The endpoint function is represented with Modal's `@stub.function`. Here, we: 4. Keep idle containers for 2 minutes before spinning them down. ```python +stub = Stub("tabby-server-" + MODEL_ID.split("/")[-1], image=image) @stub.function( gpu=GPU_CONFIG, allow_concurrent_inputs=10, @@ -118,6 +130,36 @@ def app(): Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://--tabby-server-starcoder-1b-app-dev.modal.run`. +To test if the server is working, you can send a post request to the web endpoint. + +```shell +curl --location 'https://--tabby-server-starcoder-1b-app-dev.modal.run/v1/completions' \ +--header 'Content-Type: application/json' \ +--data '{ + "language": "python", + "segments": { + "prefix": "def fib(n):\n ", + "suffix": "\n return fib(n - 1) + fib(n - 2)" + } +}' +``` + +If you can get json response like in the following case, the app server is up and have fun! + +```json +{ + "id": "cmpl-4196b0c7-f417-4c48-9329-4a56aa86baea", + "choices": [ + { + "index": 0, + "text": "if n == 0:\n return 0\n elif n == 1:\n return 1\n else:" + } + ] +} +``` + + + ![App Running](./app-running.png) Now it can be used as tabby server url in tabby editor extensions!