From 71815bef8fa7b864a81db7ec75a91ceb5ea34190 Mon Sep 17 00:00:00 2001
From: Bryan <15900473+costanzo@users.noreply.github.com>
Date: Sat, 11 Nov 2023 09:45:26 +0800
Subject: [PATCH] docs: fix and add additional information in the Modal
 installation page (#748)

* Add additional information in modal installation docs

* docs: update tabby version to 0.5.5

update Modal installation script
---
 website/docs/installation/modal/app.py   |  2 +-
 website/docs/installation/modal/index.md | 48 ++++++++++++++++++++++--
 2 files changed, 46 insertions(+), 4 deletions(-)
diff --git a/website/docs/installation/modal/app.py b/website/docs/installation/modal/app.py
index 879585b..e6c4cdd 100644
--- a/website/docs/installation/modal/app.py
+++ b/website/docs/installation/modal/app.py
@@ -4,7 +4,7 @@ modal serve app.py
 
 from modal import Image, Stub, asgi_app, gpu
 
-IMAGE_NAME = "tabbyml/tabby:0.4.0"
+IMAGE_NAME = "tabbyml/tabby:0.5.5"
 MODEL_ID = "TabbyML/StarCoder-1B"
 GPU_CONFIG = gpu.T4()
 
diff --git a/website/docs/installation/modal/index.md b/website/docs/installation/modal/index.md
index 5e50627..78f6f3b 100644
--- a/website/docs/installation/modal/index.md
+++ b/website/docs/installation/modal/index.md
@@ -7,16 +7,27 @@
 First we import the components we need from `modal`.
 
 ```python
-from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
+from modal import Image, Stub, asgi_app, gpu
 ```
 
 Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM.
 
 ```python
+IMAGE_NAME = "tabbyml/tabby:0.5.5"
 MODEL_ID = "TabbyML/StarCoder-1B"
 GPU_CONFIG = gpu.T4()
 ```
 
+Currently supported GPUs in Modal:
+
+- `T4`: Low-cost GPU option, providing 16GiB of GPU memory.
+- `L4`: Mid-tier GPU option, providing 24GiB of GPU memory.
+- `A100`: The most powerful GPU available in the cloud. Available in 40GiB and 80GiB GPU memory configurations.
+- `A10G`: A10G GPUs deliver up to 3.3x better ML training performance, 3x better ML inference performance, and 3x better graphics performance, in comparison to NVIDIA T4 GPUs.
+- `Any`: Selects any one of the GPU classes available within Modal, according to availability.
+
+For detailed usage, please check official [Modal GPU reference](https://modal.com/docs/reference/modal.gpu).
+
 ## Define the container image
 
 We want to create a Modal image which has the Tabby model cache pre-populated. The benefit of this is that the container no longer has to re-download the model - instead, it will take advantage of Modal’s internal filesystem for faster cold starts.
@@ -40,7 +51,7 @@ def download_model():
 
 ### Image definition
 
-We’ll start from a image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments.
+We’ll start from an image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments.
 
 Next we run the download step to pre-populate the image with our model weights.
 
@@ -49,7 +60,7 @@ Finally, we install the `asgi-proxy-lib` to interface with modal's asgi webserve
 ```python
 image = (
     Image.from_registry(
-        "tabbyml/tabby:0.3.1",
+        IMAGE_NAME,
         add_python="3.11",
     )
     .dockerfile_commands("ENTRYPOINT []")
@@ -68,6 +79,7 @@ The endpoint function is represented with Modal's `@stub.function`. Here, we:
 4. Keep idle containers for 2 minutes before spinning them down.
 
 ```python
+stub = Stub("tabby-server-" + MODEL_ID.split("/")[-1], image=image)
 @stub.function(
     gpu=GPU_CONFIG,
     allow_concurrent_inputs=10,
@@ -118,6 +130,36 @@ def app():
 
 Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://<USERNAME>--tabby-server-starcoder-1b-app-dev.modal.run`.
 
+To test if the server is working, you can send a post request to the web endpoint.
+
+```shell
+curl --location 'https://<USERNAME>--tabby-server-starcoder-1b-app-dev.modal.run/v1/completions' \
+--header 'Content-Type: application/json' \
+--data '{
+  "language": "python",
+  "segments": {
+    "prefix": "def fib(n):\n    ",
+    "suffix": "\n        return fib(n - 1) + fib(n - 2)"
+  }
+}'
+```
+
+If you can get json response like in the following case, the app server is up and have fun!
+
+```json
+{
+    "id": "cmpl-4196b0c7-f417-4c48-9329-4a56aa86baea",
+    "choices": [
+        {
+            "index": 0,
+            "text": "if n == 0:\n        return 0\n    elif n == 1:\n        return 1\n    else:"
+        }
+    ]
+}
+```
+
+
+
 ![App Running](./app-running.png)
 
 Now it can be used as tabby server url in tabby editor extensions!