fix(docs): add installation guide on modal (#608)
* fix(docs): add installation guide on modal * remove modal-deployr0.4
parent
c03be1596d
commit
912f8aab3d
|
|
@ -1,11 +1,4 @@
|
||||||
"""
|
from modal import Image, Stub, asgi_app, gpu
|
||||||
modal serve app.py
|
|
||||||
"""
|
|
||||||
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
import modal
|
|
||||||
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
|
|
||||||
|
|
||||||
IMAGE_NAME = "tabbyml/tabby:0.3.1"
|
IMAGE_NAME = "tabbyml/tabby:0.3.1"
|
||||||
MODEL_ID = "TabbyML/StarCoder-1B"
|
MODEL_ID = "TabbyML/StarCoder-1B"
|
||||||
|
|
@ -27,7 +20,7 @@ def download_model():
|
||||||
|
|
||||||
image = (
|
image = (
|
||||||
Image.from_registry(
|
Image.from_registry(
|
||||||
"tabbyml/tabby:0.3.1",
|
IMAGE_NAME,
|
||||||
add_python="3.11",
|
add_python="3.11",
|
||||||
)
|
)
|
||||||
.dockerfile_commands("ENTRYPOINT []")
|
.dockerfile_commands("ENTRYPOINT []")
|
||||||
|
|
@ -65,7 +58,7 @@ def app():
|
||||||
)
|
)
|
||||||
|
|
||||||
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
||||||
def webserver_ready():
|
def tabby_ready():
|
||||||
try:
|
try:
|
||||||
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
||||||
return True
|
return True
|
||||||
|
|
@ -77,7 +70,7 @@ def app():
|
||||||
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
|
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
|
||||||
return False
|
return False
|
||||||
|
|
||||||
while not webserver_ready():
|
while not tabby_ready():
|
||||||
time.sleep(1.0)
|
time.sleep(1.0)
|
||||||
|
|
||||||
print("Tabby server ready!")
|
print("Tabby server ready!")
|
||||||
|
|
@ -0,0 +1,124 @@
|
||||||
|
# Modal
|
||||||
|
|
||||||
|
Modal is a serverless GPU provider. By leveraging Modal, your Tabby instance will run on demand. When there are no requests to the Tabby server for a certain amount of time, Modal will schedule the container to sleep, thereby saving GPU costs.
|
||||||
|
|
||||||
|
## Setup
|
||||||
|
|
||||||
|
First we import the components we need from `modal`.
|
||||||
|
|
||||||
|
```python
|
||||||
|
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
|
||||||
|
```
|
||||||
|
|
||||||
|
Next, we set the base docker image version, which model to serve, taking care to specify the GPU configuration required to fit the model into VRAM.
|
||||||
|
|
||||||
|
```python
|
||||||
|
MODEL_ID = "TabbyML/StarCoder-1B"
|
||||||
|
GPU_CONFIG = gpu.T4()
|
||||||
|
```
|
||||||
|
|
||||||
|
## Define the container image
|
||||||
|
|
||||||
|
We want to create a Modal image which has the Tabby model cache pre-populated. The benefit of this is that the container no longer has to re-download the model - instead, it will take advantage of Modal’s internal filesystem for faster cold starts.
|
||||||
|
|
||||||
|
### Download the weights
|
||||||
|
|
||||||
|
```python
|
||||||
|
def download_model():
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"/opt/tabby/bin/tabby",
|
||||||
|
"download",
|
||||||
|
"--model",
|
||||||
|
MODEL_ID,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
|
||||||
|
### Image definition
|
||||||
|
|
||||||
|
We’ll start from a image by tabby, and override the default ENTRYPOINT for Modal to run its own which enables seamless serverless deployments.
|
||||||
|
|
||||||
|
Next we run the download step to pre-populate the image with our model weights.
|
||||||
|
|
||||||
|
Finally, we install the `asgi-proxy-lib` to interface with modal's asgi webserver over localhost.
|
||||||
|
|
||||||
|
```python
|
||||||
|
image = (
|
||||||
|
Image.from_registry(
|
||||||
|
"tabbyml/tabby:0.3.1",
|
||||||
|
add_python="3.11",
|
||||||
|
)
|
||||||
|
.dockerfile_commands("ENTRYPOINT []")
|
||||||
|
.run_function(download_model)
|
||||||
|
.pip_install("asgi-proxy-lib")
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
### The app function
|
||||||
|
|
||||||
|
The endpoint function is represented with Modal's `@stub.function`. Here, we:
|
||||||
|
|
||||||
|
1. Launch the Tabby process and wait for it to be ready to accept requests.
|
||||||
|
2. Create an ASGI proxy to tunnel requests from the Modal web endpoint to the local Tabby server.
|
||||||
|
3. Specify that each container is allowed to handle up to 10 requests simultaneously.
|
||||||
|
4. Keep idle containers for 2 minutes before spinning them down.
|
||||||
|
|
||||||
|
```python
|
||||||
|
@stub.function(
|
||||||
|
gpu=GPU_CONFIG,
|
||||||
|
allow_concurrent_inputs=10,
|
||||||
|
container_idle_timeout=120,
|
||||||
|
timeout=360,
|
||||||
|
)
|
||||||
|
@asgi_app()
|
||||||
|
def app():
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from asgi_proxy import asgi_proxy
|
||||||
|
|
||||||
|
launcher = subprocess.Popen(
|
||||||
|
[
|
||||||
|
"/opt/tabby/bin/tabby",
|
||||||
|
"serve",
|
||||||
|
"--model",
|
||||||
|
MODEL_ID,
|
||||||
|
"--port",
|
||||||
|
"8000",
|
||||||
|
"--device",
|
||||||
|
"cuda",
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
||||||
|
def tabby_ready():
|
||||||
|
try:
|
||||||
|
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
||||||
|
return True
|
||||||
|
except (socket.timeout, ConnectionRefusedError):
|
||||||
|
# Check if launcher webserving process has exited.
|
||||||
|
# If so, a connection can never be made.
|
||||||
|
retcode = launcher.poll()
|
||||||
|
if retcode is not None:
|
||||||
|
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
while not tabby_ready():
|
||||||
|
time.sleep(1.0)
|
||||||
|
|
||||||
|
print("Tabby server ready!")
|
||||||
|
return asgi_proxy("http://localhost:8000")
|
||||||
|
```
|
||||||
|
|
||||||
|
### Serve the app
|
||||||
|
|
||||||
|
Once we deploy this model with `modal serve app.py`, it will output the url of the web endpoint, in a form of `https://<USERNAME>--tabby-server-starcoder-1b-app-dev.modal.run`, it can be used as tabby server url in tabby editor extensions!
|
||||||
|
|
||||||
|
See [app.py](https://github.com/TabbyML/tabby/tree/main/website/docs/modal/app.py) for a complete example.
|
||||||
|
|
||||||
|
## Feedback and support
|
||||||
|
If you have improvement suggestions or need specific support, please join [Tabby Slack community](https://join.slack.com/t/tabbycommunity/shared_invite/zt-1xeiddizp-bciR2RtFTaJ37RBxr8VxpA) or reach out on [Tabby’s GitHub repository](https://github.com/TabbyML/tabby).
|
||||||
|
|
@ -25,6 +25,7 @@
|
||||||
"postcss": "^8.4.24",
|
"postcss": "^8.4.24",
|
||||||
"posthog-docusaurus": "^2.0.0",
|
"posthog-docusaurus": "^2.0.0",
|
||||||
"prism-react-renderer": "^1.3.5",
|
"prism-react-renderer": "^1.3.5",
|
||||||
|
"raw-loader": "^4.0.2",
|
||||||
"react": "^17.0.2",
|
"react": "^17.0.2",
|
||||||
"react-dom": "^17.0.2",
|
"react-dom": "^17.0.2",
|
||||||
"tailwindcss": "^3.3.2",
|
"tailwindcss": "^3.3.2",
|
||||||
|
|
|
||||||
|
|
@ -6893,6 +6893,14 @@ raw-body@2.5.1:
|
||||||
iconv-lite "0.4.24"
|
iconv-lite "0.4.24"
|
||||||
unpipe "1.0.0"
|
unpipe "1.0.0"
|
||||||
|
|
||||||
|
raw-loader@^4.0.2:
|
||||||
|
version "4.0.2"
|
||||||
|
resolved "https://registry.yarnpkg.com/raw-loader/-/raw-loader-4.0.2.tgz#1aac6b7d1ad1501e66efdac1522c73e59a584eb6"
|
||||||
|
integrity sha512-ZnScIV3ag9A4wPX/ZayxL/jZH+euYb6FcUinPcgiQW0+UBtEv0O6Q3lGd3cqJ+GHH+rksEv3Pj99oxJ3u3VIKA==
|
||||||
|
dependencies:
|
||||||
|
loader-utils "^2.0.0"
|
||||||
|
schema-utils "^3.0.0"
|
||||||
|
|
||||||
rc@1.2.8, rc@^1.2.8:
|
rc@1.2.8, rc@^1.2.8:
|
||||||
version "1.2.8"
|
version "1.2.8"
|
||||||
resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed"
|
resolved "https://registry.yarnpkg.com/rc/-/rc-1.2.8.tgz#cd924bf5200a075b83c188cd6b9e211b7fc0d3ed"
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue