feat(loadtest): add loadtest tools (#906)
* add loadtest tools * update * [autofix.ci] apply automated fixes * add readme * cleanup legacy loadtest --------- Co-authored-by: autofix-ci[bot] <114827586+autofix-ci[bot]@users.noreply.github.com>add-prompt-lookup
parent
b47abc661d
commit
edd33a326d
7
Makefile
7
Makefile
|
|
@ -1,10 +1,3 @@
|
||||||
loadtest:
|
|
||||||
ifdef TABBY_API_HOST
|
|
||||||
k6 run tests/*.loadtest.js
|
|
||||||
else
|
|
||||||
$(error TABBY_API_HOST is undefined)
|
|
||||||
endif
|
|
||||||
|
|
||||||
fix:
|
fix:
|
||||||
cargo machete --fix || true
|
cargo machete --fix || true
|
||||||
cargo +nightly fmt
|
cargo +nightly fmt
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,6 @@
|
||||||
|
# Run loadtest with tabby on modal GPUs
|
||||||
|
|
||||||
|
Steps:
|
||||||
|
1. Adjust `TABBY_API_HOST` in `run.sh` to match your modal deployment url.
|
||||||
|
2. Add models you're interested in to benchmark at end of `run.sh`
|
||||||
|
3. Run `run.sh`, output will be appended to `record.csv`
|
||||||
|
|
@ -0,0 +1,48 @@
|
||||||
|
import http from "k6/http";
|
||||||
|
import { check, sleep } from "k6";
|
||||||
|
import { textSummary } from "https://jslib.k6.io/k6-utils/1.4.0/index.js";
|
||||||
|
|
||||||
|
const PARALLELISM = parseInt(__ENV.PARALLELISM);
|
||||||
|
|
||||||
|
export const options = {
|
||||||
|
stages: [
|
||||||
|
{ duration: "1s", target: PARALLELISM },
|
||||||
|
{ duration: "30s", target: PARALLELISM },
|
||||||
|
],
|
||||||
|
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU.
|
||||||
|
thresholds: {
|
||||||
|
http_req_failed: ['rate<0.001'],
|
||||||
|
http_req_duration: ["med<1800", "avg<1800", "p(90)<2500", "p(95)<3000"],
|
||||||
|
},
|
||||||
|
};
|
||||||
|
|
||||||
|
export default () => {
|
||||||
|
const payload = JSON.stringify({
|
||||||
|
language: "python",
|
||||||
|
segments: {
|
||||||
|
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +"
|
||||||
|
},
|
||||||
|
});
|
||||||
|
const headers = { "Content-Type": "application/json" };
|
||||||
|
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, {
|
||||||
|
headers,
|
||||||
|
});
|
||||||
|
check(res, { success: (r) => r.status === 200 });
|
||||||
|
sleep(0.5);
|
||||||
|
};
|
||||||
|
|
||||||
|
export function handleSummary(data) {
|
||||||
|
const avg_latency = data.metrics.http_req_duration.values.avg / 1000;
|
||||||
|
const med_latency = data.metrics.http_req_duration.values.med / 1000;
|
||||||
|
const p90_latency = data.metrics.http_req_duration.values["p(90)"] / 1000;
|
||||||
|
const p95_latency = data.metrics.http_req_duration.values["p(95)"] / 1000;
|
||||||
|
const qps = PARALLELISM / avg_latency;
|
||||||
|
|
||||||
|
return {
|
||||||
|
"metrics.txt": `${rounded(qps)},${rounded(avg_latency)},${rounded(med_latency)},${rounded(p90_latency)},${rounded(p95_latency)}`
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
function rounded(x) {
|
||||||
|
return Math.round(x * 100) / 100;
|
||||||
|
}
|
||||||
|
|
@ -0,0 +1,59 @@
|
||||||
|
GPU,Model,Parallelism,QPS,Latency (Avg)," Latency (Med)"," Latency (p90)"," Latency (p95)"," Passed"
|
||||||
|
T4,TabbyML/StarCoder-1B,7,4.14,1.69,"1.58","2","2.05","SUCCESS"
|
||||||
|
T4,TabbyML/StarCoder-1B,10,4.85,2.06,"1.98","2.49","2.63","FAILED"
|
||||||
|
T4,TabbyML/StarCoder-1B,8,4.22,1.9,"1.85","2.29","2.38","FAILED"
|
||||||
|
A10G,TabbyML/StarCoder-1B,17,12.01,1.42,"1.35","1.66","1.9","SUCCESS"
|
||||||
|
A10G,TabbyML/StarCoder-1B,25,14.61,1.71,"1.68","1.97","2.07","SUCCESS"
|
||||||
|
A10G,TabbyML/StarCoder-1B,29,15.94,1.82,"1.79","2.11","2.17","FAILED"
|
||||||
|
A10G,TabbyML/StarCoder-1B,27,14.89,1.81,"1.79","2.03","2.37","FAILED"
|
||||||
|
A10G,TabbyML/StarCoder-1B,26,14.79,1.76,"1.72","2.08","2.22","SUCCESS"
|
||||||
|
A100,TabbyML/StarCoder-1B,33,13.16,2.51,"1.57","2.58","11.52","FAILED"
|
||||||
|
A100,TabbyML/StarCoder-1B,17,12.94,1.31,"1.29","1.46","1.53","SUCCESS"
|
||||||
|
A100,TabbyML/StarCoder-1B,25,16.98,1.47,"1.43","1.69","1.86","SUCCESS"
|
||||||
|
A100,TabbyML/StarCoder-1B,29,10.46,2.77,"2.8","3.11","3.18","FAILED"
|
||||||
|
A100,TabbyML/StarCoder-1B,27,13.58,1.99,"1.96","2.37","2.42","FAILED"
|
||||||
|
A100,TabbyML/StarCoder-1B,26,16.52,1.57,"1.47","1.96","2.19","SUCCESS"
|
||||||
|
T4,TabbyML/DeepseekCoder-1.3B,7,3.82,1.83,"1.86","1.94","1.96","FAILED"
|
||||||
|
T4,TabbyML/DeepseekCoder-1.3B,4,3.05,1.31,"1.32","1.45","1.49","SUCCESS"
|
||||||
|
T4,TabbyML/DeepseekCoder-1.3B,5,3.18,1.57,"1.54","1.92","1.97","SUCCESS"
|
||||||
|
T4,TabbyML/DeepseekCoder-1.3B,6,3.33,1.8,"1.84","1.92","1.95","FAILED"
|
||||||
|
A10G,TabbyML/DeepseekCoder-1.3B,17,11.66,1.46,1.46,1.65,1.77,SUCCESS
|
||||||
|
A10G,TabbyML/DeepseekCoder-1.3B,21,6.83,3.07,1.63,14.32,14.56,FAILED
|
||||||
|
A10G,TabbyML/DeepseekCoder-1.3B,19,12.63,1.5,1.5,1.69,1.78,SUCCESS
|
||||||
|
A10G,TabbyML/DeepseekCoder-1.3B,20,12.88,1.55,1.53,1.79,1.92,SUCCESS
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,33,20.78,1.59,1.55,1.84,1.9,SUCCESS
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,49,18.2,2.69,2.62,3.15,3.38,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,41,16.44,2.49,2.36,2.9,3.49,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,37,21.15,1.75,1.67,2.07,2.18,SUCCESS
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,39,14.78,2.64,2.6,3.02,3.13,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-1.3B,38,20.88,1.82,1.76,2.08,2.19,FAILED
|
||||||
|
T4,TabbyML/StarCoder-3B,7,1.89,3.7,3.68,3.89,3.96,FAILED
|
||||||
|
T4,TabbyML/StarCoder-3B,4,1.53,2.62,2.62,2.75,2.78,FAILED
|
||||||
|
T4,TabbyML/StarCoder-3B,2,0.86,2.32,2.34,2.43,2.47,FAILED
|
||||||
|
A10G,TabbyML/StarCoder-3B,17,5.42,3.14,3.16,3.47,3.84,FAILED
|
||||||
|
A10G,TabbyML/StarCoder-3B,9,4.31,2.09,2.05,2.39,2.69,FAILED
|
||||||
|
A10G,TabbyML/StarCoder-3B,5,2.85,1.75,1.73,1.95,2.28,SUCCESS
|
||||||
|
A10G,TabbyML/StarCoder-3B,7,3.61,1.94,1.94,2.18,2.29,FAILED
|
||||||
|
A10G,TabbyML/StarCoder-3B,6,3.17,1.89,1.91,2.04,2.06,FAILED
|
||||||
|
A100,TabbyML/StarCoder-3B,33,8.12,4.07,4.12,4.53,4.61,FAILED
|
||||||
|
A100,TabbyML/StarCoder-3B,17,7.9,2.15,2.1,2.34,2.81,FAILED
|
||||||
|
A100,TabbyML/StarCoder-3B,9,4.47,2.01,1.99,2.19,2.26,FAILED
|
||||||
|
A100,TabbyML/StarCoder-3B,5,3.21,1.56,1.56,1.68,1.72,SUCCESS
|
||||||
|
A100,TabbyML/StarCoder-3B,7,4.29,1.63,1.61,1.78,1.8,SUCCESS
|
||||||
|
A100,TabbyML/StarCoder-3B,8,4.63,1.73,1.73,1.92,2.03,SUCCESS
|
||||||
|
A10G,TabbyML/DeepseekCoder-6.7B,5,1.3,3.85,3.83,4.25,4.31,FAILED
|
||||||
|
A10G,TabbyML/DeepseekCoder-6.7B,3,1.14,2.63,2.6,2.81,2.86,FAILED
|
||||||
|
A10G,TabbyML/DeepseekCoder-6.7B,2,0.83,2.4,2.4,2.48,2.5,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-6.7B,9,3.14,2.87,2.85,3.08,3.13,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-6.7B,5,2.08,2.4,2.46,2.58,2.63,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-6.7B,3,1.32,2.27,2.3,2.54,2.69,FAILED
|
||||||
|
A100,TabbyML/DeepseekCoder-6.7B,2,1.2,1.67,1.66,1.84,1.93,SUCCESS
|
||||||
|
A100,TabbyML/CodeLlama-7B,9,3.69,2.44,2.45,2.59,2.63,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-7B,5,2.14,2.34,2.31,2.61,3.26,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-7B,3,1.52,1.97,2.02,2.3,2.37,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-7B,5,2.37,2.11,2.13,2.24,2.26,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-7B,3,1.59,1.89,1.95,2.04,2.07,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-7B,2,1.45,1.38,1.39,1.54,1.56,SUCCESS
|
||||||
|
A100,TabbyML/CodeLlama-13B,5,1.21,4.14,4.15,4.38,4.5,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-13B,3,0.89,3.36,3.4,3.71,3.73,FAILED
|
||||||
|
A100,TabbyML/CodeLlama-13B,2,0.73,2.75,2.73,2.92,3.06,FAILED
|
||||||
|
|
|
@ -0,0 +1,107 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
record() {
|
||||||
|
echo $GPU_CONFIG,$MODEL_ID,$PARALLELISM,$1 >> record.csv
|
||||||
|
}
|
||||||
|
|
||||||
|
cleanup() {
|
||||||
|
MODAL_APP_ID=$(modal app list | grep tabby-server-loadtest | grep deployed | awk '{print $2}')
|
||||||
|
|
||||||
|
if [ -z $MODAL_APP_ID ]; then
|
||||||
|
modal app stop $MODAL_APP_ID
|
||||||
|
fi
|
||||||
|
}
|
||||||
|
|
||||||
|
loadtest() {
|
||||||
|
export GPU_CONFIG=$1
|
||||||
|
export MODEL_ID=$2
|
||||||
|
export PARALLELISM=$3
|
||||||
|
|
||||||
|
>&2 modal deploy server.py
|
||||||
|
|
||||||
|
export MODAL_PROCESS_ID=$!
|
||||||
|
export TABBY_API_HOST=https://wsxiaoys--tabby-server-loadtest-app.modal.run
|
||||||
|
|
||||||
|
# wait for warmup
|
||||||
|
>&2 echo "Waiting for warmup..."
|
||||||
|
|
||||||
|
|
||||||
|
n=0
|
||||||
|
while [[ "$(curl -s -o /dev/null -w ''%{http_code}'' $TABBY_API_HOST/v1/health)" != "200" ]]; do
|
||||||
|
if [ "$n" -ge 5 ]; then
|
||||||
|
# error after 5 retries.
|
||||||
|
return 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
sleep 10;
|
||||||
|
n=$((n+1))
|
||||||
|
done
|
||||||
|
|
||||||
|
>&2 echo "Start load testing..."
|
||||||
|
|
||||||
|
>&2 k6 run loadtest.js
|
||||||
|
SUCCESS=$?
|
||||||
|
METRICS=$(cat metrics.txt)
|
||||||
|
rm metrics.txt
|
||||||
|
|
||||||
|
if [ $SUCCESS -ne 0 ]; then
|
||||||
|
record $METRICS,FAILED
|
||||||
|
else
|
||||||
|
record $METRICS,SUCCESS
|
||||||
|
fi
|
||||||
|
|
||||||
|
cleanup
|
||||||
|
|
||||||
|
return $SUCCESS
|
||||||
|
}
|
||||||
|
|
||||||
|
function dichotomic_search {
|
||||||
|
min=$1
|
||||||
|
max=$2
|
||||||
|
command=$3
|
||||||
|
|
||||||
|
while (( $min < $max )); do
|
||||||
|
# Compute the mean between min and max, rounded up to the superior unit
|
||||||
|
current=$(( (min + max + 1 ) / 2 ))
|
||||||
|
|
||||||
|
if $command $current
|
||||||
|
then min=$current
|
||||||
|
else max=$((current - 1))
|
||||||
|
fi
|
||||||
|
done
|
||||||
|
}
|
||||||
|
|
||||||
|
test_t4() {
|
||||||
|
loadtest T4 $MODEL_ID $1
|
||||||
|
}
|
||||||
|
|
||||||
|
test_a10g() {
|
||||||
|
loadtest A10G $MODEL_ID $1
|
||||||
|
}
|
||||||
|
|
||||||
|
test_a100() {
|
||||||
|
loadtest A100 $MODEL_ID $1
|
||||||
|
}
|
||||||
|
|
||||||
|
test_1b3b_model() {
|
||||||
|
export MODEL_ID="$1"
|
||||||
|
|
||||||
|
dichotomic_search 1 12 test_t4
|
||||||
|
dichotomic_search 1 32 test_a10g
|
||||||
|
dichotomic_search 1 64 test_a100
|
||||||
|
}
|
||||||
|
|
||||||
|
test_7b_model() {
|
||||||
|
export MODEL_ID="$1"
|
||||||
|
|
||||||
|
dichotomic_search 1 8 test_a100
|
||||||
|
}
|
||||||
|
|
||||||
|
test_13b_model() {
|
||||||
|
export MODEL_ID="$1"
|
||||||
|
|
||||||
|
dichotomic_search 1 8 test_a100
|
||||||
|
}
|
||||||
|
|
||||||
|
# test_7b_model TabbyML/CodeLlama-7B
|
||||||
|
test_13b_model TabbyML/CodeLlama-13B
|
||||||
|
|
@ -0,0 +1,97 @@
|
||||||
|
"""Usage:
|
||||||
|
modal serve app.py
|
||||||
|
"""
|
||||||
|
|
||||||
|
import os
|
||||||
|
from modal import Image, Stub, asgi_app
|
||||||
|
|
||||||
|
GPU_CONFIG = os.environ.get("GPU_CONFIG", "T4")
|
||||||
|
IMAGE_NAME = "tabbyml/tabby:0.6.0"
|
||||||
|
MODEL_ID = os.environ.get("MODEL_ID", "TabbyML/StarCoder-1B")
|
||||||
|
PARALLELISM = os.environ.get("PARALLELISM", "4")
|
||||||
|
|
||||||
|
|
||||||
|
def download_model():
|
||||||
|
import os
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
model_id = os.environ.get("MODEL_ID")
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"/opt/tabby/bin/tabby",
|
||||||
|
"download",
|
||||||
|
"--model",
|
||||||
|
model_id,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
image = (
|
||||||
|
Image.from_registry(
|
||||||
|
IMAGE_NAME,
|
||||||
|
add_python="3.11",
|
||||||
|
)
|
||||||
|
.env({"MODEL_ID": MODEL_ID})
|
||||||
|
.dockerfile_commands("ENTRYPOINT []")
|
||||||
|
.run_function(download_model)
|
||||||
|
.pip_install("asgi-proxy-lib")
|
||||||
|
.env({"PARALLELISM": PARALLELISM})
|
||||||
|
)
|
||||||
|
|
||||||
|
stub = Stub("tabby-server-loadtest", image=image)
|
||||||
|
|
||||||
|
|
||||||
|
@stub.function(
|
||||||
|
gpu=GPU_CONFIG,
|
||||||
|
allow_concurrent_inputs=int(PARALLELISM),
|
||||||
|
container_idle_timeout=120,
|
||||||
|
timeout=360,
|
||||||
|
)
|
||||||
|
@asgi_app()
|
||||||
|
def app():
|
||||||
|
import os
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
from asgi_proxy import asgi_proxy
|
||||||
|
|
||||||
|
model_id = os.environ.get("MODEL_ID")
|
||||||
|
parallelism = os.environ.get("PARALLELISM")
|
||||||
|
|
||||||
|
env = os.environ.copy()
|
||||||
|
env["TABBY_DISABLE_USAGE_COLLECTION"] = "1"
|
||||||
|
|
||||||
|
launcher = subprocess.Popen(
|
||||||
|
[
|
||||||
|
"/opt/tabby/bin/tabby",
|
||||||
|
"serve",
|
||||||
|
"--model",
|
||||||
|
model_id,
|
||||||
|
"--port",
|
||||||
|
"8000",
|
||||||
|
"--device",
|
||||||
|
"cuda",
|
||||||
|
"--parallelism",
|
||||||
|
parallelism,
|
||||||
|
],
|
||||||
|
env=env
|
||||||
|
)
|
||||||
|
|
||||||
|
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
||||||
|
def tabby_ready():
|
||||||
|
try:
|
||||||
|
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
||||||
|
return True
|
||||||
|
except (socket.timeout, ConnectionRefusedError):
|
||||||
|
# Check if launcher webserving process has exited.
|
||||||
|
# If so, a connection can never be made.
|
||||||
|
retcode = launcher.poll()
|
||||||
|
if retcode is not None:
|
||||||
|
raise RuntimeError(f"launcher exited unexpectedly with code {retcode}")
|
||||||
|
return False
|
||||||
|
|
||||||
|
while not tabby_ready():
|
||||||
|
time.sleep(1.0)
|
||||||
|
|
||||||
|
print("Tabby server ready!")
|
||||||
|
return asgi_proxy("http://localhost:8000")
|
||||||
|
|
@ -1,30 +0,0 @@
|
||||||
import http from "k6/http";
|
|
||||||
import { check, group, sleep, abortTest } from "k6";
|
|
||||||
|
|
||||||
export const options = {
|
|
||||||
stages: [
|
|
||||||
{ duration: "5s", target: 8 },
|
|
||||||
{ duration: "20s", target: 8 },
|
|
||||||
{ duration: "5s", target: 0 },
|
|
||||||
],
|
|
||||||
// Below thresholds are tested against TabbyML/StarCoder-1B served by NVIDIA T4 GPU.
|
|
||||||
thresholds: {
|
|
||||||
http_req_failed: ['rate<0.01'], // http errors should be less than 1%
|
|
||||||
http_req_duration: ["med<1800", "avg<1800", "p(95)<2000"],
|
|
||||||
},
|
|
||||||
};
|
|
||||||
|
|
||||||
export default () => {
|
|
||||||
const payload = JSON.stringify({
|
|
||||||
language: "python",
|
|
||||||
segments: {
|
|
||||||
prefix: "def binarySearch(arr, left, right, x):\n mid = (left +"
|
|
||||||
},
|
|
||||||
});
|
|
||||||
const headers = { "Content-Type": "application/json" };
|
|
||||||
const res = http.post(`${__ENV.TABBY_API_HOST}/v1/completions`, payload, {
|
|
||||||
headers,
|
|
||||||
});
|
|
||||||
check(res, { success: (r) => r.status === 200 });
|
|
||||||
sleep(0.5);
|
|
||||||
};
|
|
||||||
Loading…
Reference in New Issue