feat(experimental): use modal for easier evaluation. (#594)
parent
0dc7e98232
commit
257dba64a5
|
|
@ -2,4 +2,5 @@
|
||||||
node_modules
|
node_modules
|
||||||
.idea/
|
.idea/
|
||||||
.DS_Store
|
.DS_Store
|
||||||
.vscode/
|
.vscode/
|
||||||
|
__pycache__
|
||||||
|
|
|
||||||
|
|
@ -1,4 +0,0 @@
|
||||||
__pycache__
|
|
||||||
.ipynb_checkpoints
|
|
||||||
reports.*
|
|
||||||
tabby
|
|
||||||
|
|
@ -1,7 +0,0 @@
|
||||||
# Eval
|
|
||||||
|
|
||||||
## Local
|
|
||||||
`./eval.sh`
|
|
||||||
|
|
||||||
## Skypilot
|
|
||||||
`./eval_sky.sh`
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
[[repositories]]
|
|
||||||
git_url = "https://github.com/huggingface/text-generation-inference"
|
|
||||||
|
|
@ -1,11 +0,0 @@
|
||||||
version: '3.5'
|
|
||||||
services:
|
|
||||||
tabby:
|
|
||||||
command: serve --model TabbyML/SantaCoder-1B --device cuda
|
|
||||||
deploy:
|
|
||||||
resources:
|
|
||||||
reservations:
|
|
||||||
devices:
|
|
||||||
- driver: nvidia
|
|
||||||
count: 1
|
|
||||||
capabilities: [gpu]
|
|
||||||
|
|
@ -1,21 +0,0 @@
|
||||||
version: '3.5'
|
|
||||||
|
|
||||||
services:
|
|
||||||
scheduler:
|
|
||||||
image: tabbyml/tabby
|
|
||||||
platform: linux/amd64
|
|
||||||
command: scheduler --now
|
|
||||||
volumes:
|
|
||||||
- "$PWD/tabby:/data"
|
|
||||||
|
|
||||||
tabby:
|
|
||||||
depends_on:
|
|
||||||
- scheduler
|
|
||||||
restart: always
|
|
||||||
image: tabbyml/tabby
|
|
||||||
platform: linux/amd64
|
|
||||||
command: serve --model TabbyML/T5P-220M
|
|
||||||
ports:
|
|
||||||
- "8080:8080"
|
|
||||||
volumes:
|
|
||||||
- "$PWD/tabby:/data"
|
|
||||||
|
|
@ -1,24 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
mkdir -p tabby
|
|
||||||
cp config.toml tabby/
|
|
||||||
|
|
||||||
docker-compose down
|
|
||||||
|
|
||||||
if nvidia-smi; then
|
|
||||||
docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d
|
|
||||||
else
|
|
||||||
docker-compose up -d
|
|
||||||
fi
|
|
||||||
|
|
||||||
while ! curl -X POST http://localhost:8080/v1/health; do
|
|
||||||
echo "server not ready, waiting..."
|
|
||||||
sleep 5
|
|
||||||
done
|
|
||||||
|
|
||||||
python main.py "./tabby/dataset/*.jsonl" ${MAX_RECORDS:-3} > reports.jsonl
|
|
||||||
|
|
||||||
docker-compose down
|
|
||||||
|
|
||||||
echo done
|
|
||||||
|
|
@ -1,10 +0,0 @@
|
||||||
#!/bin/bash
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300"
|
|
||||||
|
|
||||||
if ! sky exec $ARGS; then
|
|
||||||
sky launch -c $ARGS
|
|
||||||
fi
|
|
||||||
|
|
||||||
scp tabby-eval:~/sky_workdir/reports.jsonl ./
|
|
||||||
|
|
@ -1,90 +0,0 @@
|
||||||
import sys
|
|
||||||
import argparse
|
|
||||||
import pandas as pd
|
|
||||||
import logging
|
|
||||||
|
|
||||||
from tabby_client import Client
|
|
||||||
from tabby_client.api.v1 import health
|
|
||||||
from tabby_client.api.v1 import completion
|
|
||||||
|
|
||||||
from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice
|
|
||||||
|
|
||||||
import processing
|
|
||||||
import editdistance
|
|
||||||
import random
|
|
||||||
|
|
||||||
|
|
||||||
def valid_item(item: processing.Item):
|
|
||||||
count_body_lines = len(item.body.splitlines())
|
|
||||||
|
|
||||||
if count_body_lines > 10:
|
|
||||||
return False
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
def scorer(label, prediction):
|
|
||||||
distance = editdistance.eval(label, prediction)
|
|
||||||
return max(0.0, 1.0 - distance / len(label))
|
|
||||||
|
|
||||||
|
|
||||||
def run_eval(args):
|
|
||||||
api = "http://localhost:8080"
|
|
||||||
client = Client(base_url=api, timeout=50)
|
|
||||||
try:
|
|
||||||
health.sync(client=client)
|
|
||||||
except:
|
|
||||||
print(f"Tabby Server is not ready, please check if '{api}' is correct.")
|
|
||||||
return
|
|
||||||
|
|
||||||
items = [
|
|
||||||
x for x in processing.items_from_filepattern(args.filepattern) if valid_item(x)
|
|
||||||
]
|
|
||||||
if len(items) > args.max_records:
|
|
||||||
random.seed(0xBADBEEF)
|
|
||||||
items = random.sample(items, args.max_records)
|
|
||||||
|
|
||||||
for item in items:
|
|
||||||
if not valid_item(item):
|
|
||||||
continue
|
|
||||||
|
|
||||||
request = CompletionRequest(
|
|
||||||
language=item.language, segments=Segments(prefix=item.prefix)
|
|
||||||
)
|
|
||||||
|
|
||||||
resp: CompletionResponse = completion.sync(client=client, json_body=request)
|
|
||||||
label = item.body
|
|
||||||
prediction = resp.choices[0].text
|
|
||||||
|
|
||||||
block_score = scorer(label, prediction)
|
|
||||||
|
|
||||||
label_lines = label.splitlines()
|
|
||||||
prediction_lines = prediction.splitlines()
|
|
||||||
|
|
||||||
if len(label_lines) > 0 and len(prediction_lines) > 0:
|
|
||||||
line_score = scorer(label_lines[0], prediction_lines[0])
|
|
||||||
|
|
||||||
yield dict(
|
|
||||||
prompt=item.prefix,
|
|
||||||
prediction=prediction,
|
|
||||||
label=label,
|
|
||||||
block_score=block_score,
|
|
||||||
line_score=line_score,
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
|
|
||||||
|
|
||||||
parser = argparse.ArgumentParser(
|
|
||||||
description="SxS eval for tabby",
|
|
||||||
epilog="Example usage: python main.py ./tabby/dataset/data.jsonl 5 > output.jsonl",
|
|
||||||
)
|
|
||||||
parser.add_argument("filepattern", type=str, help="File pattern to dataset.")
|
|
||||||
parser.add_argument(
|
|
||||||
"max_records", type=int, help="Max number of records to be evaluated."
|
|
||||||
)
|
|
||||||
args = parser.parse_args()
|
|
||||||
logging.info("args %s", args)
|
|
||||||
df = pd.DataFrame(run_eval(args))
|
|
||||||
print(df.to_json(orient="records", lines=True))
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -1,88 +0,0 @@
|
||||||
from typing import Iterator
|
|
||||||
|
|
||||||
import glob
|
|
||||||
import json
|
|
||||||
from dataclasses import dataclass
|
|
||||||
|
|
||||||
|
|
||||||
@dataclass
|
|
||||||
class Item:
|
|
||||||
git_url: str
|
|
||||||
filepath: str
|
|
||||||
language: str
|
|
||||||
|
|
||||||
name: str
|
|
||||||
body: str
|
|
||||||
prefix: str
|
|
||||||
suffix: str
|
|
||||||
|
|
||||||
|
|
||||||
def iter_items(doc) -> Iterator[Item]:
|
|
||||||
if doc["max_line_length"] > 500:
|
|
||||||
return
|
|
||||||
|
|
||||||
if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
|
|
||||||
return
|
|
||||||
|
|
||||||
if doc["alphanum_fraction"] < 0.25:
|
|
||||||
return
|
|
||||||
|
|
||||||
for tag in doc["tags"]:
|
|
||||||
content = doc["content"]
|
|
||||||
name = get_content(content, tag["name_range"])
|
|
||||||
body = get_content(content, tag["range"])
|
|
||||||
|
|
||||||
prefix = get_prefix(content, tag["range"]["start"])
|
|
||||||
suffix = get_suffix(content, tag["range"]["end"])
|
|
||||||
|
|
||||||
yield Item(
|
|
||||||
name=name,
|
|
||||||
body=body,
|
|
||||||
prefix=prefix,
|
|
||||||
suffix=suffix,
|
|
||||||
git_url=doc["git_url"],
|
|
||||||
filepath=doc["filepath"],
|
|
||||||
language=doc["language"],
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def iter_docs(filepattern: str):
|
|
||||||
for filepath in glob.glob(filepattern):
|
|
||||||
with open(filepath) as f:
|
|
||||||
for line in f:
|
|
||||||
yield json.loads(line)
|
|
||||||
|
|
||||||
|
|
||||||
def get_content(content: str, range: dict):
|
|
||||||
return content[range["start"] : range["end"]]
|
|
||||||
|
|
||||||
|
|
||||||
def get_prefix(content: str, start: int, max=20):
|
|
||||||
num_lines = 0
|
|
||||||
prefix_start = 0
|
|
||||||
for prefix_start in range(start - 1, 0, -1):
|
|
||||||
if content[prefix_start] == "\n":
|
|
||||||
num_lines += 1
|
|
||||||
|
|
||||||
if num_lines == max:
|
|
||||||
break
|
|
||||||
|
|
||||||
return content[prefix_start + 1 : start]
|
|
||||||
|
|
||||||
|
|
||||||
def get_suffix(content: str, end: int, max=20):
|
|
||||||
num_lines = 0
|
|
||||||
suffix_end = end
|
|
||||||
for suffix_end in range(end, len(content)):
|
|
||||||
if content[suffix_end] == "\n":
|
|
||||||
num_lines += 1
|
|
||||||
|
|
||||||
if num_lines == max:
|
|
||||||
break
|
|
||||||
|
|
||||||
return content[end : suffix_end - 1]
|
|
||||||
|
|
||||||
|
|
||||||
def items_from_filepattern(filepattern: str):
|
|
||||||
for doc in iter_docs(filepattern):
|
|
||||||
yield from iter_items(doc)
|
|
||||||
|
|
@ -1,2 +0,0 @@
|
||||||
git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client
|
|
||||||
editdistance
|
|
||||||
File diff suppressed because one or more lines are too long
|
|
@ -1,25 +0,0 @@
|
||||||
resources:
|
|
||||||
accelerators: T4:1
|
|
||||||
|
|
||||||
# tabby base dir
|
|
||||||
workdir: ./
|
|
||||||
|
|
||||||
setup: |
|
|
||||||
set -ex
|
|
||||||
|
|
||||||
# On some cloud providers, docker-compose is not installed by default.
|
|
||||||
sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
|
|
||||||
sudo chmod a+x /usr/local/bin/docker-compose
|
|
||||||
|
|
||||||
# Install tabby python client.
|
|
||||||
pip install -r requirements.txt
|
|
||||||
|
|
||||||
# On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here
|
|
||||||
sudo docker-compose pull
|
|
||||||
|
|
||||||
# Add current user to docker group, it won't take effect immediately as skypilot job is started by a long-running daemon.
|
|
||||||
sudo usermod -aG docker $USER
|
|
||||||
|
|
||||||
|
|
||||||
run: |
|
|
||||||
./eval.sh
|
|
||||||
|
|
@ -0,0 +1,107 @@
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
|
||||||
|
|
||||||
|
GPU_CONFIG = gpu.T4()
|
||||||
|
MODEL_ID = "TabbyML/StarCoder-1B"
|
||||||
|
LAUNCH_FLAGS = [
|
||||||
|
"serve",
|
||||||
|
"--model",
|
||||||
|
MODEL_ID,
|
||||||
|
"--port",
|
||||||
|
"8000",
|
||||||
|
"--device",
|
||||||
|
"cuda"
|
||||||
|
]
|
||||||
|
|
||||||
|
def download_model():
|
||||||
|
import subprocess
|
||||||
|
|
||||||
|
subprocess.run(
|
||||||
|
[
|
||||||
|
"/opt/tabby/bin/tabby",
|
||||||
|
"download",
|
||||||
|
"--model",
|
||||||
|
MODEL_ID,
|
||||||
|
]
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
image = (
|
||||||
|
Image.from_registry("tabbyml/tabby:0.3.0", add_python="3.11")
|
||||||
|
.dockerfile_commands("ENTRYPOINT []")
|
||||||
|
.run_function(download_model)
|
||||||
|
.pip_install("git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client")
|
||||||
|
)
|
||||||
|
|
||||||
|
stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)
|
||||||
|
|
||||||
|
|
||||||
|
@stub.cls(
|
||||||
|
gpu=GPU_CONFIG,
|
||||||
|
allow_concurrent_inputs=10,
|
||||||
|
container_idle_timeout=60 * 10,
|
||||||
|
timeout=360,
|
||||||
|
)
|
||||||
|
class Model:
|
||||||
|
def __enter__(self):
|
||||||
|
import socket
|
||||||
|
import subprocess
|
||||||
|
import time
|
||||||
|
|
||||||
|
from tabby_client import Client
|
||||||
|
|
||||||
|
self.launcher = subprocess.Popen(
|
||||||
|
["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS
|
||||||
|
)
|
||||||
|
self.client = Client("http://127.0.0.1:8000")
|
||||||
|
|
||||||
|
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
||||||
|
def webserver_ready():
|
||||||
|
try:
|
||||||
|
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
||||||
|
return True
|
||||||
|
except (socket.timeout, ConnectionRefusedError):
|
||||||
|
# Check if launcher webserving process has exited.
|
||||||
|
# If so, a connection can never be made.
|
||||||
|
retcode = self.launcher.poll()
|
||||||
|
if retcode is not None:
|
||||||
|
raise RuntimeError(
|
||||||
|
f"launcher exited unexpectedly with code {retcode}"
|
||||||
|
)
|
||||||
|
return False
|
||||||
|
|
||||||
|
while not webserver_ready():
|
||||||
|
time.sleep(1.0)
|
||||||
|
|
||||||
|
print("Tabby server ready!")
|
||||||
|
|
||||||
|
def __exit__(self, _exc_type, _exc_value, _traceback):
|
||||||
|
self.launcher.terminate()
|
||||||
|
|
||||||
|
@method()
|
||||||
|
async def complete(self, language: str, prompt: str):
|
||||||
|
from tabby_client.api.v1 import completion
|
||||||
|
from tabby_client.models import CompletionRequest, CompletionResponse, Segments
|
||||||
|
|
||||||
|
request = CompletionRequest(
|
||||||
|
language=language, prompt=prompt
|
||||||
|
)
|
||||||
|
resp: CompletionResponse = await completion.asyncio(client=self.client, json_body=request)
|
||||||
|
return resp.choices[0].text
|
||||||
|
|
||||||
|
|
||||||
|
@stub.local_entrypoint()
|
||||||
|
def main():
|
||||||
|
import json
|
||||||
|
|
||||||
|
model = Model()
|
||||||
|
with open("./output.jsonl", "w") as fout:
|
||||||
|
with open("./sample.jsonl") as fin:
|
||||||
|
for line in fin:
|
||||||
|
x = json.loads(line)
|
||||||
|
prompt = x['crossfile_context']['text'] + x['prompt']
|
||||||
|
label = x['groundtruth']
|
||||||
|
prediction = model.complete.remote("python", prompt)
|
||||||
|
|
||||||
|
json.dump(dict(prompt=prompt, label=label, prediction=prediction), fout)
|
||||||
|
|
@ -1,23 +0,0 @@
|
||||||
import streamlit as st
|
|
||||||
import pandas as pd
|
|
||||||
import altair as alt
|
|
||||||
|
|
||||||
st.set_page_config(layout="wide")
|
|
||||||
|
|
||||||
df = pd.read_json("reports.jsonl", lines=True)
|
|
||||||
|
|
||||||
for _, v in df.iterrows():
|
|
||||||
col1, col2, col3 = st.columns(3)
|
|
||||||
with col1:
|
|
||||||
st.write("prompt")
|
|
||||||
st.code(v.prompt)
|
|
||||||
with col2:
|
|
||||||
st.write("prediction")
|
|
||||||
st.code(v.prediction)
|
|
||||||
st.write("label")
|
|
||||||
st.code(v.label)
|
|
||||||
with col3:
|
|
||||||
col1, col2 = st.columns(2)
|
|
||||||
st.metric("Line score", v.line_score)
|
|
||||||
st.metric("Block score", v.block_score)
|
|
||||||
st.divider()
|
|
||||||
Loading…
Reference in New Issue