feat(experimental): use modal for easier evaluation. (#594)

r0.4
Meng Zhang 2023-10-18 23:33:27 -07:00 committed by GitHub
parent 0dc7e98232
commit 257dba64a5
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
16 changed files with 210 additions and 308 deletions

1
.gitignore vendored
View File

@ -3,3 +3,4 @@ node_modules
.idea/
.DS_Store
.vscode/
__pycache__

View File

@ -1,4 +0,0 @@
__pycache__
.ipynb_checkpoints
reports.*
tabby

View File

@ -1,7 +0,0 @@
# Eval
## Local
`./eval.sh`
## Skypilot
`./eval_sky.sh`

View File

@ -1,2 +0,0 @@
[[repositories]]
git_url = "https://github.com/huggingface/text-generation-inference"

View File

@ -1,11 +0,0 @@
version: '3.5'
services:
tabby:
command: serve --model TabbyML/SantaCoder-1B --device cuda
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: 1
capabilities: [gpu]

View File

@ -1,21 +0,0 @@
version: '3.5'
services:
scheduler:
image: tabbyml/tabby
platform: linux/amd64
command: scheduler --now
volumes:
- "$PWD/tabby:/data"
tabby:
depends_on:
- scheduler
restart: always
image: tabbyml/tabby
platform: linux/amd64
command: serve --model TabbyML/T5P-220M
ports:
- "8080:8080"
volumes:
- "$PWD/tabby:/data"

View File

@ -1,24 +0,0 @@
#!/bin/bash
set -ex
mkdir -p tabby
cp config.toml tabby/
docker-compose down
if nvidia-smi; then
docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d
else
docker-compose up -d
fi
while ! curl -X POST http://localhost:8080/v1/health; do
echo "server not ready, waiting..."
sleep 5
done
python main.py "./tabby/dataset/*.jsonl" ${MAX_RECORDS:-3} > reports.jsonl
docker-compose down
echo done

View File

@ -1,10 +0,0 @@
#!/bin/bash
set -ex
ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300"
if ! sky exec $ARGS; then
sky launch -c $ARGS
fi
scp tabby-eval:~/sky_workdir/reports.jsonl ./

View File

@ -1,90 +0,0 @@
import sys
import argparse
import pandas as pd
import logging
from tabby_client import Client
from tabby_client.api.v1 import health
from tabby_client.api.v1 import completion
from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice
import processing
import editdistance
import random
def valid_item(item: processing.Item):
count_body_lines = len(item.body.splitlines())
if count_body_lines > 10:
return False
return True
def scorer(label, prediction):
distance = editdistance.eval(label, prediction)
return max(0.0, 1.0 - distance / len(label))
def run_eval(args):
api = "http://localhost:8080"
client = Client(base_url=api, timeout=50)
try:
health.sync(client=client)
except:
print(f"Tabby Server is not ready, please check if '{api}' is correct.")
return
items = [
x for x in processing.items_from_filepattern(args.filepattern) if valid_item(x)
]
if len(items) > args.max_records:
random.seed(0xBADBEEF)
items = random.sample(items, args.max_records)
for item in items:
if not valid_item(item):
continue
request = CompletionRequest(
language=item.language, segments=Segments(prefix=item.prefix)
)
resp: CompletionResponse = completion.sync(client=client, json_body=request)
label = item.body
prediction = resp.choices[0].text
block_score = scorer(label, prediction)
label_lines = label.splitlines()
prediction_lines = prediction.splitlines()
if len(label_lines) > 0 and len(prediction_lines) > 0:
line_score = scorer(label_lines[0], prediction_lines[0])
yield dict(
prompt=item.prefix,
prediction=prediction,
label=label,
block_score=block_score,
line_score=line_score,
)
if __name__ == "__main__":
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
parser = argparse.ArgumentParser(
description="SxS eval for tabby",
epilog="Example usage: python main.py ./tabby/dataset/data.jsonl 5 > output.jsonl",
)
parser.add_argument("filepattern", type=str, help="File pattern to dataset.")
parser.add_argument(
"max_records", type=int, help="Max number of records to be evaluated."
)
args = parser.parse_args()
logging.info("args %s", args)
df = pd.DataFrame(run_eval(args))
print(df.to_json(orient="records", lines=True))

File diff suppressed because one or more lines are too long

View File

@ -1,88 +0,0 @@
from typing import Iterator
import glob
import json
from dataclasses import dataclass
@dataclass
class Item:
git_url: str
filepath: str
language: str
name: str
body: str
prefix: str
suffix: str
def iter_items(doc) -> Iterator[Item]:
if doc["max_line_length"] > 500:
return
if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
return
if doc["alphanum_fraction"] < 0.25:
return
for tag in doc["tags"]:
content = doc["content"]
name = get_content(content, tag["name_range"])
body = get_content(content, tag["range"])
prefix = get_prefix(content, tag["range"]["start"])
suffix = get_suffix(content, tag["range"]["end"])
yield Item(
name=name,
body=body,
prefix=prefix,
suffix=suffix,
git_url=doc["git_url"],
filepath=doc["filepath"],
language=doc["language"],
)
def iter_docs(filepattern: str):
for filepath in glob.glob(filepattern):
with open(filepath) as f:
for line in f:
yield json.loads(line)
def get_content(content: str, range: dict):
return content[range["start"] : range["end"]]
def get_prefix(content: str, start: int, max=20):
num_lines = 0
prefix_start = 0
for prefix_start in range(start - 1, 0, -1):
if content[prefix_start] == "\n":
num_lines += 1
if num_lines == max:
break
return content[prefix_start + 1 : start]
def get_suffix(content: str, end: int, max=20):
num_lines = 0
suffix_end = end
for suffix_end in range(end, len(content)):
if content[suffix_end] == "\n":
num_lines += 1
if num_lines == max:
break
return content[end : suffix_end - 1]
def items_from_filepattern(filepattern: str):
for doc in iter_docs(filepattern):
yield from iter_items(doc)

View File

@ -1,2 +0,0 @@
git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client
editdistance

File diff suppressed because one or more lines are too long

View File

@ -1,25 +0,0 @@
resources:
accelerators: T4:1
# tabby base dir
workdir: ./
setup: |
set -ex
# On some cloud providers, docker-compose is not installed by default.
sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
sudo chmod a+x /usr/local/bin/docker-compose
# Install tabby python client.
pip install -r requirements.txt
# On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here
sudo docker-compose pull
# Add current user to docker group, it won't take effect immediately as skypilot job is started by a long-running daemon.
sudo usermod -aG docker $USER
run: |
./eval.sh

107
experimental/eval/tabby.py Normal file
View File

@ -0,0 +1,107 @@
from pathlib import Path
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
GPU_CONFIG = gpu.T4()
MODEL_ID = "TabbyML/StarCoder-1B"
LAUNCH_FLAGS = [
"serve",
"--model",
MODEL_ID,
"--port",
"8000",
"--device",
"cuda"
]
def download_model():
import subprocess
subprocess.run(
[
"/opt/tabby/bin/tabby",
"download",
"--model",
MODEL_ID,
]
)
image = (
Image.from_registry("tabbyml/tabby:0.3.0", add_python="3.11")
.dockerfile_commands("ENTRYPOINT []")
.run_function(download_model)
.pip_install("git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client")
)
stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)
@stub.cls(
gpu=GPU_CONFIG,
allow_concurrent_inputs=10,
container_idle_timeout=60 * 10,
timeout=360,
)
class Model:
def __enter__(self):
import socket
import subprocess
import time
from tabby_client import Client
self.launcher = subprocess.Popen(
["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS
)
self.client = Client("http://127.0.0.1:8000")
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
def webserver_ready():
try:
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
return True
except (socket.timeout, ConnectionRefusedError):
# Check if launcher webserving process has exited.
# If so, a connection can never be made.
retcode = self.launcher.poll()
if retcode is not None:
raise RuntimeError(
f"launcher exited unexpectedly with code {retcode}"
)
return False
while not webserver_ready():
time.sleep(1.0)
print("Tabby server ready!")
def __exit__(self, _exc_type, _exc_value, _traceback):
self.launcher.terminate()
@method()
async def complete(self, language: str, prompt: str):
from tabby_client.api.v1 import completion
from tabby_client.models import CompletionRequest, CompletionResponse, Segments
request = CompletionRequest(
language=language, prompt=prompt
)
resp: CompletionResponse = await completion.asyncio(client=self.client, json_body=request)
return resp.choices[0].text
@stub.local_entrypoint()
def main():
import json
model = Model()
with open("./output.jsonl", "w") as fout:
with open("./sample.jsonl") as fin:
for line in fin:
x = json.loads(line)
prompt = x['crossfile_context']['text'] + x['prompt']
label = x['groundtruth']
prediction = model.complete.remote("python", prompt)
json.dump(dict(prompt=prompt, label=label, prediction=prediction), fout)

View File

@ -1,23 +0,0 @@
import streamlit as st
import pandas as pd
import altair as alt
st.set_page_config(layout="wide")
df = pd.read_json("reports.jsonl", lines=True)
for _, v in df.iterrows():
col1, col2, col3 = st.columns(3)
with col1:
st.write("prompt")
st.code(v.prompt)
with col2:
st.write("prediction")
st.code(v.prediction)
st.write("label")
st.code(v.label)
with col3:
col1, col2 = st.columns(2)
st.metric("Line score", v.line_score)
st.metric("Block score", v.block_score)
st.divider()