feat(experimental): use modal for easier evaluation. (#594)
parent
0dc7e98232
commit
257dba64a5
|
|
@ -3,3 +3,4 @@ node_modules
|
|||
.idea/
|
||||
.DS_Store
|
||||
.vscode/
|
||||
__pycache__
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
__pycache__
|
||||
.ipynb_checkpoints
|
||||
reports.*
|
||||
tabby
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
# Eval
|
||||
|
||||
## Local
|
||||
`./eval.sh`
|
||||
|
||||
## Skypilot
|
||||
`./eval_sky.sh`
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
[[repositories]]
|
||||
git_url = "https://github.com/huggingface/text-generation-inference"
|
||||
|
|
@ -1,11 +0,0 @@
|
|||
version: '3.5'
|
||||
services:
|
||||
tabby:
|
||||
command: serve --model TabbyML/SantaCoder-1B --device cuda
|
||||
deploy:
|
||||
resources:
|
||||
reservations:
|
||||
devices:
|
||||
- driver: nvidia
|
||||
count: 1
|
||||
capabilities: [gpu]
|
||||
|
|
@ -1,21 +0,0 @@
|
|||
version: '3.5'
|
||||
|
||||
services:
|
||||
scheduler:
|
||||
image: tabbyml/tabby
|
||||
platform: linux/amd64
|
||||
command: scheduler --now
|
||||
volumes:
|
||||
- "$PWD/tabby:/data"
|
||||
|
||||
tabby:
|
||||
depends_on:
|
||||
- scheduler
|
||||
restart: always
|
||||
image: tabbyml/tabby
|
||||
platform: linux/amd64
|
||||
command: serve --model TabbyML/T5P-220M
|
||||
ports:
|
||||
- "8080:8080"
|
||||
volumes:
|
||||
- "$PWD/tabby:/data"
|
||||
|
|
@ -1,24 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
mkdir -p tabby
|
||||
cp config.toml tabby/
|
||||
|
||||
docker-compose down
|
||||
|
||||
if nvidia-smi; then
|
||||
docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d
|
||||
else
|
||||
docker-compose up -d
|
||||
fi
|
||||
|
||||
while ! curl -X POST http://localhost:8080/v1/health; do
|
||||
echo "server not ready, waiting..."
|
||||
sleep 5
|
||||
done
|
||||
|
||||
python main.py "./tabby/dataset/*.jsonl" ${MAX_RECORDS:-3} > reports.jsonl
|
||||
|
||||
docker-compose down
|
||||
|
||||
echo done
|
||||
|
|
@ -1,10 +0,0 @@
|
|||
#!/bin/bash
|
||||
set -ex
|
||||
|
||||
ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300"
|
||||
|
||||
if ! sky exec $ARGS; then
|
||||
sky launch -c $ARGS
|
||||
fi
|
||||
|
||||
scp tabby-eval:~/sky_workdir/reports.jsonl ./
|
||||
|
|
@ -1,90 +0,0 @@
|
|||
import sys
|
||||
import argparse
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
from tabby_client import Client
|
||||
from tabby_client.api.v1 import health
|
||||
from tabby_client.api.v1 import completion
|
||||
|
||||
from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice
|
||||
|
||||
import processing
|
||||
import editdistance
|
||||
import random
|
||||
|
||||
|
||||
def valid_item(item: processing.Item):
|
||||
count_body_lines = len(item.body.splitlines())
|
||||
|
||||
if count_body_lines > 10:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def scorer(label, prediction):
|
||||
distance = editdistance.eval(label, prediction)
|
||||
return max(0.0, 1.0 - distance / len(label))
|
||||
|
||||
|
||||
def run_eval(args):
|
||||
api = "http://localhost:8080"
|
||||
client = Client(base_url=api, timeout=50)
|
||||
try:
|
||||
health.sync(client=client)
|
||||
except:
|
||||
print(f"Tabby Server is not ready, please check if '{api}' is correct.")
|
||||
return
|
||||
|
||||
items = [
|
||||
x for x in processing.items_from_filepattern(args.filepattern) if valid_item(x)
|
||||
]
|
||||
if len(items) > args.max_records:
|
||||
random.seed(0xBADBEEF)
|
||||
items = random.sample(items, args.max_records)
|
||||
|
||||
for item in items:
|
||||
if not valid_item(item):
|
||||
continue
|
||||
|
||||
request = CompletionRequest(
|
||||
language=item.language, segments=Segments(prefix=item.prefix)
|
||||
)
|
||||
|
||||
resp: CompletionResponse = completion.sync(client=client, json_body=request)
|
||||
label = item.body
|
||||
prediction = resp.choices[0].text
|
||||
|
||||
block_score = scorer(label, prediction)
|
||||
|
||||
label_lines = label.splitlines()
|
||||
prediction_lines = prediction.splitlines()
|
||||
|
||||
if len(label_lines) > 0 and len(prediction_lines) > 0:
|
||||
line_score = scorer(label_lines[0], prediction_lines[0])
|
||||
|
||||
yield dict(
|
||||
prompt=item.prefix,
|
||||
prediction=prediction,
|
||||
label=label,
|
||||
block_score=block_score,
|
||||
line_score=line_score,
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
logging.basicConfig(stream=sys.stderr, level=logging.INFO)
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="SxS eval for tabby",
|
||||
epilog="Example usage: python main.py ./tabby/dataset/data.jsonl 5 > output.jsonl",
|
||||
)
|
||||
parser.add_argument("filepattern", type=str, help="File pattern to dataset.")
|
||||
parser.add_argument(
|
||||
"max_records", type=int, help="Max number of records to be evaluated."
|
||||
)
|
||||
args = parser.parse_args()
|
||||
logging.info("args %s", args)
|
||||
df = pd.DataFrame(run_eval(args))
|
||||
print(df.to_json(orient="records", lines=True))
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -1,88 +0,0 @@
|
|||
from typing import Iterator
|
||||
|
||||
import glob
|
||||
import json
|
||||
from dataclasses import dataclass
|
||||
|
||||
|
||||
@dataclass
|
||||
class Item:
|
||||
git_url: str
|
||||
filepath: str
|
||||
language: str
|
||||
|
||||
name: str
|
||||
body: str
|
||||
prefix: str
|
||||
suffix: str
|
||||
|
||||
|
||||
def iter_items(doc) -> Iterator[Item]:
|
||||
if doc["max_line_length"] > 500:
|
||||
return
|
||||
|
||||
if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
|
||||
return
|
||||
|
||||
if doc["alphanum_fraction"] < 0.25:
|
||||
return
|
||||
|
||||
for tag in doc["tags"]:
|
||||
content = doc["content"]
|
||||
name = get_content(content, tag["name_range"])
|
||||
body = get_content(content, tag["range"])
|
||||
|
||||
prefix = get_prefix(content, tag["range"]["start"])
|
||||
suffix = get_suffix(content, tag["range"]["end"])
|
||||
|
||||
yield Item(
|
||||
name=name,
|
||||
body=body,
|
||||
prefix=prefix,
|
||||
suffix=suffix,
|
||||
git_url=doc["git_url"],
|
||||
filepath=doc["filepath"],
|
||||
language=doc["language"],
|
||||
)
|
||||
|
||||
|
||||
def iter_docs(filepattern: str):
|
||||
for filepath in glob.glob(filepattern):
|
||||
with open(filepath) as f:
|
||||
for line in f:
|
||||
yield json.loads(line)
|
||||
|
||||
|
||||
def get_content(content: str, range: dict):
|
||||
return content[range["start"] : range["end"]]
|
||||
|
||||
|
||||
def get_prefix(content: str, start: int, max=20):
|
||||
num_lines = 0
|
||||
prefix_start = 0
|
||||
for prefix_start in range(start - 1, 0, -1):
|
||||
if content[prefix_start] == "\n":
|
||||
num_lines += 1
|
||||
|
||||
if num_lines == max:
|
||||
break
|
||||
|
||||
return content[prefix_start + 1 : start]
|
||||
|
||||
|
||||
def get_suffix(content: str, end: int, max=20):
|
||||
num_lines = 0
|
||||
suffix_end = end
|
||||
for suffix_end in range(end, len(content)):
|
||||
if content[suffix_end] == "\n":
|
||||
num_lines += 1
|
||||
|
||||
if num_lines == max:
|
||||
break
|
||||
|
||||
return content[end : suffix_end - 1]
|
||||
|
||||
|
||||
def items_from_filepattern(filepattern: str):
|
||||
for doc in iter_docs(filepattern):
|
||||
yield from iter_items(doc)
|
||||
|
|
@ -1,2 +0,0 @@
|
|||
git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client
|
||||
editdistance
|
||||
File diff suppressed because one or more lines are too long
|
|
@ -1,25 +0,0 @@
|
|||
resources:
|
||||
accelerators: T4:1
|
||||
|
||||
# tabby base dir
|
||||
workdir: ./
|
||||
|
||||
setup: |
|
||||
set -ex
|
||||
|
||||
# On some cloud providers, docker-compose is not installed by default.
|
||||
sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
|
||||
sudo chmod a+x /usr/local/bin/docker-compose
|
||||
|
||||
# Install tabby python client.
|
||||
pip install -r requirements.txt
|
||||
|
||||
# On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here
|
||||
sudo docker-compose pull
|
||||
|
||||
# Add current user to docker group, it won't take effect immediately as skypilot job is started by a long-running daemon.
|
||||
sudo usermod -aG docker $USER
|
||||
|
||||
|
||||
run: |
|
||||
./eval.sh
|
||||
|
|
@ -0,0 +1,107 @@
|
|||
from pathlib import Path
|
||||
|
||||
from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
|
||||
|
||||
GPU_CONFIG = gpu.T4()
|
||||
MODEL_ID = "TabbyML/StarCoder-1B"
|
||||
LAUNCH_FLAGS = [
|
||||
"serve",
|
||||
"--model",
|
||||
MODEL_ID,
|
||||
"--port",
|
||||
"8000",
|
||||
"--device",
|
||||
"cuda"
|
||||
]
|
||||
|
||||
def download_model():
|
||||
import subprocess
|
||||
|
||||
subprocess.run(
|
||||
[
|
||||
"/opt/tabby/bin/tabby",
|
||||
"download",
|
||||
"--model",
|
||||
MODEL_ID,
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
image = (
|
||||
Image.from_registry("tabbyml/tabby:0.3.0", add_python="3.11")
|
||||
.dockerfile_commands("ENTRYPOINT []")
|
||||
.run_function(download_model)
|
||||
.pip_install("git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client")
|
||||
)
|
||||
|
||||
stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)
|
||||
|
||||
|
||||
@stub.cls(
|
||||
gpu=GPU_CONFIG,
|
||||
allow_concurrent_inputs=10,
|
||||
container_idle_timeout=60 * 10,
|
||||
timeout=360,
|
||||
)
|
||||
class Model:
|
||||
def __enter__(self):
|
||||
import socket
|
||||
import subprocess
|
||||
import time
|
||||
|
||||
from tabby_client import Client
|
||||
|
||||
self.launcher = subprocess.Popen(
|
||||
["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS
|
||||
)
|
||||
self.client = Client("http://127.0.0.1:8000")
|
||||
|
||||
# Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
|
||||
def webserver_ready():
|
||||
try:
|
||||
socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
|
||||
return True
|
||||
except (socket.timeout, ConnectionRefusedError):
|
||||
# Check if launcher webserving process has exited.
|
||||
# If so, a connection can never be made.
|
||||
retcode = self.launcher.poll()
|
||||
if retcode is not None:
|
||||
raise RuntimeError(
|
||||
f"launcher exited unexpectedly with code {retcode}"
|
||||
)
|
||||
return False
|
||||
|
||||
while not webserver_ready():
|
||||
time.sleep(1.0)
|
||||
|
||||
print("Tabby server ready!")
|
||||
|
||||
def __exit__(self, _exc_type, _exc_value, _traceback):
|
||||
self.launcher.terminate()
|
||||
|
||||
@method()
|
||||
async def complete(self, language: str, prompt: str):
|
||||
from tabby_client.api.v1 import completion
|
||||
from tabby_client.models import CompletionRequest, CompletionResponse, Segments
|
||||
|
||||
request = CompletionRequest(
|
||||
language=language, prompt=prompt
|
||||
)
|
||||
resp: CompletionResponse = await completion.asyncio(client=self.client, json_body=request)
|
||||
return resp.choices[0].text
|
||||
|
||||
|
||||
@stub.local_entrypoint()
|
||||
def main():
|
||||
import json
|
||||
|
||||
model = Model()
|
||||
with open("./output.jsonl", "w") as fout:
|
||||
with open("./sample.jsonl") as fin:
|
||||
for line in fin:
|
||||
x = json.loads(line)
|
||||
prompt = x['crossfile_context']['text'] + x['prompt']
|
||||
label = x['groundtruth']
|
||||
prediction = model.complete.remote("python", prompt)
|
||||
|
||||
json.dump(dict(prompt=prompt, label=label, prediction=prediction), fout)
|
||||
|
|
@ -1,23 +0,0 @@
|
|||
import streamlit as st
|
||||
import pandas as pd
|
||||
import altair as alt
|
||||
|
||||
st.set_page_config(layout="wide")
|
||||
|
||||
df = pd.read_json("reports.jsonl", lines=True)
|
||||
|
||||
for _, v in df.iterrows():
|
||||
col1, col2, col3 = st.columns(3)
|
||||
with col1:
|
||||
st.write("prompt")
|
||||
st.code(v.prompt)
|
||||
with col2:
|
||||
st.write("prediction")
|
||||
st.code(v.prediction)
|
||||
st.write("label")
|
||||
st.code(v.label)
|
||||
with col3:
|
||||
col1, col2 = st.columns(2)
|
||||
st.metric("Line score", v.line_score)
|
||||
st.metric("Block score", v.block_score)
|
||||
st.divider()
|
||||
Loading…
Reference in New Issue