feat(experimental): use modal for easier evaluation. (#594)

2023-10-18 23:33:27 -07:00 · 2023-10-18 23:33:27 -07:00 · 257dba64a5
parent 0dc7e98232
commit 257dba64a5
16 changed files with 210 additions and 308 deletions
--- a/.gitignore
+++ b/.gitignore
@ -2,4 +2,5 @@
 node_modules
 .idea/
 .DS_Store
-.vscode/
+.vscode/
 __pycache__
--- a/experimental/eval/.gitignore
+++ b/experimental/eval/.gitignore
@ -1,4 +0,0 @@
 __pycache__
 .ipynb_checkpoints
 reports.*
 tabby
--- a/experimental/eval/README.md
+++ b/experimental/eval/README.md
@ -1,7 +0,0 @@
 # Eval
 ## Local
 `./eval.sh`
 ## Skypilot
 `./eval_sky.sh`
--- a/experimental/eval/config.toml
+++ b/experimental/eval/config.toml
@ -1,2 +0,0 @@
 [[repositories]]
 git_url = "https://github.com/huggingface/text-generation-inference"
--- a/experimental/eval/docker-compose.cuda.yaml
+++ b/experimental/eval/docker-compose.cuda.yaml
@ -1,11 +0,0 @@
 version: '3.5'
 services:
  tabby:
    command: serve --model TabbyML/SantaCoder-1B --device cuda
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: 1
              capabilities: [gpu]
--- a/experimental/eval/docker-compose.yaml
+++ b/experimental/eval/docker-compose.yaml
@ -1,21 +0,0 @@
 version: '3.5'
 services:
  scheduler:
    image: tabbyml/tabby
    platform: linux/amd64
    command: scheduler --now
    volumes:
      - "$PWD/tabby:/data"
  tabby:
    depends_on:
      - scheduler
    restart: always
    image: tabbyml/tabby
    platform: linux/amd64
    command: serve --model TabbyML/T5P-220M
    ports:
      - "8080:8080"
    volumes:
      - "$PWD/tabby:/data"
--- a/experimental/eval/eval.sh
+++ b/experimental/eval/eval.sh
@ -1,24 +0,0 @@
 #!/bin/bash
 set -ex
 mkdir -p tabby
 cp config.toml tabby/
 docker-compose down
 if nvidia-smi; then
  docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d
 else
  docker-compose up -d
 fi
 while ! curl -X POST http://localhost:8080/v1/health; do
  echo "server not ready, waiting..."
  sleep 5
 done
 python main.py "./tabby/dataset/*.jsonl" ${MAX_RECORDS:-3} > reports.jsonl
 docker-compose down
 echo done
--- a/experimental/eval/eval_sky.sh
+++ b/experimental/eval/eval_sky.sh
@ -1,10 +0,0 @@
 #!/bin/bash
 set -ex
 ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300"
 if ! sky exec $ARGS; then
  sky launch -c $ARGS
 fi
 scp tabby-eval:~/sky_workdir/reports.jsonl ./
--- a/experimental/eval/main.py
+++ b/experimental/eval/main.py
@ -1,90 +0,0 @@
 import sys
 import argparse
 import pandas as pd
 import logging
 from tabby_client import Client
 from tabby_client.api.v1 import health
 from tabby_client.api.v1 import completion
 from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice
 import processing
 import editdistance
 import random
 def valid_item(item: processing.Item):
    count_body_lines = len(item.body.splitlines())
    if count_body_lines > 10:
        return False
    return True
 def scorer(label, prediction):
    distance = editdistance.eval(label, prediction)
    return max(0.0, 1.0 - distance / len(label))
 def run_eval(args):
    api = "http://localhost:8080"
    client = Client(base_url=api, timeout=50)
    try:
        health.sync(client=client)
    except:
        print(f"Tabby Server is not ready, please check if '{api}' is correct.")
        return
    items = [
        x for x in processing.items_from_filepattern(args.filepattern) if valid_item(x)
    ]
    if len(items) > args.max_records:
        random.seed(0xBADBEEF)
        items = random.sample(items, args.max_records)
    for item in items:
        if not valid_item(item):
            continue
        request = CompletionRequest(
            language=item.language, segments=Segments(prefix=item.prefix)
        )
        resp: CompletionResponse = completion.sync(client=client, json_body=request)
        label = item.body
        prediction = resp.choices[0].text
        block_score = scorer(label, prediction)
        label_lines = label.splitlines()
        prediction_lines = prediction.splitlines()
        if len(label_lines) > 0 and len(prediction_lines) > 0:
            line_score = scorer(label_lines[0], prediction_lines[0])
        yield dict(
            prompt=item.prefix,
            prediction=prediction,
            label=label,
            block_score=block_score,
            line_score=line_score,
        )
 if __name__ == "__main__":
    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
    parser = argparse.ArgumentParser(
        description="SxS eval for tabby",
        epilog="Example usage: python main.py ./tabby/dataset/data.jsonl 5 > output.jsonl",
    )
    parser.add_argument("filepattern", type=str, help="File pattern to dataset.")
    parser.add_argument(
        "max_records", type=int, help="Max number of records to be evaluated."
    )
    args = parser.parse_args()
    logging.info("args %s", args)
    df = pd.DataFrame(run_eval(args))
    print(df.to_json(orient="records", lines=True))
--- a/experimental/eval/output.jsonl
+++ b/experimental/eval/output.jsonl
--- a/experimental/eval/processing.py
+++ b/experimental/eval/processing.py
@ -1,88 +0,0 @@
 from typing import Iterator
 import glob
 import json
 from dataclasses import dataclass
@dataclass
 class Item:
    git_url: str
    filepath: str
    language: str
    name: str
    body: str
    prefix: str
    suffix: str
 def iter_items(doc) -> Iterator[Item]:
    if doc["max_line_length"] > 500:
        return
    if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
        return
    if doc["alphanum_fraction"] < 0.25:
        return
    for tag in doc["tags"]:
        content = doc["content"]
        name = get_content(content, tag["name_range"])
        body = get_content(content, tag["range"])
        prefix = get_prefix(content, tag["range"]["start"])
        suffix = get_suffix(content, tag["range"]["end"])
        yield Item(
            name=name,
            body=body,
            prefix=prefix,
            suffix=suffix,
            git_url=doc["git_url"],
            filepath=doc["filepath"],
            language=doc["language"],
        )
 def iter_docs(filepattern: str):
    for filepath in glob.glob(filepattern):
        with open(filepath) as f:
            for line in f:
                yield json.loads(line)
 def get_content(content: str, range: dict):
    return content[range["start"] : range["end"]]
 def get_prefix(content: str, start: int, max=20):
    num_lines = 0
    prefix_start = 0
    for prefix_start in range(start - 1, 0, -1):
        if content[prefix_start] == "\n":
            num_lines += 1
        if num_lines == max:
            break
    return content[prefix_start + 1 : start]
 def get_suffix(content: str, end: int, max=20):
    num_lines = 0
    suffix_end = end
    for suffix_end in range(end, len(content)):
        if content[suffix_end] == "\n":
            num_lines += 1
        if num_lines == max:
            break
    return content[end : suffix_end - 1]
 def items_from_filepattern(filepattern: str):
    for doc in iter_docs(filepattern):
        yield from iter_items(doc)
--- a/experimental/eval/requirements.txt
+++ b/experimental/eval/requirements.txt
@ -1,2 +0,0 @@
 git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client
 editdistance
--- a/experimental/eval/sample.jsonl
+++ b/experimental/eval/sample.jsonl
--- a/experimental/eval/skypilot.yaml
+++ b/experimental/eval/skypilot.yaml
@ -1,25 +0,0 @@
 resources:
  accelerators: T4:1
 # tabby base dir
 workdir: ./
 setup: |
  set -ex
  # On some cloud providers, docker-compose is not installed by default.
  sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
  sudo chmod a+x /usr/local/bin/docker-compose
  # Install tabby python client.
  pip install -r requirements.txt
  # On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here
  sudo docker-compose pull
  # Add current user to docker group, it won't take effect immediately as skypilot job is started by a long-running daemon.
  sudo usermod -aG docker $USER
 run: |
  ./eval.sh
--- a/experimental/eval/tabby.py
+++ b/experimental/eval/tabby.py
@ -0,0 +1,107 @@
 from pathlib import Path
 from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
 GPU_CONFIG = gpu.T4()
 MODEL_ID = "TabbyML/StarCoder-1B"
 LAUNCH_FLAGS = [
    "serve",
    "--model",
    MODEL_ID,
    "--port",
    "8000",
    "--device",
    "cuda"
 ]
 def download_model():
    import subprocess
    subprocess.run(
        [
            "/opt/tabby/bin/tabby",
            "download",
            "--model",
            MODEL_ID,
        ]
    )
 image = (
        Image.from_registry("tabbyml/tabby:0.3.0", add_python="3.11")
        .dockerfile_commands("ENTRYPOINT []")
        .run_function(download_model)
        .pip_install("git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client")
 )
 stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)
@stub.cls(
    gpu=GPU_CONFIG,
    allow_concurrent_inputs=10,
    container_idle_timeout=60 * 10,
    timeout=360,
 )
 class Model:
    def __enter__(self):
        import socket
        import subprocess
        import time
        from tabby_client import Client
        self.launcher = subprocess.Popen(
            ["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS
        )
        self.client = Client("http://127.0.0.1:8000")
        # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
        def webserver_ready():
            try:
                socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
                return True
            except (socket.timeout, ConnectionRefusedError):
                # Check if launcher webserving process has exited.
                # If so, a connection can never be made.
                retcode = self.launcher.poll()
                if retcode is not None:
                    raise RuntimeError(
                        f"launcher exited unexpectedly with code {retcode}"
                    )
                return False
        while not webserver_ready():
            time.sleep(1.0)
        print("Tabby server ready!")
    def __exit__(self, _exc_type, _exc_value, _traceback):
        self.launcher.terminate()
    @method()
    async def complete(self, language: str, prompt: str):
        from tabby_client.api.v1 import completion
        from tabby_client.models import CompletionRequest, CompletionResponse, Segments
        request = CompletionRequest(
            language=language, prompt=prompt 
        )
        resp: CompletionResponse = await completion.asyncio(client=self.client, json_body=request)
        return resp.choices[0].text
@stub.local_entrypoint()
 def main():
    import json
    model = Model()
    with open("./output.jsonl", "w") as fout:
        with open("./sample.jsonl") as fin:
            for line in fin:
                x = json.loads(line)
                prompt = x['crossfile_context']['text'] + x['prompt']
                label = x['groundtruth']
                prediction = model.complete.remote("python", prompt)
                json.dump(dict(prompt=prompt, label=label, prediction=prediction), fout)
--- a/experimental/eval/visualize.py
+++ b/experimental/eval/visualize.py
@ -1,23 +0,0 @@
 import streamlit as st
 import pandas as pd
 import altair as alt
 st.set_page_config(layout="wide")
 df = pd.read_json("reports.jsonl", lines=True)
 for _, v in df.iterrows():
    col1, col2, col3 = st.columns(3)
    with col1:
        st.write("prompt")
        st.code(v.prompt)
    with col2:
        st.write("prediction")
        st.code(v.prediction)
        st.write("label")
        st.code(v.label)
    with col3:
        col1, col2 = st.columns(2)
        st.metric("Line score", v.line_score)
        st.metric("Block score", v.block_score)
    st.divider()
		`@ -1,2 +0,0 @@`
			`[[repositories]]`
			`git_url = "https://github.com/huggingface/text-generation-inference"`
		`@ -1,2 +0,0 @@`
			`git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client`
			`editdistance`