feat(experimental): use modal for easier evaluation. (#594)

2023-10-18 23:33:27 -07:00 · 2023-10-18 23:33:27 -07:00 · 257dba64a5
parent 0dc7e98232
commit 257dba64a5
16 changed files with 210 additions and 308 deletions
--- a/.gitignore
+++ b/.gitignore
@ -3,3 +3,4 @@ node_modules
 .idea/
 .DS_Store
 .vscode/
+__pycache__
--- a/experimental/eval/.gitignore
+++ b/experimental/eval/.gitignore
@ -1,4 +0,0 @@
-__pycache__
-.ipynb_checkpoints
-reports.*
-tabby
--- a/experimental/eval/README.md
+++ b/experimental/eval/README.md
@ -1,7 +0,0 @@
-# Eval
-
-## Local
-`./eval.sh`
-
-## Skypilot
-`./eval_sky.sh`
--- a/experimental/eval/config.toml
+++ b/experimental/eval/config.toml
@ -1,2 +0,0 @@
-[[repositories]]
-git_url = "https://github.com/huggingface/text-generation-inference"
--- a/experimental/eval/docker-compose.cuda.yaml
+++ b/experimental/eval/docker-compose.cuda.yaml
@ -1,11 +0,0 @@
-version: '3.5'
-services:
-  tabby:
-    command: serve --model TabbyML/SantaCoder-1B --device cuda
-    deploy:
-      resources:
-        reservations:
-          devices:
-            - driver: nvidia
-              count: 1
-              capabilities: [gpu]
--- a/experimental/eval/docker-compose.yaml
+++ b/experimental/eval/docker-compose.yaml
@ -1,21 +0,0 @@
-version: '3.5'
-
-services:
-  scheduler:
-    image: tabbyml/tabby
-    platform: linux/amd64
-    command: scheduler --now
-    volumes:
-      - "$PWD/tabby:/data"
-
-  tabby:
-    depends_on:
-      - scheduler
-    restart: always
-    image: tabbyml/tabby
-    platform: linux/amd64
-    command: serve --model TabbyML/T5P-220M
-    ports:
-      - "8080:8080"
-    volumes:
-      - "$PWD/tabby:/data"
--- a/experimental/eval/eval.sh
+++ b/experimental/eval/eval.sh
@ -1,24 +0,0 @@
-#!/bin/bash
-set -ex
-
-mkdir -p tabby
-cp config.toml tabby/
-
-docker-compose down
-
-if nvidia-smi; then
-  docker-compose -f docker-compose.yaml -f docker-compose.cuda.yaml up -d
-else
-  docker-compose up -d
-fi
-
-while ! curl -X POST http://localhost:8080/v1/health; do
-  echo "server not ready, waiting..."
-  sleep 5
-done
-
-python main.py "./tabby/dataset/*.jsonl" ${MAX_RECORDS:-3} > reports.jsonl
-
-docker-compose down
-
-echo done
--- a/experimental/eval/eval_sky.sh
+++ b/experimental/eval/eval_sky.sh
@ -1,10 +0,0 @@
-#!/bin/bash
-set -ex
-
-ARGS="tabby-eval skypilot.yaml --env MAX_RECORDS=300"
-
-if ! sky exec $ARGS; then
-  sky launch -c $ARGS
-fi
-
-scp tabby-eval:~/sky_workdir/reports.jsonl ./
--- a/experimental/eval/main.py
+++ b/experimental/eval/main.py
@ -1,90 +0,0 @@
-import sys
-import argparse
-import pandas as pd
-import logging
-
-from tabby_client import Client
-from tabby_client.api.v1 import health
-from tabby_client.api.v1 import completion
-
-from tabby_client.models import CompletionRequest, CompletionRequest, Segments, Choice
-
-import processing
-import editdistance
-import random
-
-
-def valid_item(item: processing.Item):
-    count_body_lines = len(item.body.splitlines())
-
-    if count_body_lines > 10:
-        return False
-
-    return True
-
-
-def scorer(label, prediction):
-    distance = editdistance.eval(label, prediction)
-    return max(0.0, 1.0 - distance / len(label))
-
-
-def run_eval(args):
-    api = "http://localhost:8080"
-    client = Client(base_url=api, timeout=50)
-    try:
-        health.sync(client=client)
-    except:
-        print(f"Tabby Server is not ready, please check if '{api}' is correct.")
-        return
-
-    items = [
-        x for x in processing.items_from_filepattern(args.filepattern) if valid_item(x)
-    ]
-    if len(items) > args.max_records:
-        random.seed(0xBADBEEF)
-        items = random.sample(items, args.max_records)
-
-    for item in items:
-        if not valid_item(item):
-            continue
-
-        request = CompletionRequest(
-            language=item.language, segments=Segments(prefix=item.prefix)
-        )
-
-        resp: CompletionResponse = completion.sync(client=client, json_body=request)
-        label = item.body
-        prediction = resp.choices[0].text
-
-        block_score = scorer(label, prediction)
-
-        label_lines = label.splitlines()
-        prediction_lines = prediction.splitlines()
-
-        if len(label_lines) > 0 and len(prediction_lines) > 0:
-            line_score = scorer(label_lines[0], prediction_lines[0])
-
-        yield dict(
-            prompt=item.prefix,
-            prediction=prediction,
-            label=label,
-            block_score=block_score,
-            line_score=line_score,
-        )
-
-
-if __name__ == "__main__":
-    logging.basicConfig(stream=sys.stderr, level=logging.INFO)
-
-    parser = argparse.ArgumentParser(
-        description="SxS eval for tabby",
-        epilog="Example usage: python main.py ./tabby/dataset/data.jsonl 5 > output.jsonl",
-    )
-    parser.add_argument("filepattern", type=str, help="File pattern to dataset.")
-    parser.add_argument(
-        "max_records", type=int, help="Max number of records to be evaluated."
-    )
-    args = parser.parse_args()
-    logging.info("args %s", args)
-    df = pd.DataFrame(run_eval(args))
-    print(df.to_json(orient="records", lines=True))
--- a/experimental/eval/output.jsonl
+++ b/experimental/eval/output.jsonl
--- a/experimental/eval/processing.py
+++ b/experimental/eval/processing.py
@ -1,88 +0,0 @@
-from typing import Iterator
-
-import glob
-import json
-from dataclasses import dataclass
-
-
-@dataclass
-class Item:
-    git_url: str
-    filepath: str
-    language: str
-
-    name: str
-    body: str
-    prefix: str
-    suffix: str
-
-
-def iter_items(doc) -> Iterator[Item]:
-    if doc["max_line_length"] > 500:
-        return
-
-    if doc["avg_line_length"] < 10 or doc["avg_line_length"] > 200:
-        return
-
-    if doc["alphanum_fraction"] < 0.25:
-        return
-
-    for tag in doc["tags"]:
-        content = doc["content"]
-        name = get_content(content, tag["name_range"])
-        body = get_content(content, tag["range"])
-
-        prefix = get_prefix(content, tag["range"]["start"])
-        suffix = get_suffix(content, tag["range"]["end"])
-
-        yield Item(
-            name=name,
-            body=body,
-            prefix=prefix,
-            suffix=suffix,
-            git_url=doc["git_url"],
-            filepath=doc["filepath"],
-            language=doc["language"],
-        )
-
-
-def iter_docs(filepattern: str):
-    for filepath in glob.glob(filepattern):
-        with open(filepath) as f:
-            for line in f:
-                yield json.loads(line)
-
-
-def get_content(content: str, range: dict):
-    return content[range["start"] : range["end"]]
-
-
-def get_prefix(content: str, start: int, max=20):
-    num_lines = 0
-    prefix_start = 0
-    for prefix_start in range(start - 1, 0, -1):
-        if content[prefix_start] == "\n":
-            num_lines += 1
-
-        if num_lines == max:
-            break
-
-    return content[prefix_start + 1 : start]
-
-
-def get_suffix(content: str, end: int, max=20):
-    num_lines = 0
-    suffix_end = end
-    for suffix_end in range(end, len(content)):
-        if content[suffix_end] == "\n":
-            num_lines += 1
-
-        if num_lines == max:
-            break
-
-    return content[end : suffix_end - 1]
-
-
-def items_from_filepattern(filepattern: str):
-    for doc in iter_docs(filepattern):
-        yield from iter_items(doc)
--- a/experimental/eval/requirements.txt
+++ b/experimental/eval/requirements.txt
@ -1,2 +0,0 @@
-git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client
-editdistance
--- a/experimental/eval/sample.jsonl
+++ b/experimental/eval/sample.jsonl
--- a/experimental/eval/skypilot.yaml
+++ b/experimental/eval/skypilot.yaml
@ -1,25 +0,0 @@
-resources:
-  accelerators: T4:1
-
-# tabby base dir
-workdir: ./
-
-setup: |
-  set -ex
-
-  # On some cloud providers, docker-compose is not installed by default.
-  sudo curl -L https://github.com/docker/compose/releases/download/v2.17.2/docker-compose-linux-x86_64 -o /usr/local/bin/docker-compose
-  sudo chmod a+x /usr/local/bin/docker-compose
-
-  # Install tabby python client.
-  pip install -r requirements.txt
-
-  # On certain cloud providers (e.g lambda cloud), the default user is not added to docker group, so we need sudo here
-  sudo docker-compose pull
-
-  # Add current user to docker group, it won't take effect immediately as skypilot job is started by a long-running daemon.
-  sudo usermod -aG docker $USER
-
-
-run: |
-  ./eval.sh
--- a/experimental/eval/tabby.py
+++ b/experimental/eval/tabby.py
@ -0,0 +1,107 @@
+from pathlib import Path
+
+from modal import Image, Mount, Secret, Stub, asgi_app, gpu, method
+
+GPU_CONFIG = gpu.T4()
+MODEL_ID = "TabbyML/StarCoder-1B"
+LAUNCH_FLAGS = [
+    "serve",
+    "--model",
+    MODEL_ID,
+    "--port",
+    "8000",
+    "--device",
+    "cuda"
+]
+
+def download_model():
+    import subprocess
+
+    subprocess.run(
+        [
+            "/opt/tabby/bin/tabby",
+            "download",
+            "--model",
+            MODEL_ID,
+        ]
+    )
+
+
+image = (
+        Image.from_registry("tabbyml/tabby:0.3.0", add_python="3.11")
+        .dockerfile_commands("ENTRYPOINT []")
+        .run_function(download_model)
+        .pip_install("git+https://github.com/TabbyML/tabby.git#egg=tabby-python-client&subdirectory=clients/tabby-python-client")
+)
+
+stub = Stub("tabby-" + MODEL_ID.split("/")[-1], image=image)
+
+
+@stub.cls(
+    gpu=GPU_CONFIG,
+    allow_concurrent_inputs=10,
+    container_idle_timeout=60 * 10,
+    timeout=360,
+)
+class Model:
+    def __enter__(self):
+        import socket
+        import subprocess
+        import time
+
+        from tabby_client import Client
+
+        self.launcher = subprocess.Popen(
+            ["/opt/tabby/bin/tabby"] + LAUNCH_FLAGS
+        )
+        self.client = Client("http://127.0.0.1:8000")
+
+        # Poll until webserver at 127.0.0.1:8000 accepts connections before running inputs.
+        def webserver_ready():
+            try:
+                socket.create_connection(("127.0.0.1", 8000), timeout=1).close()
+                return True
+            except (socket.timeout, ConnectionRefusedError):
+                # Check if launcher webserving process has exited.
+                # If so, a connection can never be made.
+                retcode = self.launcher.poll()
+                if retcode is not None:
+                    raise RuntimeError(
+                        f"launcher exited unexpectedly with code {retcode}"
+                    )
+                return False
+
+        while not webserver_ready():
+            time.sleep(1.0)
+
+        print("Tabby server ready!")
+
+    def __exit__(self, _exc_type, _exc_value, _traceback):
+        self.launcher.terminate()
+
+    @method()
+    async def complete(self, language: str, prompt: str):
+        from tabby_client.api.v1 import completion
+        from tabby_client.models import CompletionRequest, CompletionResponse, Segments
+
+        request = CompletionRequest(
+            language=language, prompt=prompt 
+        )
+        resp: CompletionResponse = await completion.asyncio(client=self.client, json_body=request)
+        return resp.choices[0].text
+
+
+@stub.local_entrypoint()
+def main():
+    import json
+
+    model = Model()
+    with open("./output.jsonl", "w") as fout:
+        with open("./sample.jsonl") as fin:
+            for line in fin:
+                x = json.loads(line)
+                prompt = x['crossfile_context']['text'] + x['prompt']
+                label = x['groundtruth']
+                prediction = model.complete.remote("python", prompt)
+
+                json.dump(dict(prompt=prompt, label=label, prediction=prediction), fout)
--- a/experimental/eval/visualize.py
+++ b/experimental/eval/visualize.py
@ -1,23 +0,0 @@
-import streamlit as st
-import pandas as pd
-import altair as alt
-
-st.set_page_config(layout="wide")
-
-df = pd.read_json("reports.jsonl", lines=True)
-
-for _, v in df.iterrows():
-    col1, col2, col3 = st.columns(3)
-    with col1:
-        st.write("prompt")
-        st.code(v.prompt)
-    with col2:
-        st.write("prediction")
-        st.code(v.prediction)
-        st.write("label")
-        st.code(v.label)
-    with col3:
-        col1, col2 = st.columns(2)
-        st.metric("Line score", v.line_score)
-        st.metric("Block score", v.block_score)
-    st.divider()