tabby/deployment/docker-compose.triton.yml

version: '3.3'

services:
  server:
    image: tabbyml/tabby
    environment:
      - MODEL_BACKEND=triton
    depends_on:
      triton:
        condition: service_healthy

  triton:
    image: tabbyml/fastertransformer_backend
    container_name: tabby-triton
    command: /scripts/triton.sh
    shm_size: 1gb
    volumes:
      - ./scripts:/scripts
      - ./data/hf_cache:/root/.cache/huggingface
    deploy:
      resources:
        reservations:
          devices:
            - driver: nvidia
              count: all
              capabilities: [gpu]
    environment:
      - MODEL_NAME=TabbyML/NeoX-70M
    depends_on:
      init:
        condition: service_completed_successfully
    healthcheck:
      test: ["CMD", "curl", "-f", "http://localhost:8002/metrics"]
      interval: 2s
      timeout: 2s
      start_period: 120s