feat: swtich cpu backend to llama.cpp (#638)

* feat: swtich Cpu backend to llama.cpp * feat: switch cpu serving to ggml * fix cargo.toml * use optional dependency * fix compliation * update ci target
2023-10-25 15:40:11 -07:00 · 2023-10-25 15:40:11 -07:00 · 1a4c2aa71f
parent 21ec60eddf
commit 1a4c2aa71f
9 changed files with 48 additions and 32 deletions
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@ -111,7 +111,7 @@ jobs:
      - run: bash ./ci/prepare_build_environment.sh

      - name: Bulid release binary
-        run: cargo build --no-default-features --release --target ${{ matrix.target }}
+        run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby

      - name: Rename release binary
        run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -3,6 +3,8 @@
 ## Features

 ## Fixes and Improvements
+* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
+* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637

 # v0.4.0

--- a/crates/llama-cpp-bindings/include/engine.h
+++ b/crates/llama-cpp-bindings/include/engine.h
@ -16,5 +16,5 @@ class TextInferenceEngine {
  virtual uint32_t eos_token() const = 0;
 };

-std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path);
+std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path);
 }  // namespace
--- a/crates/llama-cpp-bindings/src/engine.cc
+++ b/crates/llama-cpp-bindings/src/engine.cc
@ -114,11 +114,11 @@ struct BackendInitializer {
 };
 } // namespace

-std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
+std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path) {
  static BackendInitializer initializer;

  llama_model_params model_params = llama_model_default_params();
-  model_params.n_gpu_layers = 1;
+  model_params.n_gpu_layers = use_gpu ? 1 : 0;
  llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);

  if (!model) {
--- a/crates/llama-cpp-bindings/src/lib.rs
+++ b/crates/llama-cpp-bindings/src/lib.rs
@ -15,7 +15,7 @@ mod ffi {

        type TextInferenceEngine;

-        fn create_engine(model_path: &str) -> UniquePtr<TextInferenceEngine>;
+        fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr<TextInferenceEngine>;

        fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]);
        fn step(self: Pin<&mut TextInferenceEngine>) -> Result<u32>;
@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {}
 pub struct LlamaEngineOptions {
    model_path: String,
    tokenizer_path: String,
+    use_gpu: bool,
 }

 pub struct LlamaEngine {
@ -42,7 +43,7 @@ pub struct LlamaEngine {

 impl LlamaEngine {
    pub fn create(options: LlamaEngineOptions) -> Self {
-        let engine = create_engine(&options.model_path);
+        let engine = create_engine(options.use_gpu, &options.model_path);
        if engine.is_null() {
            panic!("Unable to load model: {}", options.model_path);
        }
--- a/crates/tabby/Cargo.toml
+++ b/crates/tabby/Cargo.toml
@ -4,7 +4,6 @@ version = "0.5.0-dev"
 edition = "2021"

 [dependencies]
-ctranslate2-bindings = { path = "../ctranslate2-bindings" }
 tabby-common = { path = "../tabby-common" }
 tabby-scheduler = { path = "../tabby-scheduler" }
 tabby-download = { path = "../tabby-download" }
@ -43,9 +42,8 @@ minijinja = { version = "1.0.8", features = ["loader"] }
 textdistance = "1.0.2"
 regex.workspace = true
 thiserror.workspace = true
-
-[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies]
 llama-cpp-bindings = { path = "../llama-cpp-bindings" }
+ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }

 [dependencies.uuid]
 version = "1.3.3"
@ -57,6 +55,7 @@ features = [

 [features]
 link_shared = ["ctranslate2-bindings/link_shared"]
+link_cuda_static = ["ctranslate2-bindings"]

 [build-dependencies]
 vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
--- a/crates/tabby/src/serve/engine.rs
+++ b/crates/tabby/src/serve/engine.rs
@ -1,6 +1,5 @@
 use std::path::Path;

-use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
 use serde::Deserialize;
 use tabby_common::path::ModelDir;
 use tabby_inference::TextGeneration;
@ -39,33 +38,36 @@ pub struct EngineInfo {
    pub chat_template: Option<String>,
 }

-#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))]
 fn create_local_engine(
    args: &crate::serve::ServeArgs,
    model_dir: &ModelDir,
-    metadata: &Metadata,
+    _metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    create_ctranslate2_engine(args, model_dir, metadata)
+    create_ggml_engine(&args.device, model_dir)
 }

-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
 fn create_local_engine(
    args: &crate::serve::ServeArgs,
    model_dir: &ModelDir,
    metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
-    if args.device != super::Device::Metal {
-        create_ctranslate2_engine(args, model_dir, metadata)
+    if args.device.use_ggml_backend() {
+        create_ggml_engine(&args.device, model_dir)
    } else {
-        create_llama_engine(model_dir)
+        create_ctranslate2_engine(args, model_dir, metadata)
    }
 }

+#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
 fn create_ctranslate2_engine(
    args: &crate::serve::ServeArgs,
    model_dir: &ModelDir,
    metadata: &Metadata,
 ) -> Box<dyn TextGeneration> {
+    use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
+
    let device = format!("{}", args.device);
    let options = CTranslate2EngineOptionsBuilder::default()
        .model_path(model_dir.ctranslate2_dir())
@ -78,11 +80,11 @@ fn create_ctranslate2_engine(
    Box::new(CTranslate2Engine::create(options))
 }

-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-fn create_llama_engine(model_dir: &ModelDir) -> Box<dyn TextGeneration> {
+fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
    let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
        .model_path(model_dir.ggml_q8_0_file())
        .tokenizer_path(model_dir.tokenizer_file())
+        .use_gpu(device.ggml_use_gpu())
        .build()
        .unwrap();

@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir {

 #[derive(Deserialize)]
 struct Metadata {
+    #[allow(dead_code)]
    auto_model: String,
    prompt_template: Option<String>,
    chat_template: Option<String>,
--- a/crates/tabby/src/serve/mod.rs
+++ b/crates/tabby/src/serve/mod.rs
@ -74,7 +74,7 @@ pub enum Device {
    #[strum(serialize = "cpu")]
    Cpu,

-    #[strum(serialize = "cuda")]
+    #[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
    Cuda,

    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
@ -85,6 +85,28 @@ pub enum Device {
    ExperimentalHttp,
 }

+impl Device {
+    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+    fn use_ggml_backend(&self) -> bool {
+        *self == Device::Metal || *self == Device::Cpu
+    }
+
+    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+    fn use_ggml_backend(&self) -> bool {
+        *self == Device::Cpu
+    }
+
+    #[cfg(all(target_os = "macos", target_arch = "aarch64"))]
+    fn ggml_use_gpu(&self) -> bool {
+        *self == Device::Metal
+    }
+
+    #[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
+    fn ggml_use_gpu(&self) -> bool {
+        false
+    }
+}
+
 #[derive(Args)]
 pub struct ServeArgs {
    /// Model id for `/completions` API endpoint.
@ -115,16 +137,6 @@ pub struct ServeArgs {
    compute_type: Option<String>,
 }

-#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
-fn should_download_ggml_files(_device: &Device) -> bool {
-    false
-}
-
-#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
-fn should_download_ggml_files(device: &Device) -> bool {
-    *device == Device::Metal
-}
-
 pub async fn main(config: &Config, args: &ServeArgs) {
    valid_args(args);

@ -275,7 +287,7 @@ fn start_heartbeat(args: &ServeArgs) {
 async fn download_model(model: &str, device: &Device) {
    let downloader = Downloader::new(model, /* prefer_local_file= */ true);
    let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,);
-    let download_result = if should_download_ggml_files(device) {
+    let download_result = if device.use_ggml_backend() {
        downloader.download_ggml_files().await
    } else {
        downloader.download_ctranslate2_files().await
--- a/website/docs/models/index.md
+++ b/website/docs/models/index.md
@ -17,7 +17,6 @@ We recommend using
 | [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
 | [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
 | [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B)   | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) |        ✅         |       ✅       |
-| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M)               |                    [BSD-3](https://opensource.org/license/bsd-3-clause/)                    |        ❌         |       ❌       |

 ## Chat models (`--chat-model`)