feat: upgrade llama.cpp (#645)

* feat: upgrade llama.cpp * update download files * update changelog * Update CHANGELOG.md * Update CHANGELOG.md
2023-10-27 12:18:46 -07:00 · 2023-10-27 12:18:46 -07:00 · f37840566b
parent 89d1765422
commit f37840566b
6 changed files with 14 additions and 18 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,8 +1,12 @@
 # v0.5.0 [Unreleased]

+## Notice
+* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252
+
 ## Features

 ## Fixes and Improvements
+
 * Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
 * add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637

--- a/crates/llama-cpp-bindings/llama.cpp
+++ b/crates/llama-cpp-bindings/llama.cpp
@ -1 +1 @@
-Subproject commit 6ed7dce31afdf4d5a11ed8bfd0f993dcb8df39c0
+Subproject commit 5cc49e631f0902f33b10b7703b4d174fd635ccd9
--- a/crates/llama-cpp-bindings/src/engine.cc
+++ b/crates/llama-cpp-bindings/src/engine.cc
@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
  TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
    model_(std::move(model)),
    ctx_(std::move(ctx)) {
-      batch_ = llama_batch_init(N_BATCH, 0);
  }

  void start(rust::Slice<const uint32_t> input_token_ids) override {
@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
  }

  uint32_t eos_token() const override {
-    return llama_token_eos(ctx_.get());
+    return llama_token_eos(llama_get_model(ctx_.get()));
  }

 private:
  uint32_t sample() const {
    auto* ctx = ctx_.get();

-    auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
+    auto logits = llama_get_logits_ith(ctx, 0);
    auto n_vocab = llama_n_vocab(llama_get_model(ctx));

    // Greedy sampling (always select the highest logit).
@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
      n_past_ = 0;
    }

-    batch_.n_tokens = size;
-    for (size_t i = 0; i < size; ++i) {
-      batch_.token[i] = data[i];
-      batch_.pos[i] = n_past_ + i;
-      batch_.seq_id[i] = 0;
-      batch_.logits[i] = false;
-    }
-    batch_.logits[size - 1] = true;
-
    auto* ctx = ctx_.get();
    llama_kv_cache_tokens_rm(ctx, n_past_, -1);
-    if (llama_decode(ctx, batch_)) {
+    if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
      throw std::runtime_error("Failed to eval");
    }

@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
  size_t n_past_;
  owned<llama_model> model_;
  owned<llama_context> ctx_;
-
-  llama_batch batch_;
 };

 static int g_llama_cpp_log_level = 0;
--- a/crates/tabby-common/src/path.rs
+++ b/crates/tabby-common/src/path.rs
@ -89,4 +89,8 @@ impl ModelDir {
    pub fn ggml_q8_0_file(&self) -> String {
        self.path_string("ggml/q8_0.gguf")
    }
+
+    pub fn ggml_q8_0_v2_file(&self) -> String {
+        self.path_string("ggml/q8_0.v2.gguf")
+    }
 }
--- a/crates/tabby-download/src/lib.rs
+++ b/crates/tabby-download/src/lib.rs
@ -48,7 +48,7 @@ impl Downloader {
        let files = vec![
            ("tabby.json", true),
            ("tokenizer.json", true),
-            ("ggml/q8_0.gguf", true),
+            ("ggml/q8_0.v2.gguf", true),
        ];
        self.download_files(&files).await
    }
--- a/crates/tabby/src/serve/engine.rs
+++ b/crates/tabby/src/serve/engine.rs
@ -82,7 +82,7 @@ fn create_ctranslate2_engine(

 fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
    let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
-        .model_path(model_dir.ggml_q8_0_file())
+        .model_path(model_dir.ggml_q8_0_v2_file())
        .tokenizer_path(model_dir.tokenizer_file())
        .use_gpu(device.ggml_use_gpu())
        .build()