feat: upgrade llama.cpp (#645)
* feat: upgrade llama.cpp * update download files * update changelog * Update CHANGELOG.md * Update CHANGELOG.mdrelease-notes-05
parent
89d1765422
commit
f37840566b
|
|
@ -1,8 +1,12 @@
|
||||||
# v0.5.0 [Unreleased]
|
# v0.5.0 [Unreleased]
|
||||||
|
|
||||||
|
## Notice
|
||||||
|
* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252
|
||||||
|
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
## Fixes and Improvements
|
## Fixes and Improvements
|
||||||
|
|
||||||
* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
|
* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
|
||||||
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
|
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1 +1 @@
|
||||||
Subproject commit 6ed7dce31afdf4d5a11ed8bfd0f993dcb8df39c0
|
Subproject commit 5cc49e631f0902f33b10b7703b4d174fd635ccd9
|
||||||
|
|
@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
||||||
TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
|
TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
|
||||||
model_(std::move(model)),
|
model_(std::move(model)),
|
||||||
ctx_(std::move(ctx)) {
|
ctx_(std::move(ctx)) {
|
||||||
batch_ = llama_batch_init(N_BATCH, 0);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void start(rust::Slice<const uint32_t> input_token_ids) override {
|
void start(rust::Slice<const uint32_t> input_token_ids) override {
|
||||||
|
|
@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t eos_token() const override {
|
uint32_t eos_token() const override {
|
||||||
return llama_token_eos(ctx_.get());
|
return llama_token_eos(llama_get_model(ctx_.get()));
|
||||||
}
|
}
|
||||||
|
|
||||||
private:
|
private:
|
||||||
uint32_t sample() const {
|
uint32_t sample() const {
|
||||||
auto* ctx = ctx_.get();
|
auto* ctx = ctx_.get();
|
||||||
|
|
||||||
auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
|
auto logits = llama_get_logits_ith(ctx, 0);
|
||||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||||
|
|
||||||
// Greedy sampling (always select the highest logit).
|
// Greedy sampling (always select the highest logit).
|
||||||
|
|
@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
||||||
n_past_ = 0;
|
n_past_ = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
batch_.n_tokens = size;
|
|
||||||
for (size_t i = 0; i < size; ++i) {
|
|
||||||
batch_.token[i] = data[i];
|
|
||||||
batch_.pos[i] = n_past_ + i;
|
|
||||||
batch_.seq_id[i] = 0;
|
|
||||||
batch_.logits[i] = false;
|
|
||||||
}
|
|
||||||
batch_.logits[size - 1] = true;
|
|
||||||
|
|
||||||
auto* ctx = ctx_.get();
|
auto* ctx = ctx_.get();
|
||||||
llama_kv_cache_tokens_rm(ctx, n_past_, -1);
|
llama_kv_cache_tokens_rm(ctx, n_past_, -1);
|
||||||
if (llama_decode(ctx, batch_)) {
|
if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
|
||||||
throw std::runtime_error("Failed to eval");
|
throw std::runtime_error("Failed to eval");
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
||||||
size_t n_past_;
|
size_t n_past_;
|
||||||
owned<llama_model> model_;
|
owned<llama_model> model_;
|
||||||
owned<llama_context> ctx_;
|
owned<llama_context> ctx_;
|
||||||
|
|
||||||
llama_batch batch_;
|
|
||||||
};
|
};
|
||||||
|
|
||||||
static int g_llama_cpp_log_level = 0;
|
static int g_llama_cpp_log_level = 0;
|
||||||
|
|
|
||||||
|
|
@ -89,4 +89,8 @@ impl ModelDir {
|
||||||
pub fn ggml_q8_0_file(&self) -> String {
|
pub fn ggml_q8_0_file(&self) -> String {
|
||||||
self.path_string("ggml/q8_0.gguf")
|
self.path_string("ggml/q8_0.gguf")
|
||||||
}
|
}
|
||||||
|
|
||||||
|
pub fn ggml_q8_0_v2_file(&self) -> String {
|
||||||
|
self.path_string("ggml/q8_0.v2.gguf")
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -48,7 +48,7 @@ impl Downloader {
|
||||||
let files = vec![
|
let files = vec![
|
||||||
("tabby.json", true),
|
("tabby.json", true),
|
||||||
("tokenizer.json", true),
|
("tokenizer.json", true),
|
||||||
("ggml/q8_0.gguf", true),
|
("ggml/q8_0.v2.gguf", true),
|
||||||
];
|
];
|
||||||
self.download_files(&files).await
|
self.download_files(&files).await
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -82,7 +82,7 @@ fn create_ctranslate2_engine(
|
||||||
|
|
||||||
fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
||||||
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
||||||
.model_path(model_dir.ggml_q8_0_file())
|
.model_path(model_dir.ggml_q8_0_v2_file())
|
||||||
.tokenizer_path(model_dir.tokenizer_file())
|
.tokenizer_path(model_dir.tokenizer_file())
|
||||||
.use_gpu(device.ggml_use_gpu())
|
.use_gpu(device.ggml_use_gpu())
|
||||||
.build()
|
.build()
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue