feat: upgrade llama.cpp (#645)
* feat: upgrade llama.cpp * update download files * update changelog * Update CHANGELOG.md * Update CHANGELOG.mdrelease-notes-05
parent
89d1765422
commit
f37840566b
|
|
@ -1,8 +1,12 @@
|
|||
# v0.5.0 [Unreleased]
|
||||
|
||||
## Notice
|
||||
* llama.cpp backend (CPU, Metal) now requires a redownload of gguf model due to upstream format changes: https://github.com/TabbyML/tabby/pull/645 https://github.com/ggerganov/llama.cpp/pull/3252
|
||||
|
||||
## Features
|
||||
|
||||
## Fixes and Improvements
|
||||
|
||||
* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
|
||||
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
|
||||
|
||||
|
|
|
|||
|
|
@ -1 +1 @@
|
|||
Subproject commit 6ed7dce31afdf4d5a11ed8bfd0f993dcb8df39c0
|
||||
Subproject commit 5cc49e631f0902f33b10b7703b4d174fd635ccd9
|
||||
|
|
@ -21,7 +21,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
|||
TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
|
||||
model_(std::move(model)),
|
||||
ctx_(std::move(ctx)) {
|
||||
batch_ = llama_batch_init(N_BATCH, 0);
|
||||
}
|
||||
|
||||
void start(rust::Slice<const uint32_t> input_token_ids) override {
|
||||
|
|
@ -46,14 +45,14 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
|||
}
|
||||
|
||||
uint32_t eos_token() const override {
|
||||
return llama_token_eos(ctx_.get());
|
||||
return llama_token_eos(llama_get_model(ctx_.get()));
|
||||
}
|
||||
|
||||
private:
|
||||
uint32_t sample() const {
|
||||
auto* ctx = ctx_.get();
|
||||
|
||||
auto logits = llama_get_logits_ith(ctx, batch_.n_tokens - 1);
|
||||
auto logits = llama_get_logits_ith(ctx, 0);
|
||||
auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
||||
|
||||
// Greedy sampling (always select the highest logit).
|
||||
|
|
@ -65,18 +64,9 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
|||
n_past_ = 0;
|
||||
}
|
||||
|
||||
batch_.n_tokens = size;
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
batch_.token[i] = data[i];
|
||||
batch_.pos[i] = n_past_ + i;
|
||||
batch_.seq_id[i] = 0;
|
||||
batch_.logits[i] = false;
|
||||
}
|
||||
batch_.logits[size - 1] = true;
|
||||
|
||||
auto* ctx = ctx_.get();
|
||||
llama_kv_cache_tokens_rm(ctx, n_past_, -1);
|
||||
if (llama_decode(ctx, batch_)) {
|
||||
if (llama_decode(ctx, llama_batch_get_one(data, size, n_past_, 0))) {
|
||||
throw std::runtime_error("Failed to eval");
|
||||
}
|
||||
|
||||
|
|
@ -86,8 +76,6 @@ class TextInferenceEngineImpl : public TextInferenceEngine {
|
|||
size_t n_past_;
|
||||
owned<llama_model> model_;
|
||||
owned<llama_context> ctx_;
|
||||
|
||||
llama_batch batch_;
|
||||
};
|
||||
|
||||
static int g_llama_cpp_log_level = 0;
|
||||
|
|
|
|||
|
|
@ -89,4 +89,8 @@ impl ModelDir {
|
|||
pub fn ggml_q8_0_file(&self) -> String {
|
||||
self.path_string("ggml/q8_0.gguf")
|
||||
}
|
||||
|
||||
pub fn ggml_q8_0_v2_file(&self) -> String {
|
||||
self.path_string("ggml/q8_0.v2.gguf")
|
||||
}
|
||||
}
|
||||
|
|
|
|||
|
|
@ -48,7 +48,7 @@ impl Downloader {
|
|||
let files = vec![
|
||||
("tabby.json", true),
|
||||
("tokenizer.json", true),
|
||||
("ggml/q8_0.gguf", true),
|
||||
("ggml/q8_0.v2.gguf", true),
|
||||
];
|
||||
self.download_files(&files).await
|
||||
}
|
||||
|
|
|
|||
|
|
@ -82,7 +82,7 @@ fn create_ctranslate2_engine(
|
|||
|
||||
fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
||||
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
||||
.model_path(model_dir.ggml_q8_0_file())
|
||||
.model_path(model_dir.ggml_q8_0_v2_file())
|
||||
.tokenizer_path(model_dir.tokenizer_file())
|
||||
.use_gpu(device.ggml_use_gpu())
|
||||
.build()
|
||||
|
|
|
|||
Loading…
Reference in New Issue