tabby/crates/llama-cpp-bindings/src/engine.cc

#include "engine.h"

#include <functional>
#include <vector>

#include <ggml.h>
#include <llama.h>

namespace llama {
TextInferenceEngine::~TextInferenceEngine() {}

namespace {
static size_t N_BATCH = 512;

template<class T>
using owned = std::unique_ptr<T, std::function<void(T*)>>;

class TextInferenceEngineImpl : public TextInferenceEngine {
 public:
  TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :
    model_(std::move(model)),
    ctx_(std::move(ctx)) {
  }

  void start(rust::Slice<const uint32_t> input_token_ids) const override {
    auto* ctx = ctx_.get();
    llama_reset_timings(ctx);
    std::vector<llama_token> tokens_list(input_token_ids.begin(), input_token_ids.end());

    for (size_t i = 0; i < tokens_list.size(); i += N_BATCH) {
      const size_t size = std::min(N_BATCH, tokens_list.size() - i);
      eval(tokens_list.data() + i, size, /* reset = */ i == 0);
    }
  }

  uint32_t step() const override {
    const llama_token id = sample();
    eval(const_cast<llama_token*>(&id), 1, /* reset = */ false);
    return id;
  }

  void end() const override {
    llama_print_timings(ctx_.get());
  }

  uint32_t eos_token() const override {
    return llama_token_eos(ctx_.get());
  }

 private:
  uint32_t sample() const {
    auto* ctx = ctx_.get();

    auto logits = llama_get_logits(ctx);
    auto n_vocab = llama_n_vocab(llama_get_model(ctx));

    // Greedy sampling (always select the highest logit).
    return std::distance(logits, std::max_element(logits, logits + n_vocab));
  }

  bool eval(llama_token* data, size_t size, bool reset) const {
    auto* ctx = ctx_.get();
    if (llama_eval(
          ctx,
          data,
          size,
          reset ? 0 : llama_get_kv_cache_token_count(ctx))) {
      fprintf(stderr, "%s : failed to eval\n", __func__);
      return false;
    }

    return true;
  }

  owned<llama_model> model_;
  owned<llama_context> ctx_;
};

static int g_llama_cpp_log_level = 0;
static void llama_log_callback(ggml_log_level level, const char * text, void * user_data) {
  (void)user_data;
  if (level < g_llama_cpp_log_level) {
    fputs(text, stderr);
    fflush(stderr);
  }
}

struct BackendInitializer {
  BackendInitializer() {
    if (const char* level = std::getenv("LLAMA_CPP_LOG_LEVEL")) {
      g_llama_cpp_log_level = std::stoi(level);
    }
    llama_log_set(llama_log_callback, nullptr);
    llama_backend_init(false);
  }

  ~BackendInitializer() {
    llama_backend_free();
  }
};
} // namespace

std::shared_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
  static BackendInitializer initializer;

  llama_model_params model_params = llama_model_default_params();
  model_params.n_gpu_layers = 1;
  llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);

  if (!model) {
    fprintf(stderr , "%s: error: unable to load model\n" , __func__);
    return nullptr;
  }

  llama_context_params ctx_params = llama_context_default_params();
  ctx_params.n_ctx = 2048;
  ctx_params.n_batch = N_BATCH;
  llama_context* ctx = llama_new_context_with_model(model, ctx_params);

  return std::make_shared<TextInferenceEngineImpl>(
      owned<llama_model>(model, llama_free_model),
      owned<llama_context>(ctx, llama_free)
  );
}

}  // namespace tabby
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`#include "engine.h"`

			`#include <functional>`
			`#include <vector>`

			`#include <ggml.h>`
			`#include <llama.h>`

			`namespace llama {`
			`TextInferenceEngine::~TextInferenceEngine() {}`

			`namespace {`
feat: implement input truncation for llama-cpp-bindings (#416) * feat: implement input truncation for llama-cpp-bindings * set max input length to 1024 * fix: batching tokens with n_batches * fix batching 2023-09-08 16:20:51 +00:00			`static size_t N_BATCH = 512;`

feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`template<class T>`
			`using owned = std::unique_ptr<T, std::function<void(T*)>>;`

			`class TextInferenceEngineImpl : public TextInferenceEngine {`
			`public:`
			`TextInferenceEngineImpl(owned<llama_model> model, owned<llama_context> ctx) :`
			`model_(std::move(model)),`
			`ctx_(std::move(ctx)) {`
			`}`

fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`void start(rust::Slice<const uint32_t> input_token_ids) const override {`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`auto* ctx = ctx_.get();`
feat: tune llama metal backend performance (#393) * feat: support eos based stop * feat: print performance stats after each inference * update llama.cpp * update commits 2023-09-05 02:14:29 +00:00			`llama_reset_timings(ctx);`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`std::vector<llama_token> tokens_list(input_token_ids.begin(), input_token_ids.end());`
feat: implement input truncation for llama-cpp-bindings (#416) * feat: implement input truncation for llama-cpp-bindings * set max input length to 1024 * fix: batching tokens with n_batches * fix batching 2023-09-08 16:20:51 +00:00
			`for (size_t i = 0; i < tokens_list.size(); i += N_BATCH) {`
			`const size_t size = std::min(N_BATCH, tokens_list.size() - i);`
			`eval(tokens_list.data() + i, size, /* reset = */ i == 0);`
			`}`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`

fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`uint32_t step() const override {`
			`const llama_token id = sample();`
feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`eval(const_cast<llama_token>(&id), 1, / reset = */ false);`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`return id;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`

feat: tune llama metal backend performance (#393) * feat: support eos based stop * feat: print performance stats after each inference * update llama.cpp * update commits 2023-09-05 02:14:29 +00:00			`void end() const override {`
			`llama_print_timings(ctx_.get());`
			`}`

			`uint32_t eos_token() const override {`
			`return llama_token_eos(ctx_.get());`
			`}`

feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`private:`
			`uint32_t sample() const {`
			`auto* ctx = ctx_.get();`

			`auto logits = llama_get_logits(ctx);`
feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`auto n_vocab = llama_n_vocab(llama_get_model(ctx));`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00
			`// Greedy sampling (always select the highest logit).`
			`return std::distance(logits, std::max_element(logits, logits + n_vocab));`
			`}`

feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`bool eval(llama_token* data, size_t size, bool reset) const {`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`auto* ctx = ctx_.get();`
			`if (llama_eval(`
			`ctx,`
feat: implement input truncation for llama-cpp-bindings (#416) * feat: implement input truncation for llama-cpp-bindings * set max input length to 1024 * fix: batching tokens with n_batches * fix batching 2023-09-08 16:20:51 +00:00			`data,`
			`size,`
feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`reset ? 0 : llama_get_kv_cache_token_count(ctx))) {`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`fprintf(stderr, "%s : failed to eval\n", __func__);`
			`return false;`
			`}`

			`return true;`
			`}`

			`owned<llama_model> model_;`
			`owned<llama_context> ctx_;`
			`};`

feat: add LLAMA_CPP_LOG_LEVEL to control log level of llama.cpp (#436) 2023-09-12 14:41:39 +00:00			`static int g_llama_cpp_log_level = 0;`
feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`static void llama_log_callback(ggml_log_level level, const char * text, void * user_data) {`
feat: add LLAMA_CPP_LOG_LEVEL to control log level of llama.cpp (#436) 2023-09-12 14:41:39 +00:00			`(void)user_data;`
			`if (level < g_llama_cpp_log_level) {`
			`fputs(text, stderr);`
			`fflush(stderr);`
			`}`
			`}`

feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`struct BackendInitializer {`
			`BackendInitializer() {`
feat: add LLAMA_CPP_LOG_LEVEL to control log level of llama.cpp (#436) 2023-09-12 14:41:39 +00:00			`if (const char* level = std::getenv("LLAMA_CPP_LOG_LEVEL")) {`
			`g_llama_cpp_log_level = std::stoi(level);`
			`}`
			`llama_log_set(llama_log_callback, nullptr);`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`llama_backend_init(false);`
			`}`

			`~BackendInitializer() {`
			`llama_backend_free();`
			`}`
			`};`
			`} // namespace`

			`std::shared_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {`
			`static BackendInitializer initializer;`

feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`llama_model_params model_params = llama_model_default_params();`
			`model_params.n_gpu_layers = 1;`
			`llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00
			`if (!model) {`
			`fprintf(stderr , "%s: error: unable to load model\n" , __func__);`
			`return nullptr;`
			`}`

feat: update llama.cpp (#488) * feat: update llama.cpp * remove useless include 2023-09-28 23:59:59 +00:00			`llama_context_params ctx_params = llama_context_default_params();`
			`ctx_params.n_ctx = 2048;`
			`ctx_params.n_batch = N_BATCH;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`llama_context* ctx = llama_new_context_with_model(model, ctx_params);`

			`return std::make_shared<TextInferenceEngineImpl>(`
			`owned<llama_model>(model, llama_free_model),`
			`owned<llama_context>(ctx, llama_free)`
			`);`
			`}`

			`} // namespace tabby`