tabby/crates/llama-cpp-bindings/src/lib.rs

use std::sync::Arc;

use async_stream::stream;
use async_trait::async_trait;
use derive_builder::Builder;
use ffi::create_engine;
use futures::{lock::Mutex, stream::BoxStream};
use tabby_inference::{decoding::DecodingFactory, helpers, TextGeneration, TextGenerationOptions};
use tokenizers::tokenizer::Tokenizer;

#[cxx::bridge(namespace = "llama")]
mod ffi {
    unsafe extern "C++" {
        include!("llama-cpp-bindings/include/engine.h");

        type TextInferenceEngine;

        fn create_engine(model_path: &str) -> SharedPtr<TextInferenceEngine>;

        fn start(&self, input_token_ids: &[u32]);
        fn step(&self) -> u32;
        fn end(&self);

        fn eos_token(&self) -> u32;
    }
}

unsafe impl Send for ffi::TextInferenceEngine {}
unsafe impl Sync for ffi::TextInferenceEngine {}

#[derive(Builder, Debug)]
pub struct LlamaEngineOptions {
    model_path: String,
    tokenizer_path: String,
}

pub struct LlamaEngine {
    engine: Mutex<cxx::SharedPtr<ffi::TextInferenceEngine>>,
    tokenizer: Arc<Tokenizer>,
    decoding_factory: DecodingFactory,
}

impl LlamaEngine {
    pub fn create(options: LlamaEngineOptions) -> Self {
        LlamaEngine {
            engine: Mutex::new(create_engine(&options.model_path)),
            tokenizer: Arc::new(Tokenizer::from_file(&options.tokenizer_path).unwrap()),
            decoding_factory: DecodingFactory::default(),
        }
    }
}

#[async_trait]
impl TextGeneration for LlamaEngine {
    async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> String {
        let s = self.generate_stream(prompt, options).await;
        helpers::stream_to_string(s).await
    }

    async fn generate_stream(
        &self,
        prompt: &str,
        options: TextGenerationOptions,
    ) -> BoxStream<String> {
        let encoding = self.tokenizer.encode(prompt, true).unwrap();

        let s = stream! {
            let engine = self.engine.lock().await;
            let eos_token = engine.eos_token();

            let input_token_ids = truncate_tokens(encoding.get_ids(), options.max_input_length);
            engine.start(input_token_ids);
            let mut decoding = self.decoding_factory.create_incremental_decoding(self.tokenizer.clone(), input_token_ids, options.stop_words);
            let mut n_remains = options.max_decoding_length ;
            while n_remains > 0 {
                let next_token_id = engine.step();
                if next_token_id == eos_token {
                    break;
                }

                if let Some(new_text) = decoding.next_token(next_token_id) {
                    yield new_text;
                } else {
                    break;
                }

                n_remains -= 1;
            }

            engine.end();
        };

        Box::pin(s)
    }
}

fn truncate_tokens(tokens: &[u32], max_length: usize) -> &[u32] {
    if max_length < tokens.len() {
        let start = tokens.len() - max_length;
        &tokens[start..]
    } else {
        tokens
    }
}
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`use std::sync::Arc;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`use async_stream::stream;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`use async_trait::async_trait;`
			`use derive_builder::Builder;`
			`use ffi::create_engine;`
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`use futures::{lock::Mutex, stream::BoxStream};`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`use tabby_inference::{decoding::DecodingFactory, helpers, TextGeneration, TextGenerationOptions};`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`use tokenizers::tokenizer::Tokenizer;`

			`#[cxx::bridge(namespace = "llama")]`
			`mod ffi {`
			`unsafe extern "C++" {`
			`include!("llama-cpp-bindings/include/engine.h");`

			`type TextInferenceEngine;`

			`fn create_engine(model_path: &str) -> SharedPtr<TextInferenceEngine>;`

fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`fn start(&self, input_token_ids: &[u32]);`
			`fn step(&self) -> u32;`
feat: tune llama metal backend performance (#393) * feat: support eos based stop * feat: print performance stats after each inference * update llama.cpp * update commits 2023-09-05 02:14:29 +00:00			`fn end(&self);`

			`fn eos_token(&self) -> u32;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`
			`}`

			`unsafe impl Send for ffi::TextInferenceEngine {}`
			`unsafe impl Sync for ffi::TextInferenceEngine {}`

			`#[derive(Builder, Debug)]`
			`pub struct LlamaEngineOptions {`
			`model_path: String,`
			`tokenizer_path: String,`
			`}`

			`pub struct LlamaEngine {`
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`engine: Mutex<cxx::SharedPtr<ffi::TextInferenceEngine>>,`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`tokenizer: Arc<Tokenizer>,`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`decoding_factory: DecodingFactory,`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`

			`impl LlamaEngine {`
			`pub fn create(options: LlamaEngineOptions) -> Self {`
			`LlamaEngine {`
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`engine: Mutex::new(create_engine(&options.model_path)),`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`tokenizer: Arc::new(Tokenizer::from_file(&options.tokenizer_path).unwrap()),`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`decoding_factory: DecodingFactory::default(),`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`
			`}`
			`}`

			`#[async_trait]`
			`impl TextGeneration for LlamaEngine {`
			`async fn generate(&self, prompt: &str, options: TextGenerationOptions) -> String {`
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`let s = self.generate_stream(prompt, options).await;`
			`helpers::stream_to_string(s).await`
			`}`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`async fn generate_stream(`
			`&self,`
			`prompt: &str,`
			`options: TextGenerationOptions,`
			`) -> BoxStream<String> {`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`let encoding = self.tokenizer.encode(prompt, true).unwrap();`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`let s = stream! {`
			`let engine = self.engine.lock().await;`
feat: tune llama metal backend performance (#393) * feat: support eos based stop * feat: print performance stats after each inference * update llama.cpp * update commits 2023-09-05 02:14:29 +00:00			`let eos_token = engine.eos_token();`

fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00			`let input_token_ids = truncate_tokens(encoding.get_ids(), options.max_input_length);`
			`engine.start(input_token_ids);`
			`let mut decoding = self.decoding_factory.create_incremental_decoding(self.tokenizer.clone(), input_token_ids, options.stop_words);`
			`let mut n_remains = options.max_decoding_length ;`
			`while n_remains > 0 {`
			`let next_token_id = engine.step();`
			`if next_token_id == eos_token {`
			`break;`
feat: support cancellation in llama backend [TAB-146] (#392) * feat: support cancellation in llama backend * fix lint 2023-09-03 02:15:54 +00:00			`}`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00
			`if let Some(new_text) = decoding.next_token(next_token_id) {`
			`yield new_text;`
			`} else {`
			`break;`
			`}`

			`n_remains -= 1;`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`
feat: support cancellation in llama backend [TAB-146] (#392) * feat: support cancellation in llama backend * fix lint 2023-09-03 02:15:54 +00:00
feat: tune llama metal backend performance (#393) * feat: support eos based stop * feat: print performance stats after each inference * update llama.cpp * update commits 2023-09-05 02:14:29 +00:00			`engine.end();`
feat: add /generate and /generate_streaming (#482) * feat: add generate_stream interface * extract engine::create_engine * feat add generate::generate * support streaming in llama.cpp * support streaming in ctranslate2 * update * fix formatting * refactor: extract helpers functions 2023-09-28 17:20:50 +00:00			`};`

			`Box::pin(s)`
feat: llama.cpp for metal support [TAB-146] (#391) * feat: init commit adding llama-cpp-bindings * add llama.cpp submodule * add LlamaEngine to hold llama context / llama model * add cxxbridge * add basic greedy sampling * move files * make compile success * connect TextGeneration with LlamaEngine * experimental support llama.cpp * add metal device * add Accelerate * fix namespace for llama-cpp-bindings * fix lint * move stepping logic to rust * add stop words package * use stop-words in ctranslate2-bindings * use raw string for regex * use Arc<Tokenizer> for sharing tokenizers * refactor: remove useless stop_words_encoding_offset * switch to tokenizers 0.13.4-rc.3 * fix lints in cpp * simplify implementation of greedy decoding * feat: split metal feature for llama backend * add ci * update ci * build tabby bin in ci build 2023-09-03 01:59:07 +00:00			`}`
			`}`
fix: correct Decoding behavior in incremental manner (#491) * feat: implement IncrementalDecoding * refactor: use IncrementalDecoding for ctranslate2 * refactor: rename StopWords to DecodingFactory * refactor: move decoding logic to tabby-inference * feat: optimize decoding range * cleanup 2023-09-29 13:06:47 +00:00
			`fn truncate_tokens(tokens: &[u32], max_length: usize) -> &[u32] {`
			`if max_length < tokens.len() {`
			`let start = tokens.len() - max_length;`
			`&tokens[start..]`
			`} else {`
			`tokens`
			`}`
			`}`