feat: swtich cpu backend to llama.cpp (#638)
* feat: swtich Cpu backend to llama.cpp * feat: switch cpu serving to ggml * fix cargo.toml * use optional dependency * fix compliation * update ci targetadd-llama-model-converter
parent
21ec60eddf
commit
1a4c2aa71f
|
|
@ -111,7 +111,7 @@ jobs:
|
|||
- run: bash ./ci/prepare_build_environment.sh
|
||||
|
||||
- name: Bulid release binary
|
||||
run: cargo build --no-default-features --release --target ${{ matrix.target }}
|
||||
run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby
|
||||
|
||||
- name: Rename release binary
|
||||
run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}
|
||||
|
|
|
|||
|
|
@ -3,6 +3,8 @@
|
|||
## Features
|
||||
|
||||
## Fixes and Improvements
|
||||
* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
|
||||
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
|
||||
|
||||
# v0.4.0
|
||||
|
||||
|
|
|
|||
|
|
@ -16,5 +16,5 @@ class TextInferenceEngine {
|
|||
virtual uint32_t eos_token() const = 0;
|
||||
};
|
||||
|
||||
std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path);
|
||||
std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path);
|
||||
} // namespace
|
||||
|
|
|
|||
|
|
@ -114,11 +114,11 @@ struct BackendInitializer {
|
|||
};
|
||||
} // namespace
|
||||
|
||||
std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
|
||||
std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path) {
|
||||
static BackendInitializer initializer;
|
||||
|
||||
llama_model_params model_params = llama_model_default_params();
|
||||
model_params.n_gpu_layers = 1;
|
||||
model_params.n_gpu_layers = use_gpu ? 1 : 0;
|
||||
llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);
|
||||
|
||||
if (!model) {
|
||||
|
|
|
|||
|
|
@ -15,7 +15,7 @@ mod ffi {
|
|||
|
||||
type TextInferenceEngine;
|
||||
|
||||
fn create_engine(model_path: &str) -> UniquePtr<TextInferenceEngine>;
|
||||
fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr<TextInferenceEngine>;
|
||||
|
||||
fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]);
|
||||
fn step(self: Pin<&mut TextInferenceEngine>) -> Result<u32>;
|
||||
|
|
@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {}
|
|||
pub struct LlamaEngineOptions {
|
||||
model_path: String,
|
||||
tokenizer_path: String,
|
||||
use_gpu: bool,
|
||||
}
|
||||
|
||||
pub struct LlamaEngine {
|
||||
|
|
@ -42,7 +43,7 @@ pub struct LlamaEngine {
|
|||
|
||||
impl LlamaEngine {
|
||||
pub fn create(options: LlamaEngineOptions) -> Self {
|
||||
let engine = create_engine(&options.model_path);
|
||||
let engine = create_engine(options.use_gpu, &options.model_path);
|
||||
if engine.is_null() {
|
||||
panic!("Unable to load model: {}", options.model_path);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -4,7 +4,6 @@ version = "0.5.0-dev"
|
|||
edition = "2021"
|
||||
|
||||
[dependencies]
|
||||
ctranslate2-bindings = { path = "../ctranslate2-bindings" }
|
||||
tabby-common = { path = "../tabby-common" }
|
||||
tabby-scheduler = { path = "../tabby-scheduler" }
|
||||
tabby-download = { path = "../tabby-download" }
|
||||
|
|
@ -43,9 +42,8 @@ minijinja = { version = "1.0.8", features = ["loader"] }
|
|||
textdistance = "1.0.2"
|
||||
regex.workspace = true
|
||||
thiserror.workspace = true
|
||||
|
||||
[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies]
|
||||
llama-cpp-bindings = { path = "../llama-cpp-bindings" }
|
||||
ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }
|
||||
|
||||
[dependencies.uuid]
|
||||
version = "1.3.3"
|
||||
|
|
@ -57,6 +55,7 @@ features = [
|
|||
|
||||
[features]
|
||||
link_shared = ["ctranslate2-bindings/link_shared"]
|
||||
link_cuda_static = ["ctranslate2-bindings"]
|
||||
|
||||
[build-dependencies]
|
||||
vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
|
||||
|
|
|
|||
|
|
@ -1,6 +1,5 @@
|
|||
use std::path::Path;
|
||||
|
||||
use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
|
||||
use serde::Deserialize;
|
||||
use tabby_common::path::ModelDir;
|
||||
use tabby_inference::TextGeneration;
|
||||
|
|
@ -39,33 +38,36 @@ pub struct EngineInfo {
|
|||
pub chat_template: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||
#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))]
|
||||
fn create_local_engine(
|
||||
args: &crate::serve::ServeArgs,
|
||||
model_dir: &ModelDir,
|
||||
metadata: &Metadata,
|
||||
_metadata: &Metadata,
|
||||
) -> Box<dyn TextGeneration> {
|
||||
create_ctranslate2_engine(args, model_dir, metadata)
|
||||
create_ggml_engine(&args.device, model_dir)
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||
fn create_local_engine(
|
||||
args: &crate::serve::ServeArgs,
|
||||
model_dir: &ModelDir,
|
||||
metadata: &Metadata,
|
||||
) -> Box<dyn TextGeneration> {
|
||||
if args.device != super::Device::Metal {
|
||||
create_ctranslate2_engine(args, model_dir, metadata)
|
||||
if args.device.use_ggml_backend() {
|
||||
create_ggml_engine(&args.device, model_dir)
|
||||
} else {
|
||||
create_llama_engine(model_dir)
|
||||
create_ctranslate2_engine(args, model_dir, metadata)
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||
fn create_ctranslate2_engine(
|
||||
args: &crate::serve::ServeArgs,
|
||||
model_dir: &ModelDir,
|
||||
metadata: &Metadata,
|
||||
) -> Box<dyn TextGeneration> {
|
||||
use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
|
||||
|
||||
let device = format!("{}", args.device);
|
||||
let options = CTranslate2EngineOptionsBuilder::default()
|
||||
.model_path(model_dir.ctranslate2_dir())
|
||||
|
|
@ -78,11 +80,11 @@ fn create_ctranslate2_engine(
|
|||
Box::new(CTranslate2Engine::create(options))
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
fn create_llama_engine(model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
||||
fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
||||
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
||||
.model_path(model_dir.ggml_q8_0_file())
|
||||
.tokenizer_path(model_dir.tokenizer_file())
|
||||
.use_gpu(device.ggml_use_gpu())
|
||||
.build()
|
||||
.unwrap();
|
||||
|
||||
|
|
@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir {
|
|||
|
||||
#[derive(Deserialize)]
|
||||
struct Metadata {
|
||||
#[allow(dead_code)]
|
||||
auto_model: String,
|
||||
prompt_template: Option<String>,
|
||||
chat_template: Option<String>,
|
||||
|
|
|
|||
|
|
@ -74,7 +74,7 @@ pub enum Device {
|
|||
#[strum(serialize = "cpu")]
|
||||
Cpu,
|
||||
|
||||
#[strum(serialize = "cuda")]
|
||||
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||
Cuda,
|
||||
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
|
|
@ -85,6 +85,28 @@ pub enum Device {
|
|||
ExperimentalHttp,
|
||||
}
|
||||
|
||||
impl Device {
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
fn use_ggml_backend(&self) -> bool {
|
||||
*self == Device::Metal || *self == Device::Cpu
|
||||
}
|
||||
|
||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||
fn use_ggml_backend(&self) -> bool {
|
||||
*self == Device::Cpu
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
fn ggml_use_gpu(&self) -> bool {
|
||||
*self == Device::Metal
|
||||
}
|
||||
|
||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||
fn ggml_use_gpu(&self) -> bool {
|
||||
false
|
||||
}
|
||||
}
|
||||
|
||||
#[derive(Args)]
|
||||
pub struct ServeArgs {
|
||||
/// Model id for `/completions` API endpoint.
|
||||
|
|
@ -115,16 +137,6 @@ pub struct ServeArgs {
|
|||
compute_type: Option<String>,
|
||||
}
|
||||
|
||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||
fn should_download_ggml_files(_device: &Device) -> bool {
|
||||
false
|
||||
}
|
||||
|
||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||
fn should_download_ggml_files(device: &Device) -> bool {
|
||||
*device == Device::Metal
|
||||
}
|
||||
|
||||
pub async fn main(config: &Config, args: &ServeArgs) {
|
||||
valid_args(args);
|
||||
|
||||
|
|
@ -275,7 +287,7 @@ fn start_heartbeat(args: &ServeArgs) {
|
|||
async fn download_model(model: &str, device: &Device) {
|
||||
let downloader = Downloader::new(model, /* prefer_local_file= */ true);
|
||||
let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,);
|
||||
let download_result = if should_download_ggml_files(device) {
|
||||
let download_result = if device.use_ggml_backend() {
|
||||
downloader.download_ggml_files().await
|
||||
} else {
|
||||
downloader.download_ctranslate2_files().await
|
||||
|
|
|
|||
|
|
@ -17,7 +17,6 @@ We recommend using
|
|||
| [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||
| [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||
| [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||
| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M) | [BSD-3](https://opensource.org/license/bsd-3-clause/) | ❌ | ❌ |
|
||||
|
||||
## Chat models (`--chat-model`)
|
||||
|
||||
|
|
|
|||
Loading…
Reference in New Issue