feat: swtich cpu backend to llama.cpp (#638)
* feat: swtich Cpu backend to llama.cpp * feat: switch cpu serving to ggml * fix cargo.toml * use optional dependency * fix compliation * update ci targetadd-llama-model-converter
parent
21ec60eddf
commit
1a4c2aa71f
|
|
@ -111,7 +111,7 @@ jobs:
|
||||||
- run: bash ./ci/prepare_build_environment.sh
|
- run: bash ./ci/prepare_build_environment.sh
|
||||||
|
|
||||||
- name: Bulid release binary
|
- name: Bulid release binary
|
||||||
run: cargo build --no-default-features --release --target ${{ matrix.target }}
|
run: cargo build --no-default-features --release --target ${{ matrix.target }} --package tabby
|
||||||
|
|
||||||
- name: Rename release binary
|
- name: Rename release binary
|
||||||
run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}
|
run: mv target/${{ matrix.target }}/release/tabby tabby_${{ matrix.target }}
|
||||||
|
|
|
||||||
|
|
@ -3,6 +3,8 @@
|
||||||
## Features
|
## Features
|
||||||
|
|
||||||
## Fixes and Improvements
|
## Fixes and Improvements
|
||||||
|
* Switch cpu backend to llama.cpp: https://github.com/TabbyML/tabby/pull/638
|
||||||
|
* add `server.completion_timeout` to control the code completion interface timeout: https://github.com/TabbyML/tabby/pull/637
|
||||||
|
|
||||||
# v0.4.0
|
# v0.4.0
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -16,5 +16,5 @@ class TextInferenceEngine {
|
||||||
virtual uint32_t eos_token() const = 0;
|
virtual uint32_t eos_token() const = 0;
|
||||||
};
|
};
|
||||||
|
|
||||||
std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path);
|
std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path);
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
|
||||||
|
|
@ -114,11 +114,11 @@ struct BackendInitializer {
|
||||||
};
|
};
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
std::unique_ptr<TextInferenceEngine> create_engine(rust::Str model_path) {
|
std::unique_ptr<TextInferenceEngine> create_engine(bool use_gpu, rust::Str model_path) {
|
||||||
static BackendInitializer initializer;
|
static BackendInitializer initializer;
|
||||||
|
|
||||||
llama_model_params model_params = llama_model_default_params();
|
llama_model_params model_params = llama_model_default_params();
|
||||||
model_params.n_gpu_layers = 1;
|
model_params.n_gpu_layers = use_gpu ? 1 : 0;
|
||||||
llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);
|
llama_model* model = llama_load_model_from_file(std::string(model_path).c_str(), model_params);
|
||||||
|
|
||||||
if (!model) {
|
if (!model) {
|
||||||
|
|
|
||||||
|
|
@ -15,7 +15,7 @@ mod ffi {
|
||||||
|
|
||||||
type TextInferenceEngine;
|
type TextInferenceEngine;
|
||||||
|
|
||||||
fn create_engine(model_path: &str) -> UniquePtr<TextInferenceEngine>;
|
fn create_engine(use_gpu: bool, model_path: &str) -> UniquePtr<TextInferenceEngine>;
|
||||||
|
|
||||||
fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]);
|
fn start(self: Pin<&mut TextInferenceEngine>, input_token_ids: &[u32]);
|
||||||
fn step(self: Pin<&mut TextInferenceEngine>) -> Result<u32>;
|
fn step(self: Pin<&mut TextInferenceEngine>) -> Result<u32>;
|
||||||
|
|
@ -32,6 +32,7 @@ unsafe impl Sync for ffi::TextInferenceEngine {}
|
||||||
pub struct LlamaEngineOptions {
|
pub struct LlamaEngineOptions {
|
||||||
model_path: String,
|
model_path: String,
|
||||||
tokenizer_path: String,
|
tokenizer_path: String,
|
||||||
|
use_gpu: bool,
|
||||||
}
|
}
|
||||||
|
|
||||||
pub struct LlamaEngine {
|
pub struct LlamaEngine {
|
||||||
|
|
@ -42,7 +43,7 @@ pub struct LlamaEngine {
|
||||||
|
|
||||||
impl LlamaEngine {
|
impl LlamaEngine {
|
||||||
pub fn create(options: LlamaEngineOptions) -> Self {
|
pub fn create(options: LlamaEngineOptions) -> Self {
|
||||||
let engine = create_engine(&options.model_path);
|
let engine = create_engine(options.use_gpu, &options.model_path);
|
||||||
if engine.is_null() {
|
if engine.is_null() {
|
||||||
panic!("Unable to load model: {}", options.model_path);
|
panic!("Unable to load model: {}", options.model_path);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
|
|
@ -4,7 +4,6 @@ version = "0.5.0-dev"
|
||||||
edition = "2021"
|
edition = "2021"
|
||||||
|
|
||||||
[dependencies]
|
[dependencies]
|
||||||
ctranslate2-bindings = { path = "../ctranslate2-bindings" }
|
|
||||||
tabby-common = { path = "../tabby-common" }
|
tabby-common = { path = "../tabby-common" }
|
||||||
tabby-scheduler = { path = "../tabby-scheduler" }
|
tabby-scheduler = { path = "../tabby-scheduler" }
|
||||||
tabby-download = { path = "../tabby-download" }
|
tabby-download = { path = "../tabby-download" }
|
||||||
|
|
@ -43,9 +42,8 @@ minijinja = { version = "1.0.8", features = ["loader"] }
|
||||||
textdistance = "1.0.2"
|
textdistance = "1.0.2"
|
||||||
regex.workspace = true
|
regex.workspace = true
|
||||||
thiserror.workspace = true
|
thiserror.workspace = true
|
||||||
|
|
||||||
[target.'cfg(all(target_os="macos", target_arch="aarch64"))'.dependencies]
|
|
||||||
llama-cpp-bindings = { path = "../llama-cpp-bindings" }
|
llama-cpp-bindings = { path = "../llama-cpp-bindings" }
|
||||||
|
ctranslate2-bindings = { path = "../ctranslate2-bindings", optional = true }
|
||||||
|
|
||||||
[dependencies.uuid]
|
[dependencies.uuid]
|
||||||
version = "1.3.3"
|
version = "1.3.3"
|
||||||
|
|
@ -57,6 +55,7 @@ features = [
|
||||||
|
|
||||||
[features]
|
[features]
|
||||||
link_shared = ["ctranslate2-bindings/link_shared"]
|
link_shared = ["ctranslate2-bindings/link_shared"]
|
||||||
|
link_cuda_static = ["ctranslate2-bindings"]
|
||||||
|
|
||||||
[build-dependencies]
|
[build-dependencies]
|
||||||
vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
|
vergen = { version = "8.0.0", features = ["build", "git", "gitcl"] }
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
use std::path::Path;
|
use std::path::Path;
|
||||||
|
|
||||||
use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
|
|
||||||
use serde::Deserialize;
|
use serde::Deserialize;
|
||||||
use tabby_common::path::ModelDir;
|
use tabby_common::path::ModelDir;
|
||||||
use tabby_inference::TextGeneration;
|
use tabby_inference::TextGeneration;
|
||||||
|
|
@ -39,33 +38,36 @@ pub struct EngineInfo {
|
||||||
pub chat_template: Option<String>,
|
pub chat_template: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
#[cfg(not(any(feature = "link_shared", feature = "link_cuda_static")))]
|
||||||
fn create_local_engine(
|
fn create_local_engine(
|
||||||
args: &crate::serve::ServeArgs,
|
args: &crate::serve::ServeArgs,
|
||||||
model_dir: &ModelDir,
|
model_dir: &ModelDir,
|
||||||
metadata: &Metadata,
|
_metadata: &Metadata,
|
||||||
) -> Box<dyn TextGeneration> {
|
) -> Box<dyn TextGeneration> {
|
||||||
create_ctranslate2_engine(args, model_dir, metadata)
|
create_ggml_engine(&args.device, model_dir)
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||||
fn create_local_engine(
|
fn create_local_engine(
|
||||||
args: &crate::serve::ServeArgs,
|
args: &crate::serve::ServeArgs,
|
||||||
model_dir: &ModelDir,
|
model_dir: &ModelDir,
|
||||||
metadata: &Metadata,
|
metadata: &Metadata,
|
||||||
) -> Box<dyn TextGeneration> {
|
) -> Box<dyn TextGeneration> {
|
||||||
if args.device != super::Device::Metal {
|
if args.device.use_ggml_backend() {
|
||||||
create_ctranslate2_engine(args, model_dir, metadata)
|
create_ggml_engine(&args.device, model_dir)
|
||||||
} else {
|
} else {
|
||||||
create_llama_engine(model_dir)
|
create_ctranslate2_engine(args, model_dir, metadata)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||||
fn create_ctranslate2_engine(
|
fn create_ctranslate2_engine(
|
||||||
args: &crate::serve::ServeArgs,
|
args: &crate::serve::ServeArgs,
|
||||||
model_dir: &ModelDir,
|
model_dir: &ModelDir,
|
||||||
metadata: &Metadata,
|
metadata: &Metadata,
|
||||||
) -> Box<dyn TextGeneration> {
|
) -> Box<dyn TextGeneration> {
|
||||||
|
use ctranslate2_bindings::{CTranslate2Engine, CTranslate2EngineOptionsBuilder};
|
||||||
|
|
||||||
let device = format!("{}", args.device);
|
let device = format!("{}", args.device);
|
||||||
let options = CTranslate2EngineOptionsBuilder::default()
|
let options = CTranslate2EngineOptionsBuilder::default()
|
||||||
.model_path(model_dir.ctranslate2_dir())
|
.model_path(model_dir.ctranslate2_dir())
|
||||||
|
|
@ -78,11 +80,11 @@ fn create_ctranslate2_engine(
|
||||||
Box::new(CTranslate2Engine::create(options))
|
Box::new(CTranslate2Engine::create(options))
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
fn create_ggml_engine(device: &super::Device, model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
||||||
fn create_llama_engine(model_dir: &ModelDir) -> Box<dyn TextGeneration> {
|
|
||||||
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
let options = llama_cpp_bindings::LlamaEngineOptionsBuilder::default()
|
||||||
.model_path(model_dir.ggml_q8_0_file())
|
.model_path(model_dir.ggml_q8_0_file())
|
||||||
.tokenizer_path(model_dir.tokenizer_file())
|
.tokenizer_path(model_dir.tokenizer_file())
|
||||||
|
.use_gpu(device.ggml_use_gpu())
|
||||||
.build()
|
.build()
|
||||||
.unwrap();
|
.unwrap();
|
||||||
|
|
||||||
|
|
@ -99,6 +101,7 @@ fn get_model_dir(model: &str) -> ModelDir {
|
||||||
|
|
||||||
#[derive(Deserialize)]
|
#[derive(Deserialize)]
|
||||||
struct Metadata {
|
struct Metadata {
|
||||||
|
#[allow(dead_code)]
|
||||||
auto_model: String,
|
auto_model: String,
|
||||||
prompt_template: Option<String>,
|
prompt_template: Option<String>,
|
||||||
chat_template: Option<String>,
|
chat_template: Option<String>,
|
||||||
|
|
|
||||||
|
|
@ -74,7 +74,7 @@ pub enum Device {
|
||||||
#[strum(serialize = "cpu")]
|
#[strum(serialize = "cpu")]
|
||||||
Cpu,
|
Cpu,
|
||||||
|
|
||||||
#[strum(serialize = "cuda")]
|
#[cfg(any(feature = "link_shared", feature = "link_cuda_static"))]
|
||||||
Cuda,
|
Cuda,
|
||||||
|
|
||||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||||
|
|
@ -85,6 +85,28 @@ pub enum Device {
|
||||||
ExperimentalHttp,
|
ExperimentalHttp,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
impl Device {
|
||||||
|
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||||
|
fn use_ggml_backend(&self) -> bool {
|
||||||
|
*self == Device::Metal || *self == Device::Cpu
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||||
|
fn use_ggml_backend(&self) -> bool {
|
||||||
|
*self == Device::Cpu
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
||||||
|
fn ggml_use_gpu(&self) -> bool {
|
||||||
|
*self == Device::Metal
|
||||||
|
}
|
||||||
|
|
||||||
|
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
||||||
|
fn ggml_use_gpu(&self) -> bool {
|
||||||
|
false
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
#[derive(Args)]
|
#[derive(Args)]
|
||||||
pub struct ServeArgs {
|
pub struct ServeArgs {
|
||||||
/// Model id for `/completions` API endpoint.
|
/// Model id for `/completions` API endpoint.
|
||||||
|
|
@ -115,16 +137,6 @@ pub struct ServeArgs {
|
||||||
compute_type: Option<String>,
|
compute_type: Option<String>,
|
||||||
}
|
}
|
||||||
|
|
||||||
#[cfg(not(all(target_os = "macos", target_arch = "aarch64")))]
|
|
||||||
fn should_download_ggml_files(_device: &Device) -> bool {
|
|
||||||
false
|
|
||||||
}
|
|
||||||
|
|
||||||
#[cfg(all(target_os = "macos", target_arch = "aarch64"))]
|
|
||||||
fn should_download_ggml_files(device: &Device) -> bool {
|
|
||||||
*device == Device::Metal
|
|
||||||
}
|
|
||||||
|
|
||||||
pub async fn main(config: &Config, args: &ServeArgs) {
|
pub async fn main(config: &Config, args: &ServeArgs) {
|
||||||
valid_args(args);
|
valid_args(args);
|
||||||
|
|
||||||
|
|
@ -275,7 +287,7 @@ fn start_heartbeat(args: &ServeArgs) {
|
||||||
async fn download_model(model: &str, device: &Device) {
|
async fn download_model(model: &str, device: &Device) {
|
||||||
let downloader = Downloader::new(model, /* prefer_local_file= */ true);
|
let downloader = Downloader::new(model, /* prefer_local_file= */ true);
|
||||||
let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,);
|
let handler = |err| fatal!("Failed to fetch model '{}' due to '{}'", model, err,);
|
||||||
let download_result = if should_download_ggml_files(device) {
|
let download_result = if device.use_ggml_backend() {
|
||||||
downloader.download_ggml_files().await
|
downloader.download_ggml_files().await
|
||||||
} else {
|
} else {
|
||||||
downloader.download_ctranslate2_files().await
|
downloader.download_ctranslate2_files().await
|
||||||
|
|
|
||||||
|
|
@ -17,7 +17,6 @@ We recommend using
|
||||||
| [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
| [TabbyML/StarCoder-7B](https://huggingface.co/TabbyML/StarCoder-7B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||||
| [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
| [TabbyML/StarCoder-3B](https://huggingface.co/TabbyML/StarCoder-3B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||||
| [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
| [TabbyML/StarCoder-1B](https://huggingface.co/TabbyML/StarCoder-1B) | [BigCode-OpenRAIL-M](https://huggingface.co/spaces/bigcode/bigcode-model-license-agreement) | ✅ | ✅ |
|
||||||
| [TabbyML/J-350M](https://huggingface.co/TabbyML/J-350M) | [BSD-3](https://opensource.org/license/bsd-3-clause/) | ❌ | ❌ |
|
|
||||||
|
|
||||||
## Chat models (`--chat-model`)
|
## Chat models (`--chat-model`)
|
||||||
|
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue