tabby/crates/ctranslate2-bindings/src/ctranslate2.cc

#include "ctranslate2-bindings/include/ctranslate2.h"

#include "ctranslate2/translator.h"
#include "ctranslate2/generator.h"

namespace tabby {
TextInferenceEngine::~TextInferenceEngine() {}

template <class Model, class Child>
class TextInferenceEngineImpl : public TextInferenceEngine {
 protected:
  struct Options {
    size_t max_decoding_length;
    float sampling_temperature;
  };

 public:
  rust::Vec<uint32_t> inference(
      rust::Box<InferenceContext> context,
      InferenceCallback callback,
      rust::Slice<const rust::String> tokens,
      size_t max_decoding_length,
      float sampling_temperature
  ) const {
    // Inference.
    std::vector<std::string> input_tokens(tokens.begin(), tokens.end());
    return process(
        std::move(context),
        std::move(callback),
        input_tokens,
        Options{max_decoding_length, sampling_temperature}
    );
  }

  static std::unique_ptr<TextInferenceEngine> create(const ctranslate2::models::ModelLoader& loader) {
    auto impl = std::make_unique<Child>();
    impl->model_ = std::make_unique<Model>(loader);
    return impl;
  }

 protected:
  virtual rust::Vec<uint32_t> process(
      rust::Box<InferenceContext> context,
      InferenceCallback callback,
      const std::vector<std::string>& tokens,
      const Options& options) const = 0;
  std::unique_ptr<Model> model_;
};

class EncoderDecoderImpl : public TextInferenceEngineImpl<ctranslate2::Translator, EncoderDecoderImpl> {
 protected:
  virtual rust::Vec<uint32_t> process(
      rust::Box<InferenceContext> context,
      InferenceCallback callback,
      const std::vector<std::string>& tokens,
      const Options& options) const override {
    ctranslate2::TranslationOptions x;
    x.max_decoding_length = options.max_decoding_length;
    x.sampling_temperature = options.sampling_temperature;
    x.beam_size = 1;
    rust::Vec<uint32_t> output_ids;
    x.callback = [&](ctranslate2::GenerationStepResult result) {
      bool stop = callback(*context, result.step, result.token_id, result.token);
      if (!stop) {
        output_ids.push_back(result.token_id);
      } else if (result.is_last) {
        output_ids.push_back(result.token_id);
      }
      return stop;
    };
    ctranslate2::TranslationResult result = model_->translate_batch({ tokens }, x)[0];
    return output_ids;
  }
};

class DecoderImpl : public TextInferenceEngineImpl<ctranslate2::Generator, DecoderImpl> {
 protected:
  virtual rust::Vec<uint32_t> process(
      rust::Box<InferenceContext> context,
      InferenceCallback callback,
      const std::vector<std::string>& tokens,
      const Options& options) const override {
    ctranslate2::GenerationOptions x;
    x.include_prompt_in_result = false;
    x.max_length = options.max_decoding_length;
    x.sampling_temperature = options.sampling_temperature;
    x.beam_size = 1;

    rust::Vec<uint32_t> output_ids;
    x.callback = [&](ctranslate2::GenerationStepResult result) {
      bool stop = callback(*context, result.step, result.token_id, result.token);
      if (!stop) {
        output_ids.push_back(result.token_id);
      } else if (result.is_last) {
        output_ids.push_back(result.token_id);
      }
      return stop;
    };
    ctranslate2::GenerationResult result = model_->generate_batch_async({ tokens }, x)[0].get();
    return output_ids;
  }
};

std::shared_ptr<TextInferenceEngine> create_engine(
    rust::Str model_path,
    rust::Str model_type,
    rust::Str device,
    rust::Str compute_type,
    rust::Slice<const int32_t> device_indices,
    size_t num_replicas_per_device
) {
  std::string model_type_str(model_type);
  std::string model_path_str(model_path);
  ctranslate2::models::ModelLoader loader(model_path_str);
  loader.device = ctranslate2::str_to_device(std::string(device));
  loader.device_indices = std::vector<int>(device_indices.begin(), device_indices.end());
  loader.num_replicas_per_device = num_replicas_per_device;

  std::string compute_type_str(compute_type);
  if (compute_type_str == "auto") {
    if (loader.device == ctranslate2::Device::CPU) {
      loader.compute_type = ctranslate2::ComputeType::INT8;
    } else if (loader.device == ctranslate2::Device::CUDA) {
      loader.compute_type = ctranslate2::ComputeType::INT8_FLOAT16;
    }
  } else {
      loader.compute_type = ctranslate2::str_to_compute_type(compute_type_str);
  }

  if (model_type_str == "AutoModelForCausalLM") {
    return DecoderImpl::create(loader);
  } else if (model_type_str == "AutoModelForSeq2SeqLM") {
    return EncoderDecoderImpl::create(loader);
  } else {
    return nullptr;
  }
}
}  // namespace tabby
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`#include "ctranslate2-bindings/include/ctranslate2.h"`

			`#include "ctranslate2/translator.h"`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`#include "ctranslate2/generator.h"`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00
			`namespace tabby {`
			`TextInferenceEngine::~TextInferenceEngine() {}`

refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`template <class Model, class Child>`
			`class TextInferenceEngineImpl : public TextInferenceEngine {`
			`protected:`
			`struct Options {`
			`size_t max_decoding_length;`
			`float sampling_temperature;`
			`};`

add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`public:`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`rust::Vec<uint32_t> inference(`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`rust::Box<InferenceContext> context,`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`InferenceCallback callback,`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`rust::Slice<const rust::String> tokens,`
			`size_t max_decoding_length,`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`float sampling_temperature`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`) const {`
			`// Inference.`
			`std::vector<std::string> input_tokens(tokens.begin(), tokens.end());`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`return process(`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`std::move(context),`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`std::move(callback),`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`input_tokens,`
			`Options{max_decoding_length, sampling_temperature}`
			`);`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`}`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00
			`static std::unique_ptr<TextInferenceEngine> create(const ctranslate2::models::ModelLoader& loader) {`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`auto impl = std::make_unique<Child>();`
			`impl->model_ = std::make_unique<Model>(loader);`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`return impl;`
			`}`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`protected:`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`virtual rust::Vec<uint32_t> process(`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`rust::Box<InferenceContext> context,`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`InferenceCallback callback,`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`const std::vector<std::string>& tokens,`
			`const Options& options) const = 0;`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`std::unique_ptr<Model> model_;`
			`};`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`class EncoderDecoderImpl : public TextInferenceEngineImpl<ctranslate2::Translator, EncoderDecoderImpl> {`
			`protected:`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`virtual rust::Vec<uint32_t> process(`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`rust::Box<InferenceContext> context,`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`InferenceCallback callback,`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`const std::vector<std::string>& tokens,`
			`const Options& options) const override {`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`ctranslate2::TranslationOptions x;`
			`x.max_decoding_length = options.max_decoding_length;`
			`x.sampling_temperature = options.sampling_temperature;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`x.beam_size = 1;`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`rust::Vec<uint32_t> output_ids;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`x.callback = [&](ctranslate2::GenerationStepResult result) {`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`bool stop = callback(*context, result.step, result.token_id, result.token);`
			`if (!stop) {`
			`output_ids.push_back(result.token_id);`
			`} else if (result.is_last) {`
			`output_ids.push_back(result.token_id);`
			`}`
			`return stop;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`};`
			`ctranslate2::TranslationResult result = model_->translate_batch({ tokens }, x)[0];`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`return output_ids;`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`}`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`};`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`class DecoderImpl : public TextInferenceEngineImpl<ctranslate2::Generator, DecoderImpl> {`
			`protected:`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`virtual rust::Vec<uint32_t> process(`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`rust::Box<InferenceContext> context,`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`InferenceCallback callback,`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`const std::vector<std::string>& tokens,`
			`const Options& options) const override {`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`ctranslate2::GenerationOptions x;`
			`x.include_prompt_in_result = false;`
			`x.max_length = options.max_decoding_length;`
			`x.sampling_temperature = options.sampling_temperature;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`x.beam_size = 1;`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00
			`rust::Vec<uint32_t> output_ids;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`x.callback = [&](ctranslate2::GenerationStepResult result) {`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`bool stop = callback(*context, result.step, result.token_id, result.token);`
			`if (!stop) {`
			`output_ids.push_back(result.token_id);`
			`} else if (result.is_last) {`
			`output_ids.push_back(result.token_id);`
			`}`
			`return stop;`
feat: support early stop [TAB-51] (#208) * bump ctranslate2 to v3.15.0 * enable early stop * support early stop 2023-06-06 12:46:17 +00:00			`};`
refactor: extract TextInferenceEngineImpl to reduce duplications between EncoderDecoderImpl and DecoderImpl #189 2023-06-04 22:28:39 +00:00			`ctranslate2::GenerationResult result = model_->generate_batch_async({ tokens }, x)[0].get();`
feat: support stop sequences [TAB-52] (#212) * refactor: pass step and string token to callback * add token to callback * add stop regexp * implement stop words logic * pass token_ids from inference * improve effiency of regexp match with reversed regex * fmt * add typescript and javascript stop words * add cache for stop words regexp 2023-06-06 23:28:58 +00:00			`return output_ids;`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`}`
			`};`

chore: mark thread safety [TAB-52] (#186) * mark thread safety * use shared_ptr to ensure thread safety * fmt 2023-06-04 06:23:31 +00:00			`std::shared_ptr<TextInferenceEngine> create_engine(`
feat: support cuda devices in rust tabby (#149) 2023-05-26 06:23:07 +00:00			`rust::Str model_path,`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`rust::Str model_type,`
feat: support cuda devices in rust tabby (#149) 2023-05-26 06:23:07 +00:00			`rust::Str device,`
feat: support set compute_type through commandline arguments 2023-06-13 19:04:07 +00:00			`rust::Str compute_type,`
feat: support cuda devices in rust tabby (#149) 2023-05-26 06:23:07 +00:00			`rust::Slice<const int32_t> device_indices,`
			`size_t num_replicas_per_device`
			`) {`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`std::string model_type_str(model_type);`
			`std::string model_path_str(model_path);`
			`ctranslate2::models::ModelLoader loader(model_path_str);`
			`loader.device = ctranslate2::str_to_device(std::string(device));`
			`loader.device_indices = std::vector<int>(device_indices.begin(), device_indices.end());`
feat: support cuda devices in rust tabby (#149) 2023-05-26 06:23:07 +00:00			`loader.num_replicas_per_device = num_replicas_per_device;`

feat: support set compute_type through commandline arguments 2023-06-13 19:04:07 +00:00			`std::string compute_type_str(compute_type);`
			`if (compute_type_str == "auto") {`
			`if (loader.device == ctranslate2::Device::CPU) {`
			`loader.compute_type = ctranslate2::ComputeType::INT8;`
			`} else if (loader.device == ctranslate2::Device::CUDA) {`
			`loader.compute_type = ctranslate2::ComputeType::INT8_FLOAT16;`
			`}`
			`} else {`
			`loader.compute_type = ctranslate2::str_to_compute_type(compute_type_str);`
feat: add `tabby download` command (#157) * simplify fmt-display * cleanup * move tabby-admin to reduce nest * add model downloader * get rid of model-type * improve commands * fix fmt 2023-05-28 21:36:11 +00:00			`}`

			`if (model_type_str == "AutoModelForCausalLM") {`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`return DecoderImpl::create(loader);`
feat: add `tabby download` command (#157) * simplify fmt-display * cleanup * move tabby-admin to reduce nest * add model downloader * get rid of model-type * improve commands * fix fmt 2023-05-28 21:36:11 +00:00			`} else if (model_type_str == "AutoModelForSeq2SeqLM") {`
Support causal lm (decoder only model) (#151) * support * support causal lm 2023-05-27 08:26:33 +00:00			`return EncoderDecoderImpl::create(loader);`
			`} else {`
			`return nullptr;`
			`}`
add ctranslate2-bindings / tabby rust packages (#146) * add ctranslate2-bindings * add fixme for linux build * turn off shared lib * add tabby-cli 2023-05-25 21:05:28 +00:00			`}`
			`} // namespace tabby`