From 144dceae4198aa8dd55b3046493e893391817edb Mon Sep 17 00:00:00 2001 From: Meng Zhang Date: Fri, 20 Oct 2023 00:32:04 -0700 Subject: [PATCH] fix: cap parallelisim to 4 for cuda to avoid oom --- crates/ctranslate2-bindings/src/ctranslate2.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/crates/ctranslate2-bindings/src/ctranslate2.cc b/crates/ctranslate2-bindings/src/ctranslate2.cc index 68c7829..b9675a8 100644 --- a/crates/ctranslate2-bindings/src/ctranslate2.cc +++ b/crates/ctranslate2-bindings/src/ctranslate2.cc @@ -116,8 +116,8 @@ std::shared_ptr create_engine( const size_t num_cpus = std::thread::hardware_concurrency(); if (loader.device == ctranslate2::Device::CUDA) { - // When device is cuda, set parallelism to be number of thread. - loader.num_replicas_per_device = num_cpus; + // When device is cuda, set parallelism to be number of thread, capped to 4 to avoid VRAM oom. + loader.num_replicas_per_device = std::min(num_cpus, 4); } else if (loader.device == ctranslate2::Device::CPU){ // When device is cpu, adjust the number based on threads per replica. // https://github.com/OpenNMT/CTranslate2/blob/master/src/utils.cc#L77