feat: add gpu info to health state [TAB-162] (#364)

* feat: add gpu info to health response

* chore: error handling

* chore: refactor cpu manager code

* chore: typo

* chore: fix context mutability

* chore: fix context mutability

* feat: add link to NVML lib

* chore: refactor

* lint

* chore: resolve comments

* chore: fix typo

* chore: fix

* chore: resolve comments

* chore: fix

* chore: resolve comments
release-0.0
vodkaslime 2023-08-21 18:06:38 +08:00 committed by GitHub
parent bdda8a534f
commit 2a91a21787
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
4 changed files with 145 additions and 15 deletions

105
Cargo.lock generated
View File

@ -431,7 +431,7 @@ dependencies = [
"anstyle",
"bitflags",
"clap_lex",
"strsim",
"strsim 0.10.0",
]
[[package]]
@ -671,14 +671,38 @@ dependencies = [
"syn 2.0.28",
]
[[package]]
name = "darling"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858"
dependencies = [
"darling_core 0.10.2",
"darling_macro 0.10.2",
]
[[package]]
name = "darling"
version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
dependencies = [
"darling_core",
"darling_macro",
"darling_core 0.14.4",
"darling_macro 0.14.4",
]
[[package]]
name = "darling_core"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
dependencies = [
"fnv",
"ident_case",
"proc-macro2",
"quote",
"strsim 0.9.3",
"syn 1.0.109",
]
[[package]]
@ -691,7 +715,18 @@ dependencies = [
"ident_case",
"proc-macro2",
"quote",
"strsim",
"strsim 0.10.0",
"syn 1.0.109",
]
[[package]]
name = "darling_macro"
version = "0.10.2"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
dependencies = [
"darling_core 0.10.2",
"quote",
"syn 1.0.109",
]
@ -701,7 +736,7 @@ version = "0.14.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
dependencies = [
"darling_core",
"darling_core 0.14.4",
"quote",
"syn 1.0.109",
]
@ -743,7 +778,7 @@ version = "0.12.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
dependencies = [
"darling",
"darling 0.14.4",
"proc-macro2",
"quote",
"syn 1.0.109",
@ -1487,6 +1522,16 @@ version = "0.2.144"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
[[package]]
name = "libloading"
version = "0.7.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
dependencies = [
"cfg-if",
"winapi",
]
[[package]]
name = "link-cplusplus"
version = "1.0.8"
@ -1808,6 +1853,29 @@ version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
[[package]]
name = "nvml-wrapper"
version = "0.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7cd21b9f5a1cce3c3515c9ffa85f5c7443e07162dae0ccf4339bb7ca38ad3454"
dependencies = [
"bitflags",
"libloading",
"nvml-wrapper-sys",
"static_assertions",
"thiserror",
"wrapcenum-derive",
]
[[package]]
name = "nvml-wrapper-sys"
version = "0.7.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "c961a2ea9e91c59a69b78e69090f6f5b867bb46c0c56de9482da232437c4987e"
dependencies = [
"libloading",
]
[[package]]
name = "object"
version = "0.30.3"
@ -2733,12 +2801,24 @@ version = "1.2.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
[[package]]
name = "static_assertions"
version = "1.1.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
[[package]]
name = "strfmt"
version = "0.2.4"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "7a8348af2d9fc3258c8733b8d9d8db2e56f54b2363a4b5b81585c7875ed65e65"
[[package]]
name = "strsim"
version = "0.9.3"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
[[package]]
name = "strsim"
version = "0.10.0"
@ -2828,6 +2908,7 @@ dependencies = [
"hyper",
"lazy_static",
"mime_guess",
"nvml-wrapper",
"opentelemetry",
"opentelemetry-otlp",
"rust-embed",
@ -4143,6 +4224,18 @@ dependencies = [
"winapi",
]
[[package]]
name = "wrapcenum-derive"
version = "0.4.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "6bcc065c85ad2c3bd12aa4118bf164835712e25080c392557801a13292c60aec"
dependencies = [
"darling 0.10.2",
"proc-macro2",
"quote",
"syn 1.0.109",
]
[[package]]
name = "xattr"
version = "0.2.3"

View File

@ -41,6 +41,11 @@ RUN apt-get update && \
apt-get clean && \
rm -rf /var/lib/apt/lists/*
# Make link to libnvidia-ml.so (NVML) library
# so that we could get GPU stats.
RUN ln -s /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \
/usr/lib/x86_64-linux-gnu/libnvidia-ml.so
COPY --from=builder /opt/tabby /opt/tabby
ENV TABBY_ROOT=/data

View File

@ -34,6 +34,7 @@ tracing-opentelemetry = "0.18.0"
tantivy = { workspace = true }
anyhow = { workspace = true }
sysinfo = "0.29.8"
nvml-wrapper = "0.9.0"
[dependencies.uuid]

View File

@ -1,6 +1,8 @@
use std::{env::consts::ARCH, sync::Arc};
use anyhow::Result;
use axum::{extract::State, Json};
use nvml_wrapper::Nvml;
use serde::{Deserialize, Serialize};
use sysinfo::{CpuExt, System, SystemExt};
use utoipa::ToSchema;
@ -13,19 +15,17 @@ pub struct HealthState {
arch: String,
cpu_info: String,
cpu_count: usize,
cuda_devices: Vec<String>,
version: Version,
}
impl HealthState {
pub fn new(args: &super::ServeArgs) -> Self {
let mut sys = System::new_all();
sys.refresh_cpu();
let cpus = sys.cpus();
let cpu_info = if !cpus.is_empty() {
let cpu = &cpus[0];
cpu.brand().to_string()
} else {
"unknown".to_string()
let (cpu_info, cpu_count) = read_cpu_info();
let cuda_devices = match read_cuda_devices() {
Ok(s) => s,
Err(_) => vec![],
};
Self {
@ -34,12 +34,43 @@ impl HealthState {
compute_type: args.compute_type.to_string(),
arch: ARCH.to_string(),
cpu_info,
cpu_count: cpus.len(),
cpu_count,
cuda_devices,
version: Version::new(),
}
}
}
fn read_cpu_info() -> (String, usize) {
let mut system = System::new_all();
system.refresh_cpu();
let cpus = system.cpus();
let count = cpus.len();
let info = if count > 0 {
let cpu = &cpus[0];
cpu.brand().to_string()
} else {
"unknown".to_string()
};
(info, count)
}
fn read_cuda_devices() -> Result<Vec<String>> {
// In cases of MacOS or docker containers where --gpus are not specified,
// the Nvml::init() would return an error. In these scenarios, we
// assign cuda_devices to be empty, indicating that the current runtime
// environment does not support cuda interface.
let nvml = Nvml::init()?;
let mut cuda_devices = vec![];
let device_count = nvml.device_count()?;
for i in 0..device_count {
let name = nvml.device_by_index(i)?.name()?;
cuda_devices.push(name);
}
Ok(cuda_devices)
}
#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
pub struct Version {
build_date: String,