feat: add gpu info to health state [TAB-162] (#364)
* feat: add gpu info to health response * chore: error handling * chore: refactor cpu manager code * chore: typo * chore: fix context mutability * chore: fix context mutability * feat: add link to NVML lib * chore: refactor * lint * chore: resolve comments * chore: fix typo * chore: fix * chore: resolve comments * chore: fix * chore: resolve commentsrelease-0.0
parent
bdda8a534f
commit
2a91a21787
|
|
@ -431,7 +431,7 @@ dependencies = [
|
|||
"anstyle",
|
||||
"bitflags",
|
||||
"clap_lex",
|
||||
"strsim",
|
||||
"strsim 0.10.0",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -671,14 +671,38 @@ dependencies = [
|
|||
"syn 2.0.28",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "0d706e75d87e35569db781a9b5e2416cff1236a47ed380831f959382ccd5f858"
|
||||
dependencies = [
|
||||
"darling_core 0.10.2",
|
||||
"darling_macro 0.10.2",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling"
|
||||
version = "0.14.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_macro",
|
||||
"darling_core 0.14.4",
|
||||
"darling_macro 0.14.4",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_core"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "f0c960ae2da4de88a91b2d920c2a7233b400bc33cb28453a2987822d8392519b"
|
||||
dependencies = [
|
||||
"fnv",
|
||||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim 0.9.3",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
|
|
@ -691,7 +715,18 @@ dependencies = [
|
|||
"ident_case",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"strsim",
|
||||
"strsim 0.10.0",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "darling_macro"
|
||||
version = "0.10.2"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "d9b5a2f4ac4969822c62224815d069952656cadc7084fdca9751e6d959189b72"
|
||||
dependencies = [
|
||||
"darling_core 0.10.2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
|
|
@ -701,7 +736,7 @@ version = "0.14.4"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
|
||||
dependencies = [
|
||||
"darling_core",
|
||||
"darling_core 0.14.4",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
|
@ -743,7 +778,7 @@ version = "0.12.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c11bdc11a0c47bc7d37d582b5285da6849c96681023680b906673c5707af7b0f"
|
||||
dependencies = [
|
||||
"darling",
|
||||
"darling 0.14.4",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
|
|
@ -1487,6 +1522,16 @@ version = "0.2.144"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "2b00cc1c228a6782d0f076e7b232802e0c5689d41bb5df366f2a6b6621cfdfe1"
|
||||
|
||||
[[package]]
|
||||
name = "libloading"
|
||||
version = "0.7.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "b67380fd3b2fbe7527a606e18729d21c6f3951633d0500574c4dc22d2d638b9f"
|
||||
dependencies = [
|
||||
"cfg-if",
|
||||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "link-cplusplus"
|
||||
version = "1.0.8"
|
||||
|
|
@ -1808,6 +1853,29 @@ version = "0.4.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
||||
|
||||
[[package]]
|
||||
name = "nvml-wrapper"
|
||||
version = "0.9.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7cd21b9f5a1cce3c3515c9ffa85f5c7443e07162dae0ccf4339bb7ca38ad3454"
|
||||
dependencies = [
|
||||
"bitflags",
|
||||
"libloading",
|
||||
"nvml-wrapper-sys",
|
||||
"static_assertions",
|
||||
"thiserror",
|
||||
"wrapcenum-derive",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "nvml-wrapper-sys"
|
||||
version = "0.7.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "c961a2ea9e91c59a69b78e69090f6f5b867bb46c0c56de9482da232437c4987e"
|
||||
dependencies = [
|
||||
"libloading",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "object"
|
||||
version = "0.30.3"
|
||||
|
|
@ -2733,12 +2801,24 @@ version = "1.2.0"
|
|||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a8f112729512f8e442d81f95a8a7ddf2b7c6b8a1a6f509a95864142b30cab2d3"
|
||||
|
||||
[[package]]
|
||||
name = "static_assertions"
|
||||
version = "1.1.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "a2eb9349b6444b326872e140eb1cf5e7c522154d69e7a0ffb0fb81c06b37543f"
|
||||
|
||||
[[package]]
|
||||
name = "strfmt"
|
||||
version = "0.2.4"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "7a8348af2d9fc3258c8733b8d9d8db2e56f54b2363a4b5b81585c7875ed65e65"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.9.3"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6446ced80d6c486436db5c078dde11a9f73d42b57fb273121e160b84f63d894c"
|
||||
|
||||
[[package]]
|
||||
name = "strsim"
|
||||
version = "0.10.0"
|
||||
|
|
@ -2828,6 +2908,7 @@ dependencies = [
|
|||
"hyper",
|
||||
"lazy_static",
|
||||
"mime_guess",
|
||||
"nvml-wrapper",
|
||||
"opentelemetry",
|
||||
"opentelemetry-otlp",
|
||||
"rust-embed",
|
||||
|
|
@ -4143,6 +4224,18 @@ dependencies = [
|
|||
"winapi",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "wrapcenum-derive"
|
||||
version = "0.4.0"
|
||||
source = "registry+https://github.com/rust-lang/crates.io-index"
|
||||
checksum = "6bcc065c85ad2c3bd12aa4118bf164835712e25080c392557801a13292c60aec"
|
||||
dependencies = [
|
||||
"darling 0.10.2",
|
||||
"proc-macro2",
|
||||
"quote",
|
||||
"syn 1.0.109",
|
||||
]
|
||||
|
||||
[[package]]
|
||||
name = "xattr"
|
||||
version = "0.2.3"
|
||||
|
|
|
|||
|
|
@ -41,6 +41,11 @@ RUN apt-get update && \
|
|||
apt-get clean && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Make link to libnvidia-ml.so (NVML) library
|
||||
# so that we could get GPU stats.
|
||||
RUN ln -s /usr/lib/x86_64-linux-gnu/libnvidia-ml.so.1 \
|
||||
/usr/lib/x86_64-linux-gnu/libnvidia-ml.so
|
||||
|
||||
COPY --from=builder /opt/tabby /opt/tabby
|
||||
|
||||
ENV TABBY_ROOT=/data
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@ tracing-opentelemetry = "0.18.0"
|
|||
tantivy = { workspace = true }
|
||||
anyhow = { workspace = true }
|
||||
sysinfo = "0.29.8"
|
||||
nvml-wrapper = "0.9.0"
|
||||
|
||||
|
||||
[dependencies.uuid]
|
||||
|
|
|
|||
|
|
@ -1,6 +1,8 @@
|
|||
use std::{env::consts::ARCH, sync::Arc};
|
||||
|
||||
use anyhow::Result;
|
||||
use axum::{extract::State, Json};
|
||||
use nvml_wrapper::Nvml;
|
||||
use serde::{Deserialize, Serialize};
|
||||
use sysinfo::{CpuExt, System, SystemExt};
|
||||
use utoipa::ToSchema;
|
||||
|
|
@ -13,19 +15,17 @@ pub struct HealthState {
|
|||
arch: String,
|
||||
cpu_info: String,
|
||||
cpu_count: usize,
|
||||
cuda_devices: Vec<String>,
|
||||
version: Version,
|
||||
}
|
||||
|
||||
impl HealthState {
|
||||
pub fn new(args: &super::ServeArgs) -> Self {
|
||||
let mut sys = System::new_all();
|
||||
sys.refresh_cpu();
|
||||
let cpus = sys.cpus();
|
||||
let cpu_info = if !cpus.is_empty() {
|
||||
let cpu = &cpus[0];
|
||||
cpu.brand().to_string()
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
let (cpu_info, cpu_count) = read_cpu_info();
|
||||
|
||||
let cuda_devices = match read_cuda_devices() {
|
||||
Ok(s) => s,
|
||||
Err(_) => vec![],
|
||||
};
|
||||
|
||||
Self {
|
||||
|
|
@ -34,12 +34,43 @@ impl HealthState {
|
|||
compute_type: args.compute_type.to_string(),
|
||||
arch: ARCH.to_string(),
|
||||
cpu_info,
|
||||
cpu_count: cpus.len(),
|
||||
cpu_count,
|
||||
cuda_devices,
|
||||
version: Version::new(),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
fn read_cpu_info() -> (String, usize) {
|
||||
let mut system = System::new_all();
|
||||
system.refresh_cpu();
|
||||
let cpus = system.cpus();
|
||||
let count = cpus.len();
|
||||
let info = if count > 0 {
|
||||
let cpu = &cpus[0];
|
||||
cpu.brand().to_string()
|
||||
} else {
|
||||
"unknown".to_string()
|
||||
};
|
||||
|
||||
(info, count)
|
||||
}
|
||||
|
||||
fn read_cuda_devices() -> Result<Vec<String>> {
|
||||
// In cases of MacOS or docker containers where --gpus are not specified,
|
||||
// the Nvml::init() would return an error. In these scenarios, we
|
||||
// assign cuda_devices to be empty, indicating that the current runtime
|
||||
// environment does not support cuda interface.
|
||||
let nvml = Nvml::init()?;
|
||||
let mut cuda_devices = vec![];
|
||||
let device_count = nvml.device_count()?;
|
||||
for i in 0..device_count {
|
||||
let name = nvml.device_by_index(i)?.name()?;
|
||||
cuda_devices.push(name);
|
||||
}
|
||||
Ok(cuda_devices)
|
||||
}
|
||||
|
||||
#[derive(Serialize, Deserialize, ToSchema, Clone, Debug)]
|
||||
pub struct Version {
|
||||
build_date: String,
|
||||
|
|
|
|||
Loading…
Reference in New Issue