Skip to content

Commit

Permalink
feat(api/device): add aggregated total NVLink throughput
Browse files Browse the repository at this point in the history
  • Loading branch information
XuehaiPan committed May 8, 2024
1 parent 52b44a7 commit 8fb9653
Show file tree
Hide file tree
Showing 2 changed files with 147 additions and 0 deletions.
16 changes: 16 additions & 0 deletions nvitop-exporter/nvitop_exporter/exporter.py
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,20 @@ def __init__( # pylint: disable=too-many-statements
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_total_tx_throughput = Gauge(
name='gpu_nvlink_total_tx_throughput',
documentation='GPU total NVLink transmit throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_total_rx_throughput = Gauge(
name='gpu_nvlink_total_rx_throughput',
documentation='GPU total NVLink receive throughput (MiB/s).',
unit='MiBps',
labelnames=['hostname', 'index', 'devicename', 'uuid'],
registry=self.registry,
)
self.gpu_nvlink_mean_tx_throughput = Gauge(
name='gpu_nvlink_mean_tx_throughput',
documentation='GPU mean NVLink transmit throughput (MiB/s).',
Expand Down Expand Up @@ -548,6 +562,8 @@ def update_device(self, device: Device) -> None: # pylint: disable=too-many-loc
(self.gpu_fan_speed, float(device.fan_speed())),
(self.gpu_pcie_tx_throughput, device.pcie_tx_throughput() / 1024.0),
(self.gpu_pcie_rx_throughput, device.pcie_rx_throughput() / 1024.0),
(self.gpu_nvlink_total_tx_throughput, device.nvlink_total_tx_throughput() / 1024.0),
(self.gpu_nvlink_total_rx_throughput, device.nvlink_total_rx_throughput() / 1024.0),
(self.gpu_nvlink_mean_tx_throughput, device.nvlink_mean_tx_throughput() / 1024.0),
(self.gpu_nvlink_mean_rx_throughput, device.nvlink_mean_rx_throughput() / 1024.0),
):
Expand Down
131 changes: 131 additions & 0 deletions nvitop/api/device.py
Original file line number Diff line number Diff line change
Expand Up @@ -1605,6 +1605,39 @@ def query_nvlink_throughput_counters() -> tuple[tuple[int | NaType, int]]:
for tx, rx in zip(throughputs[:nvlink_link_count], throughputs[nvlink_link_count:])
]

def nvlink_total_throughput(
self,
interval: float | None = None,
) -> ThroughputInfo: # in KiB/s
"""The total NVLink throughput for all NVLinks in KiB/s.
This function is querying data counters between methods calls and thus is the NVLink
throughput over that interval. For the first call, the function is blocking for 20ms to get
the first data counters.
Args:
interval (Optional[float]):
The interval in seconds between two calls to get the NVLink throughput. If
``interval`` is a positive number, compares throughput counters before and after the
interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares
throughput counters since the last call, returning immediately (non-blocking).
Returns: ThroughputInfo(tx, rx)
A named tuple with the total NVLink throughput for all NVLinks in KiB/s, the item could
be :const:`nvitop.NA` when not applicable.
"""
tx_throughputs = []
rx_throughputs = []
for tx, rx in self.nvlink_throughput(interval=interval):
if libnvml.nvmlCheckReturn(tx, int):
tx_throughputs.append(tx)
if libnvml.nvmlCheckReturn(rx, int):
rx_throughputs.append(rx)
return ThroughputInfo(
tx=sum(tx_throughputs) if tx_throughputs else NA,
rx=sum(rx_throughputs) if rx_throughputs else NA,
)

def nvlink_mean_throughput(
self,
interval: float | None = None,
Expand Down Expand Up @@ -1684,6 +1717,29 @@ def nvlink_mean_tx_throughput(
"""
return self.nvlink_mean_throughput(interval=interval).tx

def nvlink_total_tx_throughput(
self,
interval: float | None = None,
) -> int | NaType: # in KiB/s
"""The total NVLink transmit data throughput for all NVLinks in KiB/s.
This function is querying data counters between methods calls and thus is the NVLink
throughput over that interval. For the first call, the function is blocking for 20ms to get
the first data counters.
Args:
interval (Optional[float]):
The interval in seconds between two calls to get the NVLink throughput. If
``interval`` is a positive number, compares throughput counters before and after the
interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares
throughput counters since the last call, returning immediately (non-blocking).
Returns: Union[int, NaType]
The total NVLink transmit data throughput for all NVLinks in KiB/s, or
:const:`nvitop.NA` when not applicable.
"""
return self.nvlink_total_throughput(interval=interval).tx

def nvlink_rx_throughput(
self,
interval: float | None = None,
Expand Down Expand Up @@ -1730,6 +1786,29 @@ def nvlink_mean_rx_throughput(
"""
return self.nvlink_mean_throughput(interval=interval).rx

def nvlink_total_rx_throughput(
self,
interval: float | None = None,
) -> int | NaType: # in KiB/s
"""The total NVLink receive data throughput for all NVLinks in KiB/s.
This function is querying data counters between methods calls and thus is the NVLink
throughput over that interval. For the first call, the function is blocking for 20ms to get
the first data counters.
Args:
interval (Optional[float]):
The interval in seconds between two calls to get the NVLink throughput. If
``interval`` is a positive number, compares throughput counters before and after the
interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares
throughput counters since the last call, returning immediately (non-blocking).
Returns: Union[int, NaType]
The total NVLink receive data throughput for all NVLinks in KiB/s, or
:const:`nvitop.NA` when not applicable.
"""
return self.nvlink_total_throughput(interval=interval).rx

def nvlink_tx_throughput_human(
self,
interval: float | None = None,
Expand Down Expand Up @@ -1782,6 +1861,32 @@ def nvlink_mean_tx_throughput_human(
return f'{bytes2human(mean_tx * 1024)}/s'
return NA

def nvlink_total_tx_throughput_human(
self,
interval: float | None = None,
) -> str | NaType: # in human readable
"""The total NVLink transmit data throughput for all NVLinks in human readable format.
This function is querying data counters between methods calls and thus is the NVLink
throughput over that interval. For the first call, the function is blocking for 20ms to get
the first data counters.
Args:
interval (Optional[float]):
The interval in seconds between two calls to get the NVLink throughput. If
``interval`` is a positive number, compares throughput counters before and after the
interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares
throughput counters since the last call, returning immediately (non-blocking).
Returns: Union[str, NaType]
The total NVLink transmit data throughput for all NVLinks in human readable format, or
:const:`nvitop.NA` when not applicable.
"""
total_tx = self.nvlink_total_tx_throughput(interval=interval)
if libnvml.nvmlCheckReturn(total_tx, int):
return f'{bytes2human(total_tx * 1024)}/s'
return NA

def nvlink_rx_throughput_human(
self,
interval: float | None = None,
Expand Down Expand Up @@ -1834,6 +1939,32 @@ def nvlink_mean_rx_throughput_human(
return f'{bytes2human(mean_rx * 1024)}/s'
return NA

def nvlink_total_rx_throughput_human(
self,
interval: float | None = None,
) -> str | NaType: # in human readable
"""The total NVLink receive data throughput for all NVLinks in human readable format.
This function is querying data counters between methods calls and thus is the NVLink
throughput over that interval. For the first call, the function is blocking for 20ms to get
the first data counters.
Args:
interval (Optional[float]):
The interval in seconds between two calls to get the NVLink throughput. If
``interval`` is a positive number, compares throughput counters before and after the
interval (blocking). If ``interval`` is :const`0.0` or :data:`None`, compares
throughput counters since the last call, returning immediately (non-blocking).
Returns: Union[str, NaType]
The total NVLink receive data throughput for all NVLinks in human readable format, or
:const:`nvitop.NA` when not applicable.
"""
total_rx = self.nvlink_total_rx_throughput(interval=interval)
if libnvml.nvmlCheckReturn(total_rx, int):
return f'{bytes2human(total_rx * 1024)}/s'
return NA

def display_active(self) -> str | NaType:
"""A flag that indicates whether a display is initialized on the GPU's (e.g. memory is allocated on the device for display).
Expand Down

0 comments on commit 8fb9653

Please sign in to comment.