1
0
mirror of https://github.com/vladmandic/sdnext.git synced 2026-01-27 15:02:48 +03:00
Files
sdnext/modules/api/rocm_smi.py
Vladimir Mandic 41640773e5 fix global state tracking
Signed-off-by: Vladimir Mandic <mandic00@live.com>
2025-08-10 09:19:49 -04:00

115 lines
4.2 KiB
Python

import math
import json
import subprocess as sp
from enum import IntFlag
try:
from installer import log
except Exception:
import logging
log = logging.getLogger(__name__)
try:
from modules.rocm import version as rocm_version
except Exception:
rocm_version = "unknown"
# ThrottleStatus is from leuc/amdgpu_metrics.py
class ThrottleStatus(IntFlag):
# linux/drivers/gpu/drm/amd/pm/inc/amdgpu_smu.h
PPT0 = 1 << 0
PPT1 = 1 << 1
PPT2 = 1 << 2
PPT3 = 1 << 3
SPL = 1 << 4
FPPT = 1 << 5
SPPT = 1 << 6
SPPT_APU = 1 << 7
TDC_GFX = 1 << 16
TDC_SOC = 1 << 17
TDC_MEM = 1 << 18
TDC_VDD = 1 << 19
TDC_CVIP = 1 << 20
EDC_CPU = 1 << 21
EDC_GFX = 1 << 22
APCC = 1 << 23
TEMP_GPU = 1 << 32
TEMP_CORE = 1 << 33
TEMP_MEM = 1 << 34
TEMP_EDGE = 1 << 35
TEMP_HOTSPOT = 1 << 36
TEMP_SOC = 1 << 37
TEMP_VR_GFX = 1 << 38
TEMP_VR_SOC = 1 << 39
TEMP_VR_MEM0 = 1 << 40
TEMP_VR_MEM1 = 1 << 41
TEMP_LIQUID0 = 1 << 42
TEMP_LIQUID1 = 1 << 43
VRHOT0 = 1 << 44
VRHOT1 = 1 << 45
PROCHOT_CPU = 1 << 46
PROCHOT_GFX = 1 << 47
PPM = 1 << 56
FIT = 1 << 57
def active(self):
members = self.__class__.__members__
return (m for m in members if getattr(self, m)._value_ & self.value != 0) # pylint: disable=protected-access
def __iter__(self):
return self.active()
def __str__(self):
return ', '.join(self.active())
def get_rocm_smi():
try:
rocm_smi_data = json.loads(sp.check_output(("rocm-smi", "-a", "--json")))
driver_version = rocm_smi_data.pop("system", {"Driver version": "unknown"}).get("Driver version")
devices = []
for key in rocm_smi_data.keys():
load = {
'gpu': rocm_smi_data[key].get('GPU use (%)', 'unknown'),
'memory': rocm_smi_data[key].get("GPU Memory Allocated (VRAM%)", "unknown"),
'temp': rocm_smi_data[key].get('Temperature (Sensor edge) (C)', 'unknown'),
'temp_junction': rocm_smi_data[key].get('Temperature (Sensor junction) (C)', 'unknown'),
'temp_memory': rocm_smi_data[key].get('Temperature (Sensor memory) (C)', 'unknown'),
'fan': rocm_smi_data[key].get('Fan speed (%)', 'unknown'),
}
data = {
"ROCm": f'version {rocm_version} agent {rocm_smi_data[key].get("GFX Version", "unknown")}',
"Driver": driver_version,
"Hardware": f'VBIOS {rocm_smi_data[key].get("VBIOS version", "unknown")}',
"PCI link": f'Gen.{int(math.log2(float(rocm_smi_data[key].get("pcie_link_speed (0.1 GT/s)", 10)) / 10))} x{rocm_smi_data[key].get("pcie_link_width (Lanes)", "unknown")}',
"Power": f'{round(float(rocm_smi_data[key].get("Average Graphics Package Power (W)", 0)), 2)} W / {round(float(rocm_smi_data[key].get("Max Graphics Package Power (W)", 0)), 2)} W',
"GPU clock": f'{rocm_smi_data[key].get("average_gfxclk_frequency (MHz)", 0)} Mhz / {rocm_smi_data[key].get("Valid sclk range", "0").split(" - ")[-1].removesuffix("Mhz")} Mhz',
"VRAM clock": f'{rocm_smi_data[key].get("current_uclk (MHz)", 0)} Mhz / {rocm_smi_data[key].get("Valid mclk range", "0").split(" - ")[-1].removesuffix("Mhz")} Mhz',
"VRAM usage": f'{load["memory"]}% Used | {rocm_smi_data[key].get("GPU Memory Read/Write Activity (%)", "unknown")}% Activity',
"GPU usage": f'GPU {load["gpu"]}% | Fan {load["fan"]}%',
"GPU temp": f'Edge {load["temp"]}C | Junction {load["temp_junction"]}C | Memory {load["temp_memory"]}C',
'Throttle reason': str(ThrottleStatus(int(rocm_smi_data[key].get("throttle_status", 0)))),
}
name = rocm_smi_data[key].get('Device Name', 'unknown')
chart = [load["memory"], load["gpu"]]
devices.append({
'name': name,
'data': data,
'chart': chart,
})
return devices
except Exception as e:
log.error(f'ROCm SMI: {e}')
return []
if __name__ == '__main__':
from rich import print as rprint
for gpu in get_rocm_smi():
rprint(gpu)