sdnext/modules/api/nvml.py

try:
    from installer import install, log
except Exception:
    def install(*args, **kwargs): # pylint: disable=unused-argument
        pass
    import logging
    log = logging.getLogger(__name__)


nvml_initialized = False
warned = False


def warn_once(msg):
    global warned # pylint: disable=global-statement
    if not warned:
        log.error(msg)
        warned = True

def get_reason(val):
    throttle = {
        1: 'gpu idle',
        2: 'applications clocks setting',
        4: 'sw power cap',
        8: 'hw slowdown',
        16: 'sync boost',
        32: 'sw thermal slowdown',
        64: 'hw thermal slowdown',
        128: 'hw power brake slowdown',
        256: 'display clock setting',
    }
    reason = ', '.join([throttle[i] for i in throttle if i & val])
    return reason if len(reason) > 0 else 'ok'


def get_nvml():
    global nvml_initialized # pylint: disable=global-statement
    if warned:
        return []
    try:
        from modules.memstats import ram_stats
        if not nvml_initialized:
            install('nvidia-ml-py', quiet=True)
            import pynvml # pylint: disable=redefined-outer-name
            pynvml.nvmlInit()
            log.debug('NVML initialized')
            nvml_initialized = True
        else:
            import pynvml
        devices = []
        for i in range(pynvml.nvmlDeviceGetCount()):
            dev = pynvml.nvmlDeviceGetHandleByIndex(i)
            try:
                name = pynvml.nvmlDeviceGetName(dev)
            except Exception:
                name = ''
            load = pynvml.nvmlDeviceGetUtilizationRates(dev)
            mem = pynvml.nvmlDeviceGetMemoryInfo(dev)
            ram = ram_stats()
            data = {
                "CUDA": f'Version {pynvml.nvmlSystemGetCudaDriverVersion()} Compute {pynvml.nvmlDeviceGetCudaComputeCapability(dev)}',
                "Driver": pynvml.nvmlSystemGetDriverVersion(),
                "Hardware": f'VBIOS {pynvml.nvmlDeviceGetVbiosVersion(dev)} ROM {pynvml.nvmlDeviceGetInforomImageVersion(dev)}',
                "PCI link": f'Gen.{pynvml.nvmlDeviceGetCurrPcieLinkGeneration(dev)} x{pynvml.nvmlDeviceGetCurrPcieLinkWidth(dev)}',
                "Power": f'{round(pynvml.nvmlDeviceGetPowerUsage(dev)/1000, 2)} W / {round(pynvml.nvmlDeviceGetEnforcedPowerLimit(dev)/1000, 2)} W',
                "GPU clock": f'{pynvml.nvmlDeviceGetClockInfo(dev, 0)} Mhz / {pynvml.nvmlDeviceGetMaxClockInfo(dev, 0)} Mhz',
                "SM clock": f'{pynvml.nvmlDeviceGetClockInfo(dev, 1)} Mhz / {pynvml.nvmlDeviceGetMaxClockInfo(dev, 1)} Mhz',
                "VRAM clock": f'{pynvml.nvmlDeviceGetClockInfo(dev, 2)} Mhz / {pynvml.nvmlDeviceGetMaxClockInfo(dev, 2)} Mhz',
                "VRAM usage": f'{round(100 * mem.used / mem.total)}% | {round(mem.used / 1024 / 1024)} MB used | {round(mem.free / 1024 / 1024)} MB free | {round(mem.total / 1024 / 1024)} MB total',
                "RAM usage": f'{round(100 * ram["used"] / ram["total"])}% | {round(1024 * ram["used"])} MB used | {round(1024 * ram["free"])} MB free | {round(1024 * ram["total"])} MB total',
                "System load": f'GPU {load.gpu}% | VRAM {load.memory}% | Temp {pynvml.nvmlDeviceGetTemperature(dev, 0)}C | Fan {pynvml.nvmlDeviceGetFanSpeed(dev)}%',
                'State': get_reason(pynvml.nvmlDeviceGetCurrentClocksThrottleReasons(dev)),
            }
            chart = [load.memory, load.gpu]
            devices.append({
                'name': name,
                'data': data,
                'chart': chart,
            })
        # log.debug(f'nmvl: {devices}')
        return devices
    except Exception as e:
        warn_once(f'NVML: {e}')
        return []


if __name__ == '__main__':
    nvml_initialized = True
    import pynvml # pylint: disable=redefined-outer-name
    pynvml.nvmlInit()
    from rich import print as rprint
    for gpu in get_nvml():
        rprint(gpu)