import re import sys import os import psutil import torch from modules import shared, errors fail_once = False ram = {} gpu = {} mem = {} process = None docker_limit = None runpod_limit = None def gb(val: float): return round(val / 1024 / 1024 / 1024, 2) def get_docker_limit(): global docker_limit # pylint: disable=global-statement if docker_limit is not None: return docker_limit try: with open('/sys/fs/cgroup/memory/memory.limit_in_bytes', 'r', encoding='utf8') as f: docker_limit = float(f.read()) except Exception: docker_limit = sys.float_info.max if docker_limit == 0: docker_limit = sys.float_info.max return docker_limit def get_runpod_limit(): global runpod_limit # pylint: disable=global-statement if runpod_limit is not None: return runpod_limit runpod_limit = float(os.environ.get('RUNPOD_MEM_GB', 0)) * 1024 * 1024 * 1024 if runpod_limit == 0: runpod_limit = sys.float_info.max return runpod_limit def ram_stats(): global process, fail_once # pylint: disable=global-statement try: if process is None: process = psutil.Process(os.getpid()) res = process.memory_info() if 'total' not in ram: process = psutil.Process(os.getpid()) ram_total = 100 * res.rss / process.memory_percent() ram_total = min(ram_total, get_docker_limit(), get_runpod_limit()) ram['total'] = gb(ram_total) ram['rss'] = gb(res.rss) except Exception as e: ram['total'] = 0 ram['rss'] = 0 ram['error'] = str(e) if not fail_once: shared.log.error(f'RAM stats: {e}') errors.display(e, 'RAM stats') fail_once = True try: vmem = psutil.virtual_memory() ram['used'] = gb(vmem.used) if hasattr(vmem, 'used') else 0 ram['free'] = gb(vmem.free) if hasattr(vmem, 'free') else 0 ram['avail'] = gb(vmem.available) if hasattr(vmem, 'available') else 0 ram['buffers'] = gb(vmem.buffers) if hasattr(vmem, 'buffers') else 0 ram['cached'] = gb(vmem.cached) if hasattr(vmem, 'cached') else 0 except Exception as e: ram['used'] = 0 ram['free'] = 0 ram['avail'] = 0 ram['buffers'] = 0 ram['cached'] = 0 ram['error'] = str(e) if not fail_once: shared.log.error(f'RAM stats: {e}') errors.display(e, 'RAM stats') fail_once = True return ram def gpu_stats(): global fail_once # pylint: disable=global-statement try: free, total = torch.cuda.mem_get_info() gpu['used'] = gb(total - free) gpu['total'] = gb(total) stats = dict(torch.cuda.memory_stats()) if stats.get('num_ooms', 0) > 0: shared.state.oom = True gpu['active'] = gb(stats.get('active_bytes.all.current', 0)) gpu['peak'] = gb(stats.get('active_bytes.all.peak', 0)) gpu['retries'] = stats.get('num_alloc_retries', 0) gpu['oom'] = stats.get('num_ooms', 0) except Exception as e: gpu['total'] = 0 gpu['used'] = 0 gpu['error'] = str(e) if not fail_once: shared.log.warning(f'GPU stats: {e}') # errors.display(e, 'GPU stats') fail_once = True return gpu def memory_stats(): mem['ram'] = ram_stats() mem['gpu'] = gpu_stats() mem['job'] = shared.state.job try: mem['gpu']['swap'] = round(mem['gpu']['active'] - mem['gpu']['used']) if mem['gpu']['active'] > mem['gpu']['used'] else 0 except Exception: mem['gpu']['swap'] = 0 return mem def reset_stats(): try: torch.cuda.reset_memory_stats() except Exception: pass class Object: pattern = r"'(.*?)'" def __init__(self, name, obj): self.id = id(obj) self.name = name self.fn = sys._getframe(2).f_code.co_name self.size = sys.getsizeof(obj) self.refcount = sys.getrefcount(obj) if torch.is_tensor(obj): self.type = obj.dtype self.size = obj.element_size() * obj.nelement() else: self.type = re.findall(self.pattern, str(type(obj)))[0] self.size = sys.getsizeof(obj) def __str__(self): return f'{self.fn}.{self.name} type={self.type} size={self.size} ref={self.refcount}' def get_objects(gcl={}, threshold:int=0): objects = [] seen = [] for name, obj in gcl.items(): if id(obj) in seen: continue seen.append(id(obj)) if name == '__name__': name = obj elif name.startswith('__'): continue try: o = Object(name, obj) if o.size >= threshold: objects.append(o) except Exception: pass objects = sorted(objects, key=lambda x: x.size, reverse=True) for obj in objects: shared.log.trace(obj) return objects