diff --git a/machinestate.py b/machinestate.py index cf29899..03f1117 100755 --- a/machinestate.py +++ b/machinestate.py @@ -137,6 +137,8 @@ VEOS_BASE = "/opt/nec/ve/bin" # The NvidiaInfo class requires this path if nvidia-smi is not in $PATH NVIDIA_PATH = "/opt/nvidia/bin" +#The RocmInfo class requires this path to call rocm-smi if not in $PATH +ROCM_PATH = "/opt/rocm/bin" # The OpenCLInfo class requires this path if clinfo is not in $PATH CLINFO_PATH = "/usr/bin" @@ -1227,7 +1229,8 @@ def __init__(self, nvidia_path=NVIDIA_PATH, modulecmd=MODULECMD_PATH, vecmd_path=VEOS_BASE, - clinfo_path=CLINFO_PATH): + clinfo_path=CLINFO_PATH, + rocm_path=ROCM_PATH): super(MachineState, self).__init__(extended=extended, anonymous=anonymous) self.loglevel = loglevel self.dmifile = dmifile @@ -1238,6 +1241,7 @@ def __init__(self, self.modulecmd = modulecmd self.vecmd_path = vecmd_path self.clinfo_path = clinfo_path + self.rocm_path = rocm_path ostype = get_ostype() if ostype == "Linux": self.classlist = [ @@ -1288,6 +1292,8 @@ def __init__(self, self.classargs.append({"executable" : executable}) self.classlist.append(OpenCLInfo) self.classargs.append({"clinfo_path" : clinfo_path}) + self.classlist.append(RocmInfo) + self.classargs.append({"rocm_path" : rocm_path}) if likwid_enable: if likwid_path is None or not pexists(likwid_path): path = which("likwid-topology") @@ -3196,6 +3202,106 @@ def __init__(self, vecmd_path="", extended=False, anonymous=False): self.subargs = {"vecmd_path" : vecmd_path} ################################################################################ +# Infos about AMD ROCm devices +################################################################################ +class RocmInfoClass(InfoGroup): + '''Class for a single ROCm devices''' + def __init__(self, device, extended=False, anonymous=False, rocm_path=ROCM_PATH): + super(RocmInfoClass, self).__init__(name="Card{}".format(device), + extended=extended, + anonymous=anonymous) + self.device = device + self.rocm_path = rocm_path + rocmcmd = pjoin(self.rocm_path, "rocm-smi") + if not os.access(rocmcmd, os.X_OK) and which("rocm-smi"): + rocmcmd = which("rocm-smi") + rocm_args = "-d {} -a --json | python3 -m json.tool".format(self.device) + self.addc("Vendor", rocmcmd, rocm_args, r'^\s+"Card vendor\":\s+\"(.*)\"[,]?$', str) + self.addc("VBiosVersion", rocmcmd, rocm_args, r'^\s+"VBIOS version\":\s+\"(.*)\"[,]?$', str) + self.addc("SKU", rocmcmd, rocm_args, r'^\s+"Card SKU\":\s+\"(.*)\"[,]?$', str) + self.addc("Model", rocmcmd, rocm_args, r'^\s+"Card model\":\s+\"(.*)\"[,]?$', str) + self.addc("Series", rocmcmd, rocm_args, r'^\s+"Card series\":\s+\"(.*)\"[,]?$', str) + self.addc("ID", rocmcmd, rocm_args, r'^\s+"GPU ID\":\s+\"(.*)\"[,]?$', str) + self.addc("MemoryVendor", rocmcmd, rocm_args, r'^\s+"GPU memory vendor\":\s+\"(.*)\"[,]?$', str) + self.addc("MaxPackagePowerWatt", rocmcmd, rocm_args, r'^\s+"Max Graphics Package Power \(W\)\":\s+\"(.*)\"[,]?$', float) + self.addc("PerformanceLevel", rocmcmd, rocm_args, r'^\s+"Performance Level\":\s+\"(.*)\"[,]?$', str) + self.addc("SerialNumber", rocmcmd, rocm_args, r'^\s+"Serial Number\":\s+\"(.*)\"[,]?$', str) + self.addc("VoltageMilliVolt", rocmcmd, rocm_args, r'^\s+"Voltage \(mV\)\":\s+\"(.*)\"[,]?$', int) + self.addc("TemperatureEdge", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor edge\) \(C\)\":\s+\"(.*)\"[,]?$', int) + self.addc("TemperatureJunction", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor junction\) \(C\)\":\s+\"(.*)\"[,]?$', int) + self.addc("TemperatureMemory", rocmcmd, rocm_args, r'^\s+"Temperature \(Sensor memory\) \(C\)\":\s+\"(.*)\"[,]?$', int) + self.addc("FClk", rocmcmd, rocm_args, r'^\s+"fclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz) + self.addc("MClk", rocmcmd, rocm_args, r'^\s+"mclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz) + self.addc("SClk", rocmcmd, rocm_args, r'^\s+"sclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz) + self.addc("SocClk", rocmcmd, rocm_args, r'^\s+"socclk clock speed[:]?\":\s+\"\((.*)\)\"[,]?$', tohertz) + self.addc("PcieClk", rocmcmd, rocm_args, r'^\s+"pcie clock level\":\s+\"(.*)\"[,]?$', str) + self.required("Vendor", "VBiosVersion", "SKU", "Model", "Series", "PerformanceLevel", "MemoryVendor", "PcieClk") + if self.extended: + self.addc("DmcuFirmwareVersion", rocmcmd, rocm_args, r'^\s+"DMCU firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("CeFirmwareVersion", rocmcmd, rocm_args, r'^\s+"CE firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("AsdFirmwareVersion", rocmcmd, rocm_args, r'^\s+"ASD firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("McFirmwareVersion", rocmcmd, rocm_args, r'^\s+"MC firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("MeFirmwareVersion", rocmcmd, rocm_args, r'^\s+"ME firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("MecFirmwareVersion", rocmcmd, rocm_args, r'^\s+"MEC firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("Mec2FirmwareVersion", rocmcmd, rocm_args, r'^\s+"MEC2 firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("PfpFirmwareVersion", rocmcmd, rocm_args, r'^\s+"PFP firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("RlcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("RlcSrlcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLC firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("RlcSrlgFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLG firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("RlcSrlsFirmwareVersion", rocmcmd, rocm_args, r'^\s+"RLC SRLS firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("SdmaFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SDMA firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("Sdma2FirmwareVersion", rocmcmd, rocm_args, r'^\s+"SDMA2 firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("SmcFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SMC firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("SosFirmwareVersion", rocmcmd, rocm_args, r'^\s+"SOS firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("TaRasFirmwareVersion", rocmcmd, rocm_args, r'^\s+"TA RAS firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("TaXgmiFirmwareVersion", rocmcmd, rocm_args, r'^\s+"TA XGMI firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("UvdFirmwareVersion", rocmcmd, rocm_args, r'^\s+"UVD firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("VceFirmwareVersion", rocmcmd, rocm_args, r'^\s+"VCE firmware version\":\s+\"(.*)\"[,]?$', str) + self.addc("VcnFirmwareVersion", rocmcmd, rocm_args, r'^\s+"VCN firmware version\":\s+\"(.*)\"[,]?$', str) + +class RocmInfo(ListInfoGroup): + '''Class to spawn subclasses for ROCm devices''' + def __init__(self, extended=False, anonymous=False, rocm_path=ROCM_PATH): + super(RocmInfo, self).__init__(name="RocmInfo", + extended=extended, + anonymous=anonymous) + self.rocm_path = rocm_path + rocmsmi = pjoin(self.rocm_path, "rocm-smi") + if not os.access(rocmsmi, os.X_OK) and which("rocm-smi"): + rocmsmi = which("rocm-smi") + rocminfo = pjoin(self.rocm_path, "rocminfo") + if not os.access(rocminfo, os.X_OK) and which("rocminfo"): + rocminfo = which("rocminfo") + hipconfig = pjoin(self.rocm_path, "hipconfig") + if not os.access(hipconfig, os.X_OK) and which("hipconfig"): + hipconfig = which("hipconfig") + data = process_cmd((rocmsmi)) + if data is not None: + for l in data.split("\n"): + m = re.match(r"^(\d+)\s+[\d\.]+c.*", l) + if m: + dev = int(m.group(1)) + if dev not in devicelist: + self.userlist.append(dev) + self.subclass = RocmInfoClass + self.subargs = {"rocm_path" : self.rocm_path} + self.addc("DriverVersion", rocmsmi, "--showdriverversion", r"Driver version:\s+([\d\.]+)", str) + self.addc("RocmSmiVersion", rocmsmi, "--help", r"ROCM-SMI version: (\d+\.\d+\.\d+)", str) + self.addc("RocmKernelVersion", rocmsmi, "--help", r"Kernel version: (\d+\.\d+\.\d+)", str) + self.addc("RuntimeVersion", rocminfo, regex=r"Runtime Version:\s+(\d+\.\d+\.\d+)", parse=str) + self.addc("HipVersion", hipconfig, "--version") + self.addc("HipRuntime", hipconfig, "--runtime") + self.addc("HipCompiler", hipconfig, "--compiler") + self.addc("HipPlatform", hipconfig, "--platform") + self.addc("HipC++Options", hipconfig, "--cpp_config") + self.required("DriverVersion") + self.required("RuntimeVersion") + self.required("HipVersion") + self.required("HipRuntime") + self.required("HipCompiler") + self.required("HipPlatform") + +################################################################################ # Infos from clinfo (OpenCL devices and runtime) ################################################################################ class OpenCLInfoPlatformDeviceClass(InfoGroup): @@ -3441,6 +3547,7 @@ def read_config(config={"extended" : False, "anonymous" : False, "executable" : "modulecmd" : MODULECMD_PATH, "vecmd_path" : VEOS_BASE, "nvidia_path" : NVIDIA_PATH, + "rocm_path" : ROCM_PATH, "loglevel" : DEFAULT_LOGLEVEL, "clinfo_path" : CLINFO_PATH, "anonymous" : False,