From 92878be51838456b927ef01c4c854dc1b60503ff Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Thu, 11 Jan 2018 18:01:04 +0100 Subject: [PATCH 01/16] print spec control CPUID info Change-Id: Ib46ee52111b5539155983c2fb92d4457962dd668 --- cpucounters.cpp | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/cpucounters.cpp b/cpucounters.cpp index 1af07b72..89669fcc 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -471,6 +471,12 @@ bool PCM::detectModel() return false; } + pcm_cpuid(7, 0, cpuinfo); + + std::cout << "IBRS and IBPB supported : " << ((cpuinfo.reg.edx & (1 << 26)) ? "yes" : "no") << std::endl; + std::cout << "STIBP supported : " << ((cpuinfo.reg.edx & (1 << 27)) ? "yes" : "no") << std::endl; + std::cout << "Spec arch caps supported : " << ((cpuinfo.reg.edx & (1 << 29)) ? "yes" : "no") << std::endl; + return true; } From 07c85e9aa318ebe7e3d66338a00891e542bc2f91 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Wed, 24 Jan 2018 09:18:11 +0100 Subject: [PATCH 02/16] print spec control MSRs Change-Id: I093f94bfd26e517ca124b6af051fa8e0045030f2 --- cpucounters.cpp | 26 ++++++++++++++++++++++++++ cpucounters.h | 1 + types.h | 3 +++ 3 files changed, 30 insertions(+) diff --git a/cpucounters.cpp b/cpucounters.cpp index 89669fcc..e05e962b 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -1566,6 +1566,8 @@ PCM::PCM() : if(!detectNominalFrequency()) return; + showSpecControlMSRs(); + initEnergyMonitoring(); initUncoreObjects(); @@ -1609,6 +1611,30 @@ void PCM::enableJKTWorkaround(bool enable) } } +void PCM::showSpecControlMSRs() +{ + PCM_CPUID_INFO cpuinfo; + pcm_cpuid(7, 0, cpuinfo); + + if (MSR.size()) + { + if ((cpuinfo.reg.edx & (1 << 26)) || (cpuinfo.reg.edx & (1 << 27))) + { + uint64 val64 = 0; + MSR[0]->read(MSR_IA32_SPEC_CTRL, &val64); + std::cout << "IBRS enabled in the kernel : " << ((val64 & 1) ? "yes" : "no") << std::endl; + std::cout << "STIBP enabled in the kernel : " << ((val64 & 2) ? "yes" : "no") << std::endl; + } + if (cpuinfo.reg.edx & (1 << 29)) + { + uint64 val64 = 0; + MSR[0]->read(MSR_IA32_ARCH_CAPABILITIES, &val64); + std::cout << "The processor is not susceptible to Rogue Data Cache Load: " << ((val64 & 1) ? "yes" : "no") << std::endl; + std::cout << "The processor supports enhanced IBRS : " << ((val64 & 2) ? "yes" : "no") << std::endl; + } + } +} + bool PCM::isCoreOnline(int32 os_core_id) const { return (topology[os_core_id].os_id != -1) && (topology[os_core_id].core_id != -1) && (topology[os_core_id].socket != -1); diff --git a/cpucounters.h b/cpucounters.h index dc0a9038..5b0f6a4f 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -541,6 +541,7 @@ class PCM_API PCM void printSystemTopology() const; bool initMSR(); bool detectNominalFrequency(); + void showSpecControlMSRs(); void initEnergyMonitoring(); void initUncoreObjects(); /*! diff --git a/types.h b/types.h index f09afcec..50ec8f03 100644 --- a/types.h +++ b/types.h @@ -882,6 +882,9 @@ struct IIOPMUCNTCTLRegister #define MSR_CORE_C6_RESIDENCY (0x3FD) #define MSR_CORE_C7_RESIDENCY (0x3FE) +#define MSR_IA32_SPEC_CTRL (0x48) +#define MSR_IA32_ARCH_CAPABILITIES (0x10A) + #ifdef _MSC_VER #include // data structure for converting two uint32s <-> uin64 From bcabb94b5755445d70d1891457f2d60ed2197b55 Mon Sep 17 00:00:00 2001 From: Roman Dementiev Date: Tue, 29 Jan 2019 16:20:08 +0100 Subject: [PATCH 03/16] use Linux uncore perf interface for Secure Boot Change-Id: Ib1666f19aef0020827c0f34c9a4b1ebff265215b --- cpucounters.cpp | 441 ++++++++++++++++++++++++++++++++++++++++++------ cpucounters.h | 20 +++ types.h | 4 + 3 files changed, 417 insertions(+), 48 deletions(-) diff --git a/cpucounters.cpp b/cpucounters.cpp index 091ab7ea..f0e2b7da 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -512,6 +512,7 @@ bool PCM::detectModel() bool PCM::QOSMetricAvailable() const { + if (isSecureBoot()) return false; // TODO: use perf rdt driver PCM_CPUID_INFO cpuinfo; pcm_cpuid(0x7,0,cpuinfo); return (cpuinfo.reg.ebx & (1<<12))?true:false; @@ -519,6 +520,7 @@ bool PCM::QOSMetricAvailable() const bool PCM::L3QOSMetricAvailable() const { + if (isSecureBoot()) return false; // TODO:: use perf rdt driver PCM_CPUID_INFO cpuinfo; pcm_cpuid(0xf,0,cpuinfo); return (cpuinfo.reg.edx & (1<<1))?true:false; @@ -729,18 +731,18 @@ void PCM::initCStateSupportTables() #ifdef __linux__ -std::string readSysFS(const char * path) +std::string readSysFS(const char * path, bool silent = false) { FILE * f = fopen(path, "r"); if (!f) { - std::cerr << "Can not open "<< path <<" file." << std::endl; + if (silent == false) std::cerr << "Can not open "<< path <<" file." << std::endl; return std::string(); } char buffer[1024]; if(NULL == fgets(buffer, 1024, f)) { - std::cerr << "Can not read "<< path << "." << std::endl; + if (silent == false) std::cerr << "Can not read "<< path << "." << std::endl; return std::string(); } fclose(f); @@ -1404,9 +1406,37 @@ void PCM::initUncoreObjects() #endif } } + + if (useLinuxPerfForUncore()) + { + initUncorePMUsPerf(); + } + else + { + initUncorePMUsDirect(); + } +} + +void PCM::initUncorePMUsDirect() +{ for (uint32 s = 0; s < (uint32)num_sockets; ++s) { auto & handle = MSR[socketRefCore[s]]; + uboxPMUs.push_back( + UncorePMU( + std::shared_ptr(), + std::make_shared(handle, UBOX_MSR_PMON_CTL0_ADDR), + std::make_shared(handle, UBOX_MSR_PMON_CTL1_ADDR), + std::shared_ptr(), + std::shared_ptr(), + std::make_shared(handle, UBOX_MSR_PMON_CTR0_ADDR), + std::make_shared(handle, UBOX_MSR_PMON_CTR1_ADDR), + std::shared_ptr(), + std::shared_ptr(), + std::make_shared(handle, UCLK_FIXED_CTL_ADDR), + std::make_shared(handle, UCLK_FIXED_CTR_ADDR) + ) + ); switch (cpu_model) { case IVYTOWN: @@ -1516,6 +1546,32 @@ void PCM::initUncoreObjects() } } +#ifdef PCM_USE_PERF +std::vector enumeratePerfPMUs(const std::string & type, int max_id); +void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed); +#endif + +void PCM::initUncorePMUsPerf() +{ +#ifdef PCM_USE_PERF + iioPMUs.resize(num_sockets); + cboPMUs.resize(num_sockets); + for (uint32 s = 0; s < (uint32)num_sockets; ++s) + { + populatePerfPMUs(s, enumeratePerfPMUs("pcu", 100), pcuPMUs, false); + populatePerfPMUs(s, enumeratePerfPMUs("ubox", 100), uboxPMUs, true); + populatePerfPMUs(s, enumeratePerfPMUs("cbox", 100), cboPMUs[s], false); + populatePerfPMUs(s, enumeratePerfPMUs("cha", 200), cboPMUs[s], false); + std::vector iioPMUVector; + populatePerfPMUs(s, enumeratePerfPMUs("iio", 100), iioPMUVector, false); + for (size_t i = 0; i < iioPMUVector.size(); ++i) + { + iioPMUs[s][i] = iioPMUVector[i]; + } + } +#endif +} + #ifdef __linux__ bool isNMIWatchdogEnabled() @@ -1805,7 +1861,7 @@ bool PCM::good() } #ifdef PCM_USE_PERF -perf_event_attr PCM_init_perf_event_attr() +perf_event_attr PCM_init_perf_event_attr(bool group = true) { perf_event_attr e; bzero(&e,sizeof(perf_event_attr)); @@ -1814,7 +1870,7 @@ perf_event_attr PCM_init_perf_event_attr() e.config = -1; // must be set up later e.sample_period = 0; e.sample_type = 0; - e.read_format = PERF_FORMAT_GROUP; /* PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | + e.read_format = group ? PERF_FORMAT_GROUP : 0; /* PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING | PERF_FORMAT_ID | PERF_FORMAT_GROUP ; */ e.disabled = 0; e.inherit = 0; @@ -4308,6 +4364,60 @@ PciHandleType * ServerPCICFGUncore::createIntelPerfMonDevice(uint32 groupnr_, in return NULL; } +bool PCM::isSecureBoot() const +{ + static int flag = -1; + if (MSR.size() > 0 && flag == -1) + { + // std::cerr << "DEBUG: checking MSR in isSecureBoot" << std::endl; + uint64 val = 0; + if (MSR[0]->read(IA32_PERFEVTSEL0_ADDR, &val) != sizeof(val)) + { + flag = 0; // some problem with MSR read, not secure boot + } + // read works + if (MSR[0]->write(IA32_PERFEVTSEL0_ADDR, val) != sizeof(val)/* && errno == 1 */) // errno works only on windows + { // write does not work -> secure boot + flag = 1; + } + else + { + flag = 0; // can write MSR -> no secure boot + } + } + return flag == 1; +} + +bool PCM::useLinuxPerfForUncore() const +{ + static bool printed = false; + bool secureBoot = isSecureBoot(); +#ifdef PCM_USE_PERF + const char * perf_env = std::getenv("PCM_USE_UNCORE_PERF"); + if (perf_env != NULL && std::string(perf_env) == std::string("1")) + { + if (!printed) std::cout << "INFO: using Linux perf interface to program uncore PMUs because env variable PCM_USE_UNCORE_PERF=1" << std::endl; + printed = true; + return true; + } + if (secureBoot) + { + if (!printed) std::cout << "Secure Boot detected. Using Linux perf for uncore PMU programming." << std::endl; + printed = true; + return true; + } + else +#endif + { + if (secureBoot) + { + if (!printed) std::cerr << "ERROR: Secure Boot detected. Recompile PCM with -DPCM_USE_PERF or disable Secure Boot." << std::endl; + printed = true; + } + } + return false; +} + ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : iMCbus(-1) , UPIbus(-1) @@ -4316,8 +4426,26 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : , cpu_model(pcm->getOriginalCPUModel()) , qpi_speed(0) { - std::vector > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function) + initRegisterLocations(); + initBuses(socket_, pcm); + + if (pcm->useLinuxPerfForUncore()) + { + initPerf(socket_, pcm); + } + else + { + initDirect(socket_, pcm); + } + + std::cerr << "Socket " << socket_ << ": " << + getNumMC() << " memory controllers detected with total number of " << getNumMCChannels() << " channels. " << + getNumQPIPorts() << " QPI ports detected." << + " " << m2mPMUs.size() << " M2M (mesh to memory) blocks detected." << std::endl; +} +void ServerPCICFGUncore::initRegisterLocations() +{ #define PCM_PCICFG_MC_INIT(controller, channel, arch) \ MCRegisterLocation.resize(controller + 1); \ MCRegisterLocation[controller].resize(channel + 1); \ @@ -4328,14 +4456,10 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : XPIRegisterLocation.resize(port + 1); \ XPIRegisterLocation[port] = std::make_pair(arch##_QPI_PORT##port##_REGISTER_DEV_ADDR, arch##_QPI_PORT##port##_REGISTER_FUNC_ADDR); - std::vector > EDCRegisterLocation; // EDCRegisterLocation: (device, function) - #define PCM_PCICFG_EDC_INIT(controller, clock, arch) \ EDCRegisterLocation.resize(controller + 1); \ EDCRegisterLocation[controller] = std::make_pair(arch##_EDC##controller##_##clock##_REGISTER_DEV_ADDR, arch##_EDC##controller##_##clock##_REGISTER_FUNC_ADDR); - std::vector > M2MRegisterLocation; // M2MRegisterLocation: (device, function) - #define PCM_PCICFG_M2M_INIT(x, arch) \ M2MRegisterLocation.resize(x + 1); \ M2MRegisterLocation[x] = std::make_pair(arch##_M2M_##x##_REGISTER_DEV_ADDR, arch##_M2M_##x##_REGISTER_FUNC_ADDR); @@ -4418,7 +4542,10 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : #undef PCM_PCICFG_QPI_INIT #undef PCM_PCICFG_EDC_INIT #undef PCM_PCICFG_M2M_INIT +} +void ServerPCICFGUncore::initBuses(uint32 socket_, const PCM * pcm) +{ const uint32 total_sockets_ = pcm->getNumSockets(); if (M2MRegisterLocation.size()) @@ -4462,6 +4589,46 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : throw std::exception(); } + if (total_sockets_ == 1) { + /* + * For single socket systems, do not worry at all about QPI ports. This + * eliminates QPI LL programming error messages on single socket systems + * with BIOS that hides QPI performance counting PCI functions. It also + * eliminates register programming that is not needed since no QPI traffic + * is possible with single socket systems. + */ + return; + } + +#ifdef PCM_NOQPI + return; +#endif + + if(cpu_model == PCM::SKX) + { + initSocket2Bus(socket2UPIbus, XPIRegisterLocation[0].first, XPIRegisterLocation[0].second, UPI_DEV_IDS, (uint32)sizeof(UPI_DEV_IDS) / sizeof(UPI_DEV_IDS[0])); + if(total_sockets_ == socket2UPIbus.size()) + { + UPIbus = socket2UPIbus[socket_].second; + if(groupnr != socket2UPIbus[socket_].first) + { + UPIbus = -1; + std::cerr << "PCM error: mismatching PCICFG group number for UPI and IMC perfmon devices." << std::endl; + } + } + else + { + std::cerr << "PCM error: Did not find UPI perfmon device on every socket in a multisocket system." << std::endl; + } + } + else + { + UPIbus = iMCbus; + } +} + +void ServerPCICFGUncore::initDirect(uint32 socket_, const PCM * pcm) +{ { std::vector > imcHandles; @@ -4582,7 +4749,7 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : } } - if (total_sockets_ == 1) { + if (pcm->getNumSockets() == 1) { /* * For single socket systems, do not worry at all about QPI ports. This * eliminates QPI LL programming error messages on single socket systems @@ -4591,8 +4758,6 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : * is possible with single socket systems. */ xpiPMUs.clear(); - std::cerr << "On the socket detected " << getNumMC() << " memory controllers with total number of " << imcPMUs.size() << " channels. " << - m2mPMUs.size() << " M2M (mesh to memory) blocks detected."<< std::endl; return; } @@ -4601,32 +4766,9 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : std::cerr << getNumMC() <<" memory controllers detected with total number of "<< imcPMUs.size() <<" channels. " << m2mPMUs.size() << " M2M (mesh to memory) blocks detected."<< std::endl; return; -#else +#endif std::vector > qpiLLHandles; - - if(cpu_model == PCM::SKX) - { - initSocket2Bus(socket2UPIbus, XPIRegisterLocation[0].first, XPIRegisterLocation[0].second, UPI_DEV_IDS, (uint32)sizeof(UPI_DEV_IDS) / sizeof(UPI_DEV_IDS[0])); - if(total_sockets_ == socket2UPIbus.size()) - { - UPIbus = socket2UPIbus[socket_].second; - if(groupnr != socket2UPIbus[socket_].first) - { - UPIbus = -1; - std::cerr << "PCM error: mismatching PCICFG group number for UPI and IMC perfmon devices." << std::endl; - } - } - else - { - std::cerr << "PCM error: Did not find UPI perfmon device on every socket in a multisocket system." << std::endl; - } - } - else - { - UPIbus = iMCbus; - } - auto xPI = pcm->xPI(); try { @@ -4691,12 +4833,209 @@ ServerPCICFGUncore::ServerPCICFGUncore(uint32 socket_, const PCM * pcm) : ); } } +} + +#ifdef PCM_USE_PERF +class PerfVirtualDummyUnitControlRegister : public HWRegister +{ + uint64 lastValue; +public: + PerfVirtualDummyUnitControlRegister() : lastValue(0) {} + void operator = (uint64 val) override + { + lastValue = val; + } + operator uint64 () override + { + return lastValue; + } +}; + +class PerfVirtualFilterRegister : public HWRegister +{ + uint64 lastValue; + void printError() + { + static bool printed = false; + if (!printed) std::cerr << "ERROR: perf uncore interface does not support filter registers yet." << std::endl; + printed = true; + } +public: + PerfVirtualFilterRegister() : lastValue(0) {} + void operator = (uint64 val) override + { + printError(); + lastValue = val; + } + operator uint64 () override + { + printError(); + return lastValue; + } +}; + +class PerfVirtualControlRegister : public HWRegister +{ + friend class PerfVirtualCounterRegister; + int fd; + int socket; + int pmuID; + perf_event_attr event; + bool fixed; + void close() + { + if (fd >= 0) + { + ::close(fd); + fd = -1; + } + } +public: + PerfVirtualControlRegister(int socket_, int pmuID_, bool fixed_ = false) : + fd(-1), + socket(socket_), + pmuID(pmuID_), + fixed(fixed_) + { + event = PCM_init_perf_event_attr(false); + event.type = pmuID; + } + void operator = (uint64 val) override + { + close(); + event.config = fixed ? 0xff : val; + const auto core = PCM::getInstance()->socketRefCore[socket]; + if ((fd = syscall(SYS_perf_event_open, &event, -1, core, -1, 0)) <= 0) + { + std::cerr << "Linux Perf: Error on programming PMU " << pmuID << ": " << strerror(errno) << std::endl; + std::cerr << "config: 0x" << std::hex << event.config << " config1: 0x" << event.config1 << " config2: 0x" << event.config2 << std::dec << std::endl; + if (errno == 24) std::cerr << "try executing 'ulimit -n 10000' to increase the limit on the number of open files." << std::endl; + return; + } + } + operator uint64 () override + { + return event.config; + } + ~PerfVirtualControlRegister() + { + close(); + } + int getFD() const { return fd; } + int getPMUID() const { return pmuID; } +}; + +class PerfVirtualCounterRegister : public HWRegister +{ + std::shared_ptr controlReg; +public: + PerfVirtualCounterRegister(const std::shared_ptr & controlReg_) : controlReg(controlReg_) + { + } + void operator = (uint64 /* val */) override + { + // no-op + } + operator uint64 () override + { + uint64 result = 0; + if (controlReg.get() && (controlReg->getFD() >= 0)) + { + int status = ::read(controlReg->getFD(), &result, sizeof(result)); + if (status != sizeof(result)) + { + std::cerr << "PCM Error: failed to read from Linux perf handle " << controlReg->getFD() << " PMU " << controlReg->getPMUID() << std::endl; + } + } + return result; + } +}; + +std::vector enumeratePerfPMUs(const std::string & type, int max_id) +{ + auto getPerfPMUID = [](const std::string & type, int num) + { + int id = -1; + std::ostringstream pmuIDPath(std::ostringstream::out); + pmuIDPath << std::string("/sys/bus/event_source/devices/uncore_") << type; + if (num != -1) + { + pmuIDPath << "_" << num; + } + pmuIDPath << "/type"; + const std::string pmuIDStr = readSysFS(pmuIDPath.str().c_str(), true); + if (pmuIDStr.size()) + { + id = std::atoi(pmuIDStr.c_str()); + } + return id; + }; + std::vector ids; + for (int i = -1; i < max_id; ++i) + { + int pmuID = getPerfPMUID(type, i); + if (pmuID > 0) + { + // std::cout << "DEBUG: " << type << " pmu id "<< pmuID << " found" << std::endl; + ids.push_back(pmuID); + } + } + return ids; +} + +void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed) +{ + for (const auto & id : ids) + { + std::shared_ptr controlReg0 = std::make_shared(socket_, id); + std::shared_ptr controlReg1 = std::make_shared(socket_, id); + std::shared_ptr controlReg2 = std::make_shared(socket_, id); + std::shared_ptr controlReg3 = std::make_shared(socket_, id); + std::shared_ptr counterReg0 = std::make_shared(controlReg0); + std::shared_ptr counterReg1 = std::make_shared(controlReg1); + std::shared_ptr counterReg2 = std::make_shared(controlReg2); + std::shared_ptr counterReg3 = std::make_shared(controlReg3); + std::shared_ptr fixedControlReg = std::make_shared(socket_, id, true); + std::shared_ptr fixedCounterReg = std::make_shared(fixedControlReg); + pmus.push_back( + UncorePMU( + std::make_shared(), + controlReg0, + controlReg1, + controlReg2, + controlReg3, + counterReg0, + counterReg1, + counterReg2, + counterReg3, + fixed ? fixedControlReg : std::shared_ptr(), + fixed ? fixedCounterReg : std::shared_ptr(), + std::make_shared(), + std::make_shared() + ) + ); + } +} +#endif + +void ServerPCICFGUncore::initPerf(uint32 socket_, const PCM * pcm) +{ +#ifdef PCM_USE_PERF + auto imcIDs = enumeratePerfPMUs("imc", 100); + auto m2mIDs = enumeratePerfPMUs("m2m", 100); + auto haIDs = enumeratePerfPMUs("ha", 100); + auto numMemControllers = std::max(m2mIDs.size(), haIDs.size()); + for (size_t i = 0; i < numMemControllers; ++i) + { + const int channelsPerController = imcIDs.size() / numMemControllers; + num_imc_channels.push_back(channelsPerController); + } + populatePerfPMUs(socket_, imcIDs, imcPMUs, true); + populatePerfPMUs(socket_, m2mIDs, m2mPMUs, false); + populatePerfPMUs(socket_, enumeratePerfPMUs("qpi", 100), xpiPMUs, false); + populatePerfPMUs(socket_, enumeratePerfPMUs("upi", 100), xpiPMUs, false); #endif - std::cerr << "Socket "<read(UCLK_FIXED_CTR_ADDR, &result); + if (socket_ < uboxPMUs.size()) + { + result = *uboxPMUs[socket_].fixedCounterValue; + } return result; } @@ -5830,9 +6175,9 @@ void PCM::programLLCReadMissLatencyEvents() const uint32 opCode = (SKX == cpu_model) ? 0x202 : 0x182; programCbo(events, opCode); - for (int32 i = 0; (i < num_sockets) && MSR.size(); ++i) + for (auto & pmu : uboxPMUs) { - MSR[socketRefCore[i]]->write(UCLK_FIXED_CTL_ADDR, UCLK_FIXED_CTL_EN); + *pmu.fixedCounterControl = UCLK_FIXED_CTL_EN; } } diff --git a/cpucounters.h b/cpucounters.h index 4f1ef4fd..38905b8f 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -291,6 +291,9 @@ class ServerPCICFGUncore std::vector qpi_speed; std::vector num_imc_channels; // number of memory channels in each memory controller std::vector > XPIRegisterLocation; // (device, function) + std::vector > > MCRegisterLocation; // MCRegisterLocation[controller]: (device, function) + std::vector > EDCRegisterLocation; // EDCRegisterLocation: (device, function) + std::vector > M2MRegisterLocation; // M2MRegisterLocation: (device, function) static PCM_Util::Mutex socket2busMutex; static std::vector > socket2iMCbus; @@ -313,6 +316,10 @@ class ServerPCICFGUncore void cleanupQPIHandles(); void cleanupPMUs(); void writeAllUnitControl(const uint32 value); + void initDirect(uint32 socket_, const PCM * pcm); + void initPerf(uint32 socket_, const PCM * pcm); + void initBuses(uint32 socket_, const PCM * pcm); + void initRegisterLocations(); public: //! \brief Initialize access data structures @@ -453,6 +460,8 @@ class SimpleCounterState typedef SimpleCounterState PCIeCounterState; typedef SimpleCounterState IIOCounterState; +class PerfVirtualControlRegister; + #ifndef HACK_TO_REMOVE_DUPLICATE_ERROR template class PCM_API std::allocator; template class PCM_API std::vector; @@ -472,6 +481,7 @@ class PCM_API PCM { friend class BasicCounterState; friend class UncoreCounterState; + friend class PerfVirtualControlRegister; PCM(); // forbidden to call directly because it is a singleton int32 cpu_family; @@ -514,6 +524,7 @@ class PCM_API PCM std::vector > server_pcicfg_uncore; std::vector pcuPMUs; std::vector > iioPMUs; + std::vector uboxPMUs; double joulesPerEnergyUnit; std::vector > energy_status; std::vector > dram_energy_status; @@ -819,7 +830,16 @@ class PCM_API PCM return (PCM::SKX == cpu_model) && (cpu_stepping > 4); } + void initUncorePMUsDirect(); + void initUncorePMUsPerf(); + public: + //! check if in secure boot mode + bool isSecureBoot() const; + + //! true if Linux perf for uncore PMU programming should AND can be used internally + bool useLinuxPerfForUncore() const; + /*! \brief checks if QOS monitoring support present diff --git a/types.h b/types.h index cd037b89..06cd8dfe 100644 --- a/types.h +++ b/types.h @@ -698,6 +698,10 @@ struct BecktonUncorePMUCNTCTLRegister #define UCLK_FIXED_CTR_ADDR (0x704) #define UCLK_FIXED_CTL_ADDR (0x703) +#define UBOX_MSR_PMON_CTL0_ADDR (0x705) +#define UBOX_MSR_PMON_CTL1_ADDR (0x706) +#define UBOX_MSR_PMON_CTR0_ADDR (0x709) +#define UBOX_MSR_PMON_CTR1_ADDR (0x70a) #define JKTIVT_PCU_MSR_PMON_CTR3_ADDR (0x0C39) #define JKTIVT_PCU_MSR_PMON_CTR2_ADDR (0x0C38) From 6106135afaa48df902835c9c555dbd7d97ce478b Mon Sep 17 00:00:00 2001 From: Roman Dementiev Date: Wed, 30 Jan 2019 08:58:50 +0100 Subject: [PATCH 04/16] enable Linux perf interface for daemon --- daemon/daemon/Debug/makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/daemon/daemon/Debug/makefile b/daemon/daemon/Debug/makefile index 29244068..311bc6b2 100644 --- a/daemon/daemon/Debug/makefile +++ b/daemon/daemon/Debug/makefile @@ -39,6 +39,11 @@ endif # All Target all: daemon +# rely on Linux perf support (user needs CAP_SYS_ADMIN privileges), comment out to disable +ifneq ($(wildcard /usr/include/linux/perf_event.h),) +CXXFLAGS += -DPCM_USE_PERF +endif + # Tool invocations daemon: pre-build $(OBJS) @echo 'Building target: $@' @@ -54,7 +59,7 @@ clean: pre-build: -@echo 'Build PCM' - -g++ -c ../../../*.cpp -std=c++0x + -g++ ${CXXFLAGS} -c ../../../*.cpp -std=c++0x -@echo ' ' .PHONY: all clean dependents From 2148bb28eacfa0a5e21a409d5cb3874c82773d63 Mon Sep 17 00:00:00 2001 From: Roman Dementiev Date: Wed, 30 Jan 2019 10:13:51 +0100 Subject: [PATCH 05/16] implement filter programming for uncore perf Change-Id: I46afe7e872e35e3b319b3f0879228db8e51ae9c4 --- cpucounters.cpp | 98 ++++++++++++++++++++++++++++--------------------- 1 file changed, 57 insertions(+), 41 deletions(-) diff --git a/cpucounters.cpp b/cpucounters.cpp index f0e2b7da..1e0f4d3d 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -1548,7 +1548,7 @@ void PCM::initUncorePMUsDirect() #ifdef PCM_USE_PERF std::vector enumeratePerfPMUs(const std::string & type, int max_id); -void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed); +void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed, bool filter0 = false, bool filter1 = false); #endif void PCM::initUncorePMUsPerf() @@ -1558,10 +1558,10 @@ void PCM::initUncorePMUsPerf() cboPMUs.resize(num_sockets); for (uint32 s = 0; s < (uint32)num_sockets; ++s) { - populatePerfPMUs(s, enumeratePerfPMUs("pcu", 100), pcuPMUs, false); + populatePerfPMUs(s, enumeratePerfPMUs("pcu", 100), pcuPMUs, false, true); populatePerfPMUs(s, enumeratePerfPMUs("ubox", 100), uboxPMUs, true); - populatePerfPMUs(s, enumeratePerfPMUs("cbox", 100), cboPMUs[s], false); - populatePerfPMUs(s, enumeratePerfPMUs("cha", 200), cboPMUs[s], false); + populatePerfPMUs(s, enumeratePerfPMUs("cbox", 100), cboPMUs[s], false, true, true); + populatePerfPMUs(s, enumeratePerfPMUs("cha", 200), cboPMUs[s], false, true, true); std::vector iioPMUVector; populatePerfPMUs(s, enumeratePerfPMUs("iio", 100), iioPMUVector, false); for (size_t i = 0; i < iioPMUVector.size(); ++i) @@ -4852,32 +4852,12 @@ class PerfVirtualDummyUnitControlRegister : public HWRegister } }; -class PerfVirtualFilterRegister : public HWRegister -{ - uint64 lastValue; - void printError() - { - static bool printed = false; - if (!printed) std::cerr << "ERROR: perf uncore interface does not support filter registers yet." << std::endl; - printed = true; - } -public: - PerfVirtualFilterRegister() : lastValue(0) {} - void operator = (uint64 val) override - { - printError(); - lastValue = val; - } - operator uint64 () override - { - printError(); - return lastValue; - } -}; +class PerfVirtualFilterRegister; class PerfVirtualControlRegister : public HWRegister { friend class PerfVirtualCounterRegister; + friend class PerfVirtualFilterRegister; int fd; int socket; int pmuID; @@ -4952,6 +4932,38 @@ class PerfVirtualCounterRegister : public HWRegister } }; +class PerfVirtualFilterRegister : public HWRegister +{ + uint64 lastValue; + std::array, 4> controlRegs; + int filterNr; +public: + PerfVirtualFilterRegister(std::array, 4> & controlRegs_, int filterNr_) : + lastValue(0), + controlRegs(controlRegs_), + filterNr(filterNr_) + { + } + void operator = (uint64 val) override + { + lastValue = val; + for (auto & ctl: controlRegs) + { + union { + uint64 config1; + uint32 config1HL[2]; + } cvt; + cvt.config1 = ctl->event.config1; + cvt.config1HL[filterNr] = val; + ctl->event.config1 = cvt.config1; + } + } + operator uint64 () override + { + return lastValue; + } +}; + std::vector enumeratePerfPMUs(const std::string & type, int max_id) { auto getPerfPMUID = [](const std::string & type, int num) @@ -4984,35 +4996,39 @@ std::vector enumeratePerfPMUs(const std::string & type, int max_id) return ids; } -void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed) +void populatePerfPMUs(unsigned socket_, const std::vector & ids, std::vector & pmus, bool fixed, bool filter0, bool filter1) { for (const auto & id : ids) { - std::shared_ptr controlReg0 = std::make_shared(socket_, id); - std::shared_ptr controlReg1 = std::make_shared(socket_, id); - std::shared_ptr controlReg2 = std::make_shared(socket_, id); - std::shared_ptr controlReg3 = std::make_shared(socket_, id); - std::shared_ptr counterReg0 = std::make_shared(controlReg0); - std::shared_ptr counterReg1 = std::make_shared(controlReg1); - std::shared_ptr counterReg2 = std::make_shared(controlReg2); - std::shared_ptr counterReg3 = std::make_shared(controlReg3); + std::array, 4> controlRegs = { + std::make_shared(socket_, id), + std::make_shared(socket_, id), + std::make_shared(socket_, id), + std::make_shared(socket_, id) + }; + std::shared_ptr counterReg0 = std::make_shared(controlRegs[0]); + std::shared_ptr counterReg1 = std::make_shared(controlRegs[1]); + std::shared_ptr counterReg2 = std::make_shared(controlRegs[2]); + std::shared_ptr counterReg3 = std::make_shared(controlRegs[3]); std::shared_ptr fixedControlReg = std::make_shared(socket_, id, true); std::shared_ptr fixedCounterReg = std::make_shared(fixedControlReg); + std::shared_ptr filterReg0 = std::make_shared(controlRegs, 0); + std::shared_ptr filterReg1 = std::make_shared(controlRegs, 1); pmus.push_back( UncorePMU( std::make_shared(), - controlReg0, - controlReg1, - controlReg2, - controlReg3, + controlRegs[0], + controlRegs[1], + controlRegs[2], + controlRegs[3], counterReg0, counterReg1, counterReg2, counterReg3, fixed ? fixedControlReg : std::shared_ptr(), fixed ? fixedCounterReg : std::shared_ptr(), - std::make_shared(), - std::make_shared() + filter0 ? filterReg0 : std::shared_ptr(), + filter1 ? filterReg1 : std::shared_ptr() ) ); } From ce4bfc46f4d0a3246c9c9d7e095a31e96812d7fc Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Wed, 30 Jan 2019 10:46:12 +0100 Subject: [PATCH 06/16] update year Change-Id: I84a086a360d5cbe2bce89c0b4315618f01ff6f52 --- cpucounters.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpucounters.h b/cpucounters.h index 38905b8f..60cea958 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -1,5 +1,5 @@ /* -Copyright (c) 2009-2018, Intel Corporation +Copyright (c) 2009-2019, Intel Corporation All rights reserved. Redistribution and use in source and binary forms, with or without modification, are permitted provided that the following conditions are met: From 73d9d8453c4b5c7d7cc4572498c210ea4685c450 Mon Sep 17 00:00:00 2001 From: Roman Dementiev Date: Wed, 30 Jan 2019 13:40:21 +0100 Subject: [PATCH 07/16] disable and re-enable NMI watchdog when needed Change-Id: I460a85db7d48fc0f6759e95eeb00e52890b4d03d --- cpucounters.cpp | 80 ++++++++++++++++++++++++++++++++----------------- cpucounters.h | 2 ++ 2 files changed, 55 insertions(+), 27 deletions(-) diff --git a/cpucounters.cpp b/cpucounters.cpp index 1e0f4d3d..a056a783 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -736,19 +736,38 @@ std::string readSysFS(const char * path, bool silent = false) FILE * f = fopen(path, "r"); if (!f) { - if (silent == false) std::cerr << "Can not open "<< path <<" file." << std::endl; + if (silent == false) std::cerr << "ERROR: Can not open "<< path <<" file." << std::endl; return std::string(); } char buffer[1024]; if(NULL == fgets(buffer, 1024, f)) { - if (silent == false) std::cerr << "Can not read "<< path << "." << std::endl; + if (silent == false) std::cerr << "ERROR: Can not read from "<< path << "." << std::endl; + fclose(f); return std::string(); } fclose(f); return std::string(buffer); } +bool writeSysFS(const char * path, const std::string & value, bool silent = false) +{ + FILE * f = fopen(path, "w"); + if (!f) + { + if (silent == false) std::cerr << "ERROR: Can not open " << path << " file." << std::endl; + return false; + } + if (fputs(value.c_str(), f) < 0) + { + if (silent == false) std::cerr << "ERROR: Can not write to " << path << "." << std::endl; + fclose(f); + return false; + } + fclose(f); + return true; +} + int readMaxFromSysFS(const char * path) { std::string content = readSysFS(path); @@ -1574,35 +1593,30 @@ void PCM::initUncorePMUsPerf() #ifdef __linux__ +#define PCM_NMI_WATCHDOG_PATH "/proc/sys/kernel/nmi_watchdog" + bool isNMIWatchdogEnabled() { - FILE * f = fopen("/proc/sys/kernel/nmi_watchdog", "r"); - if (!f) + const auto watchdog = readSysFS(PCM_NMI_WATCHDOG_PATH); + if (watchdog.length() == 0) { return false; } - char buffer[1024]; - if(NULL == fgets(buffer, 1024, f)) - { - std::cerr << "Can not read /proc/sys/kernel/nmi_watchdog ." << std::endl; - fclose(f); - return true; - } - int enabled = 1; - pcm_sscanf(buffer) >> enabled; - fclose(f); - - if(enabled == 1) - { - std::cerr << "Error: NMI watchdog is enabled. This consumes one hw-PMU counter" << std::endl; - std::cerr << " to disable NMI watchdog please run under root: echo 0 > /proc/sys/kernel/nmi_watchdog"<< std::endl; - std::cerr << " or to disable it permanently: echo 'kernel.nmi_watchdog=0' >> /etc/sysctl.conf "<< std::endl; - } + return (std::atoi(watchdog.c_str()) == 1); +} - return (enabled == 1); +void disableNMIWatchdog() +{ + std::cout << "Disabling NMI watchdog since it consumes one hw-PMU counter." << std::endl; + writeSysFS(PCM_NMI_WATCHDOG_PATH, "0"); } +void enableNMIWatchdog() +{ + std::cout << " Re-enabling NMI watchdog." << std::endl; + writeSysFS(PCM_NMI_WATCHDOG_PATH, "1"); +} #endif class CoreTaskQueue @@ -1692,7 +1706,8 @@ PCM::PCM() : canUsePerf(false), outfile(NULL), backup_ofile(NULL), - run_state(1) + run_state(1), + needToRestoreNMIWatchdog(false) { #ifdef _MSC_VER TCHAR driverPath[1040]; // length for current directory + "\\msr.sys" @@ -1713,10 +1728,6 @@ PCM::PCM() : if(!checkModel()) return; -#ifdef __linux__ - if (isNMIWatchdogEnabled()) return; -#endif - initCStateSupportTables(); if(!discoverSystemTopology()) return; @@ -1729,6 +1740,14 @@ PCM::PCM() : if(!detectNominalFrequency()) return; +#ifdef __linux__ + if (isNMIWatchdogEnabled()) + { + disableNMIWatchdog(); + needToRestoreNMIWatchdog = true; + } +#endif + initEnergyMonitoring(); initUncoreObjects(); @@ -3087,6 +3106,13 @@ void PCM::cleanup() cleanupUncorePMUs(); freeRMID(); +#ifdef __linux__ + if (needToRestoreNMIWatchdog) + { + enableNMIWatchdog(); + needToRestoreNMIWatchdog = false; + } +#endif } // hle is only available when cpuid has this: diff --git a/cpucounters.h b/cpucounters.h index 60cea958..caa308b5 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -746,6 +746,8 @@ class PCM_API PCM std::streambuf * backup_ofile; // backup of original output = cout int run_state; // either running (1) or sleeping (0) + bool needToRestoreNMIWatchdog; + std::vector > lastProgrammedCustomCounters; uint32 checkCustomCoreProgramming(std::shared_ptr msr); void reservePMU(); From 8a9447de51b032daa2c6b42cf029cf2a3a74e868 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Wed, 30 Jan 2019 20:31:31 +0100 Subject: [PATCH 08/16] fix compilation on OSX Change-Id: I380a4e22bc7a76562a0f2599c5ca8c884cce18be --- mmio.cpp | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/mmio.cpp b/mmio.cpp index c6bfce25..986a9f78 100644 --- a/mmio.cpp +++ b/mmio.cpp @@ -93,9 +93,12 @@ MMIORange::MMIORange(uint64 baseAddr_, uint64 /* size_ */, bool readonly_) : sta #include "PCIDriverInterface.h" -MMIORange::MMIORange(uint64 physical_address, uint64 size_, bool readonly_) : mmapAddr(NULL), readonly(readonly_) +MMIORange::MMIORange(uint64 physical_address, uint64 size_, bool readonly_) : + mmapAddr(NULL), + size(size_), + readonly(readonly_) { - if (size_ > 4096) + if (size > 4096) { std::cerr << "PCM Error: the driver does not support mapping of regions > 4KB" << std::endl; return; From 031f8641cb62c7f792a3eec4d2ce42ac576a813d Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Sat, 2 Feb 2019 18:18:15 +0100 Subject: [PATCH 09/16] fix uncore iMC latency computation previously for socket i the latency on memory controller channel i was measured (also running out of bounds if the number of sockets is larger than the number of channels). Change-Id: I5aa248188980f9ad9ae2e5dec3d6e64beaad5fb9 --- pcm-latency.cpp | 29 ++++++++++++++++++----------- 1 file changed, 18 insertions(+), 11 deletions(-) diff --git a/pcm-latency.cpp b/pcm-latency.cpp index 4c06b3a1..a91f4e8b 100644 --- a/pcm-latency.cpp +++ b/pcm-latency.cpp @@ -54,7 +54,7 @@ using namespace std; #define MAX_CORES 4096 EventSelectRegister regs[2]; -const uint8_t max_sockets = 8; +const uint8_t max_sockets = 64; struct socket_info_uncore { @@ -126,25 +126,32 @@ void store_latency_uncore(PCM *m, bool ddr, int delay_ms) { for (unsigned int i=0; igetNumSockets(); i++) { - uncore_event[ddr].skt[i].socket_id = i; + uncore_event[ddr].skt[i].socket_id = i; const double delay_seconds = double(delay_ms) / 1000.; DRAMSpeed = double(getDRAMClocks(0, BeforeState[i], AfterState[i]))/(double(1e9) * delay_seconds); - if (getMCCounter(i,1,BeforeState[i], AfterState[i]) == 0) + uncore_event[ddr].skt[i].rinsert = 0; + uncore_event[ddr].skt[i].roccupancy = 0; + uncore_event[ddr].skt[i].winsert = 0; + uncore_event[ddr].skt[i].woccupancy = 0; + for (size_t channel = 0; channel < m->getMCChannelsPerSocket(); ++channel) + { + uncore_event[ddr].skt[i].rinsert += (double)getMCCounter(channel, RPQ_INS, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].roccupancy += (double)getMCCounter(channel, RPQ_OCC, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].winsert += (double)getMCCounter(channel, WPQ_INS, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].woccupancy += (double)getMCCounter(channel, WPQ_OCC, BeforeState[i], AfterState[i]); + } + if (uncore_event[ddr].skt[i].rinsert == 0.) { uncore_event[ddr].skt[i].rlatency = 0; } else { - uncore_event[ddr].skt[i].rinsert = (double)getMCCounter(i,RPQ_INS,BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].roccupancy = (double)getMCCounter(i,RPQ_OCC,BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].rlatency = (double)getMCCounter(i,RPQ_OCC,BeforeState[i], AfterState[i])/getMCCounter(i,RPQ_INS,BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].rlatency = uncore_event[ddr].skt[i].roccupancy / uncore_event[ddr].skt[i].rinsert; } - if (getMCCounter(i,3,BeforeState[i], AfterState[i]) == 0) + if (uncore_event[ddr].skt[i].winsert == 0.) { uncore_event[ddr].skt[i].wlatency = 0; } else { - uncore_event[ddr].skt[i].wlatency = (double)getMCCounter(i,WPQ_OCC,BeforeState[i], AfterState[i])/getMCCounter(i,WPQ_INS,BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].winsert = (double)getMCCounter(i,WPQ_INS,BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].woccupancy = (double)getMCCounter(i,WPQ_OCC,BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].wlatency = uncore_event[ddr].skt[i].woccupancy / uncore_event[ddr].skt[i].winsert; } swap(BeforeState[i], AfterState[i]); @@ -309,7 +316,7 @@ void print_all_stats(PCM *m, bool enable_pmm, bool enable_verbose) tmp_core.push_back(tmp); } } - core_size_per_socket = tmp_core.size(); + core_size_per_socket = (unsigned int)tmp_core.size(); tmp_thread.push_back(tmp_core); } sk_th.push_back(tmp_thread); From 9491fe8c5c4e5da513827dafebd399e89b66d540 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Sat, 2 Feb 2019 19:27:36 +0100 Subject: [PATCH 10/16] populate thread_id and tile_id on Windows and *BSD Change-Id: Iaa46272445563c0b1edf048b0c210c00a867568e --- cpucounters.cpp | 132 +++++++++++++++++++++--------------------------- 1 file changed, 57 insertions(+), 75 deletions(-) diff --git a/cpucounters.cpp b/cpucounters.cpp index a056a783..74b4dc4d 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -788,27 +788,56 @@ bool PCM::discoverSystemTopology() socketIdMap_type socketIdMap; PCM_CPUID_INFO cpuid_args; - pcm_cpuid(1, cpuid_args); - - int apic_ids_per_package = extract_bits_ui(cpuid_args.array[1], 16, 23); - int apic_ids_per_core; + // init constants for CPU topology leaf 0xB + // adapted from Topology Enumeration Reference code for Intel 64 Architecture + // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration + int wasCoreReported = 0, wasThreadReported = 0; + int subleaf = 0, levelType, levelShift; + //uint32 coreSelectMask = 0, smtSelectMask = 0; + uint32 smtMaskWidth = 0; + //uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0; + uint32 corePlusSMTMaskWidth = 0; + uint32 coreMaskWidth = 0; - if (apic_ids_per_package == 0) { - std::cout << "apic_ids_per_package == 0" << std::endl; - return false; + TemporalThreadAffinity aff0(0); + do + { + pcm_cpuid(0xb, subleaf, cpuid_args); + if (cpuid_args.array[1] == 0) + { // if EBX ==0 then this subleaf is not valid, we can exit the loop + break; + } + levelType = extract_bits_ui(cpuid_args.array[2], 8, 15); + levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4); + switch (levelType) + { + case 1: //level type is SMT, so levelShift is the SMT_Mask_Width + smtMaskWidth = levelShift; + wasThreadReported = 1; + break; + case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width + corePlusSMTMaskWidth = levelShift; + wasCoreReported = 1; + break; + default: + break; + } + subleaf++; + } while (1); } - pcm_cpuid(0xb, 0x0, cpuid_args); - - if (extract_bits_ui(cpuid_args.array[2], 8, 15) == 0x1) - apic_ids_per_core = extract_bits_ui(cpuid_args.array[1], 0, 15); + if (wasThreadReported && wasCoreReported) + { + coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth; + } + else if (!wasCoreReported && wasThreadReported) + { + coreMaskWidth = smtMaskWidth; + } else - apic_ids_per_core = 1; - - if (apic_ids_per_core == 0) { - std::cout << "apic_ids_per_core == 0" << std::endl; + std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11." << std::endl; return false; } @@ -832,6 +861,14 @@ bool PCM::discoverSystemTopology() << " [the most significant bit = " << l2CacheMaskShift << "]" << std::endl; #endif + auto populateEntry = [&smtMaskWidth, &coreMaskWidth, &l2CacheMaskShift](TopologyEntry & entry, const int apic_id) + { + entry.thread_id = extract_bits_ui(apic_id, 0, smtMaskWidth - 1); + entry.core_id = extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth + coreMaskWidth - 1); + entry.socket = extract_bits_ui(apic_id, smtMaskWidth + coreMaskWidth, 31); + entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31); + }; + #ifdef _MSC_VER // version for Windows 7 and later version @@ -891,8 +928,8 @@ bool PCM::discoverSystemTopology() TopologyEntry entry; entry.os_id = i; - entry.socket = apic_id / apic_ids_per_package; - entry.core_id = (apic_id % apic_ids_per_package) / apic_ids_per_core; + + populateEntry(entry, apic_id); topology.push_back(entry); socketIdMap[entry.socket] = 0; @@ -906,58 +943,6 @@ bool PCM::discoverSystemTopology() TopologyEntry entry; #ifdef __linux__ - // init constants for CPU topology leaf 0xB - // adapted from Topology Enumeration Reference code for Intel 64 Architecture - // https://software.intel.com/en-us/articles/intel-64-architecture-processor-topology-enumeration - int wasCoreReported = 0, wasThreadReported = 0; - int subleaf = 0, levelType, levelShift; - //uint32 coreSelectMask = 0, smtSelectMask = 0; - uint32 smtMaskWidth = 0; - //uint32 pkgSelectMask = (-1), pkgSelectMaskShift = 0; - uint32 corePlusSMTMaskWidth = 0; - uint32 coreMaskWidth = 0; - - // This code needs to run affinitized to a single core, how do we make sure of that? - do - { - pcm_cpuid(0xb, subleaf, cpuid_args); - if (cpuid_args.array[1] == 0) - { // if EBX ==0 then this subleaf is not valid, we can exit the loop - break; - } - levelType = extract_bits_ui(cpuid_args.array[2], 8, 15); - levelShift = extract_bits_ui(cpuid_args.array[0], 0, 4); - switch (levelType) - { - case 1: //level type is SMT, so levelShift is the SMT_Mask_Width - smtMaskWidth = levelShift; - wasThreadReported = 1; - break; - case 2: //level type is Core, so levelShift is the CorePlusSMT_Mask_Width - corePlusSMTMaskWidth = levelShift; - wasCoreReported = 1; - break; - default: - break; - } - subleaf++; - } while (1); - - if(wasThreadReported && wasCoreReported) - { - coreMaskWidth = corePlusSMTMaskWidth - smtMaskWidth; - } - else if (!wasCoreReported && wasThreadReported) - { - coreMaskWidth = smtMaskWidth; - } - else - { - std::cerr << "ERROR: Major problem? No leaf 0 under cpuid function 11." << std::endl; - return false; - } - - num_cores = readMaxFromSysFS("/sys/devices/system/cpu/present"); if(num_cores == -1) { @@ -989,10 +974,7 @@ bool PCM::discoverSystemTopology() pcm_cpuid(0xb, 0x0, cpuid_args); int apic_id = cpuid_args.array[3]; - entry.thread_id = extract_bits_ui(apic_id, 0, smtMaskWidth-1); - entry.core_id = extract_bits_ui(apic_id, smtMaskWidth, smtMaskWidth+coreMaskWidth-1); - entry.socket = extract_bits_ui(apic_id, smtMaskWidth+coreMaskWidth, 31); - entry.tile_id = extract_bits_ui(apic_id, l2CacheMaskShift, 31); + populateEntry(entry, apic_id); topology[entry.os_id] = entry; socketIdMap[entry.socket] = 0; @@ -1090,8 +1072,8 @@ bool PCM::discoverSystemTopology() apic_id = cpuid_args_freebsd.data[3]; entry.os_id = i; - entry.socket = apic_id / apic_ids_per_package; - entry.core_id = (apic_id % apic_ids_per_package) / apic_ids_per_core; + + populateEntry(entry, apic_id); if (entry.socket == 0 && entry.core_id == 0) ++threads_per_core; From 6a0d26545f50ab78efd6d9975115812692ff81eb Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Sat, 2 Feb 2019 19:33:25 +0100 Subject: [PATCH 11/16] enable pcm-latency for SKL client Change-Id: If2108cdda3aee7cf22e3187736f3057cdd13c44f --- cpucounters.h | 5 +++-- pcm-latency.cpp | 23 +++++++++++++---------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/cpucounters.h b/cpucounters.h index caa308b5..d1bd7936 100644 --- a/cpucounters.h +++ b/cpucounters.h @@ -1572,8 +1572,9 @@ class PCM_API PCM return ( cpu_model == PCM::HASWELLX || cpu_model == PCM::BDX - || cpu_model == PCM::SKX - ); + || cpu_model == PCM::SKX + || cpu_model == PCM::SKL + ); } bool PMMTrafficMetricsAvailable() const diff --git a/pcm-latency.cpp b/pcm-latency.cpp index a91f4e8b..2156fa08 100644 --- a/pcm-latency.cpp +++ b/pcm-latency.cpp @@ -135,10 +135,10 @@ void store_latency_uncore(PCM *m, bool ddr, int delay_ms) uncore_event[ddr].skt[i].woccupancy = 0; for (size_t channel = 0; channel < m->getMCChannelsPerSocket(); ++channel) { - uncore_event[ddr].skt[i].rinsert += (double)getMCCounter(channel, RPQ_INS, BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].roccupancy += (double)getMCCounter(channel, RPQ_OCC, BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].winsert += (double)getMCCounter(channel, WPQ_INS, BeforeState[i], AfterState[i]); - uncore_event[ddr].skt[i].woccupancy += (double)getMCCounter(channel, WPQ_OCC, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].rinsert += (double)getMCCounter((uint32)channel, RPQ_INS, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].roccupancy += (double)getMCCounter((uint32)channel, RPQ_OCC, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].winsert += (double)getMCCounter((uint32)channel, WPQ_INS, BeforeState[i], AfterState[i]); + uncore_event[ddr].skt[i].woccupancy += (double)getMCCounter((uint32)channel, WPQ_OCC, BeforeState[i], AfterState[i]); } if (uncore_event[ddr].skt[i].rinsert == 0.) { @@ -324,9 +324,12 @@ void print_all_stats(PCM *m, bool enable_pmm, bool enable_verbose) print_core_stats(m, core_size_per_socket, sk_th); - print_ddr(m, enable_pmm); - if (enable_verbose) - print_verbose(m, enable_pmm); + if (m->hasPCICFGUncore()) + { + print_ddr(m, enable_pmm); + if (enable_verbose) + print_verbose(m, enable_pmm); + } } EventSelectRegister build_core_register(uint64 reg_used, uint64 value, uint64 usr, uint64 os, uint64 enable, uint64 umask, uint64 event_select, uint64 edge) @@ -386,14 +389,14 @@ void build_registers(PCM *m, PCM::ExtendedCustomCoreEventDescription conf, bool //Check if Online Cores = Available Cores. This version only supports available cores = online cores if (m->getNumCores() != m->getNumOnlineCores()) { - cout << "Number of online cores should be equal to number of available cores" << endl; + cout << "Number of online cores should be equal to number of available cores" << endl; exit(EXIT_FAILURE); } -//Check for Maximum Custom Core Events + //Check for Maximum Custom Core Events if (m->getMaxCustomCoreEvents() < 2) { - cout << "System should support a minimum of 2 Custom Core Events to run pcm-latency" << endl; + cout << "System should support a minimum of 2 Custom Core Events to run pcm-latency" << endl; exit(EXIT_FAILURE); } //Creating conf From f5061dc18eaf9f09718cd476af6702ef97354010 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Sun, 3 Feb 2019 17:44:24 +0100 Subject: [PATCH 12/16] pcm-latency: implement average socket L1 cache miss latency metric Change-Id: I66885a0b08bf0a1e1fa3d0727f802bd6995999a7 --- pcm-latency.cpp | 37 +++++++++++++++++++++++++++++++------ 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/pcm-latency.cpp b/pcm-latency.cpp index 2156fa08..5c2af6c7 100644 --- a/pcm-latency.cpp +++ b/pcm-latency.cpp @@ -75,6 +75,7 @@ struct core_info double latency; double occ_rd; double insert_rd; + core_info() : core_id(0), socket(0), thread(0), latency(0.), occ_rd(0.), insert_rd(0.) {} }; struct socket_info_pci @@ -93,6 +94,7 @@ struct res_core { string name; vector core; + vector socket; } core_latency[10]; double DRAMSpeed; @@ -107,7 +109,6 @@ std::vector DummySocketStates; void collect_beforestate_uncore(PCM *m) { - for (unsigned int i=0; igetNumSockets(); i++) { BeforeState[i] = m->getServerUncorePowerState(i); @@ -176,10 +177,15 @@ void store_latency_core(PCM *m) { core_event[k].core.resize(MAX_CORES); } - + for (auto & s : core_latency[L1].socket) + { + s.occ_rd = 0; + s.insert_rd = 0; + } for (unsigned int i=0; igetNumCores(); i++) { - double frequency = (((double)getCycles(BeforeState_core[i], AfterState_core[i])/(double)getRefCycles(BeforeState_core[i], AfterState_core[i])) * (double)m->getNominalFrequency())/1000000000; + const double frequency = (((double)getCycles(BeforeState_core[i], AfterState_core[i]) / + (double)getRefCycles(BeforeState_core[i], AfterState_core[i])) * (double)m->getNominalFrequency()) / 1000000000; for(int j=0; j<2; j++)// 2 events { core_event[j].core[i].core_id = i; @@ -190,6 +196,13 @@ void store_latency_core(PCM *m) core_latency[L1].core[i].latency = ((core_event[FB_OCC_RD].core[i].latency/core_event[FB_INS_RD].core[i].latency)+extra_clocks_for_L1_miss)/frequency; core_latency[L1].core[i].occ_rd = (core_event[FB_OCC_RD].core[i].latency); core_latency[L1].core[i].insert_rd = (core_event[FB_INS_RD].core[i].latency); + const auto s = m->getSocketId(i); + core_latency[L1].socket[s].occ_rd += (core_latency[L1].core[i].occ_rd + extra_clocks_for_L1_miss * core_latency[L1].core[i].insert_rd) / frequency; + core_latency[L1].socket[s].insert_rd += core_latency[L1].core[i].insert_rd; + } + for (auto & s : core_latency[L1].socket) + { + s.latency = s.occ_rd / s.insert_rd; } swap(BeforeState_core, AfterState_core); swap(SysBeforeState, SysAfterState); @@ -267,8 +280,12 @@ void print_ddr(PCM *m, int ddr_ip) void print_core_stats(PCM *m, unsigned int core_size_per_socket, vector>> &sk_th) { - cout <getNumSockets(); sid++) { for (unsigned int tid=0; tid< m->getThreadsPerCore(); tid++) @@ -277,7 +294,7 @@ void print_core_stats(PCM *m, unsigned int core_size_per_socket, vectorgetNumSockets(); ++s) + { + cout << "Socket" << s << ": " << core_latency[L1].socket[s].latency << endl; + } } void print_all_stats(PCM *m, bool enable_pmm, bool enable_verbose) @@ -415,6 +439,7 @@ void build_registers(PCM *m, PCM::ExtendedCustomCoreEventDescription conf, bool { uncore_event[i].skt.resize(m->getNumSockets()); core_latency[i].core.resize(m->getNumCores()); + core_latency[i].socket.resize(m->getNumSockets()); } //Program Core and Uncore From 3189b9980120c2bbda1511a9eee754dc50ff994d Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Mon, 4 Feb 2019 10:32:34 +0100 Subject: [PATCH 13/16] improve robustness for ServerUncorePowerState Change-Id: I9605b0747e2a8e2f8ba732a68c5b8d5be5f792b7 --- cpucounters.cpp | 22 +++++++++++++++------- cpucounters.h | 43 +++++++++++++++++++++++-------------------- pcm-memory.cpp | 6 +++--- 3 files changed, 41 insertions(+), 30 deletions(-) diff --git a/cpucounters.cpp b/cpucounters.cpp index 74b4dc4d..52b80778 100644 --- a/cpucounters.cpp +++ b/cpucounters.cpp @@ -4137,26 +4137,34 @@ ServerUncorePowerState PCM::getServerUncorePowerState(uint32 socket) server_pcicfg_uncore[socket]->freezeCounters(); for(uint32 port=0;port < (uint32)server_pcicfg_uncore[socket]->getNumQPIPorts();++port) { + assert(port < result.QPIClocks.size()); result.QPIClocks[port] = server_pcicfg_uncore[socket]->getQPIClocks(port); + assert(port < result.QPIL0pTxCycles.size()); result.QPIL0pTxCycles[port] = server_pcicfg_uncore[socket]->getQPIL0pTxCycles(port); + assert(port < result.QPIL1Cycles.size()); result.QPIL1Cycles[port] = server_pcicfg_uncore[socket]->getQPIL1Cycles(port); } for (uint32 channel = 0; channel < (uint32)server_pcicfg_uncore[socket]->getNumMCChannels(); ++channel) { + assert(channel < result.DRAMClocks.size()); result.DRAMClocks[channel] = server_pcicfg_uncore[socket]->getDRAMClocks(channel); - for(uint32 cnt=0;cnt<4;++cnt) - result.MCCounter[channel][cnt] = server_pcicfg_uncore[socket]->getMCCounter(channel,cnt); + assert(channel < result.MCCounter.size()); + for (uint32 cnt = 0; cnt < ServerUncorePowerState::maxCounters; ++cnt) + result.MCCounter[channel][cnt] = server_pcicfg_uncore[socket]->getMCCounter(channel, cnt); } for (uint32 channel = 0; channel < (uint32)server_pcicfg_uncore[socket]->getNumEDCChannels(); ++channel) { + assert(channel < result.MCDRAMClocks.size()); result.MCDRAMClocks[channel] = server_pcicfg_uncore[socket]->getMCDRAMClocks(channel); - for(uint32 cnt=0;cnt<4;++cnt) - result.EDCCounter[channel][cnt] = server_pcicfg_uncore[socket]->getEDCCounter(channel,cnt); + assert(channel < result.EDCCounter.size()); + for (uint32 cnt = 0; cnt < ServerUncorePowerState::maxCounters; ++cnt) + result.EDCCounter[channel][cnt] = server_pcicfg_uncore[socket]->getEDCCounter(channel, cnt); } for (uint32 controller = 0; controller < (uint32)server_pcicfg_uncore[socket]->getNumMC(); ++controller) { - for(uint32 cnt=0;cnt<4;++cnt) - result.M2MCounter[controller][cnt] = server_pcicfg_uncore[socket]->getM2MCounter(controller,cnt); + assert(controller < result.M2MCounter.size()); + for (uint32 cnt = 0; cnt < ServerUncorePowerState::maxCounters; ++cnt) + result.M2MCounter[controller][cnt] = server_pcicfg_uncore[socket]->getM2MCounter(controller, cnt); } server_pcicfg_uncore[socket]->unfreezeCounters(); } @@ -4164,7 +4172,7 @@ ServerUncorePowerState PCM::getServerUncorePowerState(uint32 socket) { uint32 refCore = socketRefCore[socket]; TemporalThreadAffinity tempThreadAffinity(refCore); - for (int i = 0; i < 4 && socket < pcuPMUs.size(); ++i) + for (int i = 0; i < ServerUncorePowerState::maxCounters && socket < pcuPMUs.size(); ++i) result.PCUCounter[i] = *pcuPMUs[socket].counterValue[i]; // std::cout<< "values read: " << result.PCUCounter[0]<<" "< +#include #include #include #include @@ -2215,13 +2216,21 @@ class UncoreCounterState //! class ServerUncorePowerState : public UncoreCounterState { - uint64 QPIClocks[3], QPIL0pTxCycles[3], QPIL1Cycles[3]; - uint64 DRAMClocks[8]; - uint64 MCDRAMClocks[16]; - uint64 MCCounter[8][4]; // channel X counter - uint64 M2MCounter[2][4]; // M2M/iMC boxes x counter - uint64 EDCCounter[8][4]; // EDC controller X counter - uint64 PCUCounter[4]; +public: + enum { + maxControllers = 2, + maxChannels = 8, + maxXPILinks = 3, + maxCounters = 4 + }; +private: + std::array QPIClocks, QPIL0pTxCycles, QPIL1Cycles; + std::array DRAMClocks; + std::array MCDRAMClocks; + std::array, maxChannels> MCCounter; // channel X counter + std::array, maxControllers> M2MCounter; // M2M/iMC boxes x counter + std::array, maxChannels> EDCCounter; // EDC controller X counter + std::array PCUCounter; int32 PackageThermalHeadroom; uint64 InvariantTSC; // invariant time stamp counter friend class PCM; @@ -2254,22 +2263,16 @@ class ServerUncorePowerState : public UncoreCounterState //! Returns current thermal headroom below TjMax int32 getPackageThermalHeadroom() const { return PackageThermalHeadroom; } ServerUncorePowerState() : + QPIClocks{}, QPIL0pTxCycles{}, QPIL1Cycles{}, + DRAMClocks{}, + MCDRAMClocks{}, + MCCounter{}, + M2MCounter{}, + EDCCounter{}, + PCUCounter{}, PackageThermalHeadroom(0), InvariantTSC(0) { - memset(&(QPIClocks[0]), 0, 3 * sizeof(uint64)); - memset(&(QPIL0pTxCycles[0]), 0, 3 * sizeof(uint64)); - memset(&(QPIL1Cycles[0]), 0, 3 * sizeof(uint64)); - memset(&(DRAMClocks[0]), 0, 8 * sizeof(uint64)); - memset(&(MCDRAMClocks[0]), 0, 16 * sizeof(uint64)); - memset(&(PCUCounter[0]), 0, 4 * sizeof(uint64)); - for (int i = 0; i < 8; ++i) { - memset(&(MCCounter[i][0]), 0, 4 * sizeof(uint64)); - memset(&(EDCCounter[i][0]), 0, 4 * sizeof(uint64)); - } - for (int i = 0; i < 2; ++i) { - memset(&(M2MCounter[i][0]), 0, 4 * sizeof(uint64)); - } } }; diff --git a/pcm-memory.cpp b/pcm-memory.cpp index f212221e..da749063 100644 --- a/pcm-memory.cpp +++ b/pcm-memory.cpp @@ -58,9 +58,9 @@ using namespace std; const uint32 max_sockets = 256; -const uint32 max_imc_channels = 8; -const uint32 max_edc_channels = 8; -const uint32 max_imc_controllers = 2; +const uint32 max_imc_channels = ServerUncorePowerState::maxChannels; +const uint32 max_edc_channels = ServerUncorePowerState::maxChannels; +const uint32 max_imc_controllers = ServerUncorePowerState::maxControllers; typedef struct memdata { float iMC_Rd_socket_chan[max_sockets][max_imc_channels]; From 79f5910bb2ee0de8b1eeea31980db99f4dae517f Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Mon, 4 Feb 2019 17:07:18 +0100 Subject: [PATCH 14/16] pcm-daemon: resolve the warning mentioned in https://github.com/opcm/pcm/pull/93 the original fix caused segm fault because it tried to allocate a too big SharedPCMState object on the stack and resulted in stack overflow Change-Id: I207fa5887ff9aef67a6ebed3589e2880cf68f5ab --- daemon/daemon/daemon.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/daemon/daemon/daemon.cpp b/daemon/daemon/daemon.cpp index 2958d953..3face086 100644 --- a/daemon/daemon/daemon.cpp +++ b/daemon/daemon/daemon.cpp @@ -374,7 +374,7 @@ namespace PCMDaemon { } //Clear out shared memory - std::memset(sharedPCMState_, 0, sizeof(SharedPCMState)); + sharedPCMState_ = new (sharedPCMState_) SharedPCMState(); // use placement new operator } gid_t Daemon::resolveGroupName(const std::string& groupName) From 0be9e60df22e998dd0b81154df56fc125467a939 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Wed, 6 Feb 2019 14:56:12 +0100 Subject: [PATCH 15/16] pcm-daemon: implement lastUpdateTscBegin and lastUpdateTscEnd Change-Id: I6c43605e56800d9ab03ac83a0887261a2b4e4ab0 --- daemon/client/client.cpp | 8 ++++---- daemon/client/main.cpp | 2 +- daemon/daemon/common.h | 13 +++++++++---- daemon/daemon/daemon.cpp | 7 ++++--- 4 files changed, 18 insertions(+), 12 deletions(-) diff --git a/daemon/client/client.cpp b/daemon/client/client.cpp index 2c158c83..31c7bb30 100644 --- a/daemon/client/client.cpp +++ b/daemon/client/client.cpp @@ -55,7 +55,7 @@ namespace PCMDaemon { //Set last updated timestamp to avoid a detected change //when the client starts - lastUpdatedClientTsc_ = sharedPCMState_->lastUpdateTsc; + lastUpdatedClientTsc_ = sharedPCMState_->lastUpdateTscEnd; } PCMDaemon::SharedPCMState& Client::read() @@ -75,7 +75,7 @@ namespace PCMDaemon { // Check client version matches daemon version if(strlen(sharedPCMState_->version) > 0 && strcmp(sharedPCMState_->version, VERSION) != 0) { - std::cout << sharedPCMState_->lastUpdateTsc << " " << lastUpdatedClientTsc_ << std::endl; + std::cout << sharedPCMState_->lastUpdateTscEnd << " " << lastUpdatedClientTsc_ << std::endl; std::stringstream ss; ss << "Out of date PCM daemon client. Client version: " << VERSION << " Daemon version: " << sharedPCMState_->version; @@ -85,7 +85,7 @@ namespace PCMDaemon { if(countersHaveUpdated()) { //There is new data - lastUpdatedClientTsc_ = sharedPCMState_->lastUpdateTsc; + lastUpdatedClientTsc_ = sharedPCMState_->lastUpdateTscEnd; return *sharedPCMState_; } @@ -99,7 +99,7 @@ namespace PCMDaemon { bool Client::countersHaveUpdated() { - return lastUpdatedClientTsc_ != sharedPCMState_->lastUpdateTsc; + return lastUpdatedClientTsc_ != sharedPCMState_->lastUpdateTscEnd; } void Client::setupSharedMemory() diff --git a/daemon/client/main.cpp b/daemon/client/main.cpp index 4abb6c72..37a05e72 100644 --- a/daemon/client/main.cpp +++ b/daemon/client/main.cpp @@ -43,7 +43,7 @@ int main(int argc, char *argv[]) // Display internal metrics printTitle("Last updated TSC"); - std::cout << state.lastUpdateTsc << std::endl; + std::cout << state.lastUpdateTscEnd << std::endl; printTitle("Timestamp"); std::cout << state.timestamp << std::endl; diff --git a/daemon/daemon/common.h b/daemon/daemon/common.h index cfe16e21..b8b40e48 100644 --- a/daemon/daemon/common.h +++ b/daemon/daemon/common.h @@ -19,7 +19,7 @@ #include static const char DEFAULT_SHM_ID_LOCATION[] = "/tmp/opcm-daemon-shm-id"; -static const char VERSION[] = "1.0.4"; +static const char VERSION[] = "1.0.5"; #define MAX_CPU_CORES 4096 #define MAX_SOCKETS 256 @@ -231,16 +231,21 @@ namespace PCMDaemon { struct SharedPCMState { char version[VERSION_SIZE]; - uint64 lastUpdateTsc; + uint64 lastUpdateTscBegin; uint64 timestamp; uint64 cyclesToGetPCMState; uint32 pollMs; SharedPCMCounters pcm; + uint64 lastUpdateTscEnd; public: SharedPCMState() : - lastUpdateTsc(0), - pollMs(-1) { + lastUpdateTscBegin(0), + timestamp(0), + cyclesToGetPCMState(0), + pollMs(-1), + lastUpdateTscEnd(0) + { memset(this->version, '\0', sizeof(char)*VERSION_SIZE); } } ALIGN(ALIGNMENT); diff --git a/daemon/daemon/daemon.cpp b/daemon/daemon/daemon.cpp index 3face086..424bb9a3 100644 --- a/daemon/daemon/daemon.cpp +++ b/daemon/daemon/daemon.cpp @@ -395,7 +395,7 @@ namespace PCMDaemon { memcpy (sharedPCMState_->version, VERSION, sizeof(VERSION)); sharedPCMState_->version[sizeof(VERSION)] = '\0'; - uint64 rdtscNow = RDTSC(); + sharedPCMState_->lastUpdateTscBegin = RDTSC(); updatePCMState(&systemStatesAfter_, &socketStatesAfter_, &coreStatesAfter_); @@ -415,12 +415,13 @@ namespace PCMDaemon { getPCMQPI(); } - sharedPCMState_->cyclesToGetPCMState = RDTSC() - rdtscNow; + const auto lastUpdateTscEnd = RDTSC(); + sharedPCMState_->cyclesToGetPCMState = lastUpdateTscEnd - sharedPCMState_->lastUpdateTscBegin; sharedPCMState_->timestamp = getTimestamp(); // As the client polls this timestamp (lastUpdateTsc) // All the data has to be in shm before - sharedPCMState_->lastUpdateTsc = rdtscNow; + sharedPCMState_->lastUpdateTscEnd = lastUpdateTscEnd; if(mode_ == Mode::DIFFERENCE) { swapPCMBeforeAfterState(); From 990bb8d79a8de73dcb5c57ba6b00e48391dc9540 Mon Sep 17 00:00:00 2001 From: "Dementiev, Roman" Date: Thu, 7 Feb 2019 09:58:24 +0100 Subject: [PATCH 16/16] pcm-sensor: don't normalize core cache misses addresses https://github.com/opcm/pcm/issues/115 Change-Id: Ie5f4fff96d6ae6ad638d23fd0d3d988c50b51652 --- pcm-sensor.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pcm-sensor.cpp b/pcm-sensor.cpp index 7b7765c1..3f2fe97a 100644 --- a/pcm-sensor.cpp +++ b/pcm-sensor.cpp @@ -589,8 +589,8 @@ int main() OUTPUT_CORE_METRIC("/IPC", (counters.get(i))) OUTPUT_CORE_METRIC("/L2CacheHitRatio", (counters.get(i))) OUTPUT_CORE_METRIC("/L3CacheHitRatio", (counters.get(i))) - OUTPUT_CORE_METRIC("/L2CacheMisses", (counters.get(i) / 1000000)) - OUTPUT_CORE_METRIC("/L3CacheMisses", (counters.get(i) / 1000000)) + OUTPUT_CORE_METRIC("/L2CacheMisses", (counters.get(i))) + OUTPUT_CORE_METRIC("/L3CacheMisses", (counters.get(i))) OUTPUT_CORE_METRIC("/L3Occupancy", (counters.get(i))) OUTPUT_CORE_METRIC("/LocalMemoryBandwidth", (counters.get(i))) OUTPUT_CORE_METRIC("/RemoteMemoryBandwidth", (counters.get(i)))