From 9ca193e2a0a509c08d7aa5ecd1a57d1c6df97cf0 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Fri, 19 Jun 2020 12:46:01 +0200 Subject: [PATCH 1/6] make scratch use caching alloc --- EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h | 9 ++++++--- EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu | 6 +++--- .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 13 +++++++++---- 3 files changed, 18 insertions(+), 10 deletions(-) diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h index 606053edb6801..08fcf6fcdefd9 100644 --- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h +++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h @@ -6,6 +6,7 @@ #include "CUDADataFormats/HcalDigi/interface/DigiCollection.h" #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "ElectronicsMappingGPU.h" @@ -53,17 +54,19 @@ namespace hcal { struct ScratchDataGPU { // depends on tHE number of output collections // that is a statically known predefined number!!! - uint32_t *pChannelsCounters = nullptr; + cms::cuda::device::unique_ptr pChannelsCounters; + /* void allocate(ConfigurationParameters const &) { cudaCheck(cudaMalloc((void **)&pChannelsCounters, sizeof(uint32_t) * numOutputCollections)); - } + }*/ + /* void deallocate(ConfigurationParameters const &) { if (pChannelsCounters) { cudaCheck(cudaFree(pChannelsCounters)); } - } + }*/ }; struct OutputDataGPU { diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu index 1589ec6cb1661..9bc5b9975f3e7 100644 --- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu +++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu @@ -574,7 +574,7 @@ namespace hcal { nfedsWithData * sizeof(uint32_t), cudaMemcpyHostToDevice, cudaStream)); - cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters, 0, sizeof(uint32_t) * numOutputCollections, cudaStream)); + cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream)); cudaCheck(cudaMemcpyAsync( inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); @@ -591,7 +591,7 @@ namespace hcal { outputGPU.digisF5HB.npresamples, outputGPU.digisF3HB.data, outputGPU.digisF3HB.ids, - scratchGPU.pChannelsCounters, + scratchGPU.pChannelsCounters.get(), config.nsamplesF01HE, config.nsamplesF5HB, config.nsamplesF3HB, @@ -599,7 +599,7 @@ namespace hcal { cudaCheck(cudaGetLastError()); cudaCheck(cudaMemcpyAsync(outputCPU.nchannels.data(), - scratchGPU.pChannelsCounters, + scratchGPU.pChannelsCounters.get(), sizeof(uint32_t) * numOutputCollections, cudaMemcpyDeviceToHost, cudaStream)); diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc index 0e3a1a0d3b1e3..bc952fc936ba7 100644 --- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc +++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc @@ -12,6 +12,7 @@ #include "HeterogeneousCore/CUDACore/interface/ScopedContext.h" #include "HeterogeneousCore/CUDAServices/interface/CUDAService.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" #include "DeclsForKernels.h" #include "DecodeGPU.h" @@ -45,7 +46,6 @@ class HcalRawToDigiGPU : public edm::stream::EDProducer { hcal::raw::InputDataCPU inputCPU_; hcal::raw::InputDataGPU inputGPU_; hcal::raw::OutputDataGPU outputGPU_; - hcal::raw::ScratchDataGPU scratchGPU_; hcal::raw::OutputDataCPU outputCPU_; }; @@ -93,7 +93,6 @@ HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps) inputGPU_.allocate(); outputGPU_.allocate(config_); - scratchGPU_.allocate(config_); } } @@ -103,7 +102,6 @@ HcalRawToDigiGPU::~HcalRawToDigiGPU() { if (cs and cs->enabled()) { inputGPU_.deallocate(); outputGPU_.deallocate(config_); - scratchGPU_.deallocate(config_); } } @@ -125,6 +123,13 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event, edm::Handle rawDataHandle; event.getByToken(rawDataToken_, rawDataHandle); + // scratch + hcal::raw::ScratchDataGPU scratchGPU = { + cms::cuda::make_device_unique( + hcal::raw::numOutputCollections, + ctx.stream()) + }; + // iterate over feds // TODO: another idea // - loop over all feds to unpack and enqueue cuda memcpy @@ -159,7 +164,7 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event, hcal::raw::entryPoint(inputCPU_, inputGPU_, outputGPU_, - scratchGPU_, + scratchGPU, outputCPU_, conditions, config_, From c7f93923ad372f06f1fe2b61be28a245a91904a7 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Fri, 19 Jun 2020 13:25:42 +0200 Subject: [PATCH 2/6] use caching allocator for input cpu --- .../HcalRawToDigi/plugins/DeclsForKernels.h | 20 +++++-------- .../HcalRawToDigi/plugins/DecodeGPU.cu | 6 ++-- .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 29 +++++++++++++------ 3 files changed, 30 insertions(+), 25 deletions(-) diff --git a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h index 08fcf6fcdefd9..591bdc40a48ed 100644 --- a/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h +++ b/EventFilter/HcalRawToDigi/plugins/DeclsForKernels.h @@ -7,6 +7,7 @@ #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" #include "HeterogeneousCore/CUDAUtilities/interface/cudaCheck.h" #include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" +#include "HeterogeneousCore/CUDAUtilities/interface/host_unique_ptr.h" #include "ElectronicsMappingGPU.h" @@ -34,6 +35,11 @@ namespace hcal { }; struct InputDataCPU { + cms::cuda::host::unique_ptr data; + cms::cuda::host::unique_ptr offsets; + cms::cuda::host::unique_ptr feds; + + /* std::vector> data; std::vector> offsets; std::vector> feds; @@ -42,7 +48,7 @@ namespace hcal { data.resize(utca_nfeds_max * sizeof(unsigned char) * nbytes_per_fed_max); offsets.resize(utca_nfeds_max, 0); feds.resize(utca_nfeds_max, 0); - } + }*/ }; struct OutputDataCPU { @@ -55,18 +61,6 @@ namespace hcal { // depends on tHE number of output collections // that is a statically known predefined number!!! cms::cuda::device::unique_ptr pChannelsCounters; - - /* - void allocate(ConfigurationParameters const &) { - cudaCheck(cudaMalloc((void **)&pChannelsCounters, sizeof(uint32_t) * numOutputCollections)); - }*/ - - /* - void deallocate(ConfigurationParameters const &) { - if (pChannelsCounters) { - cudaCheck(cudaFree(pChannelsCounters)); - } - }*/ }; struct OutputDataGPU { diff --git a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu index 9bc5b9975f3e7..35cb128ba3d15 100644 --- a/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu +++ b/EventFilter/HcalRawToDigi/plugins/DecodeGPU.cu @@ -568,15 +568,15 @@ namespace hcal { uint32_t const nbytesTotal) { // transfer cudaCheck(cudaMemcpyAsync( - inputGPU.data, inputCPU.data.data(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); + inputGPU.data, inputCPU.data.get(), nbytesTotal * sizeof(unsigned char), cudaMemcpyHostToDevice, cudaStream)); cudaCheck(cudaMemcpyAsync(inputGPU.offsets, - inputCPU.offsets.data(), + inputCPU.offsets.get(), nfedsWithData * sizeof(uint32_t), cudaMemcpyHostToDevice, cudaStream)); cudaCheck(cudaMemsetAsync(scratchGPU.pChannelsCounters.get(), 0, sizeof(uint32_t) * numOutputCollections, cudaStream)); cudaCheck(cudaMemcpyAsync( - inputGPU.feds, inputCPU.feds.data(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); + inputGPU.feds, inputCPU.feds.get(), nfedsWithData * sizeof(int), cudaMemcpyHostToDevice, cudaStream)); // 12 is the max number of modules per crate kernel_rawdecode_test<32><<>>(inputGPU.data, diff --git a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc index bc952fc936ba7..ce5e85659ebdc 100644 --- a/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc +++ b/EventFilter/HcalRawToDigi/plugins/HcalRawToDigiGPU.cc @@ -43,7 +43,6 @@ class HcalRawToDigiGPU : public edm::stream::EDProducer { hcal::raw::ConfigurationParameters config_; // FIXME move this to use raii - hcal::raw::InputDataCPU inputCPU_; hcal::raw::InputDataGPU inputGPU_; hcal::raw::OutputDataGPU outputGPU_; hcal::raw::OutputDataCPU outputCPU_; @@ -88,7 +87,6 @@ HcalRawToDigiGPU::HcalRawToDigiGPU(const edm::ParameterSet& ps) // reserve memory and call CUDA API functions only if CUDA is available edm::Service cs; if (cs and cs->enabled()) { - inputCPU_.allocate(); outputCPU_.allocate(); inputGPU_.allocate(); @@ -125,9 +123,22 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event, // scratch hcal::raw::ScratchDataGPU scratchGPU = { - cms::cuda::make_device_unique( - hcal::raw::numOutputCollections, - ctx.stream()) + cms::cuda::make_device_unique( + hcal::raw::numOutputCollections, + ctx.stream()) + }; + + // input cpu data + hcal::raw::InputDataCPU inputCPU = { + cms::cuda::make_host_unique( + hcal::raw::utca_nfeds_max * hcal::raw::nbytes_per_fed_max, + ctx.stream()), + cms::cuda::make_host_unique( + hcal::raw::utca_nfeds_max, + ctx.stream()), + cms::cuda::make_host_unique( + hcal::raw::utca_nfeds_max, + ctx.stream()) }; // iterate over feds @@ -151,17 +162,17 @@ void HcalRawToDigiGPU::acquire(edm::Event const& event, #endif // copy raw data into plain buffer - std::memcpy(inputCPU_.data.data() + currentCummOffset, data.data(), nbytes); + std::memcpy(inputCPU.data.get() + currentCummOffset, data.data(), nbytes); // set the offset in bytes from the start - inputCPU_.offsets[counter] = currentCummOffset; - inputCPU_.feds[counter] = fed; + inputCPU.offsets[counter] = currentCummOffset; + inputCPU.feds[counter] = fed; // this is the current offset into the vector currentCummOffset += nbytes; ++counter; } - hcal::raw::entryPoint(inputCPU_, + hcal::raw::entryPoint(inputCPU, inputGPU_, outputGPU_, scratchGPU, From a84d73103d874cf0f72c7c243d8e186fe7f802c7 Mon Sep 17 00:00:00 2001 From: Viktor Khristenko Date: Thu, 2 Jul 2020 17:36:23 +0200 Subject: [PATCH 3/6] using allocators for EventHFilter/HcalRawToDigi --- CUDADataFormats/HcalCommon/interface/Common.h | 16 ++ .../HcalCommon/src/classes_def.xml | 2 + CUDADataFormats/HcalDigi/src/classes_def.xml | 12 ++ .../HcalRecHitSoA/src/classes_def.xml | 3 + .../HcalRawToDigi/plugins/DeclsForKernels.h | 128 ++++++---------- .../HcalRawToDigi/plugins/DecodeGPU.cu | 28 ++-- .../plugins/HcalCPUDigisProducer.cc | 89 ++--------- .../plugins/HcalDigisProducerGPU.cc | 144 +++++++----------- .../HcalRawToDigi/plugins/HcalRawToDigiGPU.cc | 61 ++++---- 9 files changed, 192 insertions(+), 291 deletions(-) diff --git a/CUDADataFormats/HcalCommon/interface/Common.h b/CUDADataFormats/HcalCommon/interface/Common.h index 1a2592889a6c7..53d49d06a89d1 100644 --- a/CUDADataFormats/HcalCommon/interface/Common.h +++ b/CUDADataFormats/HcalCommon/interface/Common.h @@ -4,6 +4,7 @@ #include #include "HeterogeneousCore/CUDAUtilities/interface/HostAllocator.h" +#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" namespace hcal { namespace common { @@ -13,6 +14,7 @@ namespace hcal { struct Vec {}; struct Ptr {}; + struct DevPtr {}; } // namespace tags @@ -24,6 +26,11 @@ namespace hcal { uint32_t size; }; + template<> + struct AddSize { + uint32_t size; + }; + struct ViewStoragePolicy { using TagType = tags::Ptr; @@ -33,6 +40,15 @@ namespace hcal { }; }; + struct DevStoragePolicy { + using TagType = tags::DevPtr; + + template + struct StorageSelector { + using type = cms::cuda::device::unique_ptr; + }; + }; + template