Skip to content

Commit

Permalink
Merge pull request #284 from chillenzer/multithreaded-init
Browse files Browse the repository at this point in the history
Multithreaded init
  • Loading branch information
chillenzer authored Feb 6, 2025
2 parents a1fb8ba + 5a11919 commit f6b570b
Show file tree
Hide file tree
Showing 3 changed files with 83 additions and 27 deletions.
69 changes: 53 additions & 16 deletions include/mallocMC/creationPolicies/FlatterScatter.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,16 @@

#include <alpaka/core/Common.hpp>
#include <alpaka/core/Positioning.hpp>
#include <alpaka/extent/Traits.hpp>
#include <alpaka/idx/Accessors.hpp>
#include <alpaka/idx/MapIdx.hpp>
#include <alpaka/kernel/Traits.hpp>
#include <alpaka/mem/fence/Traits.hpp>
#include <alpaka/mem/view/Traits.hpp>
#include <alpaka/mem/view/ViewPlainPtr.hpp>
#include <alpaka/vec/Vec.hpp>
#include <alpaka/workdiv/Traits.hpp>
#include <alpaka/workdiv/WorkDivHelpers.hpp>
#include <alpaka/workdiv/WorkDivMembers.hpp>

#include <sys/types.h>
Expand Down Expand Up @@ -86,14 +90,38 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
MyAccessBlock* accessBlocks{};
uint32_t volatile block = 0U;

ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void
ALPAKA_FN_INLINE ALPAKA_FN_ACC static auto init(auto const& acc, void* accessBlocksPointer, auto heapSize)
-> void
{
for(uint32_t i = 0; i < numBlocks(); ++i)
auto threadsInGrid = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
auto numThreads = threadsInGrid.prod();
auto const [idx] = alpaka::mapIdx<1U>(alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc), threadsInGrid);
auto* accessBlocks = static_cast<MyAccessBlock*>(accessBlocksPointer);

for(uint32_t i = idx; i < numBlocks(heapSize) * MyAccessBlock::numPages(); i += numThreads)
{
accessBlocks[i].init();
auto blockIdx = i / MyAccessBlock::numPages();
auto pageIdx = i % MyAccessBlock::numPages();

accessBlocks[blockIdx].init(acc, pageIdx);
}
}

ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& acc) -> void
{
init(acc, accessBlocks, heapSize);
}

/**
* @brief Number of access blocks assuming the given heapSize.
*
* @return Number of access blocks in the heap.
*/
ALPAKA_FN_INLINE ALPAKA_FN_ACC static constexpr auto numBlocks(auto heapSize) -> uint32_t
{
return heapSize / T_HeapConfig::accessblocksize;
}

/**
* @brief Number of access blocks in the heap. This is a runtime quantity because it depends on the given heap
* size.
Expand All @@ -102,7 +130,7 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
*/
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto numBlocks() const -> uint32_t
{
return heapSize / T_HeapConfig::accessblocksize;
return numBlocks(heapSize);
}

/**
Expand Down Expand Up @@ -307,15 +335,22 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc
{
template<typename T_HeapConfig, typename T_HashConfig, typename T_AlignmentPolicy>
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto operator()(
auto const& /*unused*/,
auto const& acc,
Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>* m_heap,
void* m_heapmem,
size_t const m_memsize) const
{
m_heap->accessBlocks
= static_cast<Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>::MyAccessBlock*>(m_heapmem);
m_heap->heapSize = m_memsize;
m_heap->init();
auto const idx = alpaka::mapIdx<1U>(
alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc),
alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc));
if(idx == 0)
{
m_heap->accessBlocks
= static_cast<Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>::MyAccessBlock*>(m_heapmem);
m_heap->heapSize = m_memsize;
}
// We can't rely on thread 0 to finish the above before we start, so we use the static version:
Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>::init(acc, m_heapmem, m_memsize);
}
};

Expand Down Expand Up @@ -374,13 +409,15 @@ namespace mallocMC::CreationPolicies
template<typename TAcc>
static void initHeap([[maybe_unused]] auto& dev, auto& queue, auto* heap, void* pool, size_t memsize)
{
using Dim = typename alpaka::trait::DimType<TAcc>::type;
using Idx = typename alpaka::trait::IdxType<TAcc>::type;
using VecType = alpaka::Vec<Dim, Idx>;

auto workDivSingleThread
= alpaka::WorkDivMembers<Dim, Idx>{VecType::ones(), VecType::ones(), VecType::ones()};
alpaka::exec<TAcc>(queue, workDivSingleThread, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize);
using MyHeap = FlatterScatterAlloc::Heap<T_HeapConfig, T_HashConfig, T_AlignmentPolicy>;
auto numBlocks = MyHeap::numBlocks(memsize);
auto numPagesPerBlock = MyHeap::MyAccessBlock::numPages();

alpaka::KernelCfg<TAcc> const kernelCfg
= {numBlocks * numPagesPerBlock, 1U, false, alpaka::GridBlockExtentSubDivRestrictions::Unrestricted};
auto workDiv
= alpaka::getValidWorkDiv(kernelCfg, dev, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize);
alpaka::exec<TAcc>(queue, workDiv, FlatterScatterAlloc::InitKernel{}, heap, pool, memsize);
alpaka::wait(queue);
}

Expand Down
29 changes: 23 additions & 6 deletions include/mallocMC/creationPolicies/FlatterScatter/AccessBlock.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -109,19 +109,36 @@ namespace mallocMC::CreationPolicies::FlatterScatterAlloc

// This class is supposed to be reinterpeted on a piece of raw memory and not instantiated directly. We set it
// protected, so we can still test stuff in the future easily.
AccessBlock()
AccessBlock(auto const& acc)
{
init();
init(acc);
}

public:
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init() -> void
/**
* @brief Single-threaded initialisation loop. Used only for testing.
*/
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& acc) -> void
{
pageTable.cleanup();
for(uint32_t i = 0; i < numPages(); i++)
{
init(acc, i);
}
}

/**
* @brief Initialise the page given by its index. 0th also initialises the pageTable.
*/
ALPAKA_FN_INLINE ALPAKA_FN_ACC auto init(auto const& /*acc*/, auto const pageIdx) -> void
{
if(pageIdx == 0U)
{
pageTable.cleanup();
}
constexpr uint32_t dummyChunkSize = 1U;
for(auto& page : pages)
if(pageIdx < numPages())
{
MyPageInterpretation(page, dummyChunkSize).cleanupFull();
interpret(pageIdx, dummyChunkSize).cleanupFull();
}
}

Expand Down
12 changes: 7 additions & 5 deletions test/unit/source/AccessBlock.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,8 @@ struct TestableAccessBlock
: mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock<T_HeapConfig, T_AlignmentPolicy>
{
public:
TestableAccessBlock() = default;
explicit TestableAccessBlock(auto const& acc)
: mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock<T_HeapConfig, T_AlignmentPolicy>(acc) {};
using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock<T_HeapConfig, T_AlignmentPolicy>::blockSize;
using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock<T_HeapConfig, T_AlignmentPolicy>::pageSize;
using mallocMC::CreationPolicies::FlatterScatterAlloc::AccessBlock<T_HeapConfig, T_AlignmentPolicy>::wasteFactor;
Expand Down Expand Up @@ -116,7 +117,7 @@ TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks)
constexpr auto const blockSize = AccessBlock::blockSize;
constexpr auto const pageSize = AccessBlock::pageSize;

AccessBlock accessBlock{};
AccessBlock accessBlock{accSerial};

SECTION("knows its number of pages.")
{
Expand Down Expand Up @@ -312,7 +313,8 @@ TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks)
SECTION("with waste factor")
{
constexpr uint32_t const wastefactor = 3U;
TestableAccessBlock<HeapConfig<blockSize, pageSize, wastefactor>, AlignmentPolicy> wastedAccessBlock{};
TestableAccessBlock<HeapConfig<blockSize, pageSize, wastefactor>, AlignmentPolicy> wastedAccessBlock{
accSerial};
auto pointers = fillWith(wastedAccessBlock, chunkSize);

auto smallerChunkSize = chunkSize / (wastefactor - 1U);
Expand Down Expand Up @@ -374,7 +376,7 @@ TEMPLATE_LIST_TEST_CASE("AccessBlock", "", AccessBlocks)
TestableAccessBlock<
SelectivelyWastedHeapConfig<blockSize, pageSize, wastefactor, selectedNumBytes>,
AlignmentPolicy>
wastedAccessBlock{};
wastedAccessBlock{accSerial};
auto pointers = fillWith(wastedAccessBlock, chunkSize);

auto notSelectedNumBytes = chunkSize / (wastefactor - 1U);
Expand Down Expand Up @@ -596,7 +598,7 @@ TEST_CASE("AccessBlock (Regression)")
using AccessBlock
= TestableAccessBlock<HeapConfig<(pageSize + pageTableEntrySize), pageSize, wastefactor>, AlignmentPolicy>;

AccessBlock accessBlock{};
AccessBlock accessBlock{accSerial};

REQUIRE(accessBlock.getAvailableSlots(accSerial, chunkSizeOneMask) == numChunksOneMask);
REQUIRE(accessBlock.getAvailableSlots(accSerial, chunkSizeTwoMasks) == numChunksTwoMasks);
Expand Down

0 comments on commit f6b570b

Please sign in to comment.