Skip to content

Commit

Permalink
Fix round-off errors in find-peers (#479)
Browse files Browse the repository at this point in the history
integer based findPeers to guarantee commutatitivy
  • Loading branch information
sekelle authored Jan 27, 2025
1 parent 316fd29 commit 87cdae5
Show file tree
Hide file tree
Showing 9 changed files with 373 additions and 154 deletions.
12 changes: 9 additions & 3 deletions domain/include/cstone/domain/domain.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -214,8 +214,8 @@ class Domain
gatherArrays(sorter.gatherFunc(), sorter.getMap() + global_.numSendDown(), global_.numAssigned(), exchangeStart,
0, std::tie(h), util::reverse(scratch));

std::vector<int> peers = findPeersMac(myRank_, global_.assignment(), global_.octree(), box(), 1.0 / theta_);
float invThetaEff = invThetaMinMac(theta_);
std::vector<int> peers = findPeersMac(myRank_, global_.assignment(), global_.octree(), box(), invThetaEff);

if (firstCall_)
{
Expand All @@ -234,7 +234,13 @@ class Domain
halos_.discover(octreeView.prefixes, octreeView.childOffsets, octreeView.internalToLeaf, focusLeaves,
focusTree_.leafCountsAcc(), focusTree_.assignment(), {rawPtr(layoutAcc_), layoutAcc_.size()},
box(), rawPtr(h), haloSearchExt_, std::get<0>(scratch));
halos_.computeLayout(focusTree_.treeLeaves(), focusTree_.leafCounts(), focusTree_.assignment(), peers, layout_);
auto fail = halos_.computeLayout(focusTree_.treeLeaves(), focusTree_.leafCounts(), focusTree_.assignment(),
peers, layout_);
if (fail)
{
std::cout << "found halo outside peer area on rank " << myRank_ << std::endl;
MPI_Abort(MPI_COMM_WORLD, 67);
}

updateLayout(sorter, exchangeStart, keyView, particleKeys, std::tie(h),
std::tuple_cat(std::tie(x, y, z), particleProperties), scratch);
Expand Down Expand Up @@ -263,7 +269,7 @@ class Domain
gatherArrays(sorter.gatherFunc(), sorter.getMap() + global_.numSendDown(), global_.numAssigned(), exchangeStart,
0, std::tie(x, y, z, h, m), util::reverse(scratch));

float invThetaEff = invThetaVecMac(theta_);
float invThetaEff = invThetaMinToVec(theta_);
std::vector<int> peers = findPeersMac(myRank_, global_.assignment(), global_.octree(), box(), invThetaEff);

if (firstCall_)
Expand Down
7 changes: 3 additions & 4 deletions domain/include/cstone/sfc/box.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,14 +82,13 @@ HOST_DEVICE_FUN constexpr int pbcAdjust(int x)
}

//! @brief maps x into the range [-R/2+1: R/2+1] (-511 to 512 with R = 1024)
template<int R>
HOST_DEVICE_FUN constexpr int pbcDistance(int x)
HOST_DEVICE_FUN constexpr int pbcDistance(int x, int R)
{
// this version handles x outside -R, R
// int roundAwayFromZero = (x > 0) ? x + R/2 : x - R/2;
// return x -= R * (roundAwayFromZero / R);
assert(x >= -R);
assert(x <= R);
assert(R == 0 || x >= -R);
assert(R == 0 || x <= R);
int ret = (x <= -R / 2) ? x + R : x;
return (ret > R / 2) ? ret - R : ret;
}
Expand Down
54 changes: 34 additions & 20 deletions domain/include/cstone/traversal/boxoverlap.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,14 +47,12 @@ HOST_DEVICE_FUN constexpr bool overlapTwoRanges(T a, T b, T c, T d)

/*! @brief determine whether two ranges ab and cd overlap
*
* @tparam R periodic range
* @return true or false
* @param R periodic range
* @return true if ranges overlap, false otherwise
*
* Some restrictions apply, no input value can be further than R
* from the periodic range.
* Some restrictions apply, no input value can be further than R from the periodic range.
*/
template<int R>
HOST_DEVICE_FUN constexpr bool overlapRange(int a, int b, int c, int d)
HOST_DEVICE_FUN constexpr bool overlapRange(int a, int b, int c, int d, int R)
{
assert(a >= -R);
assert(a < R);
Expand All @@ -69,19 +67,38 @@ HOST_DEVICE_FUN constexpr bool overlapRange(int a, int b, int c, int d)
return overlapTwoRanges(a, b, c, d) || overlapTwoRanges(a + R, b + R, c, d) || overlapTwoRanges(a, b, c + R, d + R);
}

//! @brief compute minimal separation between ranges [a,b] and [c,d], R=0 means no periodicity
HOST_DEVICE_FUN inline int rangeSep(int a, int b, int c, int d, int R)
{
assert(a <= b && c <= d);
if (overlapTwoRanges(a, b, c, d)) { return 0; }
int d1 = pbcDistance(d - a, R);
int d2 = pbcDistance(c - b, R);
return stl::min(std::abs(d1), std::abs(d2));
}

//! @brief check whether two boxes overlap. takes PBC into account, boxes can wrap around
template<class KeyType>
HOST_DEVICE_FUN inline bool overlap(const IBox& a, const IBox& b)
HOST_DEVICE_FUN bool overlap(const IBox& a, const IBox& b)
{
constexpr int maxCoord = 1u << maxTreeLevel<KeyType>{};

bool xOverlap = overlapRange<maxCoord>(a.xmin(), a.xmax(), b.xmin(), b.xmax());
bool yOverlap = overlapRange<maxCoord>(a.ymin(), a.ymax(), b.ymin(), b.ymax());
bool zOverlap = overlapRange<maxCoord>(a.zmin(), a.zmax(), b.zmin(), b.zmax());
bool xOverlap = overlapRange(a.xmin(), a.xmax(), b.xmin(), b.xmax(), maxCoord);
bool yOverlap = overlapRange(a.ymin(), a.ymax(), b.ymin(), b.ymax(), maxCoord);
bool zOverlap = overlapRange(a.zmin(), a.zmax(), b.zmin(), b.zmax(), maxCoord);

return xOverlap && yOverlap && zOverlap;
}

//! @brief return separation between integer boxes a, b. @p pbc is the grid periodicity in each dimension
HOST_DEVICE_FUN inline Vec3<int> boxSeparation(IBox a, IBox b, Vec3<int> pbc)
{
int dx = rangeSep(a.xmin(), a.xmax(), b.xmin(), b.xmax(), pbc[0]);
int dy = rangeSep(a.ymin(), a.ymax(), b.ymin(), b.ymax(), pbc[1]);
int dz = rangeSep(a.zmin(), a.zmax(), b.zmin(), b.zmax(), pbc[2]);
return {dx, dy, dz};
}

/*! @brief Check whether a coordinate box is fully contained in a Morton code range
*
* @tparam KeyType 32- or 64-bit unsigned integer
Expand Down Expand Up @@ -118,22 +135,21 @@ containedIn(KeyType codeStart, KeyType codeEnd, const IBox& box)
/*! @brief determine whether a binary/octree node (prefix, prefixLength) is fully contained in an SFC range
*
* @tparam KeyType 32- or 64-bit unsigned integer
* @param prefix lowest SFC code of the tree node
* @param prefixLength range of the tree node in bits,
* corresponding SFC range is 2^(3*maxTreeLevel<KeyType>{} - prefixLength)
* @param nodeStart lowest SFC code of the tree node
* @param nodeEnd highest SFC key of the tree node,
* @param codeStart start of the SFC range
* @param codeEnd end of the SFC range
* @return
*/
template<class KeyType>
HOST_DEVICE_FUN inline std::enable_if_t<std::is_unsigned_v<KeyType>, bool>
HOST_DEVICE_FUN std::enable_if_t<std::is_unsigned_v<KeyType>, bool>
containedIn(KeyType nodeStart, KeyType nodeEnd, KeyType codeStart, KeyType codeEnd)
{
return !(nodeStart < codeStart || nodeEnd > codeEnd);
}

template<class KeyType>
HOST_DEVICE_FUN inline std::enable_if_t<std::is_unsigned_v<KeyType>, bool>
HOST_DEVICE_FUN std::enable_if_t<std::is_unsigned_v<KeyType>, bool>
containedIn(KeyType prefixBitKey, KeyType codeStart, KeyType codeEnd)
{
unsigned prefixLength = decodePrefixLength(prefixBitKey);
Expand All @@ -143,15 +159,13 @@ containedIn(KeyType prefixBitKey, KeyType codeStart, KeyType codeEnd)
}

template<class KeyType>
HOST_DEVICE_FUN inline int addDelta(int value, int delta, bool pbc)
HOST_DEVICE_FUN int addDelta(int value, int delta, bool pbc)
{
constexpr int maxCoordinate = (1u << maxTreeLevel<KeyType>{});

int temp = value + delta;
if (pbc)
return temp;
else
return stl::min(stl::max(0, temp), maxCoordinate);
if (pbc) { return temp; }
else { return stl::min(stl::max(0, temp), maxCoordinate); }
}

//! @brief create a box with specified radius around node delineated by codeStart/End
Expand Down
69 changes: 19 additions & 50 deletions domain/include/cstone/traversal/macs.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,8 @@

#pragma once

#include <vector>

#include "boxoverlap.hpp"
#include "cstone/primitives/math.hpp"
#include "cstone/traversal/traversal.hpp"
#include "cstone/tree/octree.hpp"

Expand All @@ -44,8 +43,8 @@ namespace cstone
//! @brief compute 1/theta + s for the minimum distance MAC
HOST_DEVICE_FUN inline float invThetaMinMac(float theta) { return 1.0f / theta + 0.5f; }

//! @brief compute 1/theta + sqrt(3) for the worst-case vector MAC
HOST_DEVICE_FUN inline float invThetaVecMac(float theta) { return 1.0f / theta + std::sqrt(3.0f); }
//! @brief cover worst-case vector mac with a min-Mac
HOST_DEVICE_FUN inline float invThetaMinToVec(float theta) { return 1.0f / theta + std::sqrt(3.0f) / 2; }

/*! @brief Compute square of the acceptance radius for the minimum distance MAC
*
Expand Down Expand Up @@ -140,56 +139,26 @@ evaluateMacPbc(Vec3<T> sourceCenter, T macSq, Vec3<T> targetCenter, Vec3<T> targ
return R2 < std::abs(macSq);
}

//! @brief commutative version of the min-distance mac, based on floating point math
/*! @brief integer based mutual min-mac
* @tparam T float or double
* @param a first integer cell box
* @param b second integer cell box
* @param ellipse grid anisotropy (max grid-step in any dim / grid-step in dim_i) divided by theta
* is equal to (L_max / L_x,y,z) * 1/theta if the number of grid points is equal in all dimensions
* @param pbc pbc yes/no per dimension
* @return true if MAC passed, i.e. true if cells are far
*/
template<class T>
HOST_DEVICE_FUN bool minMacMutual(const Vec3<T>& centerA,
const Vec3<T>& sizeA,
const Vec3<T>& centerB,
const Vec3<T>& sizeB,
const Box<T>& box,
float invTheta)
HOST_DEVICE_FUN bool minMacMutualInt(IBox a, IBox b, Vec3<T> ellipse, Vec3<int> pbc)
{
Vec3<T> dX = minDistance(centerA, sizeA, centerB, sizeB, box);
T l_max = std::max({a.xmax() - a.xmin(), a.ymax() - a.ymin(), a.zmax() - a.zmin(), b.xmax() - b.xmin(),
b.ymax() - b.ymin(), b.zmax() - b.zmin()});

T distSq = norm2(dX);
T sizeAB = 2 * stl::max(max(sizeA), max(sizeB));
// computing a-b separation in integers is key to avoiding a-b/b-a asymmetry due to round-off errors
Vec3<int> a_b = boxSeparation(a, b, pbc);

T mac = sizeAB * invTheta;

return distSq > (mac * mac);
}

/*! @brief commutative combination of min-distance and vector map
*
* @param invThetaEff 1/theta + s, effective inverse opening parameter
*
* This MAC doesn't pass any A-B pairs that would fail either the min-distance
* or vector MAC. Can be used instead of the vector mac when the mass center locations
* are not known.
*/
template<class T>
HOST_DEVICE_FUN bool minVecMacMutual(const Vec3<T>& centerA,
const Vec3<T>& sizeA,
const Vec3<T>& centerB,
const Vec3<T>& sizeB,
const Box<T>& box,
float invThetaEff)
{
bool passA;
{
// A = target, B = source
Vec3<T> dX = minDistance(centerB, centerA, sizeA, box);
T mac = max(sizeB) * 2 * invThetaEff;
passA = norm2(dX) > (mac * mac);
}
bool passB;
{
// B = target, A = source
Vec3<T> dX = minDistance(centerA, centerB, sizeB, box);
T mac = max(sizeA) * 2 * invThetaEff;
passB = norm2(dX) > (mac * mac);
}
return passA && passB;
Vec3<T> E{a_b[0] / ellipse[0], a_b[1] / ellipse[1], a_b[2] / ellipse[2]};
return norm2(E) > l_max * l_max;
}

//! @brief mark all nodes of @p octree (leaves and internal) that fail the evaluateMac w.r.t to @p target
Expand Down
32 changes: 18 additions & 14 deletions domain/include/cstone/traversal/peers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -69,19 +69,22 @@ std::vector<int> findPeersMac(int myRank,
KeyType domainStart = assignment[myRank];
KeyType domainEnd = assignment[myRank + 1];

auto crossFocusPairs =
[domainStart, domainEnd, invThetaEff, &tree = domainTree, &box](TreeNodeIndex a, TreeNodeIndex b)
int maxCoord = 1u << maxTreeLevel<KeyType>{};
float roundOff = 1 + 1e-6; // ensure that peers are picked up in case of a numerical tie
auto ellipse = Vec3<T>{box.ilx(), box.ily(), box.ilz()} * box.maxExtent() * invThetaEff * roundOff;
auto pbc_t = BoundaryType::periodic;
auto pbc = Vec3<int>{box.boundaryX() == pbc_t, box.boundaryY() == pbc_t, box.boundaryZ() == pbc_t} * maxCoord;

auto crossFocusPairs = [domainStart, domainEnd, ellipse, pbc, &tree = domainTree](TreeNodeIndex a, TreeNodeIndex b)
{
bool aFocusOverlap = overlapTwoRanges(domainStart, domainEnd, tree.codeStart(a), tree.codeEnd(a));
bool bInFocus = containedIn(tree.codeStart(b), tree.codeEnd(b), domainStart, domainEnd);
// node a has to overlap/be contained in the focus, while b must not be inside it
if (!aFocusOverlap || bInFocus) { return false; }

IBox aBox = sfcIBox(sfcKey(tree.codeStart(a)), tree.level(a));
IBox bBox = sfcIBox(sfcKey(tree.codeStart(b)), tree.level(b));
auto [aCenter, aSize] = centerAndSize<KeyType>(aBox, box);
auto [bCenter, bSize] = centerAndSize<KeyType>(bBox, box);
return !minVecMacMutual(aCenter, aSize, bCenter, bSize, box, invThetaEff);
IBox aBox = sfcIBox(sfcKey(tree.codeStart(a)), tree.level(a));
IBox bBox = sfcIBox(sfcKey(tree.codeStart(b)), tree.level(b));
return !minMacMutualInt(aBox, bBox, ellipse, pbc);
};

auto m2l = [](TreeNodeIndex, TreeNodeIndex) {};
Expand Down Expand Up @@ -130,26 +133,27 @@ std::vector<int> findPeersMacStt(int myRank,
TreeNodeIndex firstLeaf = findNodeAbove(leaves, octree.numLeafNodes(), domainStart);
TreeNodeIndex lastLeaf = findNodeAbove(leaves, octree.numLeafNodes(), domainEnd);

int maxCoord = 1u << maxTreeLevel<KeyType>{};
auto ellipse = Vec3<T>{box.ilx(), box.ily(), box.ilz()} * box.maxExtent() * invThetaEff;
auto pbc_t = BoundaryType::periodic;
auto pbc = Vec3<int>{box.boundaryX() == pbc_t, box.boundaryY() == pbc_t, box.boundaryZ() == pbc_t} * maxCoord;

std::vector<int> peers(assignment.numRanks());

#pragma omp parallel for
for (TreeNodeIndex i = firstLeaf; i < lastLeaf; ++i)
{
IBox target = sfcIBox(sfcKey(leaves[i]), sfcKey(leaves[i + 1]));
Vec3<T> targetCenter, targetSize;
std::tie(targetCenter, targetSize) = centerAndSize<KeyType>(target, box);

auto violatesMac =
[&targetCenter, &targetSize, &octree, &box, invThetaEff, domainStart, domainEnd](TreeNodeIndex idx)
auto violatesMac = [target, ellipse, pbc, &octree, domainStart, domainEnd](TreeNodeIndex idx)
{
KeyType nodeStart = octree.codeStart(idx);
KeyType nodeEnd = octree.codeEnd(idx);
// if the tree node with index idx is fully contained in the focus, we stop traversal
if (containedIn(nodeStart, nodeEnd, domainStart, domainEnd)) { return false; }

IBox sourceBox = sfcIBox(sfcKey(nodeStart), octree.level(idx));
auto [sourceCenter, sourceSize] = centerAndSize<KeyType>(sourceBox, box);
return !minVecMacMutual(targetCenter, targetSize, sourceCenter, sourceSize, box, invThetaEff);
IBox source = sfcIBox(sfcKey(nodeStart), octree.level(idx));
return !minMacMutualInt(target, source, ellipse, pbc);
};

auto markLeafIdx = [&octree, &peers, &assignment](TreeNodeIndex idx)
Expand Down
19 changes: 10 additions & 9 deletions domain/test/unit/sfc/box.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -48,15 +48,16 @@ TEST(SfcBox, pbcAdjust)

TEST(SfcBox, pbcDistance)
{
EXPECT_EQ(pbcDistance<1024>(-1024), 0);
EXPECT_EQ(pbcDistance<1024>(-513), 511);
EXPECT_EQ(pbcDistance<1024>(-512), 512);
EXPECT_EQ(pbcDistance<1024>(-1), -1);
EXPECT_EQ(pbcDistance<1024>(0), 0);
EXPECT_EQ(pbcDistance<1024>(1), 1);
EXPECT_EQ(pbcDistance<1024>(512), 512);
EXPECT_EQ(pbcDistance<1024>(513), -511);
EXPECT_EQ(pbcDistance<1024>(1024), 0);
int R = 1024;
EXPECT_EQ(pbcDistance(-1024, R), 0);
EXPECT_EQ(pbcDistance(-513, R), 511);
EXPECT_EQ(pbcDistance(-512, R), 512);
EXPECT_EQ(pbcDistance(-1, R), -1);
EXPECT_EQ(pbcDistance(0, R), 0);
EXPECT_EQ(pbcDistance(1, R), 1);
EXPECT_EQ(pbcDistance(512, R), 512);
EXPECT_EQ(pbcDistance(513, R), -511);
EXPECT_EQ(pbcDistance(1024, R), 0);
}

TEST(SfcBox, applyPbc)
Expand Down
Loading

0 comments on commit 87cdae5

Please sign in to comment.