diff --git a/.gitignore b/.gitignore index 5873071..aeee870 100644 --- a/.gitignore +++ b/.gitignore @@ -3,3 +3,11 @@ cmake-build-debug *.txt merged *.out +program* +*.zip +/build +/cmake-build-* +/CMakeFiles +Makefile +*.cmake +*.cbp \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..ed1cceb --- /dev/null +++ b/README.md @@ -0,0 +1,4 @@ +# seed-reversal-merged +A merged version of seed-reversal-gpu and seed-tester-native + +This repository is part of the effort to reverse the seed of the pack.png image. For more information on how this code works, please refer to the relevant section in https://bit.ly/packpng-details diff --git a/generator.cpp b/generator.cpp index 7a6f908..ea29969 100644 --- a/generator.cpp +++ b/generator.cpp @@ -1,82 +1,114 @@ -#include "generator.h" - -#define WATERFALL_Z 10 -#define TREE1_X -5 // Left tree on the image -#define TREE1_Z WATERFALL_Z - 8 -#define TREE1_HEIGHT 5 -#define TREE2_X -3 // right tree on the image -#define TREE2_Z WATERFALL_Z + 3 -#define TREE2_HEIGHT 5 - -bool generator::ChunkGenerator::isValidTreeSpot(int treeX, int treeZ, bool firstTreeFound, bool secondTreeFound, int waterfallX) -{ - if (treeZ >= WATERFALL_Z - 1 && treeZ <= WATERFALL_Z + 1 && treeX <= waterfallX - 3 && treeX >= waterfallX - 5) - return true; - if (!firstTreeFound) { - if (treeX >= waterfallX + TREE1_X - 1 && treeX <= waterfallX + TREE1_X + 1 && treeZ >= TREE1_Z - 1 && treeZ <= TREE1_Z + 1) - return true; - } - if (!secondTreeFound) { - if (treeX >= waterfallX + TREE2_X - 1 && treeX <= waterfallX + TREE2_X + 1 && treeZ >= TREE2_Z - 1 && treeZ <= TREE2_Z + 1) - return true; - } - return false; -} - -bool *generator::ChunkGenerator::generateLeafPattern(random_math::JavaRand& random) -{ - bool *out = new bool[16]; - for (int32_t i = 0; i < 16; i++) { - out[i] = random.nextInt(2) != 0; - } - return out; -} - -int32_t generator::ChunkGenerator::checkTrees(random_math::JavaRand& random, int32_t maxTreeCount, int waterfallX) -{ - bool treesFound[2] = {false, false}; - int8_t foundTreeCount = 0; - for (int i = 0; i <= maxTreeCount; ++i) { - int32_t treeX = random.nextInt(16); - int32_t treeZ = random.nextInt(16); - int32_t height = random.nextInt(3) + 4; - if (!treesFound[0] && treeX == waterfallX + TREE1_X && treeZ == TREE1_Z && height == TREE1_HEIGHT) { - delete[] generator::ChunkGenerator::generateLeafPattern(random); - foundTreeCount++; - treesFound[0] = true; - } else if (!treesFound[1] && treeX == waterfallX + TREE2_X && treeZ == TREE2_Z && height == TREE2_HEIGHT) { - bool *leafPattern = generateLeafPattern(random); - if (!leafPattern[0] && leafPattern[4]) { - foundTreeCount++; - treesFound[1] = true; - } else { - return -1; - } - delete[] leafPattern; - } else { - if (isValidTreeSpot(treeX, treeZ, treesFound[0], treesFound[1], waterfallX)) - return -1; - } - if (foundTreeCount == 2) - return i; - } - return -1; -} - -bool generator::ChunkGenerator::populate(int64_t chunkSeed, int waterfallX) -{ - random_math::JavaRand random(advance_3759.next(chunkSeed), false); - - int32_t maxBaseTreeCount = 12; - if (random.nextInt(10) == 0) - return false; - - int uTrees = ChunkGenerator::checkTrees(random, maxBaseTreeCount, waterfallX); - return uTrees != -1; -} - -void generator::ChunkGenerator::init() -{ - generator::ChunkGenerator::advance_3759 = random_math::JavaRand::lcg.combine(3759); -} -random_math::LCG generator::ChunkGenerator::advance_3759(1, 1, 1); +#include "generator.h" + +#define WATERFALL_Z 10 +#define TREE1_X -5 // Left tree on the image +#define TREE1_Z WATERFALL_Z - 8 +#define TREE1_HEIGHT 5 +#define TREE2_X -3 // right tree on the image +#define TREE2_Z WATERFALL_Z + 3 +#define TREE2_HEIGHT 5 + +bool generator::ChunkGenerator::isValidTreeSpot(int treeX, int treeZ, bool firstTreeFound, bool secondTreeFound, int waterfallX) +{ + if (treeZ >= WATERFALL_Z - 1 && treeZ <= WATERFALL_Z + 1 && treeX <= waterfallX - 3 && treeX >= waterfallX - 5) + return true; + if (!firstTreeFound) { + if (treeX >= waterfallX + TREE1_X - 1 && treeX <= waterfallX + TREE1_X + 1 && treeZ >= TREE1_Z - 1 && treeZ <= TREE1_Z + 1) + return true; + } + if (!secondTreeFound) { + if (treeX >= waterfallX + TREE2_X - 1 && treeX <= waterfallX + TREE2_X + 1 && treeZ >= TREE2_Z - 1 && treeZ <= TREE2_Z + 1) + return true; + } + return false; +} + +void generator::ChunkGenerator::generateLeafPattern(random_math::JavaRand& random, bool *out) +{ + out[0] = random.nextIntPow2Unchecked(2) != 0; + out[1] = random.nextIntPow2Unchecked(2) != 0; + out[2] = random.nextIntPow2Unchecked(2) != 0; + out[3] = random.nextIntPow2Unchecked(2) != 0; + out[4] = random.nextIntPow2Unchecked(2) != 0; + out[5] = random.nextIntPow2Unchecked(2) != 0; + out[6] = random.nextIntPow2Unchecked(2) != 0; + out[7] = random.nextIntPow2Unchecked(2) != 0; + out[8] = random.nextIntPow2Unchecked(2) != 0; + out[9] = random.nextIntPow2Unchecked(2) != 0; + out[10] = random.nextIntPow2Unchecked(2) != 0; + out[11] = random.nextIntPow2Unchecked(2) != 0; + out[12] = random.nextIntPow2Unchecked(2) != 0; + out[13] = random.nextIntPow2Unchecked(2) != 0; + out[14] = random.nextIntPow2Unchecked(2) != 0; + out[15] = random.nextIntPow2Unchecked(2) != 0; +} + +void generator::ChunkGenerator::ignoreLeafPattern(random_math::JavaRand& random) +{ + random.advance(advance_16); +} + +bool generator::ChunkGenerator::leafPatternNot0And4(random_math::JavaRand& random) +{ + bool _0 = random.nextIntPow2Unchecked(2) != 0; + random.advance(advance_3); + + bool _4 = random.nextIntPow2Unchecked(2) != 0; + random.advance(advance_11); + + + return !_0 && _4; +} + +int32_t generator::ChunkGenerator::checkTrees(random_math::JavaRand& random, int32_t maxTreeCount, int waterfallX) +{ + bool treesFound[2] = {false, false}; + int8_t foundTreeCount = 0; + for (int i = 0; i <= maxTreeCount; ++i) { + int32_t treeX = random.nextIntPow2Unchecked(16); + int32_t treeZ = random.nextIntPow2Unchecked(16); + int32_t height = random.nextInt(3) + 4; + if (!treesFound[0] && treeX == waterfallX + TREE1_X && treeZ == TREE1_Z && height == TREE1_HEIGHT) { + ignoreLeafPattern(random); + foundTreeCount++; + treesFound[0] = true; + } else if (!treesFound[1] && treeX == waterfallX + TREE2_X && treeZ == TREE2_Z && height == TREE2_HEIGHT) { + if (leafPatternNot0And4(random)) { + foundTreeCount++; + treesFound[1] = true; + } else { + return -1; + } + } else { + if (isValidTreeSpot(treeX, treeZ, treesFound[0], treesFound[1], waterfallX)) + return -1; + } + if (foundTreeCount == 2) + return i; + } + return -1; +} + +bool generator::ChunkGenerator::populate(int64_t chunkSeed, int waterfallX) +{ + random_math::JavaRand random(advance_3759.nextMaskableUnchecked(chunkSeed), false); + + int32_t maxBaseTreeCount = 12; + if (random.nextInt(10) == 0) + return false; + + int uTrees = ChunkGenerator::checkTrees(random, maxBaseTreeCount, waterfallX); + return uTrees != -1; +} + +void generator::ChunkGenerator::init() +{ + generator::ChunkGenerator::advance_3 = random_math::JavaRand::lcg.combine(3); + generator::ChunkGenerator::advance_11 = random_math::JavaRand::lcg.combine(11); + generator::ChunkGenerator::advance_16 = random_math::JavaRand::lcg.combine(16); + generator::ChunkGenerator::advance_3759 = random_math::JavaRand::lcg.combine(3759); +} +random_math::LCG generator::ChunkGenerator::advance_3(1, 1, 1); +random_math::LCG generator::ChunkGenerator::advance_11(1, 1, 1); +random_math::LCG generator::ChunkGenerator::advance_16(1, 1, 1); +random_math::LCG generator::ChunkGenerator::advance_3759(1, 1, 1); diff --git a/generator.h b/generator.h index 75ad605..6f4ab31 100644 --- a/generator.h +++ b/generator.h @@ -1,25 +1,30 @@ -#ifndef SEED_TESTER_GENERATOR_H -#define SEED_TESTER_GENERATOR_H - -#include "random.h" -#include - -namespace generator -{ - - class ChunkGenerator - { - private: - static bool isValidTreeSpot(int treeX, int treeZ, bool firstTreeFound, bool secondTreeFound, int waterfallX); - static bool *generateLeafPattern(random_math::JavaRand& random); - static int32_t checkTrees(random_math::JavaRand& random, int32_t maxTreeCount, int waterfallX); - public: - static void init(); - static random_math::LCG advance_3759; - static bool populate(int64_t chunkSeed, int waterfallX); - - }; -} - - -#endif //SEED_TESTER_GENERATOR_H +#ifndef SEED_TESTER_GENERATOR_H +#define SEED_TESTER_GENERATOR_H + +#include "random.h" +#include + +namespace generator +{ + + class ChunkGenerator + { + private: + static bool isValidTreeSpot(int treeX, int treeZ, bool firstTreeFound, bool secondTreeFound, int waterfallX); + static void generateLeafPattern(random_math::JavaRand& random, bool *out); + static void ignoreLeafPattern(random_math::JavaRand& random); + static bool leafPatternNot0And4(random_math::JavaRand& random); + static int32_t checkTrees(random_math::JavaRand& random, int32_t maxTreeCount, int waterfallX); + public: + static void init(); + static random_math::LCG advance_3; + static random_math::LCG advance_11; + static random_math::LCG advance_16; + static random_math::LCG advance_3759; + static bool populate(int64_t chunkSeed, int waterfallX); + + }; +} + + +#endif //SEED_TESTER_GENERATOR_H diff --git a/main.cu b/main.cu index 02a9040..312d3b7 100644 --- a/main.cu +++ b/main.cu @@ -1,9 +1,10 @@ +#pragma clang diagnostic push +#pragma ide diagnostic ignored "hicpp-signed-bitwise" //#define __JETBRAINS_IDE__ // IDE indexing #ifdef __JETBRAINS_IDE__ #define __host__ #define __device__ -#define __shared__ #define __constant__ #define __global__ #define __CUDACC__ @@ -15,84 +16,58 @@ #include <__clang_cuda_cmath.h> #endif +#ifdef __INTELLISENSE__ +#include +#include +#include +#define __CUDACC__ //fixes function defenition in ide +//void __syncthreads(); -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include +#endif -#define signed_seed_t int64_t -#define uint uint32_t -#define ulong uint64_t -// let's be EVIL (and make sure all includes come before this) -#define int int32_t +#include +#include +#include +#include +#include +#include +#include -#undef JRAND_DOUBLE +#include "generator.h" #define RANDOM_MULTIPLIER_LONG 0x5DEECE66DULL -#ifdef JRAND_DOUBLE -#define Random double -#define RANDOM_MULTIPLIER 0x5DEECE66Dp-48 -#define RANDOM_ADDEND 0xBp-48 -#define RANDOM_SCALE 0x1p-48 - -inline uint __host__ __device__ random_next(Random *random, int bits) { - *random = trunc((*random * RANDOM_MULTIPLIER + RANDOM_ADDEND) * RANDOM_SCALE); - return (uint)((ulong)(*random / RANDOM_SCALE) >> (48 - bits)); -} - -#else - -#define Random ulong +#define Random uint64_t #define RANDOM_MULTIPLIER RANDOM_MULTIPLIER_LONG #define RANDOM_ADDEND 0xBULL #define RANDOM_MASK (1ULL << 48) - 1 -#define RANDOM_SCALE 1 - -#define FAST_NEXT_INT // Random::next(bits) -__host__ __device__ inline uint random_next(Random *random, int bits) { +__host__ __device__ inline uint32_t random_next(Random *random, int32_t bits) { *random = (*random * RANDOM_MULTIPLIER + RANDOM_ADDEND) & RANDOM_MASK; - return (uint)(*random >> (48 - bits)); + return (uint32_t)(*random >> (48 - bits)); } -#endif // ~JRAND_DOUBLE - -// new Random(seed) -#define get_random(seed) ((Random)((seed ^ RANDOM_MULTIPLIER_LONG) & RANDOM_MASK)) -#define get_random_unseeded(state) ((Random) ((state) * RANDOM_SCALE)) // Random::nextInt(bound) -__host__ __device__ inline uint random_next_int(Random *random, uint bound) { - int r = random_next(random, 31); - int m = bound - 1; +__host__ __device__ inline uint32_t random_next_int(Random *random, uint32_t bound) { + int32_t r = random_next(random, 31); + int32_t m = bound - 1; if ((bound & m) == 0) { // Could probably use __mul64hi here - r = (uint)((bound * (ulong)r) >> 31); + r = (uint32_t)((bound * (uint64_t)r) >> 31); } else { -#ifdef FAST_NEXT_INT r %= bound; -#else - for (int u = r; - u - (r = u % bound) + m < 0; - u = random_next(random, 31)); -#endif } return r; } -__host__ __device__ inline int64_t random_next_long (Random *random) { - return (((int64_t)random_next(random, 32)) << 32) + random_next(random, 32); -} - #define CHECK_GPU_ERR(code) gpuAssert((code), __FILE__, __LINE__) -inline void gpuAssert(cudaError_t code, const char* file, int line) { +inline void gpuAssert(cudaError_t code, const char* file, int32_t line) { if (code != cudaSuccess) { fprintf(stderr, "GPUassert: %s (code %d) %s %d\n", cudaGetErrorString(code), code, file, line); exit(code); @@ -101,9 +76,6 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) { // advance #define advance_rng(rand, multiplier, addend) ((rand) = ((rand) * (multiplier) + (addend)) & RANDOM_MASK) -#define advance_830(rand) advance_rng(rand, 0x859D39E832D9LL, 0xE3E2DF5E9196LL) -#define advance_774(rand) advance_rng(rand, 0xF8D900133F9LL, 0x5738CAC2F85ELL) -#define advance_387(rand) advance_rng(rand, 0x5FE2BCEF32B5LL, 0xB072B3BF0CBDLL) #define advance_16(rand) advance_rng(rand, 0x6DC260740241LL, 0xD0352014D90LL) #define advance_m1(rand) advance_rng(rand, 0xDFE05BCB1365LL, 0x615C0E462AA9LL) #define advance_m3759(rand) advance_rng(rand, 0x63A9985BE4ADLL, 0xA9AA8DA9BC9BLL) @@ -111,7 +83,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) { #define WATERFALL_X 16 -#define WATERFALL_Y 76 +//#define WATERFALL_Y 76 #define WATERFALL_Z 10 #define TREE_X (WATERFALL_X - 5) @@ -119,7 +91,7 @@ inline void gpuAssert(cudaError_t code, const char* file, int line) { #define TREE_HEIGHT 5 #define OTHER_TREE_COUNT 1 -__device__ inline int getTreeHeight(int x, int z) { +__device__ inline int32_t getTreeHeight(int32_t x, int32_t z) { if (x == TREE_X && z == TREE_Z) return TREE_HEIGHT; @@ -132,13 +104,9 @@ __device__ inline int getTreeHeight(int x, int z) { #define MODULUS (1LL << 48) -#define SQUARE_SIDE (MODULUS / 16) #define X_TRANSLATE 0 -#define Z_TRANSLATE 11 #define L00 7847617LL #define L01 (-18218081LL) -#define L10 4824621LL -#define L11 24667315LL #define LI00 (24667315.0 / 16) #define LI01 (18218081.0 / 16) #define LI10 (-4824621.0 / 16) @@ -148,10 +116,8 @@ __device__ inline int getTreeHeight(int x, int z) { #define CONST_MIN4(a, b, c, d) CONST_MIN(CONST_MIN(a, b), CONST_MIN(c, d)) #define CONST_MAX(a, b) ((a) > (b) ? (a) : (b)) #define CONST_MAX4(a, b, c, d) CONST_MAX(CONST_MAX(a, b), CONST_MAX(c, d)) -#define CONST_FLOOR(x) ((x) < (signed_seed_t) (x) ? (signed_seed_t) (x) - 1 : (signed_seed_t) (x)) -#define CONST_CEIL(x) ((x) == (signed_seed_t) (x) ? (signed_seed_t) (x) : CONST_FLOOR((x) + 1)) -#define CONST_LOWER(x, m, c) ((m) < 0 ? ((x) + 1 - (double) (c) / MODULUS) * (m) : ((x) - (double) (c) / MODULUS) * (m)) -#define CONST_UPPER(x, m, c) ((m) < 0 ? ((x) - (double) (c) / MODULUS) * (m) : ((x) + 1 - (double) (c) / MODULUS) * (m)) +#define CONST_FLOOR(x) ((x) < (int64_t) (x) ? (int64_t) (x) - 1 : (int64_t) (x)) +#define CONST_CEIL(x) ((x) == (int64_t) (x) ? (int64_t) (x) : CONST_FLOOR((x) + 1)) // for a parallelogram ABCD https://media.discordapp.net/attachments/668607204009574411/671018577561649163/unknown.png #define B_X LI00 @@ -172,28 +138,28 @@ __device__ inline int getTreeHeight(int x, int z) { #define MAX_TREE_ATTEMPTS 12 #define MAX_TREE_SEARCH_BACK (3 * MAX_TREE_ATTEMPTS - 3 + 16 * OTHER_TREE_COUNT) -__constant__ ulong search_back_multipliers[MAX_TREE_SEARCH_BACK + 1]; -__constant__ ulong search_back_addends[MAX_TREE_SEARCH_BACK + 1]; -int search_back_count; +__constant__ uint64_t search_back_multipliers[MAX_TREE_SEARCH_BACK + 1]; +__constant__ uint64_t search_back_addends[MAX_TREE_SEARCH_BACK + 1]; +int32_t search_back_count; -#define WORK_UNIT_SIZE (1LL << 23) +#define WORK_UNIT_SIZE (1LL << 25) #define BLOCK_SIZE 256 -__global__ void doPreWork(ulong offset, Random* starts, int* num_starts) { +__global__ void doPreWork(uint64_t offset, Random* starts, int* num_starts) { // lattice tree position - ulong global_id = blockIdx.x * blockDim.x + threadIdx.x; + uint64_t global_id = blockIdx.x * blockDim.x + threadIdx.x; - signed_seed_t lattice_x = (signed_seed_t) ((offset + global_id) % SIZE_X) + LOWER_X; - signed_seed_t lattice_z = (signed_seed_t) ((offset + global_id) / SIZE_X) + LOWER_Z; + int64_t lattice_x = (int64_t) ((offset + global_id) % SIZE_X) + LOWER_X; + int64_t lattice_z = (int64_t) ((offset + global_id) / SIZE_X) + LOWER_Z; lattice_z += (B_X * lattice_z < B_Z * lattice_x) * SIZE_Z; if (D_X * lattice_z > D_Z * lattice_x) { lattice_x += B_X; lattice_z += B_Z; } - lattice_x += (signed_seed_t) (TREE_X * LI00 + TREE_Z * LI01); - lattice_z += (signed_seed_t) (TREE_X * LI10 + TREE_Z * LI11); + lattice_x += (int64_t) (TREE_X * LI00 + TREE_Z * LI01); + lattice_z += (int64_t) (TREE_X * LI10 + TREE_Z * LI11); - Random rand = (Random) ((lattice_x * L00 + lattice_z * L01 + X_TRANSLATE) % MODULUS); + auto rand = (Random)((lattice_x * L00 + lattice_z * L01 + X_TRANSLATE) % MODULUS); advance_m1(rand); Random tree_start = rand; @@ -201,19 +167,19 @@ __global__ void doPreWork(ulong offset, Random* starts, int* num_starts) { bool res = random_next(&rand, 4) == TREE_X; res &= random_next(&rand, 4) == TREE_Z; - res &= random_next_int(&rand, 3) == (ulong) (TREE_HEIGHT - 4); + res &= random_next_int(&rand, 3) == (uint64_t) (TREE_HEIGHT - 4); - if(res) { + if (res) { int index = atomicAdd(num_starts, 1); starts[index] = tree_start; } } -__global__ void doWork(int* num_starts, Random* tree_starts, int* num_seeds, ulong* seeds, int gpu_search_back_count) { - for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < *num_starts; i += blockDim.x * gridDim.x) { +__global__ void doWork(const int32_t* num_starts, const Random* tree_starts, int32_t* num_seeds, uint64_t* seeds, int32_t gpu_search_back_count) { + for (int32_t i = blockIdx.x * blockDim.x + threadIdx.x; i < *num_starts; i += blockDim.x * gridDim.x) { Random tree_start = tree_starts[i]; - for (int treeBackCalls = 0; treeBackCalls <= gpu_search_back_count; treeBackCalls++) { + for (int32_t treeBackCalls = 0; treeBackCalls <= gpu_search_back_count; treeBackCalls++) { Random start = (tree_start * search_back_multipliers[treeBackCalls] + search_back_addends[treeBackCalls]) & RANDOM_MASK; Random rand = start; @@ -222,18 +188,18 @@ __global__ void doWork(int* num_starts, Random* tree_starts, int* num_seeds, ulo if (random_next_int(&rand, 10) == 0) continue; - int generated_tree[16]; + int32_t generated_tree[16]; memset(generated_tree, 0x00, sizeof(generated_tree)); - int treesMatched = 0; - for (int treeAttempt = 0; treeAttempt <= MAX_TREE_ATTEMPTS; treeAttempt++) { - int treeX = random_next(&rand, 4); - int treeZ = random_next(&rand, 4); - int wantedTreeHeight = getTreeHeight(treeX, treeZ); - int treeHeight = random_next_int(&rand, 3) + 4; + int32_t treesMatched = 0; + for (int32_t treeAttempt = 0; treeAttempt <= MAX_TREE_ATTEMPTS; treeAttempt++) { + int32_t treeX = random_next(&rand, 4); + int32_t treeZ = random_next(&rand, 4); + int32_t wantedTreeHeight = getTreeHeight(treeX, treeZ); + int32_t treeHeight = random_next_int(&rand, 3) + 4; - int& boolpack = generated_tree[treeX]; - const int mask = 1 << (treeZ % 16); + int32_t& boolpack = generated_tree[treeX]; + const int32_t mask = 1 << (treeZ % 16); if (treeHeight == wantedTreeHeight && !(boolpack & mask)) { treesMatched++; @@ -248,7 +214,7 @@ __global__ void doWork(int* num_starts, Random* tree_starts, int* num_seeds, ulo Random start_chunk_rand = start; advance_m3759(start_chunk_rand); - int index = atomicAdd(num_seeds, 1); + int32_t index = atomicAdd(num_seeds, 1); seeds[index] = start_chunk_rand; } @@ -258,16 +224,14 @@ __global__ void doWork(int* num_starts, Random* tree_starts, int* num_seeds, ulo } struct GPU_Node { - int GPU; - int* num_seeds; - ulong* seeds; - int* num_tree_starts; + int32_t* num_seeds; + uint64_t* seeds; + int32_t* num_tree_starts; Random* tree_starts; }; -void setup_gpu_node(GPU_Node* node, int gpu) { +void setup_gpu_node(GPU_Node* node, int32_t gpu) { CHECK_GPU_ERR(cudaSetDevice(gpu)); - node->GPU = gpu; CHECK_GPU_ERR(cudaMallocManaged(&node->num_seeds, sizeof(*node->num_seeds))); CHECK_GPU_ERR(cudaMallocManaged(&node->seeds, (sizeof(Random)*WORK_UNIT_SIZE))); CHECK_GPU_ERR(cudaMallocManaged(&node->num_tree_starts, sizeof(*node->num_tree_starts))); @@ -283,25 +247,25 @@ void calculate_search_backs() { bool allow_search_back[MAX_TREE_SEARCH_BACK + 1]; memset(allow_search_back, false, sizeof(allow_search_back)); - for (int i = 0; i <= MAX_TREE_ATTEMPTS - OTHER_TREE_COUNT - 1; i++) { + for (int32_t i = 0; i <= MAX_TREE_ATTEMPTS - OTHER_TREE_COUNT - 1; i++) { allow_search_back[i * 3] = true; } - for (int tree = 0; tree < OTHER_TREE_COUNT; tree++) { - for (int i = 0; i <= MAX_TREE_SEARCH_BACK - 19; i++) { + for (int32_t tree = 0; tree < OTHER_TREE_COUNT; tree++) { + for (int32_t i = 0; i <= MAX_TREE_SEARCH_BACK - 19; i++) { if (allow_search_back[i]) allow_search_back[i + 19] = true; } } search_back_count = 0; - ulong multiplier = 1; - ulong addend = 0; - ulong multipliers[MAX_TREE_SEARCH_BACK + 1]; - ulong addends[MAX_TREE_SEARCH_BACK + 1]; - for (int i = 0; i <= MAX_TREE_SEARCH_BACK; i++) { + uint64_t multiplier = 1; + uint64_t addend = 0; + uint64_t multipliers[MAX_TREE_SEARCH_BACK + 1]; + uint64_t addends[MAX_TREE_SEARCH_BACK + 1]; + for (int32_t i = 0; i <= MAX_TREE_SEARCH_BACK; i++) { if (allow_search_back[i]) { - int index = search_back_count++; + int32_t index = search_back_count++; multipliers[index] = multiplier; addends[index] = addend; } @@ -309,79 +273,50 @@ void calculate_search_backs() { addend = (0xDFE05BCB1365LL * addend + 0x615C0E462AA9LL) & RANDOM_MASK; } - for (int gpu = 0; gpu < GPU_COUNT; gpu++) { + for (int32_t gpu = 0; gpu < GPU_COUNT; gpu++) { CHECK_GPU_ERR(cudaSetDevice(gpu)); CHECK_GPU_ERR(cudaMemcpyToSymbol(search_back_multipliers, &multipliers, search_back_count * sizeof(*multipliers))); CHECK_GPU_ERR(cudaMemcpyToSymbol(search_back_addends, &addends, search_back_count * sizeof(*addends))); } } -#undef int -#include "generator.h" - #ifndef OFFSET #define OFFSET 0 #endif -void run(bool &running, int &offset, int &workSize, bool &finished, long long *&tempStorage, int &count, const std::string out_file_name) { - FILE* out_file = fopen(out_file_name.c_str(), "w"); - while (running) { - if (workSize != 0 && !finished) { - for (int j = 0; j < workSize; ++j) { - if (generator::ChunkGenerator::populate(tempStorage[j + offset], X_TRANSLATE + 16)) { - fprintf(out_file, "%lld\n", tempStorage[j]); - count++; - } - } - finished = true; - } - } - fflush(out_file); - fclose(out_file); - -} +struct Thread { + std::thread thread; + std::vector threadBuffer; +}; int main(int argc, char *argv[]) { -#define int int32_t random_math::JavaRand::init(); generator::ChunkGenerator::init(); - GPU_Node *nodes = (GPU_Node*)malloc(sizeof(GPU_Node) * GPU_COUNT); - printf("Searching %ld total seeds...\n", TOTAL_WORK_SIZE); + auto *nodes = (GPU_Node*)malloc(sizeof(GPU_Node) * GPU_COUNT); + std::cout << "Searching " << TOTAL_WORK_SIZE << " total seeds...\n"; calculate_search_backs(); - for(int i = 0; i < GPU_COUNT; i++) { - setup_gpu_node(&nodes[i],i); - } + FILE* out_file = fopen("chunk_seeds.txt", "w"); - int threadCount = std::thread::hardware_concurrency(); - threadCount = threadCount-4; - std::thread threads[threadCount]; - long long *tempStorage = NULL; - bool running = true; - bool finishedArr[threadCount]; - int offsetArr[threadCount]; - int workSizeArr[threadCount]; - - int counts[threadCount]; - printf("Using %d threads for cpu work\n",threadCount); - for (int i = 0; i < threadCount; ++i) { - counts[i] = 0; - finishedArr[i] = true; - offsetArr[i] = 0; - workSizeArr[i] = 0; - std::ostringstream oss; - oss << "chunk_seeds" << i << ".txt"; - threads[i] = std::thread(run, std::ref(running), std::ref(offsetArr[i]), std::ref(workSizeArr[i]), std::ref(finishedArr[i]), std::ref(tempStorage), std::ref(counts[i]), oss.str()); + for (int32_t i = 0; i < GPU_COUNT; i++) { + setup_gpu_node(&nodes[i], i); } - ulong count = 0; - clock_t lastIteration = clock(); - clock_t startTime = clock(); - ulong arraySize = 0; - for (ulong offset = OFFSET; offset < TOTAL_WORK_SIZE;) { - for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { + std::vector threads(std::thread::hardware_concurrency() - 4); + + std::atomic count(0); + auto lastIteration = std::chrono::system_clock::now(); + auto startTime = std::chrono::system_clock::now(); + long long* tempStorage = nullptr; + uint64_t arraySize = 0; + + std::cout << "Using " << threads.size() << " threads for cpu work\n"; + + for (uint64_t offset = OFFSET; offset < TOTAL_WORK_SIZE;) { + + for (int32_t gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { CHECK_GPU_ERR(cudaSetDevice(gpu_index)); *nodes[gpu_index].num_tree_starts = 0; @@ -389,75 +324,84 @@ int main(int argc, char *argv[]) { offset += WORK_UNIT_SIZE; } - for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { + for (int32_t gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { CHECK_GPU_ERR(cudaSetDevice(gpu_index)); CHECK_GPU_ERR(cudaDeviceSynchronize()); } - for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { + for (int32_t gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { CHECK_GPU_ERR(cudaSetDevice(gpu_index)); *nodes[gpu_index].num_seeds = 0; doWork<<>>(nodes[gpu_index].num_tree_starts, nodes[gpu_index].tree_starts, nodes[gpu_index].num_seeds, nodes[gpu_index].seeds, search_back_count); } + static auto threadFunc = [&](Thread &myThread, size_t start, size_t end) { + for (int32_t j = start; j < end; ++j) { + if (generator::ChunkGenerator::populate(tempStorage[j], X_TRANSLATE + 16)) { + myThread.threadBuffer.push_back(tempStorage[j]); + count++; + } + } + }; + + + int32_t chunkSize = arraySize / threads.size(); + for(size_t i = 0; i < threads.size(); i++) + threads[i].thread = std::thread(threadFunc, std::ref(threads[i]), i * chunkSize, (i == (threads.size() - 1)) ? arraySize : ((i + 1) * chunkSize)); + + for(Thread& x : threads) { + x.thread.join(); + + for(const long long &val: x.threadBuffer) + fprintf(out_file, "%lld\n", val); + x.threadBuffer.clear(); + } + + fflush(out_file); free(tempStorage); - tempStorage = (long long*)malloc(sizeof(long long)); + tempStorage = (long long*)malloc(sizeof(long long)); arraySize = 0; - for(int gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { + for (int32_t gpu_index = 0; gpu_index < GPU_COUNT; gpu_index++) { CHECK_GPU_ERR(cudaSetDevice(gpu_index)); CHECK_GPU_ERR(cudaDeviceSynchronize()); tempStorage = (long long*) realloc(tempStorage, (*nodes[gpu_index].num_seeds + arraySize) * sizeof(long long)); - for (int i = 0, e = *nodes[gpu_index].num_seeds; i < e; i++) { - tempStorage[arraySize + i] = nodes[gpu_index].seeds[i]; + for (int32_t i = 0, e = *nodes[gpu_index].num_seeds; i < e; i++) { + tempStorage[arraySize+i]=nodes[gpu_index].seeds[i]; } arraySize += *nodes[gpu_index].num_seeds; } - int workSize = arraySize / threadCount; - - if (workSize != 0) { - for (int k = 0; arraySize != 0 && k < threadCount; ++k) { - offsetArr[k] = k * workSize; - workSizeArr[k] = k != threadCount - 1 ? workSize : arraySize % workSize; - finishedArr[k] = workSizeArr[k] == 0; - } + auto iterFinish = std::chrono::system_clock::now(); + std::chrono::duration iterationTime = iterFinish - lastIteration; + std::chrono::duration elapsedTime = iterFinish - startTime; + lastIteration = iterFinish; + uint64_t numSearched = offset + WORK_UNIT_SIZE * GPU_COUNT - OFFSET; + double speed = numSearched / elapsedTime.count() / 1000000; + double progress = (double)numSearched / (double)TOTAL_WORK_SIZE * 100.0; + double estimatedTime = (double)(TOTAL_WORK_SIZE - numSearched) / speed / 1000000; + uint64_t curCount = count; + char suffix = 's'; + if (estimatedTime >= 3600) { + suffix = 'h'; + estimatedTime /= 3600.0; + } else if (estimatedTime >= 60) { + suffix = 'm'; + estimatedTime /= 60.0; } - - if (workSize != 0) { - - for (int j = 0; j < threadCount; ++j) { - count += counts[j]; - } - double iterationTime = (double)(clock() - lastIteration) / CLOCKS_PER_SEC; - double timeElapsed = (double)(clock() - startTime) / CLOCKS_PER_SEC; - lastIteration = clock(); - ulong numSearched = offset + WORK_UNIT_SIZE * GPU_COUNT - OFFSET; - double speed = numSearched / timeElapsed / 1000000; - double progress = (double)numSearched / (double)TOTAL_WORK_SIZE * 100.0; - double estimatedTime = (double)(TOTAL_WORK_SIZE - numSearched) / speed / 1000000; - char suffix = 's'; - if (estimatedTime >= 3600) { - suffix = 'h'; - estimatedTime /= 3600.0; - } else if (estimatedTime >= 60) { - suffix = 'm'; - estimatedTime /= 60.0; - } - if (progress >= 100.0) { - estimatedTime = 0.0; - suffix = 's'; - } - printf("Searched: %ld seeds. Found: %ld matches. Uptime: %.1fs. Speed: %.2fm seeds/s. Completion: %.3f%%. ETA: %.1f%c.\n", numSearched, count, timeElapsed, speed, progress, estimatedTime, suffix); - count = 0; + if (progress >= 100.0) { + estimatedTime = 0.0; + suffix = 's'; } + std::cout << "Searched: " << std::setw(13) << numSearched << " seeds. Found: " << std::setw(13) << count.load() << " matches. Uptime: " << + std::fixed << std::setprecision(1) << elapsedTime.count() << "s. Speed: " << std::fixed << + std::setprecision(2) << speed << "m seeds/s. Completion: " << std::setprecision(2) << progress << + "%. ETA: " << std::fixed << std::setprecision(2) << estimatedTime << suffix << ".\n"; } // Last batch to do - FILE* out_file = fopen("chunk_seeds_last.txt", "w"); - - for (int j = 0; j < arraySize; ++j) { + for (int32_t j = 0; j < arraySize; ++j) { if (generator::ChunkGenerator::populate(tempStorage[j], X_TRANSLATE + 16)) { fprintf(out_file, "%lld\n", tempStorage[j]); count++; @@ -467,5 +411,4 @@ int main(int argc, char *argv[]) { fflush(out_file); free(tempStorage); fclose(out_file); - } diff --git a/random.cpp b/random.cpp index 5b0feb0..f1d9ceb 100644 --- a/random.cpp +++ b/random.cpp @@ -1,91 +1,109 @@ -#include -#include "random.h" - -random_math::LCG::LCG(int64_t multiplier, int64_t addend, int64_t modulo, bool maskable) - : multiplier(multiplier), addend(addend), modulo(modulo), maskable(maskable) -{} - -int64_t random_math::LCG::next(int64_t seed) -{ - if (this->maskable) - return (seed * this->multiplier + this->addend) & (this->modulo - 1); - return (seed * this->multiplier + this->addend) % this->modulo; -} - -random_math::LCG random_math::LCG::combine(int64_t steps) -{ - int64_t mul = 1; - int64_t add = 0; - - int64_t intermediateMultiplier = this->multiplier; - int64_t intermediateAddend = this->addend; - - for (ulong i = steps; i != 0; i >>= 1) { - if ((i & 1) != 0) { - mul *= intermediateMultiplier; - add = intermediateMultiplier * add + intermediateAddend; - } - - intermediateAddend = (intermediateMultiplier + 1) * intermediateAddend; - intermediateMultiplier *= intermediateMultiplier; - } - - if (maskable) { - mul &= this->modulo - 1; - add &= this->modulo - 1; - } else { - mul %= this->modulo; - add %= this->modulo; - } - - return LCG(mul, add, this->modulo); -} - -random_math::LCG random_math::JavaRand::lcg = LCG(0x5DEECE66DULL, 0xBULL, 1ULL << 48); - -void random_math::JavaRand::init() -{ - random_math::JavaRand::lcg = LCG(0x5DEECE66DULL, 0xBULL, 1ULL << 48); -} - -random_math::JavaRand::JavaRand(long seed, bool scramble) - : seed(0) -{ - this->setSeed(seed, scramble); -} - -uint64_t random_math::JavaRand::getSeed() -{ - return this->seed; -} - -void random_math::JavaRand::setSeed(int64_t seed, bool scramble) -{ - this->seed = seed ^ (scramble ? lcg.multiplier : 0ULL); - this->seed &= lcg.modulo - 1; -} - -uint32_t random_math::JavaRand::next(int32_t bits) -{ - this->seed = lcg.next(this->seed); - return (uint32_t) (((uint64_t)this->seed) >> (48 - bits)); -} - -uint32_t random_math::JavaRand::nextInt(int32_t bound) -{ - if (bound <= 0) { - throw std::invalid_argument("Bound must be positive"); - } - - if ((bound & -bound) == bound) { - return (int32_t) ((bound * (int64_t) this->next(31)) >> 31); - } - - int32_t bits, value; - - do { - bits = this->next(31); - value = bits % bound; - } while (bits - value + (bound - 1) < 0); - return value; -} +#include +#include "random.h" + +random_math::LCG::LCG(int64_t multiplier, int64_t addend, int64_t modulo, bool maskable) + : multiplier(multiplier), addend(addend), modulo(modulo), maskable(maskable) +{} + +int64_t random_math::LCG::next(int64_t seed) +{ + if (this->maskable) + return (seed * this->multiplier + this->addend) & (this->modulo - 1); + return (seed * this->multiplier + this->addend) % this->modulo; +} + +int64_t random_math::LCG::nextMaskableUnchecked(int64_t seed) +{ + return (seed * this->multiplier + this->addend) & (this->modulo - 1); +} + +random_math::LCG random_math::LCG::combine(int64_t steps) +{ + int64_t mul = 1; + int64_t add = 0; + + int64_t intermediateMultiplier = this->multiplier; + int64_t intermediateAddend = this->addend; + + for (ulong i = steps; i != 0; i >>= 1) { + if ((i & 1) != 0) { + mul *= intermediateMultiplier; + add = intermediateMultiplier * add + intermediateAddend; + } + + intermediateAddend = (intermediateMultiplier + 1) * intermediateAddend; + intermediateMultiplier *= intermediateMultiplier; + } + + if (maskable) { + mul &= this->modulo - 1; + add &= this->modulo - 1; + } else { + mul %= this->modulo; + add %= this->modulo; + } + + return LCG(mul, add, this->modulo); +} + +random_math::LCG random_math::JavaRand::lcg = LCG(0x5DEECE66DULL, 0xBULL, 1ULL << 48); + +void random_math::JavaRand::init() +{ + random_math::JavaRand::lcg = LCG(0x5DEECE66DULL, 0xBULL, 1ULL << 48); +} + +random_math::JavaRand::JavaRand(long seed, bool scramble) + : seed(0) +{ + this->setSeed(seed, scramble); +} + +uint64_t random_math::JavaRand::getSeed() +{ + return this->seed; +} + +void random_math::JavaRand::advance(random_math::LCG lcg) +{ + this->seed = lcg.next(this->seed); +} + +void random_math::JavaRand::setSeed(int64_t seed, bool scramble) +{ + this->seed = seed ^ (scramble ? lcg.multiplier : 0ULL); + this->seed &= lcg.modulo - 1; +} + +void random_math::JavaRand::ignoreNext() { + this->seed = lcg.nextMaskableUnchecked(this->seed); +} + +uint32_t random_math::JavaRand::next(int32_t bits) +{ + this->seed = lcg.nextMaskableUnchecked(this->seed); + return (uint32_t) (((uint64_t)this->seed) >> (48 - bits)); +} + +uint32_t random_math::JavaRand::nextInt(int32_t bound) +{ + if (bound <= 0) { + abort(); + } + + if ((bound & -bound) == bound) { + return (int32_t) ((bound * (int64_t) this->next(31)) >> 31); + } + + int32_t bits, value; + + do { + bits = this->next(31); + value = bits % bound; + } while (bits - value + (bound - 1) < 0); + return value; +} + +uint32_t random_math::JavaRand::nextIntPow2Unchecked(int32_t bound) { + return (int32_t) ((bound * (int64_t) this->next(31)) >> 31); +} diff --git a/random.h b/random.h index 07787bb..6a8c1c8 100644 --- a/random.h +++ b/random.h @@ -1,46 +1,51 @@ -#ifndef SEED_TESTER_RANDOM_H -#define SEED_TESTER_RANDOM_H - -#include -#include - -namespace random_math -{ - - class LCG { - public: - int64_t multiplier; - int64_t addend; - int64_t modulo; - bool maskable; - - LCG(int64_t multiplier, int64_t addend, int64_t modulo, bool maskable = true); - - int64_t next(int64_t seed); - - LCG combine(int64_t steps); - - }; - - class JavaRand { - private: - uint64_t seed; - public: - static LCG lcg; - static void init(); - - explicit JavaRand(long seed, bool scramble = true); - uint64_t getSeed(); - void setSeed(int64_t seed, bool scramble = true); - uint32_t next(int32_t bits); - uint32_t nextInt(int32_t bound); - }; - -} - - - - - - -#endif //SEED_TESTER_RANDOM_H +#ifndef SEED_TESTER_RANDOM_H +#define SEED_TESTER_RANDOM_H +#define ulong uint64_t +#include +#include + +namespace random_math +{ + + class LCG { + public: + int64_t multiplier; + int64_t addend; + int64_t modulo; + bool maskable; + + LCG(int64_t multiplier, int64_t addend, int64_t modulo, bool maskable = true); + + int64_t next(int64_t seed); + int64_t nextMaskableUnchecked(int64_t seed); + + LCG combine(int64_t steps); + + }; + + class JavaRand { + private: + uint64_t seed; + public: + static LCG lcg; + static void init(); + + explicit JavaRand(long seed, bool scramble = true); + uint64_t getSeed(); + void setSeed(int64_t seed, bool scramble = true); + void advance(LCG lcg); + void ignoreNext(); + uint32_t next(int32_t bits); + uint32_t nextInt(int32_t bound); + /* bound must be a positive power of 2 */ + uint32_t nextIntPow2Unchecked(int32_t bound); + }; + +} + + + + + + +#endif //SEED_TESTER_RANDOM_H