diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..567609b --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +build/ diff --git a/PLA-SeedFinder/CMakeLists.txt b/PLA-SeedFinder/CMakeLists.txt index ca9a256..9c04d27 100644 --- a/PLA-SeedFinder/CMakeLists.txt +++ b/PLA-SeedFinder/CMakeLists.txt @@ -16,31 +16,31 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_EXPORT_COMPILE_COMMANDS ON) #set(CMAKE_VERBOSE_MAKEFILE ON) -set(CMAKE_AUTOMOC ON) -set(CMAKE_AUTORCC ON) -set(CMAKE_AUTOUIC ON) - -add_custom_target(build-time-make-directory ALL - COMMAND ${CMAKE_COMMAND} -E make_directory Assembly/) - #Find threads library set(THREADS_PREFER_PTHREAD_FLAG ON) find_package(Threads REQUIRED) +#detect cpu arch +if (CMAKE_SYSTEM_PROCESSOR MATCHES "(x86)|(X86)|(amd64)|(AMD64)") + set (X86 TRUE) +else () + set (X86 FALSE) +endif () #add current directory to find tesseractPA.lib #link_directories(${CMAKE_CURRENT_LIST_DIR}) -file(GLOB MAIN_SOURCES +if (X86) + file(GLOB MAIN_SOURCES Source/Compiler.h - Source/CpuId.cpp + Source/CpuFeatures.cpp Source/CpuFeatures.h Source/DynamicParallelizer.cpp Source/DynamicParallelizer.h Source/Kernels/SeedScan_Default.cpp - Source/Kernels/SeedScan_x64_AVX2.cpp - Source/Kernels/SeedScan_x64_AVX512.cpp - Source/Kernels/SeedScan_x64_SSE41.cpp + Source/Kernels/SeedScan_x86_AVX2.cpp + Source/Kernels/SeedScan_x86_AVX512.cpp + Source/Kernels/SeedScan_x86_SSE41.cpp Source/Kernels/XoroShiro1_Default.h Source/Kernels/XoroShiro2_SSE2.h Source/Kernels/XoroShiro4_AVX2.h @@ -54,8 +54,25 @@ file(GLOB MAIN_SOURCES Source/SeedScan.h Source/Tools.cpp Source/Tools.h -) -add_executable(PLA-SeedFinder WIN32 ${MAIN_SOURCES}) + ) +else () + file(GLOB MAIN_SOURCES + Source/DynamicParallelizer.cpp + Source/Kernels/SeedScan_Default.cpp + Source/Kernels/SeedScan_aarch64.cpp + Source/Main.cpp + Source/PLA-SeedFinder.cpp + Source/ReportCandidates.cpp + Source/SeedScan.cpp + Source/Tools.cpp + ) +endif (X86) + +if (WIN32) + add_executable(PLA-SeedFinder WIN32 ${MAIN_SOURCES}) +else () + add_executable(PLA-SeedFinder ${MAIN_SOURCES}) +endif () set_target_properties(PLA-SeedFinder PROPERTIES LINKER_LANGUAGE CXX) target_link_libraries(PLA-SeedFinder) target_link_libraries(PLA-SeedFinder Threads::Threads) @@ -68,6 +85,9 @@ target_include_directories(PLA-SeedFinder PRIVATE Source/) #enable MP with MSVC (Build with Multiple Processes) if (MSVC) + add_custom_target(build-time-make-directory ALL + COMMAND ${CMAKE_COMMAND} -E make_directory Assembly/) + target_compile_options(PLA-SeedFinder PRIVATE /FAs /FaAssembly/ /MP /W4) target_compile_options(PLA-SeedFinder PRIVATE /wd5054) # Deprecated enum arithemtic target_compile_options(PLA-SeedFinder PRIVATE /wd4505) # unreferenced local function has been removed @@ -82,50 +102,49 @@ if (MSVC) target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake) else() - target_compile_options(PLA-SeedFinder PRIVATE -msse4.2) - - target_compile_options(PLA-SeedFinder PRIVATE -Wall -Wpedantic -DPA_STATIC) - - set(ARCH_FLAGS_09_Nehalem -march=nehalem) - set(ARCH_FLAGS_13_Haswell -march=haswell) - set(ARCH_FLAGS_17_Skylake -march=skylake-avx512) - - # Run-time ISA dispatching - target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_08_Nehalem) - target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_13_Haswell) - target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake) -endif() - - - -# Run-time CPU dispatching. -if (ARCH_FLAGS_09_Nehalem) -SET_SOURCE_FILES_PROPERTIES( - Source/Kernels/SeedScan_x64_SSE41.cpp - PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_09_Nehalem} -) -endif() -if (ARCH_FLAGS_13_Haswell) -SET_SOURCE_FILES_PROPERTIES( - Source/Kernels/SeedScan_x64_AVX2.cpp - PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_13_Haswell} -) + target_compile_options(PLA-SeedFinder PRIVATE -Wall -Wpedantic -O2) + if (X86) + target_compile_options(PLA-SeedFinder PRIVATE -msse4.2) + + set(ARCH_FLAGS_09_Nehalem -march=nehalem) + set(ARCH_FLAGS_13_Haswell -march=haswell) + set(ARCH_FLAGS_17_Skylake -march=skylake-avx512) + + # Run-time ISA dispatching + target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_08_Nehalem) + target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_13_Haswell) + target_compile_definitions(PLA-SeedFinder PRIVATE PA_AutoDispatch_17_Skylake) + endif (X86) endif() -if (ARCH_FLAGS_17_Skylake) -SET_SOURCE_FILES_PROPERTIES( - Source/Kernels/SeedScan_x64_AVX512.cpp - PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_17_Skylake} -) -endif() - - - - -#copy needed dlls -#file(COPY *.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) -file(GLOB MY_DLLS - "*.dll" -) -file(COPY ${MY_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +if(X86) + # Run-time CPU dispatching. + if (ARCH_FLAGS_09_Nehalem) + SET_SOURCE_FILES_PROPERTIES( + Source/Kernels/SeedScan_x86_SSE41.cpp + PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_09_Nehalem} + ) + endif() + if (ARCH_FLAGS_13_Haswell) + SET_SOURCE_FILES_PROPERTIES( + Source/Kernels/SeedScan_x86_AVX2.cpp + PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_13_Haswell} + ) + endif() + if (ARCH_FLAGS_17_Skylake) + SET_SOURCE_FILES_PROPERTIES( + Source/Kernels/SeedScan_x86_AVX512.cpp + PROPERTIES COMPILE_FLAGS ${ARCH_FLAGS_17_Skylake} + ) + endif() +endif(X86) + +if (WIN32) + #copy needed dlls + #file(COPY *.dll DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) + file(GLOB MY_DLLS + "*.dll" + ) + file(COPY ${MY_DLLS} DESTINATION ${CMAKE_CURRENT_BINARY_DIR}) +endif(WIN32) diff --git a/PLA-SeedFinder/Source/Kernels/SeedScan_aarch64.cpp b/PLA-SeedFinder/Source/Kernels/SeedScan_aarch64.cpp new file mode 100644 index 0000000..db60a3a --- /dev/null +++ b/PLA-SeedFinder/Source/Kernels/SeedScan_aarch64.cpp @@ -0,0 +1,119 @@ +#include +#include +#include "XoroShiro_aarch64.h" + +namespace PokemonAutomation{ + +#if defined __aarch64__ && defined __APPLE__ + +bool seed_scan_common_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){ + simd_ulong4 t = simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid); + simd_ulong4 seed = simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000); + simd_ulong4 delta = simd_make_ulong4(0x500000000, 0x500000000, 0x500000000, 0x500000000); + simd_ulong4 pid; + iterations /= 4; + + do { + XoroShiro4 rng(seed); + rng.next(); + size_t lc = rolls; + do { + rng.next(); + pid = rng.get_masked(); + if (simd_reduce_min(pid ^ t) == 0) { + return true; + } + } while (--lc); + seed += delta; + } while (--iterations); + return false; +} + +bool seed_scan_thorough_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){ + desired_pid &= 0xefffffff; + simd_ulong4 t = simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid); + simd_ulong4 seed = simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000); + simd_ulong4 delta = simd_make_ulong4(0x500000000, 0x500000000, 0x500000000, 0x500000000); + simd_ulong4 mask = simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff); + simd_ulong4 pid; + iterations /= 4; + + do { + XoroShiro4 rng(seed); + size_t lc = rolls + 1; + do { + rng.next(); + pid = rng.get_masked(); + if (simd_reduce_max(pid) == UINT32_MAX) { + return true; + } + pid &= mask; + if (simd_reduce_min(pid ^ t) == 0) { + return true; + } + } while (--lc); + seed += delta; + } while (--iterations); + return false; +} + +bool seed_scan_common_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){ + simd_ulong8 t = simd_make_ulong8(simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid), + simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid)); + simd_ulong8 seed = simd_make_ulong8(simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000), + simd_make_ulong4(start_seed+0x500000000, start_seed+0x600000000, start_seed+0x700000000, start_seed+0x800000000)); + simd_ulong8 delta = simd_make_ulong8(simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000), + simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000)); + simd_ulong8 pid; + iterations /= 8; + + do { + XoroShiro8 rng(seed); + rng.next(); + size_t lc = rolls; + do { + rng.next(); + pid = rng.get_masked(); + if (simd_reduce_min(pid ^ t) == 0) { + return true; + } + } while (--lc); + seed += delta; + } while (--iterations); + return false; +} + +bool seed_scan_thorough_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations){ + desired_pid &= 0xefffffff; + simd_ulong8 t = simd_make_ulong8(simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid), + simd_make_ulong4((uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid, (uint64_t)desired_pid)); + simd_ulong8 seed = simd_make_ulong8(simd_make_ulong4(start_seed, start_seed+0x200000000, start_seed+0x300000000, start_seed+0x400000000), + simd_make_ulong4(start_seed+0x500000000, start_seed+0x600000000, start_seed+0x700000000, start_seed+0x800000000)); + simd_ulong8 delta = simd_make_ulong8(simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000), + simd_make_ulong4(0x900000000, 0x900000000, 0x900000000, 0x900000000)); + simd_ulong8 u1 = simd_make_ulong8(simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff), + simd_make_ulong4(0xefffffff, 0xefffffff, 0xefffffff, 0xefffffff)); + simd_ulong8 pid; + iterations /= 8; + + do { + XoroShiro8 rng(seed); + size_t lc = rolls + 1; + do { + rng.next(); + pid = rng.get_masked(); + if (simd_reduce_max(pid) == UINT32_MAX) { + return true; + } + pid &= u1; + if (simd_reduce_min(pid ^ t) == 0) { + return true; + } + } while (--lc); + seed += delta; + } while (--iterations); + return false; +} + +#endif +} diff --git a/PLA-SeedFinder/Source/Kernels/XoroShiro1_Default.h b/PLA-SeedFinder/Source/Kernels/XoroShiro1_Default.h index 50cb6b8..7952ae0 100644 --- a/PLA-SeedFinder/Source/Kernels/XoroShiro1_Default.h +++ b/PLA-SeedFinder/Source/Kernels/XoroShiro1_Default.h @@ -18,11 +18,7 @@ namespace PokemonAutomation{ class XoroShiroX1_Default{ public: - PA_FORCE_INLINE XoroShiroX1_Default(uint64_t seed){ - state[0] = seed; - state[1] = 0x82A2B175229D6A5B; - } - + PA_FORCE_INLINE XoroShiroX1_Default(uint64_t seed) : state {seed, 0x82A2B175229D6A5B} {} PA_FORCE_INLINE uint64_t get_int64(){ return state[0] + state[1]; } diff --git a/PLA-SeedFinder/Source/Kernels/XoroShiro_aarch64.h b/PLA-SeedFinder/Source/Kernels/XoroShiro_aarch64.h new file mode 100644 index 0000000..16dee6d --- /dev/null +++ b/PLA-SeedFinder/Source/Kernels/XoroShiro_aarch64.h @@ -0,0 +1,61 @@ +#pragma once + +#if defined __aarch64__ && defined __APPLE__ +#include +#include "Compiler.h" + +#define MAGIC_NUMBER 0x82A2B175229D6A5B + +namespace PokemonAutomation{ + +static const simd_ulong4 magic_ulong4 = simd_make_ulong4(MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER, MAGIC_NUMBER); +static const simd_ulong4 mask_ulong4 = simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX); +static const simd_ulong8 mask_ulong8 = simd_make_ulong8(simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX), + simd_make_ulong4(UINT32_MAX, UINT32_MAX, UINT32_MAX, UINT32_MAX)); + +class XoroShiro4{ +public: + PA_FORCE_INLINE XoroShiro4(simd_ulong4 seed) + : state {seed, magic_ulong4} {} + PA_FORCE_INLINE simd_ulong4 get(){ return state[0] + state[1]; } + PA_FORCE_INLINE simd_ulong4 get_masked(){ return (state[0] + state[1]) & mask_ulong4; } + PA_FORCE_INLINE void next(){ + simd_ulong4 s0 = state[0]; + simd_ulong4 s1 = state[1]; + s1 = s1 ^ s0; + s0 = (s0 << 24) | (s0 >> 40); + state[1] = (s1 << 37) | (s1 >> 27); + s1 = s1 ^ (s1 << 16); + state[0] = s0 ^ s1; + } + +private: + simd_ulong4 state[2]; +}; + + +class XoroShiro8{ +public: + PA_FORCE_INLINE XoroShiro8(simd_ulong8 seed) + : state {seed, simd_make_ulong8(magic_ulong4, magic_ulong4)} {} + PA_FORCE_INLINE simd_ulong8 get(){ return state[0] + state[1]; } + PA_FORCE_INLINE simd_ulong8 get_masked(){ return (state[0] + state[1]) & mask_ulong8; } + PA_FORCE_INLINE void next(){ + simd_ulong8 s0 = state[0]; + simd_ulong8 s1 = state[1]; + s1 = s1 ^ s0; + s0 = (s0 << 24) | (s0 >> 40); + state[1] = (s1 << 37) | (s1 >> 27); + s1 = s1 ^ (s1 << 16); + state[0] = s0 ^ s1; + } + +private: + simd_ulong8 state[2]; +}; + + +} + +#undef MAGIC_NUMBER +#endif diff --git a/PLA-SeedFinder/Source/Main.cpp b/PLA-SeedFinder/Source/Main.cpp index 70f39c7..8cec426 100644 --- a/PLA-SeedFinder/Source/Main.cpp +++ b/PLA-SeedFinder/Source/Main.cpp @@ -50,7 +50,6 @@ void test(){ } #endif -#include int main(){ diff --git a/PLA-SeedFinder/Source/PLA-SeedFinder.cpp b/PLA-SeedFinder/Source/PLA-SeedFinder.cpp index b6ad35f..64d3cf8 100644 --- a/PLA-SeedFinder/Source/PLA-SeedFinder.cpp +++ b/PLA-SeedFinder/Source/PLA-SeedFinder.cpp @@ -56,8 +56,6 @@ int32_t pa_PLA_find_seeds_threads( << stats.ivs[4] << " " << stats.ivs[5] << std::endl; - EcPidMatchReporter reporter(stats); - std::cout << std::endl; print_isa(); std::cout << "Threads: " << threads << std::endl; diff --git a/PLA-SeedFinder/Source/PLA-SeedFinder.h b/PLA-SeedFinder/Source/PLA-SeedFinder.h index b172778..037094a 100644 --- a/PLA-SeedFinder/Source/PLA-SeedFinder.h +++ b/PLA-SeedFinder/Source/PLA-SeedFinder.h @@ -6,6 +6,7 @@ #include +#if defined(_WIN32) #if defined _WINDLL #define PA_EXPORT __declspec(dllexport) #elif defined PA_STATIC @@ -13,6 +14,9 @@ #else #define PA_EXPORT __declspec(dllimport) #endif +#else +#define PA_EXPORT +#endif #ifdef __cplusplus extern "C" { diff --git a/PLA-SeedFinder/Source/SeedScan.cpp b/PLA-SeedFinder/Source/SeedScan.cpp index d394332..2978c38 100644 --- a/PLA-SeedFinder/Source/SeedScan.cpp +++ b/PLA-SeedFinder/Source/SeedScan.cpp @@ -30,9 +30,19 @@ bool seed_scan_thorough_unroll4_SSE41(size_t rolls, uint32_t desired_pid, uint64 bool seed_scan_thorough_unroll8_AVX2(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); bool seed_scan_thorough_unroll16_AVX512(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); +bool seed_scan_common_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); +bool seed_scan_thorough_unroll4_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); +bool seed_scan_common_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); +bool seed_scan_thorough_unroll8_NEON(size_t rolls, uint32_t desired_pid, uint64_t start_seed, uint64_t iterations); void print_isa(){ +#ifdef __aarch64__ + #if defined __APPLE__ + cout << "Instruction Set: NEON" << endl; + return; + #endif +#else #if !_MSC_VER || _WIN64 if (CPU_CAPABILITY.OS_AVX512 && CPU_CAPABILITY.HW_AVX512_DQ){ cout << "Instruction Set: AVX512" << endl; @@ -47,6 +57,7 @@ void print_isa(){ cout << "Instruction Set: SSE4.1" << endl; return; } +#endif cout << "Instruction Set: Default" << endl; } @@ -86,7 +97,14 @@ bool seed_scan_common(size_t rolls, uint32_t desired_pid, uint64_t start_seed, u iterations -= block; } #endif - +#if defined __aarch64__ && defined __APPLE__ + uint64_t block = iterations / 8 * 8; + if (block > 0 && seed_scan_common_unroll8_NEON(rolls, desired_pid, start_seed, block)){ + return true; + } + start_seed += block * 0x100000000; + iterations -= block; +#endif if (iterations > 0){ return seed_scan_common_Default(rolls, desired_pid, start_seed, iterations); } @@ -126,6 +144,14 @@ bool seed_scan_thorough(size_t rolls, uint32_t desired_pid, uint64_t start_seed, iterations -= block; } #endif +#if defined __aarch64__ && defined __APPLE__ + uint64_t block = iterations / 8 * 8; + if (block > 0 && seed_scan_thorough_unroll8_NEON(rolls, desired_pid, start_seed, block)){ + return true; + } + start_seed += block * 0x100000000; + iterations -= block; +#endif if (iterations > 0){ return seed_scan_thorough_Default(rolls, desired_pid, start_seed, iterations);