Skip to content
This repository has been archived by the owner on Oct 7, 2024. It is now read-only.

Fix and improve the OpenCL implementation #13

Merged
merged 1 commit into from
Nov 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion cmake/Hunter/config.cmake
Original file line number Diff line number Diff line change
@@ -1,2 +1,7 @@
hunter_config(CURL VERSION ${HUNTER_CURL_VERSION} CMAKE_ARGS HTTP_ONLY=ON CMAKE_USE_OPENSSL=OFF CMAKE_USE_LIBSSH2=OFF CURL_CA_PATH=none)
hunter_config(Boost VERSION 1.66.0)
hunter_config(Boost VERSION 1.70.0-p0)

hunter_config(OpenCL VERSION
URL https://github.com/KhronosGroup/OpenCL-SDK/releases/download/v2023.04.17/OpenCL-SDK-v2023.04.17-Source.tar.gz
SHA1 aca203982e9f1cdbe71ed93ae7e0c217b1d93a37
)
4,903 changes: 3,511 additions & 1,392 deletions libethash-cl/CL/cl2.hpp → libethash-cl/CL/opencl.hpp

Large diffs are not rendered by default.

37 changes: 19 additions & 18 deletions libethash-cl/CLMiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -695,6 +695,16 @@ bool CLMiner::initDevice()
<< m_settings.globalWorkSize / m_settings.localWorkSize;
}

#ifndef __clang__
// Nvidia
if (!m_deviceDescriptor.clNvCompute.empty())
{
m_computeCapability =
m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
int maxregs = m_computeCapability >= 35 ? 72 : 63;
sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
}
#endif

return true;

Expand Down Expand Up @@ -726,18 +736,6 @@ bool CLMiner::initEpoch_internal()
try
{
char options[256] = {0};
#ifndef __clang__

// Nvidia
if (!m_deviceDescriptor.clNvCompute.empty())
{
m_computeCapability =
m_deviceDescriptor.clNvComputeMajor * 10 + m_deviceDescriptor.clNvComputeMinor;
int maxregs = m_computeCapability >= 35 ? 72 : 63;
sprintf(m_options, "-cl-nv-maxrregcount=%d", maxregs);
}

#endif

m_dagItems = m_epochContext.dagNumItems;

Expand Down Expand Up @@ -836,9 +834,15 @@ bool CLMiner::initEpoch_internal()
// GPU DAG buffer to kernel
m_searchKernel.setArg(2, *m_dag);

m_dagKernel.setArg(1, *m_light);
m_dagKernel.setArg(2, *m_dag);
uint32_t light_words4[4];
ProgPow::calculate_fast_mod_data(m_epochContext.lightNumItems, light_words4[0], light_words4[1], light_words4[2]);
light_words4[3] = m_epochContext.lightNumItems;

m_dagKernel.setArg(1, m_light[0]);
m_dagKernel.setArg(2, m_dag[0]);
m_dagKernel.setArg(3, -1);
m_dagKernel.setArg(4, (uint32_t)(m_epochContext.dagSize / sizeof(ethash_hash512)));
m_dagKernel.setArg(5, light_words4);

const uint32_t workItems = m_dagItems * 2; // GPU computes partial 512-bit DAG items.

Expand Down Expand Up @@ -889,13 +893,10 @@ void CLMiner::asyncCompile()

void CLMiner::compileKernel(uint64_t period_seed, cl::Program& program, cl::Kernel& searchKernel)
{
std::string code = ProgPow::getKern(period_seed, ProgPow::KERNEL_CL);
code += string(CLMiner_kernel);
std::string code = ProgPow::getKern(CLMiner_kernel, period_seed, ProgPow::KERNEL_CL);

addDefinition(code, "GROUP_SIZE", m_settings.localWorkSize);
addDefinition(code, "ACCESSES", 64);
addDefinition(code, "LIGHT_WORDS", m_epochContext.lightNumItems);
addDefinition(code, "PROGPOW_DAG_BYTES", m_epochContext.dagSize);
addDefinition(code, "PROGPOW_DAG_ELEMENTS", m_epochContext.dagNumItems / 2);

addDefinition(code, "MAX_OUTPUTS", c_maxSearchResults);
Expand Down
2 changes: 1 addition & 1 deletion libethash-cl/CLMiner.h
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
#define CL_HPP_CL_1_2_DEFAULT_BUILD true
#define CL_HPP_TARGET_OPENCL_VERSION 120
#define CL_HPP_MINIMUM_OPENCL_VERSION 120
#include "CL/cl2.hpp"
#include "CL/opencl.hpp"
#pragma GCC diagnostic pop

// macOS OpenCL fix:
Expand Down
57 changes: 31 additions & 26 deletions libethash-cl/CLMiner_kernel.cl
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PROGPOW_REPLACE_HEADER

#define OPENCL_PLATFORM_UNKNOWN 0
#define OPENCL_PLATFORM_NVIDIA 1
#define OPENCL_PLATFORM_AMD 2
Expand All @@ -17,6 +19,8 @@

#define HASHES_PER_GROUP (GROUP_SIZE / PROGPOW_LANES)

#define FNV_PRIME 0x1000193

typedef struct
{
uint32_t uint32s[32 / sizeof(uint32_t)];
Expand Down Expand Up @@ -210,9 +214,10 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
// initialize mix for all lanes
fill_mix(hash_seed, lane_id, mix);

#pragma unroll 1
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
progPowLoop(l, mix, g_dag, c_dag, share[0].uint64s, hack_false);
#pragma unroll 2
for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
PROGPOW_REPLACE_MATH
}

// Reduce mix data to a per-lane 32-bit digest
uint32_t mix_hash = 0x811c9dc5;
Expand Down Expand Up @@ -256,15 +261,9 @@ ethash_search(__global struct SearchResults* restrict g_output, __constant hash3
//


#ifndef LIGHT_WORDS
#define LIGHT_WORDS 262139
#endif

#define ETHASH_DATASET_PARENTS 256
#define NODE_WORDS (64 / 4)

#define FNV_PRIME 0x01000193

__constant uint2 const Keccak_f1600_RC[24] = {
(uint2)(0x00000001, 0x00000000),
(uint2)(0x00008082, 0x00000000),
Expand Down Expand Up @@ -509,27 +508,33 @@ static void SHA3_512(uint2* s, uint isolate)
keccak_f1600_no_absorb(s, 8, isolate);
}

static uint fast_mod(uint a, uint4 d)
{
const ulong t = a;
const uint q = ((t + d.y) * d.x) >> d.z;
return a - q * d.w;
}

__kernel void ethash_calculate_dag_item(
uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate)
uint start, __global hash64_t const* g_light, __global hash64_t* g_dag, uint isolate, uint dag_words, uint4 light_words)
{
uint const node_index = start + get_global_id(0);
if (node_index * sizeof(hash64_t) >= PROGPOW_DAG_BYTES)
return;
if (node_index >= dag_words)
return;

hash200_t dag_node;
copy(dag_node.uint4s, g_light[node_index % LIGHT_WORDS].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s, isolate);
hash200_t dag_node;
copy(dag_node.uint4s, g_light[fast_mod(node_index, light_words)].uint4s, 4);
dag_node.words[0] ^= node_index;
SHA3_512(dag_node.uint2s, isolate);

for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
{
uint parent_index = fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]) % LIGHT_WORDS;
for (uint i = 0; i != ETHASH_DATASET_PARENTS; ++i)
{
uint parent_index = fast_mod(fnv(node_index ^ i, dag_node.words[i % NODE_WORDS]), light_words);

for (uint w = 0; w != 4; ++w)
{
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
}
}
SHA3_512(dag_node.uint2s, isolate);
copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
for (uint w = 0; w != 4; ++w)
dag_node.uint4s[w] = fnv4(dag_node.uint4s[w], g_light[parent_index].uint4s[w]);
}

SHA3_512(dag_node.uint2s, isolate);
copy(g_dag[node_index].uint4s, dag_node.uint4s, 4);
}
3 changes: 1 addition & 2 deletions libethash-cuda/CUDAMiner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -363,8 +363,7 @@ void CUDAMiner::compileKernel(uint64_t period_seed, uint64_t dag_elms, CUfunctio

const char* name = "progpow_search";

std::string text = ProgPow::getKern(period_seed, ProgPow::KERNEL_CUDA);
text += std::string(CUDAMiner_kernel);
std::string text = ProgPow::getKern(CUDAMiner_kernel, period_seed, ProgPow::KERNEL_CUDA);

std::string tmpDir;
#ifdef _WIN32
Expand Down
9 changes: 6 additions & 3 deletions libethash-cuda/CUDAMiner_kernel.cu
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
PROGPOW_REPLACE_HEADER

#ifndef MAX_SEARCH_RESULTS
#define MAX_SEARCH_RESULTS 4U
#endif
Expand Down Expand Up @@ -181,9 +183,10 @@ progpow_search(
// initialize mix for all lanes
fill_mix(hash_seed, lane_id, mix);

#pragma unroll 1
for (uint32_t l = 0; l < PROGPOW_CNT_DAG; l++)
progPowLoop(l, mix, g_dag, c_dag, hack_false);
#pragma unroll 2
for (uint32_t loop = 0; loop < PROGPOW_CNT_DAG; loop++) {
PROGPOW_REPLACE_MATH
}


// Reduce mix data to a per-lane 32-bit digest
Expand Down
78 changes: 35 additions & 43 deletions libprogpow/ProgPow.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
#include "ProgPow.h"

#include <sstream>
#include <regex>

#define rnd() (kiss99(rnd_state))
#define mix_src() ("mix[" + std::to_string(rnd() % PROGPOW_REGS) + "]")
Expand All @@ -14,7 +15,7 @@ inline void swap(uint32_t& a, uint32_t& b)
b = t;
}

std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
std::string ProgPow::getKern(std::string kernel_code, uint64_t prog_seed, kernel_t kern)
{
std::stringstream ret;

Expand Down Expand Up @@ -99,45 +100,16 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
{
ret << "typedef struct __align__(16) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "__device__ __forceinline__ void progPowLoop(const uint32_t loop,\n";
ret << " uint32_t mix[PROGPOW_REGS],\n";
ret << " const dag_t *g_dag,\n";
ret << " const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " const bool hack_false)\n";
}
else
{
ret << "typedef struct __attribute__ ((aligned (16))) {uint32_t s[PROGPOW_DAG_LOADS];} dag_t;\n";
ret << "\n";
ret << "// Inner loop for prog_seed " << prog_seed << "\n";
ret << "inline void progPowLoop(const uint32_t loop,\n";
ret << " volatile uint32_t mix_arg[PROGPOW_REGS],\n";
ret << " __global const dag_t *g_dag,\n";
ret << " __local const uint32_t c_dag[PROGPOW_CACHE_WORDS],\n";
ret << " __local uint64_t share[GROUP_SHARE],\n";
ret << " const bool hack_false)\n";
}
ret << "{\n";
std::string kernel = std::regex_replace(kernel_code, std::regex("PROGPOW_REPLACE_HEADER"), ret.str());
ret.str(std::string());

ret << "dag_t data_dag;\n";
ret << "uint32_t offset, data;\n";
// Work around AMD OpenCL compiler bug
// See https://github.com/gangnamtestnet/ethcoreminer/issues/16
if (kern == KERNEL_CL)
{
ret << "uint32_t mix[PROGPOW_REGS];\n";
ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix[i] = mix_arg[i];\n";
}

if (kern == KERNEL_CUDA)
ret << "const uint32_t lane_id = threadIdx.x & (PROGPOW_LANES-1);\n";
else
{
ret << "const uint32_t lane_id = get_local_id(0) & (PROGPOW_LANES-1);\n";
ret << "const uint32_t group_id = get_local_id(0) / PROGPOW_LANES;\n";
}

// Global memory access
// lanes access sequential locations
Expand All @@ -149,13 +121,14 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
else
{
ret << "if(lane_id == (loop % PROGPOW_LANES))\n";
ret << " share[group_id] = mix[0];\n";
ret << " share[0].uint32s[group_id] = mix[0];\n";
ret << "barrier(CLK_LOCAL_MEM_FENCE);\n";
ret << "offset = share[group_id];\n";
ret << "offset = share[0].uint32s[group_id];\n";
}
ret << "offset %= PROGPOW_DAG_ELEMENTS;\n";
ret << "offset = offset * PROGPOW_LANES + (lane_id ^ loop) % PROGPOW_LANES;\n";
ret << "data_dag = g_dag[offset];\n";
ret << "dag_t data_dag = g_dag[offset];\n";

ret << "// hack to prevent compiler from reordering LD and usage\n";
if (kern == KERNEL_CUDA)
ret << "if (hack_false) __threadfence_block();\n";
Expand Down Expand Up @@ -208,16 +181,10 @@ std::string ProgPow::getKern(uint64_t prog_seed, kernel_t kern)
uint32_t r = rnd();
ret << merge(dest, "data_dag.s["+std::to_string(i)+"]", r);
}
// Work around AMD OpenCL compiler bug
if (kern == KERNEL_CL)
{
ret << "for(uint32_t i=0; i<PROGPOW_REGS; i++)\n";
ret << " mix_arg[i] = mix[i];\n";
}
ret << "}\n";
ret << "\n";

return ret.str();
kernel = std::regex_replace(kernel, std::regex("PROGPOW_REPLACE_MATH"), ret.str());
return kernel;
}

// Merge new data from b into the value in a
Expand Down Expand Up @@ -291,3 +258,28 @@ uint32_t ProgPow::kiss99(kiss99_t &st)
st.jcong = 69069 * st.jcong + 1234567;
return ((MWC^st.jcong) + st.jsr);
}

void ProgPow::calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift)
{
if ((divisor & (divisor - 1)) == 0) {
reciprocal = 1;
increment = 0;
shift = 31U - clz(divisor);
}
else {
shift = 63U - clz(divisor);
const uint64_t N = 1ULL << shift;
const uint64_t q = N / divisor;
const uint64_t r = N - q * divisor;
if (r * 2 < divisor)
{
reciprocal = static_cast<uint32_t>(q);
increment = 1;
}
else
{
reciprocal = static_cast<uint32_t>(q + 1);
increment = 0;
}
}
}
14 changes: 13 additions & 1 deletion libprogpow/ProgPow.h
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,9 @@ class ProgPow
KERNEL_CL
} kernel_t;

static std::string getKern(uint64_t seed, kernel_t kern);

static std::string getKern(std::string kernel_code, uint64_t seed, kernel_t kern);
static void calculate_fast_mod_data(uint32_t divisor, uint32_t& reciprocal, uint32_t& increment, uint32_t& shift);
private:
static std::string math(std::string d, std::string a, std::string b, uint32_t r);
static std::string merge(std::string a, std::string b, uint32_t r);
Expand All @@ -43,4 +44,15 @@ class ProgPow
uint32_t z, w, jsr, jcong;
} kiss99_t;
static uint32_t kiss99(kiss99_t &st);

static uint32_t clz(uint32_t a)
{
#ifdef _MSC_VER
unsigned long index;
_BitScanReverse(&index, a);
return 31 - index;
#else
return __builtin_clz(a);
#endif
}
};