From baf06ad80d507d7edbe028c205a4b6883e0d981d Mon Sep 17 00:00:00 2001 From: Ilya Grebnov Date: Fri, 24 Mar 2023 20:34:54 -0700 Subject: [PATCH] bsc 3.3.2 --- CHANGES | 3 + README | 2 +- VERSION | 2 +- bsc.cpp | 2 +- libbsc/bwt/libcubwt/CHANGES | 3 + libbsc/bwt/libcubwt/VERSION | 2 +- libbsc/bwt/libcubwt/libcubwt.cu | 1415 +++++++++++++++++------------- libbsc/bwt/libcubwt/libcubwt.cuh | 48 +- libbsc/libbsc.h | 4 +- 9 files changed, 842 insertions(+), 639 deletions(-) diff --git a/CHANGES b/CHANGES index 1331eb9..110f228 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,6 @@ +Changes in 3.3.2 (March, 24 2023) +- Reduced memory usage and improved performance of GPU accelerated forward BWT. + Changes in 3.3.1 (February, 15 2023) - Added ability to specify block size in bytes as oppose to megabytes. diff --git a/README b/README index 5c55b3f..03e5a06 100644 --- a/README +++ b/README @@ -54,7 +54,7 @@ Compression and decompression requirements are the same and in bytes, can be estimated as 16Mb + 5 x block size x number of blocks processed in parallel. GPU memory usage for NVIDIA CUDA technology is different from CPU memory usage -and can be estimated as 20 x block size for ST and 32 x block size for BWT. +and can be estimated as 20 x block size for ST and 21 x block size for BWT. NVIDIA GPU acceleration: diff --git a/VERSION b/VERSION index 712bd5a..5436ea0 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.3.1 \ No newline at end of file +3.3.2 \ No newline at end of file diff --git a/bsc.cpp b/bsc.cpp index ff39d60..108a1c9 100644 --- a/bsc.cpp +++ b/bsc.cpp @@ -869,7 +869,7 @@ void ProcessCommandline(int argc, char * argv[]) int main(int argc, char * argv[]) { - fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.1. 15 February 2023.\n"); + fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.2. 24 March 2023.\n"); fprintf(stdout, "Copyright (c) 2009-2023 Ilya Grebnov .\n\n"); #if defined(_OPENMP) && defined(__INTEL_COMPILER) diff --git a/libbsc/bwt/libcubwt/CHANGES b/libbsc/bwt/libcubwt/CHANGES index d00efa8..446d5d8 100644 --- a/libbsc/bwt/libcubwt/CHANGES +++ b/libbsc/bwt/libcubwt/CHANGES @@ -1,3 +1,6 @@ +Changes in 1.5.0 (March 24, 2023) +- Reduced memory usage and improved performance. + Changes in 1.0.0 (February 10, 2023) - Initial public release of the libcubwt. diff --git a/libbsc/bwt/libcubwt/VERSION b/libbsc/bwt/libcubwt/VERSION index afaf360..3e1ad72 100644 --- a/libbsc/bwt/libcubwt/VERSION +++ b/libbsc/bwt/libcubwt/VERSION @@ -1 +1 @@ -1.0.0 \ No newline at end of file +1.5.0 \ No newline at end of file diff --git a/libbsc/bwt/libcubwt/libcubwt.cu b/libbsc/bwt/libcubwt/libcubwt.cu index 9440f9a..d52b3d1 100644 --- a/libbsc/bwt/libcubwt/libcubwt.cu +++ b/libbsc/bwt/libcubwt/libcubwt.cu @@ -1,7 +1,7 @@ /*-- This file is a part of libcubwt, a library for CUDA accelerated -suffix array and burrows wheeler transform construction. +burrows wheeler transform construction. Copyright (c) 2022-2023 Ilya Grebnov @@ -52,27 +52,24 @@ Please see the file LICENSE for full copyright and license details. #endif #if CUDA_DEVICE_ARCH == 750 - #define CUDA_SM_THREADS 1024 + #define CUDA_SM_THREADS (1024) #elif CUDA_DEVICE_ARCH == 860 || CUDA_DEVICE_ARCH == 870 || CUDA_DEVICE_ARCH == 890 - #define CUDA_SM_THREADS 1536 + #define CUDA_SM_THREADS (1536) #else - #define CUDA_SM_THREADS 2048 + #define CUDA_SM_THREADS (2048) +#endif + +#if CUDA_DEVICE_ARCH == 860 || CUDA_DEVICE_ARCH == 870 || CUDA_DEVICE_ARCH == 890 + #define CUDA_BLOCK_THREADS (768) +#else + #define CUDA_BLOCK_THREADS (512) #endif #define CUDA_WARP_THREADS (32) -#define CUDA_BLOCK_THREADS (512) -#define CUDA_DEVICE_PADDING (16 * CUDA_BLOCK_THREADS) +#define CUDA_DEVICE_PADDING (12 * 768) typedef struct LIBCUBWT_DEVICE_STORAGE { - void * device_alloc1; - void * device_alloc2; - void * device_alloc3; - void * device_alloc4; - void * device_alloc5; - void * device_alloc6; - void * device_alloc7; - void * device_rsort_temp_storage; size_t device_rsort_temp_storage_size; @@ -88,27 +85,29 @@ typedef struct LIBCUBWT_DEVICE_STORAGE uint32_t * device_keys; uint32_t * device_offsets; - uint4 * device_descriptors_large; - uint4 * device_descriptors_copy; - uint2 * device_descriptors_small; - uint32_t * device_temp_SA; uint32_t * device_temp_ISA; uint32_t * device_temp_keys; + uint64_t * device_SA_temp_SA; uint64_t * device_keys_temp_keys; uint64_t * device_offsets_ISA; + uint4 * device_descriptors_large; + uint4 * device_descriptors_copy; + uint2 * device_descriptors_small; + + void * device_storage; + int32_t device_L2_cache_bits; + void * host_pinned_storage; size_t host_pinned_storage_size; int64_t max_length; uint32_t num_unsorted_segments; uint32_t num_unsorted_suffixes; - bool is_partial_suffix_array; - - int32_t device_L2_cache_bits; - + + uint32_t cuda_block_threads; cudaStream_t cuda_stream; } LIBCUBWT_DEVICE_STORAGE; @@ -139,7 +138,7 @@ static cudaError_t libcubwt_cuda_safe_call(const char * filename, int32_t line, template __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) { -#if CUDA_DEVICE_ARCH >= 800 +#if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__) return __reduce_add_sync((uint32_t)-1, value); #else @@ -156,7 +155,7 @@ __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) template __device__ __forceinline__ T libcubwt_warp_reduce_max(T value) { -#if CUDA_DEVICE_ARCH >= 800 +#if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__) return __reduce_max_sync((uint32_t)-1, value); #else @@ -232,44 +231,6 @@ static void libcubwt_scatter_values_uint32_kernel(const uint32_t * RESTRICT devi } } -template -__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) -static void libcubwt_scatter_values_uint8_kernel(const uint32_t * RESTRICT device_idx, const uint8_t * RESTRICT device_src, uint8_t * RESTRICT device_dst, uint32_t m) -{ - const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; - - device_idx += block_index; device_src += block_index; m -= block_index; - - if (m >= CUDA_BLOCK_THREADS * 4) - { - const uint4 indexes = __ldg((uint4 *)(device_idx + threadIdx.x * 4)); - - if (source_is_aligned) - { - const uchar4 values = __ldg((uchar4 *)(device_src + threadIdx.x * 4)); - - device_dst[indexes.x] = values.x; - device_dst[indexes.y] = values.y; - device_dst[indexes.z] = values.z; - device_dst[indexes.w] = values.w; - } - else - { - device_dst[indexes.x] = __ldg(device_src + threadIdx.x * 4 + 0); - device_dst[indexes.y] = __ldg(device_src + threadIdx.x * 4 + 1); - device_dst[indexes.z] = __ldg(device_src + threadIdx.x * 4 + 2); - device_dst[indexes.w] = __ldg(device_src + threadIdx.x * 4 + 3); - } - } - else - { - for (uint32_t thread_index = threadIdx.x; thread_index < m; thread_index += CUDA_BLOCK_THREADS) - { - device_dst[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); - } - } -} - __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) static void libcubwt_permute_block_values_uint32_kernel(const uint32_t * RESTRICT device_idx, const uint32_t * RESTRICT device_src, uint32_t * RESTRICT device_dst, uint32_t n) { @@ -338,82 +299,26 @@ static void libcubwt_permute_block_values_uint32_kernel(const uint32_t * RESTRIC } } -template __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) -static void libcubwt_permute_block_values_uint8_kernel(const uint32_t * RESTRICT device_idx, const uint8_t * RESTRICT device_src, uint8_t * RESTRICT device_dst, uint32_t n) +static void libcubwt_scatter_values_uint64_kernel(const uint32_t * RESTRICT device_idx, const uint64_t * RESTRICT device_src, uint64_t * RESTRICT device_dst, uint32_t m) { - __shared__ __align__(32) uint8_t cache[64 * CUDA_BLOCK_THREADS]; - - const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 64; + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 2; - device_idx += block_index; device_src += block_index; device_dst += block_index; n -= block_index; + device_idx += block_index; device_src += block_index; m -= block_index; - if (n >= CUDA_BLOCK_THREADS * 64) + if (m >= CUDA_BLOCK_THREADS * 2) { - { - const uint32_t * RESTRICT thread_idx = device_idx + threadIdx.x * 4; - const uint8_t * RESTRICT thread_src = device_src + threadIdx.x * 4; - uint8_t * RESTRICT thread_cache = cache - block_index; - - #pragma unroll - for (uint32_t round = 0; round < 16; round += 1) - { - const uint4 indexes = __ldg((uint4 *)(thread_idx)); - - if (source_is_aligned) - { - const uchar4 values = __ldg((uchar4 *)(thread_src)); - - thread_cache[indexes.x] = values.x; - thread_cache[indexes.y] = values.y; - thread_cache[indexes.z] = values.z; - thread_cache[indexes.w] = values.w; - } - else - { - thread_cache[indexes.x] = __ldg(thread_src + 0); - thread_cache[indexes.y] = __ldg(thread_src + 1); - thread_cache[indexes.z] = __ldg(thread_src + 2); - thread_cache[indexes.w] = __ldg(thread_src + 3); - } - - thread_idx += 4 * CUDA_BLOCK_THREADS; thread_src += 4 * CUDA_BLOCK_THREADS; - } - } - - __syncthreads(); + const uint2 indexes = __ldg((uint2 *)(device_idx + threadIdx.x * 2)); + const ulonglong2 values = __ldg((ulonglong2 *)(device_src + threadIdx.x * 2)); - { - const uint8_t * RESTRICT thread_cache = cache + threadIdx.x * 4; - uint8_t * RESTRICT thread_dst = device_dst + threadIdx.x * 4; - - #pragma unroll - for (uint32_t round = 0; round < 16; round += 1) - { - *(uchar4 *)(thread_dst) = *(uchar4 *)(thread_cache); - - thread_cache += 4 * CUDA_BLOCK_THREADS; thread_dst += 4 * CUDA_BLOCK_THREADS; - } - } + device_dst[indexes.x] = values.x; + device_dst[indexes.y] = values.y; } else { + for (uint32_t thread_index = threadIdx.x; thread_index < m; thread_index += CUDA_BLOCK_THREADS) { - uint8_t * RESTRICT thread_cache = cache - block_index; - - for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) - { - thread_cache[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); - } - } - - __syncthreads(); - - { - for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) - { - device_dst[thread_index] = cache[thread_index]; - } + device_dst[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); } } } @@ -441,9 +346,9 @@ static cudaError_t libcubwt_gather_scatter_values_uint32(LIBCUBWT_DEVICE_STORAGE if (status == cudaSuccess) { - int64_t n_gather_scatter_blocks = (m + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + int64_t n_gather_scatter_blocks = (m + storage->cuda_block_threads * 4 - 1) / (storage->cuda_block_threads * 4); - libcubwt_gather_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_src_index_value.Current(), device_src, db_src_index_value.Current(), (uint32_t)m); + libcubwt_gather_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_src_index_value.Current(), device_src, db_src_index_value.Current(), (uint32_t)m); if (sort_start_bit < sort_end_bit) { @@ -457,7 +362,7 @@ static cudaError_t libcubwt_gather_scatter_values_uint32(LIBCUBWT_DEVICE_STORAGE if (status == cudaSuccess) { - libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_dst_index.Current(), db_src_index_value.Current(), device_dst, (uint32_t)m); + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_dst_index.Current(), db_src_index_value.Current(), device_dst, (uint32_t)m); } } @@ -487,9 +392,9 @@ static cudaError_t libcubwt_scatter_values_uint32(LIBCUBWT_DEVICE_STORAGE * stor if (status == cudaSuccess) { - int64_t n_scatter_blocks = (m + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + int64_t n_scatter_blocks = (m + storage->cuda_block_threads * 4 - 1) / (storage->cuda_block_threads * 4); - libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)m); + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)m); } return status; @@ -518,32 +423,29 @@ static cudaError_t libcubwt_permute_values_uint32(LIBCUBWT_DEVICE_STORAGE * stor if (status == cudaSuccess) { - if (((int64_t)1 << sort_start_bit) <= CUDA_BLOCK_THREADS * 16) + if (((storage->cuda_block_threads * 16) % ((int64_t)1 << sort_start_bit)) == 0) { - int64_t n_permute_blocks = (n + CUDA_BLOCK_THREADS * 16 - 1) / (CUDA_BLOCK_THREADS * 16); + int64_t n_permute_blocks = (n + storage->cuda_block_threads * 16 - 1) / (storage->cuda_block_threads * 16); - libcubwt_permute_block_values_uint32_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + libcubwt_permute_block_values_uint32_kernel<<<(uint32_t)n_permute_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); } else { - int64_t n_scatter_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + int64_t n_scatter_blocks = (n + storage->cuda_block_threads * 4 - 1) / (storage->cuda_block_threads * 4); - libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); } } return status; } -static cudaError_t libcubwt_permute_values_uint8(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * device_idx, uint8_t * device_src, uint8_t * device_dst, int64_t n, uint32_t * device_temp1, uint8_t * device_temp2) +static cudaError_t libcubwt_scatter_values_uint64(LIBCUBWT_DEVICE_STORAGE * storage, cub::DoubleBuffer & db_index, cub::DoubleBuffer & db_value, int64_t m, int64_t n, int64_t k = 0) { cudaError_t status = cudaSuccess; - cub::DoubleBuffer db_index(device_idx, device_temp1); - cub::DoubleBuffer db_value(device_src, device_temp2); - int32_t sort_end_bit = 0; while ((n - 1) >= ((int64_t)1 << sort_end_bit)) { sort_end_bit += 1; } - int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits) ? (sort_end_bit - storage->device_L2_cache_bits + 7) & (-8) : 0; + int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits - 3) ? (sort_end_bit - storage->device_L2_cache_bits + 3 + 7) & (-8) : 0; int32_t sort_start_bit = std::max(0, sort_end_bit - sort_aligned_bits); if (sort_start_bit < sort_end_bit) @@ -551,39 +453,19 @@ static cudaError_t libcubwt_permute_values_uint8(LIBCUBWT_DEVICE_STORAGE * stora status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, db_index, db_value, - (uint32_t)n, + (uint32_t)m, sort_start_bit, sort_end_bit, storage->cuda_stream)); } if (status == cudaSuccess) { - if (((int64_t)1 << sort_start_bit) <= CUDA_BLOCK_THREADS * 64) - { - int64_t n_permute_blocks = (n + CUDA_BLOCK_THREADS * 64 - 1) / (CUDA_BLOCK_THREADS * 64); + int64_t n_scatter_blocks = (m + storage->cuda_block_threads * 2 - 1) / (storage->cuda_block_threads * 2); - if (((uintptr_t)db_value.Current()) % 4 == 0) - { - libcubwt_permute_block_values_uint8_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); - } - else - { - libcubwt_permute_block_values_uint8_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); - } - } - else - { - int64_t n_scatter_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + libcubwt_scatter_values_uint64_kernel<<<(uint32_t)n_scatter_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), db_value.Alternate() - k, (uint32_t)m); - if (((uintptr_t)db_value.Current()) % 4 == 0) - { - libcubwt_scatter_values_uint8_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); - } - else - { - libcubwt_scatter_values_uint8_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); - } - } + db_index.selector ^= 1; + db_value.selector ^= 1; } return status; @@ -593,117 +475,144 @@ template __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) static void libcubwt_initialize_device_arrays_kernel(const uint8_t * RESTRICT device_T, uint32_t * RESTRICT device_SA, uint64_t * RESTRICT device_keys) { - const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 8; + __shared__ __align__(32) uint4 prefixes[4 * CUDA_BLOCK_THREADS]; { - uint32_t thread_index = block_index + threadIdx.x * 4; device_SA += thread_index; - ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); + device_T += blockIdx.x * CUDA_BLOCK_THREADS * 12 + threadIdx.x * 16; + if (threadIdx.x < (12 * CUDA_BLOCK_THREADS + 8 + 15) / 16) { prefixes[threadIdx.x] = __ldg((uint4 *)device_T); } - thread_index += CUDA_BLOCK_THREADS * 4; device_SA += CUDA_BLOCK_THREADS * 4; - ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); + __syncthreads(); } - __shared__ __align__(32) uint4 prefixes[4 * CUDA_BLOCK_THREADS]; - { - device_T += block_index + threadIdx.x * 8; + uint32_t * RESTRICT thread_cache = ((uint32_t *)prefixes) + threadIdx.x * 3; + uint4 * RESTRICT thread_prefixes = ((uint4 * )prefixes) + threadIdx.x * 4; - const uint2 current = __ldg((uint2 *)(device_T + 0)); - const uint2 next = __ldg((uint2 *)(device_T + 8)); + const uint32_t b0 = thread_cache[0]; + const uint32_t b1 = thread_cache[1]; + const uint32_t b2 = thread_cache[2]; + const uint32_t b3 = thread_cache[3]; + const uint32_t b4 = thread_cache[4]; - { - uint4 * RESTRICT thread_prefixes = prefixes + threadIdx.x * 4; - - thread_prefixes[0] = make_uint4 - ( - __byte_perm(current.y, next.x, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x0123), - __byte_perm(current.y, next.x, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x1234) - ); - - thread_prefixes[1] = make_uint4 - ( - __byte_perm(current.y, next.x, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x2345), - __byte_perm(current.y, next.x, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x3456) - ); - - thread_prefixes[2] = make_uint4 - ( - __byte_perm(next.x, next.y, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x0123), - __byte_perm(next.x, next.y, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x1234) - ); - - thread_prefixes[3] = make_uint4 - ( - __byte_perm(next.x, next.y, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x2345), - __byte_perm(next.x, next.y, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x3456) - ); - } - } + __syncthreads(); + + thread_prefixes[0] = make_uint4 + ( + __byte_perm(b1, b2, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b0, b1, 0x1234), + __byte_perm(b1, b2, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b0, b1, 0x2345) + ); + + thread_prefixes[1] = make_uint4 + ( + __byte_perm(b2, b3, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b1, b2, 0x0123), + __byte_perm(b2, b3, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b1, b2, 0x1234) + ); - __syncwarp(); + thread_prefixes[2] = make_uint4 + ( + __byte_perm(b2, b3, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b1, b2, 0x3456), + __byte_perm(b3, b4, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b2, b3, 0x0123) + ); + + thread_prefixes[3] = make_uint4 + ( + __byte_perm(b3, b4, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b2, b3, 0x2345), + __byte_perm(b3, b4, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(b2, b3, 0x3456) + ); + + __syncwarp(); + } { - device_keys += block_index; + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 8; - uint4 * RESTRICT thread_prefixes = (uint4 *)prefixes + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); - uint4 * RESTRICT thread_keys = (uint4 *)device_keys + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + { + uint32_t thread_index = block_index + threadIdx.x * 4; device_SA += thread_index; + ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); - thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; - thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; - thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; - thread_keys[0] = thread_prefixes[0]; + thread_index += CUDA_BLOCK_THREADS * 4; device_SA += CUDA_BLOCK_THREADS * 4; + ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); + } + + { + device_keys += block_index; + + uint4 * RESTRICT thread_prefixes = (uint4 *)prefixes + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + uint4 * RESTRICT thread_keys = (uint4 *)device_keys + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; + } } } __global__ __launch_bounds__(CUDA_BLOCK_THREADS, 1) -static void libcubwt_set_sentinel_keys_kernel(uint64_t * RESTRICT device_keys_end, uint64_t s0, uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4, uint64_t s5, uint64_t s6, uint64_t s7) +static void libcubwt_set_sentinel_values_kernel(uint8_t * RESTRICT device_T_end, uint64_t * RESTRICT device_keys_end, uint64_t k0, uint64_t k1, uint64_t k2, uint64_t k3, uint64_t k4, uint64_t k5, uint64_t k6, uint64_t k7) { - device_keys_end[-8] = s0; - device_keys_end[-7] = s1; - device_keys_end[-6] = s2; - device_keys_end[-5] = s3; - device_keys_end[-4] = s4; - device_keys_end[-3] = s5; - device_keys_end[-2] = s6; - device_keys_end[-1] = s7; + device_T_end[0] = 0; + device_T_end[1] = 0; + device_T_end[2] = 0; + + device_keys_end[-8] = k0; + device_keys_end[-7] = k1; + device_keys_end[-6] = k2; + device_keys_end[-5] = k3; + device_keys_end[-4] = k4; + device_keys_end[-3] = k5; + device_keys_end[-2] = k6; + device_keys_end[-1] = k7; } -static cudaError_t libcubwt_initialize_device_arrays(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t n) +static cudaError_t libcubwt_initialize_device_arrays(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t reduced_n, int64_t expanded_n, int64_t input_n) { cudaError_t status = cudaSuccess; - if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_T, T, (size_t)n, cudaMemcpyHostToDevice, storage->cuda_stream))) == cudaSuccess) + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_T, T, (size_t)input_n, cudaMemcpyHostToDevice, storage->cuda_stream))) == cudaSuccess) { - int64_t n_initialize_blocks = 1 + (n / (CUDA_BLOCK_THREADS * 8)); + int64_t n_initialize_blocks = 1 + (expanded_n / (storage->cuda_block_threads * 12)); - bool extra_sentinel_bits = (T[n - 2] == 0) && (T[n - 1] == 0); + bool extra_sentinel_bits = (expanded_n - input_n >= 2) || (T[input_n - 1] == 0); if (extra_sentinel_bits) { - libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); + libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); } else { - libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); + libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); } { - uint64_t a = T[n - 8], b = T[n - 7], c = T[n - 6], d = T[n - 5], e = T[n - 4], f = T[n - 3], g = T[n - 2], h = T[n - 1]; - - uint64_t s0 = (a << 56) | (b << 48) | (c << 40) | (d << 32) | (e << 24) | (f << 16) | (g << 8) | (h << 0) | (extra_sentinel_bits ? 7 : 0); - uint64_t s1 = (b << 56) | (c << 48) | (d << 40) | (e << 32) | (f << 24) | (g << 16) | (h << 8) | (extra_sentinel_bits ? 6 : 0); - uint64_t s2 = (c << 56) | (d << 48) | (e << 40) | (f << 32) | (g << 24) | (h << 16) | (extra_sentinel_bits ? 5 : 0); - uint64_t s3 = (d << 56) | (e << 48) | (f << 40) | (g << 32) | (h << 24) | (extra_sentinel_bits ? 4 : 0); - uint64_t s4 = (e << 56) | (f << 48) | (g << 40) | (h << 32) | (extra_sentinel_bits ? 3 : 0); - uint64_t s5 = (f << 56) | (g << 48) | (h << 40) | (extra_sentinel_bits ? 2 : 0); - uint64_t s6 = (g << 56) | (h << 48) | (extra_sentinel_bits ? 1 : 0); - uint64_t s7 = (h << 56); - - libcubwt_set_sentinel_keys_kernel<<<1, 1, 0, storage->cuda_stream>>>(storage->device_keys_temp_keys + n, s0, s1, s2, s3, s4, s5, s6, s7); + uint64_t c0 = (expanded_n - 11 < input_n) ? T[expanded_n - 11] : (uint64_t)0; + uint64_t c1 = (expanded_n - 10 < input_n) ? T[expanded_n - 10] : (uint64_t)0; + uint64_t c2 = (expanded_n - 9 < input_n) ? T[expanded_n - 9] : (uint64_t)0; + uint64_t c3 = (expanded_n - 8 < input_n) ? T[expanded_n - 8] : (uint64_t)0; + uint64_t c4 = (expanded_n - 7 < input_n) ? T[expanded_n - 7] : (uint64_t)0; + uint64_t c5 = (expanded_n - 6 < input_n) ? T[expanded_n - 6] : (uint64_t)0; + uint64_t c6 = (expanded_n - 5 < input_n) ? T[expanded_n - 5] : (uint64_t)0; + uint64_t c7 = (expanded_n - 4 < input_n) ? T[expanded_n - 4] : (uint64_t)0; + uint64_t c8 = (expanded_n - 3 < input_n) ? T[expanded_n - 3] : (uint64_t)0; + uint64_t c9 = (expanded_n - 2 < input_n) ? T[expanded_n - 2] : (uint64_t)0; + uint64_t ca = (expanded_n - 1 < input_n) ? T[expanded_n - 1] : (uint64_t)0; + + uint64_t k0 = (c0 << 56) | (c1 << 48) | (c2 << 40) | (c3 << 32) | (c4 << 24) | (c5 << 16) | (c6 << 8) | (c7 << 0) | (extra_sentinel_bits ? 7 : 1); + uint64_t k1 = (c1 << 56) | (c2 << 48) | (c3 << 40) | (c4 << 32) | (c5 << 24) | (c6 << 16) | (c7 << 8) | (c8 << 0) | (extra_sentinel_bits ? 7 : 1); + + uint64_t k2 = (c3 << 56) | (c4 << 48) | (c5 << 40) | (c6 << 32) | (c7 << 24) | (c8 << 16) | (c9 << 8) | (ca << 0) | (extra_sentinel_bits ? 7 : 0); + uint64_t k3 = (c4 << 56) | (c5 << 48) | (c6 << 40) | (c7 << 32) | (c8 << 24) | (c9 << 16) | (ca << 8) | (extra_sentinel_bits ? 6 : 0); + + uint64_t k4 = (c6 << 56) | (c7 << 48) | (c8 << 40) | (c9 << 32) | (ca << 24) | (extra_sentinel_bits ? 4 : 0); + uint64_t k5 = (c7 << 56) | (c8 << 48) | (c9 << 40) | (ca << 32) | (extra_sentinel_bits ? 3 : 0); + + uint64_t k6 = (c9 << 56) | (ca << 48) | (extra_sentinel_bits ? 1 : 0); + uint64_t k7 = (ca << 56); + + libcubwt_set_sentinel_values_kernel<<<1, 1, 0, storage->cuda_stream>>>(storage->device_T + input_n, storage->device_keys_temp_keys + reduced_n, k0, k1, k2, k3, k4, k5, k6, k7); } storage->num_unsorted_segments = (uint32_t)1; - storage->num_unsorted_suffixes = (uint32_t)n; - storage->is_partial_suffix_array = false; + storage->num_unsorted_suffixes = (uint32_t)reduced_n; } return status; @@ -738,7 +647,7 @@ static cudaError_t libcubwt_sort_suffixes_by_prefix(LIBCUBWT_DEVICE_STORAGE * st } __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) -static void libcubwt_rank_and_segment_suffixes_initialization_kernel(uint64_t * RESTRICT device_keys, uint8_t * RESTRICT device_heads, uint4 * RESTRICT device_descriptors_large, uint2 * RESTRICT device_descriptors_small, uint32_t n) +static void libcubwt_rank_and_segment_suffixes_initialization_kernel(uint32_t * RESTRICT device_SA, uint64_t * RESTRICT device_keys, uint8_t * RESTRICT device_heads, uint4 * RESTRICT device_descriptors_large, uint2 * RESTRICT device_descriptors_small, uint32_t n) { const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS + threadIdx.x; @@ -757,21 +666,28 @@ static void libcubwt_rank_and_segment_suffixes_initialization_kernel(uint64_t * } { - device_keys += threadIdx.x; device_heads += threadIdx.x; uint64_t key = (threadIdx.x % 2 == 0) ? 0 : (uint64_t)-1; + device_SA += threadIdx.x; device_keys += threadIdx.x; device_heads += threadIdx.x; + if (threadIdx.x < 2) { device_keys [-2] = key; device_heads[-2] = 1; } - device_keys += n; device_heads += n; + device_SA += n; device_keys += n; device_heads += n; + + device_SA [0 * CUDA_BLOCK_THREADS] = n + threadIdx.x + 0 * CUDA_BLOCK_THREADS; + device_SA [1 * CUDA_BLOCK_THREADS] = n + threadIdx.x + 1 * CUDA_BLOCK_THREADS; + device_SA [2 * CUDA_BLOCK_THREADS] = n + threadIdx.x + 2 * CUDA_BLOCK_THREADS; + device_SA [3 * CUDA_BLOCK_THREADS] = n + threadIdx.x + 3 * CUDA_BLOCK_THREADS; device_keys [0 * CUDA_BLOCK_THREADS] = key; device_keys [1 * CUDA_BLOCK_THREADS] = key; device_keys [2 * CUDA_BLOCK_THREADS] = key; device_keys [3 * CUDA_BLOCK_THREADS] = key; + device_heads[0 * CUDA_BLOCK_THREADS] = 1; device_heads[1 * CUDA_BLOCK_THREADS] = 1; device_heads[2 * CUDA_BLOCK_THREADS] = 1; @@ -797,8 +713,6 @@ static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( uint32_t thread_exclusive_suffix_rank; uint32_t thread_suffix_rank[4]; - uchar4 thread_segment_end_flag; - uint32_t thread_exclusive_segment_index; uint32_t thread_segment_index[4]; @@ -836,15 +750,10 @@ static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( thread_suffix_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_rank[1]; thread_suffix_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_rank[2]; - thread_segment_end_flag.x = (thread_new_heads.x != 0) && (key_a.x == key_a.y); - thread_segment_end_flag.y = (thread_new_heads.y != 0) && (thread_new_heads.x == 0); - thread_segment_end_flag.z = (thread_new_heads.z != 0) && (thread_new_heads.y == 0); - thread_segment_end_flag.w = (thread_new_heads.w != 0) && (thread_new_heads.z == 0); - - thread_segment_index[0] = (uint32_t)thread_segment_end_flag.x; - thread_segment_index[1] = thread_segment_index[0] + (uint32_t)thread_segment_end_flag.y; - thread_segment_index[2] = thread_segment_index[1] + (uint32_t)thread_segment_end_flag.z; - thread_segment_index[3] = thread_segment_index[2] + (uint32_t)thread_segment_end_flag.w; + thread_segment_index[0] = ((thread_new_heads.x != 0) && (key_a.x == key_a.y)); + thread_segment_index[1] = thread_segment_index[0] + ((thread_new_heads.y != 0) && (thread_new_heads.x == 0)); + thread_segment_index[2] = thread_segment_index[1] + ((thread_new_heads.z != 0) && (thread_new_heads.y == 0)); + thread_segment_index[3] = thread_segment_index[2] + ((thread_new_heads.w != 0) && (thread_new_heads.z == 0)); } } @@ -854,7 +763,7 @@ static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( typedef cub::WarpScan WarpScan; - __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_WARP_THREADS]; WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_rank[3] , thread_inclusive_suffix_rank , thread_exclusive_suffix_rank , (uint32_t)0, cub::Max()); WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_segment_index[3], thread_inclusive_segment_index, thread_exclusive_segment_index, (uint32_t)0, cub::Sum()); @@ -877,7 +786,7 @@ static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( uint32_t warp_inclusive_segment_index; { - typedef cub::WarpScan WarpScan; + typedef cub::WarpScan WarpScan; __shared__ typename WarpScan::TempStorage warp_scan_storage; @@ -969,10 +878,10 @@ static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; - if (thread_segment_end_flag.x != 0) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } - if (thread_segment_end_flag.y != 0) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } - if (thread_segment_end_flag.z != 0) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } - if (thread_segment_end_flag.w != 0) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } + if (thread_exclusive_segment_index != thread_segment_index[0]) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } + if (thread_segment_index[0] != thread_segment_index[1]) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } + if (thread_segment_index[1] != thread_segment_index[2]) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } + if (thread_segment_index[2] != thread_segment_index[3]) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } if (scatter_ranks_directly) { @@ -1007,19 +916,15 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( __shared__ __align__(32) uint4 warp_state1[1 + CUDA_WARP_THREADS]; __shared__ __align__(32) uint32_t warp_state2[1 + CUDA_WARP_THREADS]; + uchar4 thread_old_heads; uint32_t thread_exclusive_suffix_old_rank; - uint32_t thread_suffix_old_rank[4]; + uchar4 thread_new_heads; uint32_t thread_exclusive_suffix_new_rank; - uint32_t thread_suffix_new_rank[4]; - - uchar4 thread_segment_end_flag; uint32_t thread_exclusive_segment_index; uint32_t thread_segment_index[4]; - uchar4 thread_updated_suffix_flag; - uint32_t thread_exclusive_suffix_index; uint32_t thread_suffix_index[4]; @@ -1031,9 +936,9 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( uint2 key_a = __ldg((uint2 *)(device_keys - 2)); uint4 key_b = __ldg((uint4 *)(device_keys)); - uchar4 thread_old_heads = *(uchar4 *)(device_heads); + thread_old_heads = *(uchar4 *)(device_heads); - uchar4 thread_new_heads = make_uchar4( + thread_new_heads = make_uchar4( (key_a.y != key_b.x) ? (uint8_t)1 : (uint8_t)thread_old_heads.x, (key_b.x != key_b.y) ? (uint8_t)1 : (uint8_t)thread_old_heads.y, (key_b.y != key_b.z) ? (uint8_t)1 : (uint8_t)thread_old_heads.z, @@ -1041,25 +946,20 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( *(uchar4 *)(device_heads) = thread_new_heads; - thread_suffix_old_rank[0] = (thread_old_heads.x != 0) ? (thread_index + 0) : 0; - thread_suffix_old_rank[1] = (thread_old_heads.y != 0) ? (thread_index + 1) : thread_suffix_old_rank[0]; - thread_suffix_old_rank[2] = (thread_old_heads.z != 0) ? (thread_index + 2) : thread_suffix_old_rank[1]; - thread_suffix_old_rank[3] = (thread_old_heads.w != 0) ? (thread_index + 3) : thread_suffix_old_rank[2]; - - thread_suffix_new_rank[0] = (thread_new_heads.x != 0) ? (thread_index + 0) : 0; - thread_suffix_new_rank[1] = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_suffix_new_rank[0]; - thread_suffix_new_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_new_rank[1]; - thread_suffix_new_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_new_rank[2]; - - thread_segment_end_flag.x = (thread_new_heads.x != 0) && (key_a.x == key_a.y) && (device_heads[-1] == 0); - thread_segment_end_flag.y = (thread_new_heads.y != 0) && (thread_new_heads.x == 0); - thread_segment_end_flag.z = (thread_new_heads.z != 0) && (thread_new_heads.y == 0); - thread_segment_end_flag.w = (thread_new_heads.w != 0) && (thread_new_heads.z == 0); - - thread_segment_index[0] = (uint32_t)thread_segment_end_flag.x; - thread_segment_index[1] = thread_segment_index[0] + (uint32_t)thread_segment_end_flag.y; - thread_segment_index[2] = thread_segment_index[1] + (uint32_t)thread_segment_end_flag.z; - thread_segment_index[3] = thread_segment_index[2] + (uint32_t)thread_segment_end_flag.w; + thread_exclusive_suffix_old_rank = (thread_old_heads.x != 0) ? (thread_index + 0) : 0; + thread_exclusive_suffix_old_rank = (thread_old_heads.y != 0) ? (thread_index + 1) : thread_exclusive_suffix_old_rank; + thread_exclusive_suffix_old_rank = (thread_old_heads.z != 0) ? (thread_index + 2) : thread_exclusive_suffix_old_rank; + thread_exclusive_suffix_old_rank = (thread_old_heads.w != 0) ? (thread_index + 3) : thread_exclusive_suffix_old_rank; + + thread_exclusive_suffix_new_rank = (thread_new_heads.x != 0) ? (thread_index + 0) : 0; + thread_exclusive_suffix_new_rank = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_exclusive_suffix_new_rank; + thread_exclusive_suffix_new_rank = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_exclusive_suffix_new_rank; + thread_exclusive_suffix_new_rank = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_exclusive_suffix_new_rank; + + thread_segment_index[0] = ((thread_new_heads.x != 0) && (key_a.x == key_a.y) && (device_heads[-1] == 0)); + thread_segment_index[1] = thread_segment_index[0] + ((thread_new_heads.y != 0) && (thread_new_heads.x == 0)); + thread_segment_index[2] = thread_segment_index[1] + ((thread_new_heads.z != 0) && (thread_new_heads.y == 0)); + thread_segment_index[3] = thread_segment_index[2] + ((thread_new_heads.w != 0) && (thread_new_heads.z == 0)); } { @@ -1069,11 +969,11 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( typedef cub::WarpScan WarpScan; - __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS]; - WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_old_rank[3], thread_inclusive_suffix_old_rank, thread_exclusive_suffix_old_rank, (uint32_t)0, cub::Max()); - WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_new_rank[3], thread_inclusive_suffix_new_rank, thread_exclusive_suffix_new_rank, (uint32_t)0, cub::Max()); - WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_segment_index[3] , thread_inclusive_segment_index , thread_exclusive_segment_index , (uint32_t)0, cub::Sum()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_exclusive_suffix_old_rank, thread_inclusive_suffix_old_rank, thread_exclusive_suffix_old_rank, (uint32_t)0, cub::Max()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_exclusive_suffix_new_rank, thread_inclusive_suffix_new_rank, thread_exclusive_suffix_new_rank, (uint32_t)0, cub::Max()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_segment_index[3] , thread_inclusive_segment_index , thread_exclusive_segment_index , (uint32_t)0, cub::Sum()); if ((threadIdx.x % CUDA_WARP_THREADS) == (CUDA_WARP_THREADS - 1)) { @@ -1086,7 +986,6 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( { if (threadIdx.x < CUDA_WARP_THREADS) { - uint32_t block_exclusive_suffix_old_rank = __ldg((uint32_t *)(device_descriptors_copy + blockIdx.x - 1) + 2); uint32_t block_exclusive_suffix_new_rank = 0; uint32_t block_exclusive_segment_index = 0; @@ -1095,7 +994,7 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( uint32_t warp_inclusive_segment_index; { - typedef cub::WarpScan WarpScan; + typedef cub::WarpScan WarpScan; __shared__ typename WarpScan::TempStorage warp_scan_storage; @@ -1147,7 +1046,6 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( } while (full_aggregate_lane == -1); - warp_inclusive_suffix_old_rank = cub::Max()(warp_inclusive_suffix_old_rank, block_exclusive_suffix_old_rank); warp_inclusive_suffix_new_rank = cub::Max()(warp_inclusive_suffix_new_rank, block_exclusive_suffix_new_rank); warp_inclusive_segment_index = cub::Sum()(warp_inclusive_segment_index , block_exclusive_segment_index ); } @@ -1159,6 +1057,9 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( } { + uint32_t block_exclusive_suffix_old_rank = __ldg((uint32_t *)(device_descriptors_copy + blockIdx.x - 1) + 2); + warp_inclusive_suffix_old_rank = cub::Max()(warp_inclusive_suffix_old_rank, block_exclusive_suffix_old_rank); + if (threadIdx.x == 0) { warp_state1[0] = make_uint4(0, block_exclusive_suffix_old_rank, block_exclusive_suffix_new_rank, block_exclusive_segment_index); @@ -1172,33 +1073,36 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( } { + uint32_t thread_suffix_old_rank[4]; + uint32_t thread_suffix_new_rank[4]; + uint4 warp_exclusive_state = warp_state1[threadIdx.x / CUDA_WARP_THREADS]; thread_exclusive_suffix_old_rank = cub::Max()(thread_exclusive_suffix_old_rank, warp_exclusive_state.y); thread_exclusive_suffix_new_rank = cub::Max()(thread_exclusive_suffix_new_rank, warp_exclusive_state.z); thread_exclusive_segment_index = cub::Sum()(thread_exclusive_segment_index , warp_exclusive_state.w); - thread_suffix_old_rank[0] = cub::Max()(thread_suffix_old_rank[0], thread_exclusive_suffix_old_rank); - thread_suffix_old_rank[1] = cub::Max()(thread_suffix_old_rank[1], thread_exclusive_suffix_old_rank); - thread_suffix_old_rank[2] = cub::Max()(thread_suffix_old_rank[2], thread_exclusive_suffix_old_rank); - thread_suffix_old_rank[3] = cub::Max()(thread_suffix_old_rank[3], thread_exclusive_suffix_old_rank); + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + thread_suffix_old_rank[0] = (thread_old_heads.x != 0) ? (thread_index + 0) : thread_exclusive_suffix_old_rank; + thread_suffix_old_rank[1] = (thread_old_heads.y != 0) ? (thread_index + 1) : thread_suffix_old_rank[0]; + thread_suffix_old_rank[2] = (thread_old_heads.z != 0) ? (thread_index + 2) : thread_suffix_old_rank[1]; + thread_suffix_old_rank[3] = (thread_old_heads.w != 0) ? (thread_index + 3) : thread_suffix_old_rank[2]; - thread_suffix_new_rank[0] = cub::Max()(thread_suffix_new_rank[0], thread_exclusive_suffix_new_rank); - thread_suffix_new_rank[1] = cub::Max()(thread_suffix_new_rank[1], thread_exclusive_suffix_new_rank); - thread_suffix_new_rank[2] = cub::Max()(thread_suffix_new_rank[2], thread_exclusive_suffix_new_rank); - thread_suffix_new_rank[3] = cub::Max()(thread_suffix_new_rank[3], thread_exclusive_suffix_new_rank); + thread_suffix_new_rank[0] = (thread_new_heads.x != 0) ? (thread_index + 0) : thread_exclusive_suffix_new_rank; + thread_suffix_new_rank[1] = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_suffix_new_rank[0]; + thread_suffix_new_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_new_rank[1]; + thread_suffix_new_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_new_rank[2]; thread_segment_index[0] = cub::Sum()(thread_segment_index[0], thread_exclusive_segment_index); thread_segment_index[1] = cub::Sum()(thread_segment_index[1], thread_exclusive_segment_index); thread_segment_index[2] = cub::Sum()(thread_segment_index[2], thread_exclusive_segment_index); thread_segment_index[3] = cub::Sum()(thread_segment_index[3], thread_exclusive_segment_index); - const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; - - if (thread_segment_end_flag.x != 0) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_new_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } - if (thread_segment_end_flag.y != 0) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_new_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } - if (thread_segment_end_flag.z != 0) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_new_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } - if (thread_segment_end_flag.w != 0) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_new_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } + if (thread_exclusive_segment_index != thread_segment_index[0]) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_new_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } + if (thread_segment_index[0] != thread_segment_index[1]) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_new_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } + if (thread_segment_index[1] != thread_segment_index[2]) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_new_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } + if (thread_segment_index[2] != thread_segment_index[3]) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_new_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } if (scatter_ranks_directly) { @@ -1211,15 +1115,10 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( } else { - thread_updated_suffix_flag.x = thread_suffix_old_rank[0] != thread_suffix_new_rank[0]; - thread_updated_suffix_flag.y = thread_suffix_old_rank[1] != thread_suffix_new_rank[1]; - thread_updated_suffix_flag.z = thread_suffix_old_rank[2] != thread_suffix_new_rank[2]; - thread_updated_suffix_flag.w = thread_suffix_old_rank[3] != thread_suffix_new_rank[3]; - - thread_suffix_index[0] = (uint32_t)thread_updated_suffix_flag.x; - thread_suffix_index[1] = thread_suffix_index[0] + (uint32_t)thread_updated_suffix_flag.y; - thread_suffix_index[2] = thread_suffix_index[1] + (uint32_t)thread_updated_suffix_flag.z; - thread_suffix_index[3] = thread_suffix_index[2] + (uint32_t)thread_updated_suffix_flag.w; + thread_suffix_index[0] = (thread_suffix_old_rank[0] != thread_suffix_new_rank[0]); + thread_suffix_index[1] = thread_suffix_index[0] + (thread_suffix_old_rank[1] != thread_suffix_new_rank[1]); + thread_suffix_index[2] = thread_suffix_index[1] + (thread_suffix_old_rank[2] != thread_suffix_new_rank[2]); + thread_suffix_index[3] = thread_suffix_index[2] + (thread_suffix_old_rank[3] != thread_suffix_new_rank[3]); } } @@ -1230,7 +1129,7 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( typedef cub::WarpScan WarpScan; - __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_WARP_THREADS]; WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_index[3], thread_inclusive_suffix_index, thread_exclusive_suffix_index, (uint32_t)0, cub::Sum()); @@ -1249,7 +1148,7 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( uint32_t warp_inclusive_suffix_index; { - typedef cub::WarpScan WarpScan; + typedef cub::WarpScan WarpScan; __shared__ typename WarpScan::TempStorage warp_scan_storage; @@ -1322,10 +1221,11 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( } { - if (thread_updated_suffix_flag.x + thread_updated_suffix_flag.y + thread_updated_suffix_flag.z + thread_updated_suffix_flag.w > 0) + if (thread_suffix_index[3] > 0) { + uint32_t thread_suffix_new_rank[4]; + uint32_t warp_exclusive_state = warp_state2[threadIdx.x / CUDA_WARP_THREADS]; - thread_exclusive_suffix_index = cub::Sum()(thread_exclusive_suffix_index, warp_exclusive_state); thread_suffix_index[0] = cub::Sum()(thread_suffix_index[0], thread_exclusive_suffix_index); @@ -1336,10 +1236,15 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); - if (thread_updated_suffix_flag.x != 0) { device_out_SA[thread_suffix_index[0]] = indexes.x; device_out_ISA[thread_suffix_index[0]] = thread_suffix_new_rank[0]; } - if (thread_updated_suffix_flag.y != 0) { device_out_SA[thread_suffix_index[1]] = indexes.y; device_out_ISA[thread_suffix_index[1]] = thread_suffix_new_rank[1]; } - if (thread_updated_suffix_flag.z != 0) { device_out_SA[thread_suffix_index[2]] = indexes.z; device_out_ISA[thread_suffix_index[2]] = thread_suffix_new_rank[2]; } - if (thread_updated_suffix_flag.w != 0) { device_out_SA[thread_suffix_index[3]] = indexes.w; device_out_ISA[thread_suffix_index[3]] = thread_suffix_new_rank[3]; } + thread_suffix_new_rank[0] = (thread_new_heads.x != 0) ? (thread_index + 0) : thread_exclusive_suffix_new_rank; + thread_suffix_new_rank[1] = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_suffix_new_rank[0]; + thread_suffix_new_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_new_rank[1]; + thread_suffix_new_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_new_rank[2]; + + if (thread_exclusive_suffix_index != thread_suffix_index[0]) { device_out_SA[thread_suffix_index[0]] = indexes.x; device_out_ISA[thread_suffix_index[0]] = thread_suffix_new_rank[0]; } + if (thread_suffix_index[0] != thread_suffix_index[1]) { device_out_SA[thread_suffix_index[1]] = indexes.y; device_out_ISA[thread_suffix_index[1]] = thread_suffix_new_rank[1]; } + if (thread_suffix_index[1] != thread_suffix_index[2]) { device_out_SA[thread_suffix_index[2]] = indexes.z; device_out_ISA[thread_suffix_index[2]] = thread_suffix_new_rank[2]; } + if (thread_suffix_index[2] != thread_suffix_index[3]) { device_out_SA[thread_suffix_index[3]] = indexes.w; device_out_ISA[thread_suffix_index[3]] = thread_suffix_new_rank[3]; } } } } @@ -1348,13 +1253,14 @@ static void libcubwt_rank_and_segment_suffixes_incremental_kernel( static cudaError_t libcubwt_rank_and_segment_suffixes(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t iteration) { cudaError_t status = cudaSuccess; - int64_t n_segmentation_blocks = 1 + (n / (CUDA_BLOCK_THREADS * 4)); - int64_t n_initialization_blocks = (n_segmentation_blocks + CUDA_BLOCK_THREADS - 1) / CUDA_BLOCK_THREADS; + int64_t n_segmentation_blocks = 1 + (n / (storage->cuda_block_threads * 4)); + int64_t n_initialization_blocks = (n_segmentation_blocks + storage->cuda_block_threads - 1) / storage->cuda_block_threads; bool scatter_ranks_directly = (n <= ((int64_t)1 << (storage->device_L2_cache_bits - 3))); if (iteration == 0) { - libcubwt_rank_and_segment_suffixes_initialization_kernel<<<(uint32_t)n_initialization_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_initialization_kernel<<<(uint32_t)n_initialization_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + storage->device_SA, storage->device_keys_temp_keys, storage->device_heads, storage->device_descriptors_large, @@ -1363,22 +1269,22 @@ static cudaError_t libcubwt_rank_and_segment_suffixes(LIBCUBWT_DEVICE_STORAGE * if (scatter_ranks_directly) { - libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_SA, storage->device_keys_temp_keys, storage->device_heads, storage->device_ISA, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large); } else { - libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( NULL, storage->device_keys_temp_keys, storage->device_heads, storage->device_temp_ISA, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large); } @@ -1406,22 +1312,22 @@ static cudaError_t libcubwt_rank_and_segment_suffixes(LIBCUBWT_DEVICE_STORAGE * { if ((iteration % 2) == 0) { - libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_SA, storage->device_keys, storage->device_heads, NULL, storage->device_ISA, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large, storage->device_descriptors_copy); } else { - libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_SA, storage->device_keys, storage->device_heads, NULL, storage->device_ISA, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large, storage->device_descriptors_copy); } } @@ -1429,22 +1335,22 @@ static cudaError_t libcubwt_rank_and_segment_suffixes(LIBCUBWT_DEVICE_STORAGE * { if ((iteration % 2) == 0) { - libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_SA, storage->device_keys, storage->device_heads, storage->device_temp_SA - 1, storage->device_temp_ISA - 1, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large, storage->device_descriptors_copy); } else { - libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_SA, storage->device_keys, storage->device_heads, storage->device_temp_SA - 1, storage->device_temp_ISA - 1, - storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_offsets - 1, storage->device_offsets + (n / 2) - 1, storage->device_descriptors_large, storage->device_descriptors_copy); } } @@ -1483,8 +1389,6 @@ static void libcubwt_gather_unsorted_suffixes_kernel( { __shared__ __align__(32) uint32_t warp_state[1 + CUDA_WARP_THREADS]; - uchar4 thread_updated_suffix_flag; - uint32_t thread_exclusive_suffix_index; uint32_t thread_suffix_index[4]; @@ -1494,15 +1398,10 @@ static void libcubwt_gather_unsorted_suffixes_kernel( const uchar4 current_heads = __ldg((uchar4 *)(device_heads)); const uint8_t next_head = current_heads.w > 0 ? __ldg(device_heads + 4) : 0; - thread_updated_suffix_flag.x = (current_heads.x + current_heads.y < 2); - thread_updated_suffix_flag.y = (current_heads.y + current_heads.z < 2); - thread_updated_suffix_flag.z = (current_heads.z + current_heads.w < 2); - thread_updated_suffix_flag.w = (current_heads.w + next_head < 2); - - thread_suffix_index[0] = (uint32_t)thread_updated_suffix_flag.x; - thread_suffix_index[1] = thread_suffix_index[0] + (uint32_t)thread_updated_suffix_flag.y; - thread_suffix_index[2] = thread_suffix_index[1] + (uint32_t)thread_updated_suffix_flag.z; - thread_suffix_index[3] = thread_suffix_index[2] + (uint32_t)thread_updated_suffix_flag.w; + thread_suffix_index[0] = (current_heads.x + current_heads.y < 2); + thread_suffix_index[1] = thread_suffix_index[0] + (current_heads.y + current_heads.z < 2); + thread_suffix_index[2] = thread_suffix_index[1] + (current_heads.z + current_heads.w < 2); + thread_suffix_index[3] = thread_suffix_index[2] + (current_heads.w + next_head < 2); } { @@ -1510,7 +1409,7 @@ static void libcubwt_gather_unsorted_suffixes_kernel( typedef cub::WarpScan WarpScan; - __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_WARP_THREADS]; WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_index[3], thread_inclusive_suffix_index, thread_exclusive_suffix_index, (uint32_t)0, cub::Sum()); @@ -1529,7 +1428,7 @@ static void libcubwt_gather_unsorted_suffixes_kernel( uint32_t warp_inclusive_suffix_index; { - typedef cub::WarpScan WarpScan; + typedef cub::WarpScan WarpScan; __shared__ typename WarpScan::TempStorage warp_scan_storage; @@ -1600,7 +1499,7 @@ static void libcubwt_gather_unsorted_suffixes_kernel( } { - if (thread_updated_suffix_flag.x + thread_updated_suffix_flag.y + thread_updated_suffix_flag.z + thread_updated_suffix_flag.w > 0) + if (thread_suffix_index[3] > 0) { uint32_t warp_exclusive_state = warp_state[threadIdx.x / CUDA_WARP_THREADS]; @@ -1614,10 +1513,10 @@ static void libcubwt_gather_unsorted_suffixes_kernel( const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); - if (thread_updated_suffix_flag.x != 0) { device_out_keys[thread_suffix_index[0]] = thread_index + 0; device_out_SA[thread_suffix_index[0]] = indexes.x; } - if (thread_updated_suffix_flag.y != 0) { device_out_keys[thread_suffix_index[1]] = thread_index + 1; device_out_SA[thread_suffix_index[1]] = indexes.y; } - if (thread_updated_suffix_flag.z != 0) { device_out_keys[thread_suffix_index[2]] = thread_index + 2; device_out_SA[thread_suffix_index[2]] = indexes.z; } - if (thread_updated_suffix_flag.w != 0) { device_out_keys[thread_suffix_index[3]] = thread_index + 3; device_out_SA[thread_suffix_index[3]] = indexes.w; } + if (thread_exclusive_suffix_index != thread_suffix_index[0]) { device_out_keys[thread_suffix_index[0]] = thread_index + 0; device_out_SA[thread_suffix_index[0]] = indexes.x; } + if (thread_suffix_index[0] != thread_suffix_index[1]) { device_out_keys[thread_suffix_index[1]] = thread_index + 1; device_out_SA[thread_suffix_index[1]] = indexes.y; } + if (thread_suffix_index[1] != thread_suffix_index[2]) { device_out_keys[thread_suffix_index[2]] = thread_index + 2; device_out_SA[thread_suffix_index[2]] = indexes.z; } + if (thread_suffix_index[2] != thread_suffix_index[3]) { device_out_keys[thread_suffix_index[3]] = thread_index + 3; device_out_SA[thread_suffix_index[3]] = indexes.w; } } } } @@ -1649,18 +1548,18 @@ static void libcubwt_update_suffix_sorting_keys_kernel(const uint8_t * RESTRICT static cudaError_t libcubwt_update_suffix_sorting_keys(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t iteration, int64_t depth) { cudaError_t status = cudaSuccess; - int64_t n_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + int64_t n_ranking_blocks = (n + storage->cuda_block_threads * 4 - 1) / (storage->cuda_block_threads * 4); bool gather_keys_directly = (n <= ((int64_t)1 << (storage->device_L2_cache_bits - 2))) || (n > ((int64_t)1 << (storage->device_L2_cache_bits - 2 + 8))); if (gather_keys_directly || (storage->num_unsorted_suffixes <= (n / 4))) { - libcubwt_update_suffix_sorting_keys_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_heads, storage->device_SA, storage->device_ISA + depth, storage->device_keys); + libcubwt_update_suffix_sorting_keys_kernel<<<(uint32_t)n_ranking_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(storage->device_heads, storage->device_SA, storage->device_ISA + depth, storage->device_keys); } else { if ((iteration % 2) == 0) { - libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_ranking_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_heads, storage->device_SA, storage->device_temp_keys - 1, storage->device_temp_SA - 1, @@ -1668,14 +1567,14 @@ static cudaError_t libcubwt_update_suffix_sorting_keys(LIBCUBWT_DEVICE_STORAGE * } else { - libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_ranking_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( storage->device_heads, storage->device_SA, storage->device_temp_keys - 1, storage->device_temp_SA - 1, storage->device_descriptors_small); } - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->host_pinned_storage, &storage->device_descriptors_small[n_blocks - 1], sizeof(uint2), cudaMemcpyDeviceToHost, storage->cuda_stream)); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->host_pinned_storage, &storage->device_descriptors_small[n_ranking_blocks - 1], sizeof(uint2), cudaMemcpyDeviceToHost, storage->cuda_stream)); status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); if (status == cudaSuccess) @@ -1700,145 +1599,557 @@ static cudaError_t libcubwt_sort_segmented_suffixes_by_rank(LIBCUBWT_DEVICE_STOR cudaError_t status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceSegmentedSort::SortPairs( storage->device_ssort_temp_storage, storage->device_ssort_temp_storage_size, d_keys, d_values, - (int)n, (int)storage->num_unsorted_segments, - storage->device_offsets, storage->device_offsets + (storage->max_length / 2), + (int)storage->num_unsorted_suffixes, (int)storage->num_unsorted_segments, + storage->device_offsets, storage->device_offsets + (n / 2), storage->cuda_stream)); if (d_keys.selector) { std::swap(storage->device_keys, storage->device_temp_keys); } - if (d_values.selector) { std::swap(storage->device_SA, storage->device_temp_SA); storage->is_partial_suffix_array = true; } + if (d_values.selector) { std::swap(storage->device_SA, storage->device_temp_SA); } return status; } -static cudaError_t libcubwt_compute_inverse_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t n) +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_prepare_mod12_suffixes_kernel(const uint8_t * RESTRICT device_T, const uint32_t * RESTRICT device_ISA, const uint64_t * RESTRICT device_suffixes, const uint32_t rm, const uint32_t rs) { - cudaError_t status = cudaSuccess; - - if ((status = libcubwt_initialize_device_arrays(storage, T, n)) == cudaSuccess && - (status = libcubwt_sort_suffixes_by_prefix(storage, n)) == cudaSuccess) + __shared__ union { - for (int64_t iteration = 0, depth = 7; true; iteration += 1, depth *= 2) + struct { - if ((status = libcubwt_rank_and_segment_suffixes(storage, n, iteration)) != cudaSuccess) - { - break; - } + __align__(32) uint32_t bytes[4 * CUDA_BLOCK_THREADS]; + __align__(32) uint4 ranks[3 * CUDA_BLOCK_THREADS]; + } stage1; - if (storage->num_unsorted_segments == 0) - { - break; - } + struct + { + __align__(32) uint4 suffixes[4 * CUDA_BLOCK_THREADS]; + } stage2; - if ((status = libcubwt_update_suffix_sorting_keys(storage, n, iteration, depth)) != cudaSuccess) - { - break; - } + } shared_storage; - if ((status = libcubwt_sort_segmented_suffixes_by_rank(storage, n)) != cudaSuccess) - { - break; - } + { + device_T += blockIdx.x * CUDA_BLOCK_THREADS * 12 + threadIdx.x * 16; + device_ISA += blockIdx.x * CUDA_BLOCK_THREADS * 8 + threadIdx.x * 4; + + uint4 * RESTRICT thread_bytes = (uint4 *)shared_storage.stage1.bytes + threadIdx.x; + uint4 * RESTRICT thread_ranks = (uint4 *)shared_storage.stage1.ranks + threadIdx.x; + + if (threadIdx.x < (12 * CUDA_BLOCK_THREADS + 4 + 15) / 16) { thread_bytes[0] = __ldg((uint4 *)device_T); } + + thread_ranks[0] = __ldg((uint4 *)device_ISA); thread_ranks += CUDA_BLOCK_THREADS; device_ISA += CUDA_BLOCK_THREADS * 4; + thread_ranks[0] = __ldg((uint4 *)device_ISA); thread_ranks += CUDA_BLOCK_THREADS; device_ISA += CUDA_BLOCK_THREADS * 4; + if (threadIdx.x == 0) { thread_ranks[0] = __ldg((uint4 *)device_ISA); } + } + + { + __syncthreads(); + + uint32_t bytes0 = shared_storage.stage1.bytes[threadIdx.x * 3 + 0]; + uint32_t bytes1 = shared_storage.stage1.bytes[threadIdx.x * 3 + 1]; + uint32_t bytes2 = shared_storage.stage1.bytes[threadIdx.x * 3 + 2]; + uint32_t bytes3 = shared_storage.stage1.bytes[threadIdx.x * 3 + 3]; + + uint4 ranks0 = shared_storage.stage1.ranks[threadIdx.x * 2 + 0]; + uint4 ranks1 = shared_storage.stage1.ranks[threadIdx.x * 2 + 1]; + uint4 ranks2 = shared_storage.stage1.ranks[threadIdx.x * 2 + 2]; + + __syncthreads(); + + uint32_t v4 = 0, v8 = 0; + + if (process_auxiliary_indexes) + { + const uint32_t i4 = blockIdx.x * CUDA_BLOCK_THREADS * 12 + threadIdx.x * 12 + 4 + rm + 1; + const uint32_t i8 = blockIdx.x * CUDA_BLOCK_THREADS * 12 + threadIdx.x * 12 + 8 + rm + 1; + + if ((i4 & rm) == 0) { v4 = (i4 >> rs) << 24; } + if ((i8 & rm) == 0) { v8 = (i8 >> rs) << 24; } } + + shared_storage.stage2.suffixes[threadIdx.x * 4 + 0] = make_uint4 + ( + ranks0.y, __byte_perm(bytes0, 0, 0x4021), + ranks0.z | (uint32_t)INT32_MIN, __byte_perm(bytes0, 0, 0x4132) + ); + + shared_storage.stage2.suffixes[threadIdx.x * 4 + 1] = make_uint4 + ( + ranks0.w, (__byte_perm(bytes0, bytes1, 0x0354) & 0xffffffu) | v4, + ranks1.x | (uint32_t)INT32_MIN, __byte_perm(bytes1, 0, 0x4021) + ); + + shared_storage.stage2.suffixes[threadIdx.x * 4 + 2] = make_uint4 + ( + ranks1.y, __byte_perm(bytes1, bytes2, 0x0243) & 0xffffffu, + ranks1.z | (uint32_t)INT32_MIN, (__byte_perm(bytes1, bytes2, 0x0354) & 0xffffffu) | v8 + ); + + shared_storage.stage2.suffixes[threadIdx.x * 4 + 3] = make_uint4 + ( + ranks1.w, __byte_perm(bytes2, 0, 0x4132), + ranks2.x | (uint32_t)INT32_MIN, __byte_perm(bytes2, bytes3, 0x0243) & 0xffffffu + ); + + __syncwarp(); } - return status; + { + device_suffixes += blockIdx.x * CUDA_BLOCK_THREADS * 8; + + uint4 * RESTRICT thread_src = shared_storage.stage2.suffixes + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + uint4 * RESTRICT thread_dst = (uint4 *)device_suffixes + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + + thread_dst[0] = thread_src[0]; thread_src += CUDA_WARP_THREADS; thread_dst += CUDA_WARP_THREADS; + thread_dst[0] = thread_src[0]; thread_src += CUDA_WARP_THREADS; thread_dst += CUDA_WARP_THREADS; + thread_dst[0] = thread_src[0]; thread_src += CUDA_WARP_THREADS; thread_dst += CUDA_WARP_THREADS; + thread_dst[0] = thread_src[0]; + } } -static cudaError_t libcubwt_copy_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * SA, int64_t n) +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_prepare_mod0_suffixes_kernel(const uint8_t * RESTRICT device_T, const uint32_t * RESTRICT device_ISA, const uint64_t * RESTRICT device_suffixes_lh, const uint32_t * RESTRICT device_suffixes_hh, const uint32_t rm, const uint32_t rs) { - return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(SA, storage->device_SA, n * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream)); + __shared__ __align__(32) uint16_t bytes[3 * CUDA_BLOCK_THREADS + 8]; + + { + device_T += blockIdx.x * CUDA_BLOCK_THREADS * 6 + threadIdx.x * 16; + + uint4 * RESTRICT thread_bytes = (uint4 *)bytes + threadIdx.x; + + if (threadIdx.x <= (6 * CUDA_BLOCK_THREADS) / 16) { thread_bytes[0] = __ldg((uint4 *)(device_T - 16)); } + } + + { + device_ISA += blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + device_suffixes_lh += blockIdx.x * CUDA_BLOCK_THREADS * 2 + threadIdx.x * 2; + device_suffixes_hh += blockIdx.x * CUDA_BLOCK_THREADS * 2 + threadIdx.x * 2; + + __syncthreads(); + + uint32_t bytes0 = bytes[threadIdx.x * 3 + 7 ]; + uint32_t bytes1 = bytes[threadIdx.x * 3 + 8 ]; + uint32_t bytes2 = bytes[threadIdx.x * 3 + 9 ]; + uint32_t bytes3 = bytes[threadIdx.x * 3 + 10]; + uint4 ranks = __ldg((uint4 *)(device_ISA)); + + uint32_t v0 = 0; + + if (process_auxiliary_indexes) + { + const uint32_t i0 = blockIdx.x * CUDA_BLOCK_THREADS * 6 + threadIdx.x * 6 + 0 + rm + 1; + + if ((i0 & rm) == 0) { v0 = (i0 >> rs) << 24; } + } + else if ((blockIdx.x | threadIdx.x) == 0) + { + v0 = 1u << 24; + } + + *(uint4 *)(device_suffixes_lh) = make_uint4 + ( + ranks.x, __byte_perm(bytes0, bytes1, 0x3154) | v0, + ranks.z, __byte_perm(bytes2, bytes3, 0x3041) + ); + + *(uint2 *)(device_suffixes_hh) = make_uint2(ranks.y | (uint32_t)INT32_MIN, ranks.w | (uint32_t)INT32_MIN); + } } -static cudaError_t libcubwt_copy_inverse_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * ISA, int64_t n) +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, 1) +static void libcubwt_set_sentinel_suffixes_kernel(uint64_t * RESTRICT device_mod0l_suffixes_end, uint32_t * RESTRICT device_mod0h_suffixes_end,uint64_t * RESTRICT device_mod12_suffixes_end) { - return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(ISA, storage->device_ISA, n * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream)); + uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS + threadIdx.x; + + device_mod0l_suffixes_end += thread_index; + device_mod0h_suffixes_end += thread_index; + device_mod12_suffixes_end += thread_index; + + *(uint2 *)(device_mod0l_suffixes_end) = make_uint2(0x7fffffffu - 12 * CUDA_BLOCK_THREADS + 2 * thread_index + 0, 0x00ffffffu); + *(uint32_t *)(device_mod0h_suffixes_end) = (uint32_t)(0xffffffffu - 12 * CUDA_BLOCK_THREADS + 2 * thread_index + 0 ); + *(uint2 *)(device_mod12_suffixes_end) = make_uint2(0x7fffffffu - 12 * CUDA_BLOCK_THREADS + 2 * thread_index + 1, 0x00ffffffu); } -static cudaError_t libcubwt_synchronize_cuda_stream(LIBCUBWT_DEVICE_STORAGE * storage) +__device__ __forceinline__ +bool libcubwt_compare_suffixes_kernel(const uint2 mod0l_suffix, const uint32_t mod0h_suffix, const uint2 mod12_suffix) { - return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream)); + uint32_t difference = __byte_perm(mod0l_suffix.y, 0, 0x4401) - __byte_perm(mod12_suffix.y, 0, 0x4401); + if (difference == 0) { difference = (((int32_t)mod12_suffix.x < 0) ? mod0h_suffix : mod0l_suffix.x) - mod12_suffix.x; } + + return (int32_t)difference <= 0; } -__global__ __launch_bounds__(CUDA_BLOCK_THREADS, 1) -static void libcubwt_extract_auxiliary_indexes_kernel(const uint32_t * RESTRICT device_ISA, uint32_t * RESTRICT device_offsets, uint32_t n, uint32_t r) +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_suffixes_merge_initialization_kernel( + const uint64_t * RESTRICT device_mod0l_suffixes, + const uint32_t * RESTRICT device_mod0h_suffixes, + const uint32_t num_mod0_suffixes, + + const uint64_t * RESTRICT device_mod12_suffixes, + const uint32_t num_mod12_suffixes, + + uint32_t * RESTRICT device_suffixes_merge_path, + uint32_t num_merging_blocks) { - for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS + threadIdx.x; + + if (thread_index <= num_merging_blocks) { - device_offsets[thread_index] = __ldg(device_ISA + (thread_index * r)); + uint32_t diagonal = thread_index * CUDA_BLOCK_THREADS * 5; + uint32_t begin = (diagonal > num_mod12_suffixes) ? (diagonal - num_mod12_suffixes) : 0; + uint32_t end = (diagonal > num_mod0_suffixes ) ? (num_mod0_suffixes ) : diagonal; + + while (begin < end) + { + uint32_t pivot = begin + ((end - begin) >> 1); + + bool predicate = libcubwt_compare_suffixes_kernel( + __ldg((uint2 *)(device_mod0l_suffixes + pivot)), + __ldg((uint32_t *)(device_mod0h_suffixes + pivot)), + __ldg((uint2 *)(device_mod12_suffixes + diagonal - pivot - 1))); + + begin = predicate ? (pivot + 1) : begin; + end = predicate ? (end ) : pivot; + } + + __syncwarp(); + + device_suffixes_merge_path[thread_index] = begin; } } -static cudaError_t libcubwt_extract_auxiliary_indexes(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t r, uint32_t * I) +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_merge_suffixes_kernel( + const uint64_t * RESTRICT device_mod0l_suffixes, + const uint32_t * RESTRICT device_mod0h_suffixes, + const uint64_t * RESTRICT device_mod12_suffixes, + const uint32_t * RESTRICT device_suffixes_merge_path, + + uint32_t * RESTRICT device_auxiliary_indexes, + uint8_t * RESTRICT device_L) { - cudaError_t status = cudaSuccess; - int64_t n_indexes = (n + r - 1) / r; - uint32_t * buffer = ((sizeof(uint32_t) * n_indexes) <= storage->host_pinned_storage_size) ? (uint32_t *)storage->host_pinned_storage : I; - - if (n_indexes > 1) + __shared__ union { - libcubwt_extract_auxiliary_indexes_kernel<<<1, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_ISA, storage->device_offsets, (uint32_t)n_indexes, (uint32_t)r); + struct + { + __align__(32) uint2 suffixes_l[CUDA_BLOCK_THREADS * 5 + 12]; + __align__(32) uint32_t suffixes_h[CUDA_BLOCK_THREADS * 5 + 12]; + } stage1; + + struct + { + __align__(32) uint8_t bwt[CUDA_BLOCK_THREADS * 5]; + } stage2; + + } shared_storage; + + uint32_t num_mod0_suffixes; + uint32_t num_mod12_suffixes; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(buffer, storage->device_offsets, sizeof(uint32_t) * n_indexes, cudaMemcpyDeviceToHost, storage->cuda_stream), status); - } - else { - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(buffer, storage->device_ISA, sizeof(uint32_t) * n_indexes, cudaMemcpyDeviceToHost, storage->cuda_stream), status); + const uint32_t block_mod0_path_begin = (device_suffixes_merge_path + blockIdx.x)[0]; + const uint32_t block_mod0_path_end = (device_suffixes_merge_path + blockIdx.x)[1]; + + num_mod0_suffixes = block_mod0_path_end - block_mod0_path_begin + 6; + num_mod12_suffixes = CUDA_BLOCK_THREADS * 5 + 12 - num_mod0_suffixes; + + device_mod0l_suffixes += block_mod0_path_begin; + device_mod0h_suffixes += block_mod0_path_begin; + + device_mod12_suffixes += (blockIdx.x * CUDA_BLOCK_THREADS * 5 - block_mod0_path_begin); + device_mod12_suffixes -= num_mod0_suffixes; + + #pragma unroll + for (uint32_t thread_index = threadIdx.x; thread_index < CUDA_BLOCK_THREADS * 5 + 12; thread_index += CUDA_BLOCK_THREADS) + { + if (thread_index < num_mod0_suffixes) { shared_storage.stage1.suffixes_h[thread_index] = __ldg(device_mod0h_suffixes + thread_index); } + shared_storage.stage1.suffixes_l[thread_index] = __ldg((uint2 *)(thread_index < num_mod0_suffixes ? device_mod0l_suffixes : device_mod12_suffixes) + thread_index); + } + + __syncthreads(); } - if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status)) == cudaSuccess) { - if (I != buffer) { memcpy(I, buffer, sizeof(uint32_t) * n_indexes); } + uint32_t diagonal = threadIdx.x * 5; + uint32_t begin = (diagonal > num_mod12_suffixes) ? (diagonal - num_mod12_suffixes) : 0; + uint32_t end = (diagonal > num_mod0_suffixes ) ? (num_mod0_suffixes ) : diagonal; + + while (begin < end) + { + uint32_t pivot = (begin + end) >> 1; + + bool predicate = libcubwt_compare_suffixes_kernel( + shared_storage.stage1.suffixes_l[pivot], + shared_storage.stage1.suffixes_h[pivot], + shared_storage.stage1.suffixes_l[num_mod0_suffixes + diagonal - pivot - 1]); + + begin = predicate ? (pivot + 1) : begin; + end = predicate ? (end ) : pivot; + } + + __syncwarp(); + + uint32_t suffixes[5]; + + { + uint32_t mod0_index = begin; + uint32_t mod12_index = num_mod0_suffixes + diagonal - begin; + uint2 mod0l_suffix = shared_storage.stage1.suffixes_l[mod0_index]; + uint32_t mod0h_suffix = shared_storage.stage1.suffixes_h[mod0_index]; + uint2 mod12_suffix = shared_storage.stage1.suffixes_l[mod12_index]; + + #pragma unroll + for (uint32_t item = 0; item < 5; ++item) + { + bool predicate = libcubwt_compare_suffixes_kernel(mod0l_suffix, mod0h_suffix, mod12_suffix); + + suffixes[item] = predicate ? mod0l_suffix.y : mod12_suffix.y; + + if ( predicate) { mod0_index += 1; mod0l_suffix = shared_storage.stage1.suffixes_l[mod0_index]; mod0h_suffix = shared_storage.stage1.suffixes_h[mod0_index]; } + if (!predicate) { mod12_index += 1; mod12_suffix = shared_storage.stage1.suffixes_l[mod12_index]; } + } + + __syncthreads(); + } - for (int64_t index = 0; index < n_indexes; index += 1) { I[index] += 1; } + { + #pragma unroll + for (uint32_t item = 0; item < 5; ++item) + { + if (suffixes[item] >= 0x01000000u) + { + device_auxiliary_indexes[process_auxiliary_indexes ? suffixes[item] >> 24 : 1] = blockIdx.x * CUDA_BLOCK_THREADS * 5 + diagonal + item; + } + + shared_storage.stage2.bwt[diagonal + item] = (uint8_t)(suffixes[item] >> 16); + } + + __syncthreads(); + } } - return status; + { + device_L += blockIdx.x * CUDA_BLOCK_THREADS * 5 + threadIdx.x * 16; + if (threadIdx.x < (CUDA_BLOCK_THREADS * 5 / 16)) { ((uint4 *)device_L)[0] = ((uint4 *)shared_storage.stage2.bwt)[threadIdx.x]; } + } } -__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) -static void libcubwt_reinitialize_suffix_array_kernel(uint32_t * RESTRICT device_SA) +static cudaError_t libcubwt_compute_burrows_wheeler_transform(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t input_n, int64_t r, uint32_t * I) { - const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + cudaError_t status = cudaSuccess; - *(uint4 *)(device_SA + thread_index) = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); -} + int64_t reduced_n = (input_n / 3) * 2 + 2; + int64_t expanded_n = (reduced_n / 2) * 3 + 0; + int64_t num_indexes = (input_n + r - 1) / r; -static cudaError_t libcubwt_compute_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n) -{ - cudaError_t status = cudaSuccess; + if ((status = libcubwt_initialize_device_arrays(storage, T, reduced_n, expanded_n, input_n)) == cudaSuccess) + { + status = libcubwt_sort_suffixes_by_prefix(storage, reduced_n); + } + + if (status == cudaSuccess) + { + for (int64_t iteration = 0, depth = 4; true; iteration += 1, depth *= 2) + { + if ((status = libcubwt_rank_and_segment_suffixes(storage, reduced_n, iteration)) != cudaSuccess) + { + break; + } + + if (storage->num_unsorted_segments == 0) + { + break; + } - if (storage->is_partial_suffix_array) + if ((status = libcubwt_update_suffix_sorting_keys(storage, reduced_n, iteration, depth)) != cudaSuccess) + { + break; + } + + if ((status = libcubwt_sort_segmented_suffixes_by_rank(storage, reduced_n)) != cudaSuccess) + { + break; + } + } + } + + if (status == cudaSuccess) { - int64_t n_reinitialize_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + int64_t num_mod0_suffixes = (input_n / 3) * 1 + ((input_n % 3) != 0); + int64_t num_mod12_suffixes = (input_n / 3) * 2 + ((input_n % 3) == 2); + + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_temp_ISA, storage->device_ISA, reduced_n * sizeof(uint32_t), cudaMemcpyDeviceToDevice, storage->cuda_stream))) == cudaSuccess) + { + cub::DoubleBuffer db_mod12_suffixes(storage->device_keys_temp_keys, storage->device_SA_temp_SA); + + if (status == cudaSuccess) + { + { + int64_t n_preparing_blocks = (num_mod12_suffixes + storage->cuda_block_threads * 8 - 1) / (storage->cuda_block_threads * 8); + + if (num_indexes > 1) + { + uint32_t rm = (uint32_t)(r - 1), rs = 0; while (rm >= ((uint32_t)1 << rs)) { rs += 1; } + + libcubwt_prepare_mod12_suffixes_kernel<<<(uint32_t)n_preparing_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + storage->device_T, storage->device_ISA, + db_mod12_suffixes.Current(), + rm, rs); + } + else + { + libcubwt_prepare_mod12_suffixes_kernel<<<(uint32_t)n_preparing_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + storage->device_T, storage->device_ISA, + db_mod12_suffixes.Current(), + 0, 0); + } + } + + { + cub::DoubleBuffer db_index(storage->device_ISA, storage->device_offsets); + status = libcubwt_scatter_values_uint64(storage, db_index, db_mod12_suffixes, num_mod12_suffixes, reduced_n, reduced_n - num_mod12_suffixes); + } + } + + cub::DoubleBuffer db_mod0h_suffixes(storage->device_ISA, storage->device_offsets); + cub::DoubleBuffer db_mod0l_suffixes = db_mod12_suffixes.Current() == storage->device_keys_temp_keys + ? cub::DoubleBuffer((uint64_t *)storage->device_SA, (uint64_t *)storage->device_temp_SA) + : cub::DoubleBuffer((uint64_t *)storage->device_keys, (uint64_t *)storage->device_temp_keys); + + if (status == cudaSuccess) + { + { + int64_t n_preparing_blocks = (num_mod0_suffixes + storage->cuda_block_threads * 2 - 1) / (storage->cuda_block_threads * 2); + + if (num_indexes > 1) + { + uint32_t rm = (uint32_t)(r - 1), rs = 0; while (rm >= ((uint32_t)1 << rs)) { rs += 1; } + + libcubwt_prepare_mod0_suffixes_kernel<<<(uint32_t)n_preparing_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + storage->device_T, storage->device_temp_ISA, + db_mod0l_suffixes.Current(), db_mod0h_suffixes.Current(), + rm, rs); + } + else + { + libcubwt_prepare_mod0_suffixes_kernel<<<(uint32_t)n_preparing_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + storage->device_T, storage->device_temp_ISA, + db_mod0l_suffixes.Current(), db_mod0h_suffixes.Current(), + 0, 0); + } + } + + if (reduced_n <= (1 << 24)) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_mod0l_suffixes, db_mod0h_suffixes, + (uint32_t)num_mod0_suffixes, + 0, 24, + storage->cuda_stream)); + + if (status == cudaSuccess) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_mod0l_suffixes, db_mod0h_suffixes, + (uint32_t)num_mod0_suffixes, + 32, 40, + storage->cuda_stream)); + } + } + else + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_mod0l_suffixes, db_mod0h_suffixes, + (uint32_t)num_mod0_suffixes, + 0, 40, + storage->cuda_stream)); + } + } + + if (status == cudaSuccess) + { + int64_t n_merging_blocks = (input_n + storage->cuda_block_threads * 5 - 1) / (storage->cuda_block_threads * 5); + + { + libcubwt_set_sentinel_suffixes_kernel<<<6, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + db_mod0l_suffixes.Current() + num_mod0_suffixes, + db_mod0h_suffixes.Current() + num_mod0_suffixes, + db_mod12_suffixes.Current() + num_mod12_suffixes); + } + + { + int64_t n_merge_initialization_blocks = 1 + (n_merging_blocks / storage->cuda_block_threads); - libcubwt_reinitialize_suffix_array_kernel<<<(uint32_t)n_reinitialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_temp_SA); + libcubwt_suffixes_merge_initialization_kernel<<<(uint32_t)n_merge_initialization_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + db_mod0l_suffixes.Current(), db_mod0h_suffixes.Current(), (uint32_t)(num_mod0_suffixes + 6 * storage->cuda_block_threads), + db_mod12_suffixes.Current(), (uint32_t)(num_mod12_suffixes + 6 * storage->cuda_block_threads), + (uint32_t *)storage->device_descriptors_large, (uint32_t)n_merging_blocks); + } - status = libcubwt_permute_values_uint32(storage, storage->device_ISA, storage->device_temp_SA, storage->device_SA, n, storage->device_keys, storage->device_temp_keys); + { + if (num_indexes > 1) + { + libcubwt_merge_suffixes_kernel<<<(uint32_t)n_merging_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + db_mod0l_suffixes.Current(), db_mod0h_suffixes.Current(), db_mod12_suffixes.Current(), + (uint32_t *)storage->device_descriptors_large, + (uint32_t *)storage->device_descriptors_small - 1, + storage->device_T); + } + else + { + libcubwt_merge_suffixes_kernel<<<(uint32_t)n_merging_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>( + db_mod0l_suffixes.Current(), db_mod0h_suffixes.Current(), db_mod12_suffixes.Current(), + (uint32_t *)storage->device_descriptors_large, + (uint32_t *)storage->device_descriptors_small - 1, + storage->device_T); + } + } + } + + if (status == cudaSuccess) + { + uint32_t * buffer = ((sizeof(uint32_t) * num_indexes) <= storage->host_pinned_storage_size) ? (uint32_t *)storage->host_pinned_storage : I; + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(buffer, storage->device_descriptors_small, sizeof(uint32_t) * num_indexes, cudaMemcpyDeviceToHost, storage->cuda_stream), status); + + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status)) == cudaSuccess) + { + if (I != buffer) { memcpy(I, buffer, sizeof(uint32_t) * num_indexes); } + + for (int64_t index = 0; index < num_indexes; index += 1) { I[index] += 1; } + } + } + } } return status; } -static cudaError_t libcubwt_compute_burrows_wheeler_transform(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t index) +static cudaError_t libcubwt_copy_burrows_wheeler_transform(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, uint8_t * L, int64_t input_n, int64_t index) { cudaError_t status = cudaSuccess; - if ((status = libcubwt_permute_values_uint8(storage, storage->device_ISA, storage->device_T - 1, storage->device_heads, n, storage->device_keys, (uint8_t *)storage->device_temp_keys)) == cudaSuccess) - { - L[0] = T[n - 1]; + L[0] = T[input_n - 1]; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + 1, storage->device_heads, (size_t)(index - 1), cudaMemcpyDeviceToHost, storage->cuda_stream), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + index, storage->device_heads + index, (size_t)(n - index), cudaMemcpyDeviceToHost, storage->cuda_stream), status); - } + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + 1, storage->device_T, (size_t)(index - 1), cudaMemcpyDeviceToHost, storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + index, storage->device_T + index, (size_t)(input_n - index), cudaMemcpyDeviceToHost, storage->cuda_stream), status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); return status; } int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_length) { - max_length = ((max_length + (int64_t)(CUDA_DEVICE_PADDING) - (int64_t)1) / (int64_t)(CUDA_DEVICE_PADDING)) * (int64_t)(CUDA_DEVICE_PADDING); + int64_t max_reduced_length = ((max_length / 3) * 2 + 2 + 1023) & (-1024); + int64_t max_expanded_length = ((max_reduced_length / 2) * 3 + 0 + 1023) & (-1024); - if ((device_storage == NULL) || (max_length >= UINT32_MAX)) + if ((device_storage == NULL) || (max_expanded_length >= INT32_MAX)) { return LIBCUBWT_BAD_PARAMETER; } @@ -1852,85 +2163,107 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len cudaError_t status = cudaSuccess; - int32_t cuda_device_ordinal; - int32_t cuda_device_L2_cache_size; - - libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaGetDevice(&cuda_device_ordinal), status); - libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&cuda_device_L2_cache_size, cudaDevAttrL2CacheSize, cuda_device_ordinal), status); - - if (status == cudaSuccess) { + int32_t cuda_device_ordinal; + int32_t cuda_device_L2_cache_size; + int32_t cuda_device_capability; + + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaGetDevice(&cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&cuda_device_L2_cache_size, cudaDevAttrL2CacheSize, cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::PtxVersion(cuda_device_capability, cuda_device_ordinal), status); + + if (status == cudaSuccess) { storage->device_L2_cache_bits = 0; while (cuda_device_L2_cache_size >>= 1) { storage->device_L2_cache_bits += 1; }; + + storage->cuda_block_threads = (cuda_device_capability == 860 || cuda_device_capability == 870 || cuda_device_capability == 890) ? 768 : 512; } + } + + if (status == cudaSuccess) + { + int64_t num_descriptors = ((max_reduced_length / (storage->cuda_block_threads * 4)) + 1024) & (-1024); { cub::DoubleBuffer uint8_db; cub::DoubleBuffer uint32_db; cub::DoubleBuffer uint64_db; - size_t temp_radix_sort_k32v8 = 0; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v8, uint32_db, uint8_db, (uint32_t)max_length), status); + size_t temp_radix_segmented_sort_k32v32 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceSegmentedSort::SortPairs(NULL, temp_radix_segmented_sort_k32v32, uint32_db, uint32_db, (int)max_reduced_length, (int)max_reduced_length / 2, uint32_db.Current(), uint32_db.Current()), status); size_t temp_radix_sort_k32v32 = 0; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v32, uint32_db, uint32_db, (uint32_t)max_length), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v32, uint32_db, uint32_db, (uint32_t)max_reduced_length), status); size_t temp_radix_sort_k64v32 = 0; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k64v32, uint64_db, uint32_db, (uint32_t)max_length), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k64v32, uint64_db, uint32_db, (uint32_t)max_reduced_length), status); + + size_t temp_radix_sort_k32v64 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v64, uint32_db, uint64_db, (uint32_t)max_reduced_length), status); - storage->device_rsort_temp_storage_size = std::max(std::max(temp_radix_sort_k32v8, temp_radix_sort_k32v32), temp_radix_sort_k64v32); + storage->device_ssort_temp_storage_size = std::max(temp_radix_segmented_sort_k32v32, (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)); + storage->device_rsort_temp_storage_size = std::max(std::max(temp_radix_sort_k32v32, temp_radix_sort_k64v32), temp_radix_sort_k32v64); + + storage->device_ssort_temp_storage_size = (storage->device_ssort_temp_storage_size + (size_t)1023) & (size_t)(-1024); + storage->device_rsort_temp_storage_size = (storage->device_rsort_temp_storage_size + (size_t)1023) & (size_t)(-1024); } + if (status == cudaSuccess) { - cub::DoubleBuffer uint32_db; + size_t device_storage_size = 0; - size_t temp_radix_segmented_sort_k32v32 = 0; - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceSegmentedSort::SortPairs(NULL, temp_radix_segmented_sort_k32v32, uint32_db, uint32_db, (int)max_length, (int)max_length / 2, uint32_db.Current(), uint32_db.Current()), status); + device_storage_size += storage->device_ssort_temp_storage_size; + device_storage_size += storage->device_rsort_temp_storage_size; - storage->device_ssort_temp_storage_size = std::max(temp_radix_segmented_sort_k32v32, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)); - } + device_storage_size += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); - int64_t n_descriptors = (max_length / (CUDA_BLOCK_THREADS * 4) + 4) & (-4); + device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); + device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc1, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t)), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc2, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t)), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc3, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc4, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_ssort_temp_storage, storage->device_ssort_temp_storage_size), status); + device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc5, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t)) , status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc6, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t)) , status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_storage, device_storage_size), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc7, (3 * n_descriptors + (int64_t)6 * CUDA_DEVICE_PADDING) * sizeof(uint4)) , status); - - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size = CUDA_DEVICE_PADDING * sizeof(uint32_t)), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&storage->cuda_stream), status); + if (status == cudaSuccess) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size = 256 * sizeof(uint32_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&storage->cuda_stream), status); + } + } if (status == cudaSuccess) { - storage->max_length = max_length; + uint8_t * device_alloc = (uint8_t *)storage->device_storage; - storage->device_T = (uint8_t *)storage->device_alloc5 + CUDA_DEVICE_PADDING; - storage->device_heads = (uint8_t *)storage->device_alloc6 + CUDA_DEVICE_PADDING; + storage->device_ssort_temp_storage = (void *)device_alloc; device_alloc += storage->device_ssort_temp_storage_size; + storage->device_rsort_temp_storage = (void *)device_alloc; device_alloc += storage->device_rsort_temp_storage_size; - storage->device_SA = (uint32_t *)storage->device_alloc3 + CUDA_DEVICE_PADDING; - storage->device_temp_SA = (uint32_t *)storage->device_alloc4 + CUDA_DEVICE_PADDING; + storage->device_T = (uint8_t *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); - storage->device_temp_ISA = (uint32_t *)storage->device_ssort_temp_storage + CUDA_DEVICE_PADDING; + storage->device_heads = (uint8_t *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t); + storage->device_SA_temp_SA = (uint64_t *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + storage->device_keys_temp_keys = (uint64_t *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); + storage->device_offsets_ISA = (uint64_t *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t); - storage->device_keys = (uint32_t *)storage->device_alloc1 + CUDA_DEVICE_PADDING; - storage->device_offsets = (uint32_t *)storage->device_alloc2 + CUDA_DEVICE_PADDING; + storage->device_descriptors_large = (uint4 *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + storage->device_descriptors_copy = (uint4 *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4); + storage->device_descriptors_small = (uint2 *)(void *)device_alloc + CUDA_DEVICE_PADDING; device_alloc += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2); - storage->device_temp_keys = (uint32_t *)storage->device_alloc1 + 3 * CUDA_DEVICE_PADDING + max_length; - storage->device_ISA = (uint32_t *)storage->device_alloc2 + 3 * CUDA_DEVICE_PADDING + max_length; + storage->device_temp_ISA = (uint32_t *)(void *)storage->device_ssort_temp_storage + CUDA_DEVICE_PADDING; - storage->device_keys_temp_keys = (uint64_t *)storage->device_alloc1 + CUDA_DEVICE_PADDING; - storage->device_offsets_ISA = (uint64_t *)storage->device_alloc2 + CUDA_DEVICE_PADDING; + storage->device_SA = (uint32_t *)(void *)(storage->device_SA_temp_SA - CUDA_DEVICE_PADDING) + 1 * CUDA_DEVICE_PADDING; + storage->device_keys = (uint32_t *)(void *)(storage->device_keys_temp_keys - CUDA_DEVICE_PADDING) + 1 * CUDA_DEVICE_PADDING; + storage->device_offsets = (uint32_t *)(void *)(storage->device_offsets_ISA - CUDA_DEVICE_PADDING) + 1 * CUDA_DEVICE_PADDING; - storage->device_descriptors_large = (uint4 *)storage->device_alloc7 + CUDA_DEVICE_PADDING; - storage->device_descriptors_copy = (uint4 *)storage->device_alloc7 + 3 * CUDA_DEVICE_PADDING + 1 * n_descriptors; - storage->device_descriptors_small = (uint2 *)storage->device_alloc7 + 10 * CUDA_DEVICE_PADDING + 4 * n_descriptors; + storage->device_temp_SA = (uint32_t *)(void *)(storage->device_SA_temp_SA - CUDA_DEVICE_PADDING) + 3 * CUDA_DEVICE_PADDING + max_reduced_length; + storage->device_temp_keys = (uint32_t *)(void *)(storage->device_keys_temp_keys - CUDA_DEVICE_PADDING) + 3 * CUDA_DEVICE_PADDING + max_reduced_length; + storage->device_ISA = (uint32_t *)(void *)(storage->device_offsets_ISA - CUDA_DEVICE_PADDING) + 3 * CUDA_DEVICE_PADDING + max_reduced_length; + + storage->max_length = max_length; *device_storage = storage; return LIBCUBWT_NO_ERROR; @@ -1952,17 +2285,12 @@ int64_t libcubwt_free_device_storage(void * device_storage) LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; if (storage != NULL) { - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamDestroy(storage->cuda_stream), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFreeHost((void *)storage->host_pinned_storage), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc7), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_rsort_temp_storage), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc6), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc5), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_ssort_temp_storage), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc4), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc3), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc2), status); - status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc1), status); + if (storage->device_storage != NULL) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamDestroy(storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFreeHost((void *)storage->host_pinned_storage), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_storage), status); + } free(storage); } @@ -1970,69 +2298,6 @@ int64_t libcubwt_free_device_storage(void * device_storage) return status != cudaSuccess ? libcubwt_get_error_code(status) : LIBCUBWT_NO_ERROR; } -int64_t libcubwt_sa(void * device_storage, const uint8_t * T, uint32_t * SA, int64_t n) -{ - LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; - - if ((storage == NULL) || (T == NULL) || (SA == NULL) || (n < 16) || (n > storage->max_length)) - { - return LIBCUBWT_BAD_PARAMETER; - } - - cudaError_t status; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_compute_suffix_array(storage, n)) == cudaSuccess && - (status = libcubwt_copy_suffix_array(storage, SA, n)) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) - { - return LIBCUBWT_NO_ERROR; - } - - return libcubwt_get_error_code(status); -} - -int64_t libcubwt_isa(void * device_storage, const uint8_t * T, uint32_t * ISA, int64_t n) -{ - LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; - - if ((storage == NULL) || (T == NULL) || (ISA == NULL) || (n < 16) || (n > storage->max_length)) - { - return LIBCUBWT_BAD_PARAMETER; - } - - cudaError_t status; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) - { - return LIBCUBWT_NO_ERROR; - } - - return libcubwt_get_error_code(status); -} - -int64_t libcubwt_sa_isa(void * device_storage, const uint8_t * T, uint32_t * SA, uint32_t * ISA, int64_t n) -{ - LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; - - if ((storage == NULL) || (T == NULL) || (ISA == NULL) || (n < 16) || (n > storage->max_length)) - { - return LIBCUBWT_BAD_PARAMETER; - } - - cudaError_t status; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && - (status = libcubwt_compute_suffix_array(storage, n)) == cudaSuccess && - (status = libcubwt_copy_suffix_array(storage, SA, n)) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) - { - return LIBCUBWT_NO_ERROR; - } - - return libcubwt_get_error_code(status); -} - int64_t libcubwt_bwt(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n) { LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; @@ -2043,32 +2308,8 @@ int64_t libcubwt_bwt(void * device_storage, const uint8_t * T, uint8_t * L, int6 } cudaError_t status; uint32_t index; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_extract_auxiliary_indexes(storage, n, n, &index)) == cudaSuccess && - (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, index)) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) - { - return index; - } - - return libcubwt_get_error_code(status); -} - -int64_t libcubwt_bwt_isa(void * device_storage, const uint8_t * T, uint8_t * L, uint32_t * ISA, int64_t n) -{ - LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; - - if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length)) - { - return LIBCUBWT_BAD_PARAMETER; - } - - cudaError_t status; uint32_t index; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && - (status = libcubwt_extract_auxiliary_indexes(storage, n, n, &index)) == cudaSuccess && - (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, index)) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + if ((status = libcubwt_compute_burrows_wheeler_transform(storage, T, n, n, &index)) == cudaSuccess && + (status = libcubwt_copy_burrows_wheeler_transform(storage, T, L, n, index)) == cudaSuccess) { return index; } @@ -2080,16 +2321,14 @@ int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, { LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; - if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length) || (r < 4) || ((r & (r - 1)) != 0) || ((n + r - 1) / r > 255) || (I == NULL)) { return LIBCUBWT_BAD_PARAMETER; } cudaError_t status; - if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && - (status = libcubwt_extract_auxiliary_indexes(storage, n, r, I)) == cudaSuccess && - (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, I[0])) == cudaSuccess && - (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + if ((status = libcubwt_compute_burrows_wheeler_transform(storage, T, n, r, I)) == cudaSuccess && + (status = libcubwt_copy_burrows_wheeler_transform(storage, T, L, n, I[0])) == cudaSuccess) { return LIBCUBWT_NO_ERROR; } diff --git a/libbsc/bwt/libcubwt/libcubwt.cuh b/libbsc/bwt/libcubwt/libcubwt.cuh index 6872c5a..f303803 100644 --- a/libbsc/bwt/libcubwt/libcubwt.cuh +++ b/libbsc/bwt/libcubwt/libcubwt.cuh @@ -1,7 +1,7 @@ /*-- This file is a part of libcubwt, a library for CUDA accelerated -suffix array and burrows wheeler transform construction. +burrows wheeler transform construction. Copyright (c) 2022-2023 Ilya Grebnov @@ -25,9 +25,9 @@ Please see the file LICENSE for full copyright and license details. #define LIBCUBWT_CUH 1 #define LIBCUBWT_VERSION_MAJOR 1 -#define LIBCUBWT_VERSION_MINOR 0 +#define LIBCUBWT_VERSION_MINOR 5 #define LIBCUBWT_VERSION_PATCH 0 -#define LIBCUBWT_VERSION_STRING "1.0.0" +#define LIBCUBWT_VERSION_STRING "1.5.0" #define LIBCUBWT_NO_ERROR 0 #define LIBCUBWT_BAD_PARAMETER -1 @@ -57,37 +57,6 @@ extern "C" { */ int64_t libcubwt_free_device_storage(void * device_storage); - /** - * Constructs the suffix array (SA) of a given string. - * @param device_storage The previously allocated storage on the CUDA device. - * @param T [0..n-1] The input string. - * @param SA [0..n-1] The output array of suffixes. - * @param n The length of the input string. - * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. - */ - int64_t libcubwt_sa(void * device_storage, const uint8_t * T, uint32_t * SA, int64_t n); - - /** - * Constructs the inverse suffix array (ISA) of a given string. - * @param device_storage The previously allocated storage on the CUDA device. - * @param T [0..n-1] The input string. - * @param ISA [0..n-1] The output inverse array of suffixes. - * @param n The length of the input string. - * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. - */ - int64_t libcubwt_isa(void * device_storage, const uint8_t * T, uint32_t * ISA, int64_t n); - - /** - * Constructs the suffix array (SA) and inverse suffix array (ISA) of a given string. - * @param device_storage The previously allocated storage on the CUDA device. - * @param T [0..n-1] The input string. - * @param SA [0..n-1] The output array of suffixes. - * @param ISA [0..n-1] The output inverse array of suffixes. - * @param n The length of the input string. - * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. - */ - int64_t libcubwt_sa_isa(void * device_storage, const uint8_t * T, uint32_t * SA, uint32_t * ISA, int64_t n); - /** * Constructs the Burrows-Wheeler Transform (BWT) of a given string. * @param device_storage The previously allocated storage on the CUDA device. @@ -98,17 +67,6 @@ extern "C" { */ int64_t libcubwt_bwt(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n); - /** - * Constructs the Burrows-Wheeler Transform (BWT) and inverse suffix array (ISA) of a given string. - * @param device_storage The previously allocated storage on the CUDA device. - * @param T [0..n-1] The input string. - * @param L [0..n-1] The output string (can be T). - * @param ISA [0..n-1] The output inverse array of suffixes. - * @param n The length of the input string. - * @return The primary index if no error occurred, libcubwt error code otherwise. - */ - int64_t libcubwt_bwt_isa(void * device_storage, const uint8_t * T, uint8_t * L, uint32_t * ISA, int64_t n); - /** * Constructs the Burrows-Wheeler Transform (BWT) of a given string with auxiliary indexes. * @param device_storage The previously allocated storage on the CUDA device. diff --git a/libbsc/libbsc.h b/libbsc/libbsc.h index 38e294f..8959c79 100644 --- a/libbsc/libbsc.h +++ b/libbsc/libbsc.h @@ -35,8 +35,8 @@ See also the bsc and libbsc web site: #define LIBBSC_VERSION_MAJOR 3 #define LIBBSC_VERSION_MINOR 3 -#define LIBBSC_VERSION_PATCH 1 -#define LIBBSC_VERSION_STRING "3.3.1" +#define LIBBSC_VERSION_PATCH 2 +#define LIBBSC_VERSION_STRING "3.3.2" #define LIBBSC_NO_ERROR 0 #define LIBBSC_BAD_PARAMETER -1