From 1259681434bf6c28a229fe69abc035d412447e09 Mon Sep 17 00:00:00 2001 From: Ilya Grebnov Date: Sat, 11 Feb 2023 20:45:41 -0800 Subject: [PATCH] bsc 3.3.0 --- CHANGES | 4 + README | 8 +- VERSION | 2 +- bsc.cpp | 4 +- libbsc/bwt/bwt.cpp | 139 +- libbsc/bwt/bwt.h | 7 + libbsc/bwt/libcubwt/AUTHORS | 10 + libbsc/bwt/libcubwt/CHANGES | 3 + libbsc/bwt/libcubwt/LICENSE | 201 +++ libbsc/bwt/libcubwt/VERSION | 1 + libbsc/bwt/libcubwt/libcubwt.cu | 2098 ++++++++++++++++++++++++++++++ libbsc/bwt/libcubwt/libcubwt.cuh | 128 ++ libbsc/bwt/libsais/CHANGES | 34 + libbsc/bwt/libsais/LICENSE | 202 +++ libbsc/libbsc.h | 7 +- libbsc/libbsc/libbsc.cpp | 1 + libbsc/platform/platform.h | 12 +- libbsc/st/st.cu | 196 +-- makefile | 8 +- makefile.cuda | 29 +- 20 files changed, 2978 insertions(+), 116 deletions(-) create mode 100644 libbsc/bwt/libcubwt/AUTHORS create mode 100644 libbsc/bwt/libcubwt/CHANGES create mode 100644 libbsc/bwt/libcubwt/LICENSE create mode 100644 libbsc/bwt/libcubwt/VERSION create mode 100644 libbsc/bwt/libcubwt/libcubwt.cu create mode 100644 libbsc/bwt/libcubwt/libcubwt.cuh create mode 100644 libbsc/bwt/libsais/CHANGES create mode 100644 libbsc/bwt/libsais/LICENSE diff --git a/CHANGES b/CHANGES index 9c2b048..1f9563f 100644 --- a/CHANGES +++ b/CHANGES @@ -1,3 +1,7 @@ +Changes in 3.3.0 (February, 10 2023) +- Improved GPU acceleration performance of forward ST algorithm. +- Implemented GPU acceleration of forward Burrows–Wheeler transform. + Changes in 3.2.5 (November, 23 2022) - Fixed data corruption issue in LZP encoder. - Due to these fix, an upgrade to this version is strongly recommended. diff --git a/README b/README index 775a4ab..5c55b3f 100644 --- a/README +++ b/README @@ -10,7 +10,7 @@ block-sorting data compression algorithms. libbsc is a library based on bsc, it uses the same algorithms as bsc and enables you to compress memory blocks. -Copyright (c) 2009-2022 Ilya Grebnov +Copyright (c) 2009-2023 Ilya Grebnov See file AUTHORS for a full list of contributors. @@ -21,7 +21,7 @@ See the bsc and libbsc web site: Software License: ----------------- -Copyright (c) 2009-2022 Ilya Grebnov +Copyright (c) 2009-2023 Ilya Grebnov Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -54,14 +54,14 @@ Compression and decompression requirements are the same and in bytes, can be estimated as 16Mb + 5 x block size x number of blocks processed in parallel. GPU memory usage for NVIDIA CUDA technology is different from CPU memory usage -and can be estimated as 20 x block size. +and can be estimated as 20 x block size for ST and 32 x block size for BWT. NVIDIA GPU acceleration: ------------------------ 1. libbsc uses NVIDIA CUDA technology, resulting in a performance boost on computers -with NVIDIA GPU of compute capability 3.5 or higher. Lists of supported GPUs +with NVIDIA GPU of compute capability 5.0 or higher. Lists of supported GPUs can be found on the NVIDIA website http://developer.nvidia.com/cuda-gpus. You also need to install latest graphics drivers that support CUDA. diff --git a/VERSION b/VERSION index 448ada3..0fa4ae4 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -3.2.5 \ No newline at end of file +3.3.0 \ No newline at end of file diff --git a/bsc.cpp b/bsc.cpp index 250b6f0..2c698d7 100644 --- a/bsc.cpp +++ b/bsc.cpp @@ -851,8 +851,8 @@ void ProcessCommandline(int argc, char * argv[]) int main(int argc, char * argv[]) { - fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.2.5. 23 November 2022.\n"); - fprintf(stdout, "Copyright (c) 2009-2022 Ilya Grebnov .\n\n"); + fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.0. 10 February 2023.\n"); + fprintf(stdout, "Copyright (c) 2009-2023 Ilya Grebnov .\n\n"); #if defined(_OPENMP) && defined(__INTEL_COMPILER) diff --git a/libbsc/bwt/bwt.cpp b/libbsc/bwt/bwt.cpp index 94c5640..b4b44e9 100644 --- a/libbsc/bwt/bwt.cpp +++ b/libbsc/bwt/bwt.cpp @@ -38,14 +38,149 @@ See also the bsc and libbsc web site: #include "../platform/platform.h" #include "../libbsc.h" +#include "libcubwt/libcubwt.cuh" #include "libsais/libsais.h" +#if defined(LIBBSC_CUDA_SUPPORT) && defined(LIBBSC_OPENMP) + +omp_lock_t bwt_cuda_lock; +void * bwt_cuda_device_storage = NULL; +int bwt_cuda_device_storage_size = 0; + +int bsc_bwt_init(int features) +{ + if (features & LIBBSC_FEATURE_CUDA) + { + omp_init_lock(&bwt_cuda_lock); + } + + return LIBBSC_NO_ERROR; +} + +#else + +int bsc_bwt_init(int features) +{ + return LIBBSC_NO_ERROR; +} + +#endif + +int bsc_bwt_gpu_encode(unsigned char * T, int n, unsigned char * num_indexes, int * indexes, int features) +{ + int index = -1; + + if (features & LIBBSC_FEATURE_CUDA) + { +#ifdef LIBBSC_CUDA_SUPPORT + if (num_indexes != NULL && indexes != NULL) + { + int I[256]; + + int mod = n / 8; + { + mod |= mod >> 1; mod |= mod >> 2; + mod |= mod >> 4; mod |= mod >> 8; + mod |= mod >> 16; mod >>= 1; + } + +#ifdef LIBBSC_OPENMP + omp_set_lock(&bwt_cuda_lock); + + if (bwt_cuda_device_storage_size < n) + { + if (bwt_cuda_device_storage != NULL) + { + libcubwt_free_device_storage(bwt_cuda_device_storage); + + bwt_cuda_device_storage = NULL; + bwt_cuda_device_storage_size = 0; + } + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, n + (n / 32)) == LIBCUBWT_NO_ERROR) + { + bwt_cuda_device_storage_size = n + (n / 32); + } + } + + if (bwt_cuda_device_storage_size >= n) + { + index = (int)libcubwt_bwt_aux(bwt_cuda_device_storage, T, T, n, mod + 1, (unsigned int *)I); + } + + omp_unset_lock(&bwt_cuda_lock); +#else + void * bwt_cuda_device_storage = NULL; + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, n) == LIBCUBWT_NO_ERROR) + { + index = (int)libcubwt_bwt_aux(bwt_cuda_device_storage, T, T, n, mod + 1, (unsigned int *)I); + + libcubwt_free_device_storage(bwt_cuda_device_storage); + } +#endif + + if (index == 0) + { + num_indexes[0] = (unsigned char)((n - 1) / (mod + 1)); + index = I[0]; for (int t = 0; t < num_indexes[0]; ++t) indexes[t] = I[t + 1] - 1; + } + } + else + { +#ifdef LIBBSC_OPENMP + omp_set_lock(&bwt_cuda_lock); + + if (bwt_cuda_device_storage_size < n) + { + if (bwt_cuda_device_storage != NULL) + { + libcubwt_free_device_storage(bwt_cuda_device_storage); + + bwt_cuda_device_storage = NULL; + bwt_cuda_device_storage_size = 0; + } + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, n + (n / 32)) == LIBCUBWT_NO_ERROR) + { + bwt_cuda_device_storage_size = n + (n / 32); + } + } + + if (bwt_cuda_device_storage_size >= n) + { + index = (int)libcubwt_bwt(bwt_cuda_device_storage, T, T, n); + } + + omp_unset_lock(&bwt_cuda_lock); +#else + void * bwt_cuda_device_storage = NULL; + + if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, n) == LIBCUBWT_NO_ERROR) + { + index = (int)libcubwt_bwt(bwt_cuda_device_storage, T, T, n); + + libcubwt_free_device_storage(bwt_cuda_device_storage); + } +#endif + } +#endif + } + + return index; +} + + int bsc_bwt_encode(unsigned char * T, int n, unsigned char * num_indexes, int * indexes, int features) { - if (int * RESTRICT A = (int *)bsc_malloc(n * sizeof(int))) + int index = bsc_bwt_gpu_encode(T, n, num_indexes, indexes, features); + if (index >= 0) { - int index; + return index; + } + if (int * RESTRICT A = (int *)bsc_malloc(n * sizeof(int))) + { if (num_indexes != NULL && indexes != NULL) { int I[256]; diff --git a/libbsc/bwt/bwt.h b/libbsc/bwt/bwt.h index 6b9bb7b..cddbbaf 100644 --- a/libbsc/bwt/bwt.h +++ b/libbsc/bwt/bwt.h @@ -37,6 +37,13 @@ See also the bsc and libbsc web site: extern "C" { #endif + /** + * You should call this function before you call any of the other functions in bwt. + * @param features - the set of additional features. + * @return LIBBSC_NO_ERROR if no error occurred, error code otherwise. + */ + int bsc_bwt_init(int features); + /** * Constructs the burrows wheeler transformed string of a given string. * @param T - the input/output string of n chars. diff --git a/libbsc/bwt/libcubwt/AUTHORS b/libbsc/bwt/libcubwt/AUTHORS new file mode 100644 index 0000000..56fe850 --- /dev/null +++ b/libbsc/bwt/libcubwt/AUTHORS @@ -0,0 +1,10 @@ +-- Authors of libcubwt + + Ilya Grebnov + +-- This program is based on (at least) the work of + + Leyuan Wang, Sean Baxter, John D. Owens, Yury Shukhrov, + Rory Mitchell, Jacopo Pantaleoni, Duane Merrill, + Georgy Evtushenko, Allison Vacanti, Robert Crovella, + Mark Harris. diff --git a/libbsc/bwt/libcubwt/CHANGES b/libbsc/bwt/libcubwt/CHANGES new file mode 100644 index 0000000..d00efa8 --- /dev/null +++ b/libbsc/bwt/libcubwt/CHANGES @@ -0,0 +1,3 @@ +Changes in 1.0.0 (February 10, 2023) +- Initial public release of the libcubwt. + diff --git a/libbsc/bwt/libcubwt/LICENSE b/libbsc/bwt/libcubwt/LICENSE new file mode 100644 index 0000000..261eeb9 --- /dev/null +++ b/libbsc/bwt/libcubwt/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/libbsc/bwt/libcubwt/VERSION b/libbsc/bwt/libcubwt/VERSION new file mode 100644 index 0000000..afaf360 --- /dev/null +++ b/libbsc/bwt/libcubwt/VERSION @@ -0,0 +1 @@ +1.0.0 \ No newline at end of file diff --git a/libbsc/bwt/libcubwt/libcubwt.cu b/libbsc/bwt/libcubwt/libcubwt.cu new file mode 100644 index 0000000..9440f9a --- /dev/null +++ b/libbsc/bwt/libcubwt/libcubwt.cu @@ -0,0 +1,2098 @@ +/*-- + +This file is a part of libcubwt, a library for CUDA accelerated +suffix array and burrows wheeler transform construction. + + Copyright (c) 2022-2023 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright and license details. + +--*/ + +#include "libcubwt.cuh" + +#if defined(_MSC_VER) && defined(__INTELLISENSE__) + #define __launch_bounds__(block_size) /* */ + #define __CUDACC__ + + #include + #include + #include +#endif + +#include +#include + +#include + +#if defined(__GNUC__) || defined(__clang__) || defined(__CUDACC__) + #define RESTRICT __restrict__ +#elif defined(_MSC_VER) || defined(__INTEL_COMPILER) + #define RESTRICT __restrict +#else + #define RESTRICT /* */ +#endif + +#ifndef __CUDA_ARCH__ + #define CUDA_DEVICE_ARCH 0 +#else + #define CUDA_DEVICE_ARCH __CUDA_ARCH__ +#endif + +#if CUDA_DEVICE_ARCH == 750 + #define CUDA_SM_THREADS 1024 +#elif CUDA_DEVICE_ARCH == 860 || CUDA_DEVICE_ARCH == 870 || CUDA_DEVICE_ARCH == 890 + #define CUDA_SM_THREADS 1536 +#else + #define CUDA_SM_THREADS 2048 +#endif + +#define CUDA_WARP_THREADS (32) +#define CUDA_BLOCK_THREADS (512) +#define CUDA_DEVICE_PADDING (16 * CUDA_BLOCK_THREADS) + +typedef struct LIBCUBWT_DEVICE_STORAGE +{ + void * device_alloc1; + void * device_alloc2; + void * device_alloc3; + void * device_alloc4; + void * device_alloc5; + void * device_alloc6; + void * device_alloc7; + + void * device_rsort_temp_storage; + size_t device_rsort_temp_storage_size; + + void * device_ssort_temp_storage; + size_t device_ssort_temp_storage_size; + + uint8_t * device_T; + uint8_t * device_heads; + + uint32_t * device_SA; + uint32_t * device_ISA; + + uint32_t * device_keys; + uint32_t * device_offsets; + + uint4 * device_descriptors_large; + uint4 * device_descriptors_copy; + uint2 * device_descriptors_small; + + uint32_t * device_temp_SA; + uint32_t * device_temp_ISA; + uint32_t * device_temp_keys; + + uint64_t * device_keys_temp_keys; + uint64_t * device_offsets_ISA; + + void * host_pinned_storage; + size_t host_pinned_storage_size; + + int64_t max_length; + uint32_t num_unsorted_segments; + uint32_t num_unsorted_suffixes; + bool is_partial_suffix_array; + + int32_t device_L2_cache_bits; + + cudaStream_t cuda_stream; +} LIBCUBWT_DEVICE_STORAGE; + +static int64_t libcubwt_get_error_code(cudaError_t status) +{ + return + status == cudaErrorMemoryAllocation ? LIBCUBWT_GPU_NOT_ENOUGH_MEMORY : + status == cudaErrorDevicesUnavailable ? LIBCUBWT_GPU_NOT_SUPPORTED : + status == cudaErrorNoDevice ? LIBCUBWT_GPU_NOT_SUPPORTED : + LIBCUBWT_GPU_ERROR; +} + +static cudaError_t libcubwt_cuda_safe_call(const char * filename, int32_t line, cudaError_t result, cudaError_t status = cudaSuccess) +{ +#if !defined(NDEBUG) + if (result != cudaSuccess) + { + fprintf(stderr, "%s(%d): libcubwt_cuda_safe_call failed %d: '%s'.\n", filename, line, result, cudaGetErrorString(result)); + fflush(stderr); + } +#else + (void)(filename); (void)(line); +#endif + + return result != cudaSuccess ? result : status; +} + +template +__device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) +{ +#if CUDA_DEVICE_ARCH >= 800 + return __reduce_add_sync((uint32_t)-1, value); +#else + + #pragma unroll + for (uint32_t mask = CUDA_WARP_THREADS / 2; mask > 0; mask >>= 1) + { + value = cub::Sum()(value, __shfl_xor_sync((uint32_t)-1, value, mask, CUDA_WARP_THREADS)); + } + + return value; +#endif +} + +template +__device__ __forceinline__ T libcubwt_warp_reduce_max(T value) +{ +#if CUDA_DEVICE_ARCH >= 800 + return __reduce_max_sync((uint32_t)-1, value); +#else + + #pragma unroll + for (uint32_t mask = CUDA_WARP_THREADS / 2; mask > 0; mask >>= 1) + { + value = cub::Max()(value, __shfl_xor_sync((uint32_t)-1, value, mask, CUDA_WARP_THREADS)); + } + + return value; +#endif +} + +template +__device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay) +{ +#if CUDA_DEVICE_ARCH >= 700 + __nanosleep(delay); +#else + __threadfence_block(); (void)(delay); +#endif +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_gather_values_uint32_kernel(const uint32_t * device_idx, const uint32_t * RESTRICT device_src, uint32_t * device_dst, uint32_t m) +{ + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; + + device_idx += block_index; device_dst += block_index; m -= block_index; + + if (m >= CUDA_BLOCK_THREADS * 4) + { + const uint4 indexes = *(uint4 *)(device_idx + threadIdx.x * 4); + + *(uint4 *)(device_dst + threadIdx.x * 4) = make_uint4( + __ldg(device_src + indexes.x), + __ldg(device_src + indexes.y), + __ldg(device_src + indexes.z), + __ldg(device_src + indexes.w)); + } + else + { + for (uint32_t thread_index = threadIdx.x; thread_index < m; thread_index += CUDA_BLOCK_THREADS) + { + device_dst[thread_index] = __ldg(device_src + device_idx[thread_index]); + } + } +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_scatter_values_uint32_kernel(const uint32_t * RESTRICT device_idx, const uint32_t * RESTRICT device_src, uint32_t * RESTRICT device_dst, uint32_t m) +{ + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; + + device_idx += block_index; device_src += block_index; m -= block_index; + + if (m >= CUDA_BLOCK_THREADS * 4) + { + const uint4 indexes = __ldg((uint4 *)(device_idx + threadIdx.x * 4)); + const uint4 values = __ldg((uint4 *)(device_src + threadIdx.x * 4)); + + device_dst[indexes.x] = values.x; + device_dst[indexes.y] = values.y; + device_dst[indexes.z] = values.z; + device_dst[indexes.w] = values.w; + } + else + { + for (uint32_t thread_index = threadIdx.x; thread_index < m; thread_index += CUDA_BLOCK_THREADS) + { + device_dst[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); + } + } +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_scatter_values_uint8_kernel(const uint32_t * RESTRICT device_idx, const uint8_t * RESTRICT device_src, uint8_t * RESTRICT device_dst, uint32_t m) +{ + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; + + device_idx += block_index; device_src += block_index; m -= block_index; + + if (m >= CUDA_BLOCK_THREADS * 4) + { + const uint4 indexes = __ldg((uint4 *)(device_idx + threadIdx.x * 4)); + + if (source_is_aligned) + { + const uchar4 values = __ldg((uchar4 *)(device_src + threadIdx.x * 4)); + + device_dst[indexes.x] = values.x; + device_dst[indexes.y] = values.y; + device_dst[indexes.z] = values.z; + device_dst[indexes.w] = values.w; + } + else + { + device_dst[indexes.x] = __ldg(device_src + threadIdx.x * 4 + 0); + device_dst[indexes.y] = __ldg(device_src + threadIdx.x * 4 + 1); + device_dst[indexes.z] = __ldg(device_src + threadIdx.x * 4 + 2); + device_dst[indexes.w] = __ldg(device_src + threadIdx.x * 4 + 3); + } + } + else + { + for (uint32_t thread_index = threadIdx.x; thread_index < m; thread_index += CUDA_BLOCK_THREADS) + { + device_dst[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); + } + } +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_permute_block_values_uint32_kernel(const uint32_t * RESTRICT device_idx, const uint32_t * RESTRICT device_src, uint32_t * RESTRICT device_dst, uint32_t n) +{ + __shared__ __align__(32) uint32_t cache[16 * CUDA_BLOCK_THREADS]; + + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 16; + + device_idx += block_index; device_src += block_index; device_dst += block_index; n -= block_index; + + if (n >= CUDA_BLOCK_THREADS * 16) + { + { + const uint32_t * RESTRICT thread_idx = device_idx + threadIdx.x * 4; + const uint32_t * RESTRICT thread_src = device_src + threadIdx.x * 4; + uint32_t * RESTRICT thread_cache = cache - block_index; + + #pragma unroll + for (uint32_t round = 0; round < 4; round += 1) + { + const uint4 indexes = __ldg((uint4 *)(thread_idx)); + const uint4 values = __ldg((uint4 *)(thread_src)); + + thread_cache[indexes.x] = values.x; + thread_cache[indexes.y] = values.y; + thread_cache[indexes.z] = values.z; + thread_cache[indexes.w] = values.w; + + thread_idx += 4 * CUDA_BLOCK_THREADS; thread_src += 4 * CUDA_BLOCK_THREADS; + } + } + + __syncthreads(); + + { + const uint32_t * RESTRICT thread_cache = cache + threadIdx.x * 4; + uint32_t * RESTRICT thread_dst = device_dst + threadIdx.x * 4; + + #pragma unroll + for (uint32_t round = 0; round < 4; round += 1) + { + *(uint4 *)(thread_dst) = *(uint4 *)(thread_cache); + + thread_cache += 4 * CUDA_BLOCK_THREADS; thread_dst += 4 * CUDA_BLOCK_THREADS; + } + } + } + else + { + { + uint32_t * RESTRICT thread_cache = cache - block_index; + + for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + { + thread_cache[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); + } + } + + __syncthreads(); + + { + for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + { + device_dst[thread_index] = cache[thread_index]; + } + } + } +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_permute_block_values_uint8_kernel(const uint32_t * RESTRICT device_idx, const uint8_t * RESTRICT device_src, uint8_t * RESTRICT device_dst, uint32_t n) +{ + __shared__ __align__(32) uint8_t cache[64 * CUDA_BLOCK_THREADS]; + + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 64; + + device_idx += block_index; device_src += block_index; device_dst += block_index; n -= block_index; + + if (n >= CUDA_BLOCK_THREADS * 64) + { + { + const uint32_t * RESTRICT thread_idx = device_idx + threadIdx.x * 4; + const uint8_t * RESTRICT thread_src = device_src + threadIdx.x * 4; + uint8_t * RESTRICT thread_cache = cache - block_index; + + #pragma unroll + for (uint32_t round = 0; round < 16; round += 1) + { + const uint4 indexes = __ldg((uint4 *)(thread_idx)); + + if (source_is_aligned) + { + const uchar4 values = __ldg((uchar4 *)(thread_src)); + + thread_cache[indexes.x] = values.x; + thread_cache[indexes.y] = values.y; + thread_cache[indexes.z] = values.z; + thread_cache[indexes.w] = values.w; + } + else + { + thread_cache[indexes.x] = __ldg(thread_src + 0); + thread_cache[indexes.y] = __ldg(thread_src + 1); + thread_cache[indexes.z] = __ldg(thread_src + 2); + thread_cache[indexes.w] = __ldg(thread_src + 3); + } + + thread_idx += 4 * CUDA_BLOCK_THREADS; thread_src += 4 * CUDA_BLOCK_THREADS; + } + } + + __syncthreads(); + + { + const uint8_t * RESTRICT thread_cache = cache + threadIdx.x * 4; + uint8_t * RESTRICT thread_dst = device_dst + threadIdx.x * 4; + + #pragma unroll + for (uint32_t round = 0; round < 16; round += 1) + { + *(uchar4 *)(thread_dst) = *(uchar4 *)(thread_cache); + + thread_cache += 4 * CUDA_BLOCK_THREADS; thread_dst += 4 * CUDA_BLOCK_THREADS; + } + } + } + else + { + { + uint8_t * RESTRICT thread_cache = cache - block_index; + + for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + { + thread_cache[__ldg(device_idx + thread_index)] = __ldg(device_src + thread_index); + } + } + + __syncthreads(); + + { + for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + { + device_dst[thread_index] = cache[thread_index]; + } + } + } +} + +static cudaError_t libcubwt_gather_scatter_values_uint32(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * device_src_idx, uint32_t * device_src, uint32_t * device_dst_idx, uint32_t * device_dst, int64_t m, int64_t n, uint32_t * device_temp1, uint32_t * device_temp2) +{ + cudaError_t status = cudaSuccess; + + cub::DoubleBuffer db_src_index_value(device_src_idx, device_temp1); + cub::DoubleBuffer db_dst_index(device_dst_idx, device_temp2); + + int32_t sort_end_bit = 0; while ((n - 1) >= ((int64_t)1 << sort_end_bit)) { sort_end_bit += 1; } + int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits - 2) ? (sort_end_bit - storage->device_L2_cache_bits + 2 + 7) & (-8) : 0; + int32_t sort_start_bit = std::max(0, sort_end_bit - sort_aligned_bits); + + if (sort_start_bit < sort_end_bit) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_src_index_value, db_dst_index, + (uint32_t)m, + sort_start_bit, sort_end_bit, + storage->cuda_stream)); + } + + if (status == cudaSuccess) + { + int64_t n_gather_scatter_blocks = (m + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + + libcubwt_gather_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_src_index_value.Current(), device_src, db_src_index_value.Current(), (uint32_t)m); + + if (sort_start_bit < sort_end_bit) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_dst_index, db_src_index_value, + (uint32_t)m, + sort_start_bit, sort_end_bit, + storage->cuda_stream)); + } + + if (status == cudaSuccess) + { + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_gather_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_dst_index.Current(), db_src_index_value.Current(), device_dst, (uint32_t)m); + } + } + + return status; +} + +static cudaError_t libcubwt_scatter_values_uint32(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * device_idx, uint32_t * device_src, uint32_t * device_dst, int64_t m, int64_t n, uint32_t * device_temp1, uint32_t * device_temp2) +{ + cudaError_t status = cudaSuccess; + + cub::DoubleBuffer db_index(device_idx, device_temp1); + cub::DoubleBuffer db_value(device_src, device_temp2); + + int32_t sort_end_bit = 0; while ((n - 1) >= ((int64_t)1 << sort_end_bit)) { sort_end_bit += 1; } + int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits - 2) ? (sort_end_bit - storage->device_L2_cache_bits + 2 + 7) & (-8) : 0; + int32_t sort_start_bit = std::max(0, sort_end_bit - sort_aligned_bits); + + if (sort_start_bit < sort_end_bit) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_index, db_value, + (uint32_t)m, + sort_start_bit, sort_end_bit, + storage->cuda_stream)); + } + + if (status == cudaSuccess) + { + int64_t n_scatter_blocks = (m + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)m); + } + + return status; +} + +static cudaError_t libcubwt_permute_values_uint32(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * device_idx, uint32_t * device_src, uint32_t * device_dst, int64_t n, uint32_t * device_temp1, uint32_t * device_temp2) +{ + cudaError_t status = cudaSuccess; + + cub::DoubleBuffer db_index(device_idx, device_temp1); + cub::DoubleBuffer db_value(device_src, device_temp2); + + int32_t sort_end_bit = 0; while ((n - 1) >= ((int64_t)1 << sort_end_bit)) { sort_end_bit += 1; } + int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits - 2) ? (sort_end_bit - storage->device_L2_cache_bits + 2 + 7) & (-8) : 0; + int32_t sort_start_bit = std::max(0, sort_end_bit - sort_aligned_bits); + + if (sort_start_bit < sort_end_bit) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_index, db_value, + (uint32_t)n, + sort_start_bit, sort_end_bit, + storage->cuda_stream)); + } + + if (status == cudaSuccess) + { + if (((int64_t)1 << sort_start_bit) <= CUDA_BLOCK_THREADS * 16) + { + int64_t n_permute_blocks = (n + CUDA_BLOCK_THREADS * 16 - 1) / (CUDA_BLOCK_THREADS * 16); + + libcubwt_permute_block_values_uint32_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + else + { + int64_t n_scatter_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + + libcubwt_scatter_values_uint32_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + } + + return status; +} + +static cudaError_t libcubwt_permute_values_uint8(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * device_idx, uint8_t * device_src, uint8_t * device_dst, int64_t n, uint32_t * device_temp1, uint8_t * device_temp2) +{ + cudaError_t status = cudaSuccess; + + cub::DoubleBuffer db_index(device_idx, device_temp1); + cub::DoubleBuffer db_value(device_src, device_temp2); + + int32_t sort_end_bit = 0; while ((n - 1) >= ((int64_t)1 << sort_end_bit)) { sort_end_bit += 1; } + int32_t sort_aligned_bits = (sort_end_bit > storage->device_L2_cache_bits) ? (sort_end_bit - storage->device_L2_cache_bits + 7) & (-8) : 0; + int32_t sort_start_bit = std::max(0, sort_end_bit - sort_aligned_bits); + + if (sort_start_bit < sort_end_bit) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_index, db_value, + (uint32_t)n, + sort_start_bit, sort_end_bit, + storage->cuda_stream)); + } + + if (status == cudaSuccess) + { + if (((int64_t)1 << sort_start_bit) <= CUDA_BLOCK_THREADS * 64) + { + int64_t n_permute_blocks = (n + CUDA_BLOCK_THREADS * 64 - 1) / (CUDA_BLOCK_THREADS * 64); + + if (((uintptr_t)db_value.Current()) % 4 == 0) + { + libcubwt_permute_block_values_uint8_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + else + { + libcubwt_permute_block_values_uint8_kernel<<<(uint32_t)n_permute_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + } + else + { + int64_t n_scatter_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + + if (((uintptr_t)db_value.Current()) % 4 == 0) + { + libcubwt_scatter_values_uint8_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + else + { + libcubwt_scatter_values_uint8_kernel<<<(uint32_t)n_scatter_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(db_index.Current(), db_value.Current(), device_dst, (uint32_t)n); + } + } + } + + return status; +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_initialize_device_arrays_kernel(const uint8_t * RESTRICT device_T, uint32_t * RESTRICT device_SA, uint64_t * RESTRICT device_keys) +{ + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 8; + + { + uint32_t thread_index = block_index + threadIdx.x * 4; device_SA += thread_index; + ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); + + thread_index += CUDA_BLOCK_THREADS * 4; device_SA += CUDA_BLOCK_THREADS * 4; + ((uint4 *)device_SA)[0] = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); + } + + __shared__ __align__(32) uint4 prefixes[4 * CUDA_BLOCK_THREADS]; + + { + device_T += block_index + threadIdx.x * 8; + + const uint2 current = __ldg((uint2 *)(device_T + 0)); + const uint2 next = __ldg((uint2 *)(device_T + 8)); + + { + uint4 * RESTRICT thread_prefixes = prefixes + threadIdx.x * 4; + + thread_prefixes[0] = make_uint4 + ( + __byte_perm(current.y, next.x, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x0123), + __byte_perm(current.y, next.x, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x1234) + ); + + thread_prefixes[1] = make_uint4 + ( + __byte_perm(current.y, next.x, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x2345), + __byte_perm(current.y, next.x, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.x, current.y, 0x3456) + ); + + thread_prefixes[2] = make_uint4 + ( + __byte_perm(next.x, next.y, 0x0123) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x0123), + __byte_perm(next.x, next.y, 0x1234) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x1234) + ); + + thread_prefixes[3] = make_uint4 + ( + __byte_perm(next.x, next.y, 0x2345) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x2345), + __byte_perm(next.x, next.y, 0x3456) | (extra_sentinel_bits ? (uint32_t)7 : (uint32_t)1), __byte_perm(current.y, next.x, 0x3456) + ); + } + } + + __syncwarp(); + + { + device_keys += block_index; + + uint4 * RESTRICT thread_prefixes = (uint4 *)prefixes + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + uint4 * RESTRICT thread_keys = (uint4 *)device_keys + ((threadIdx.x / CUDA_WARP_THREADS) * CUDA_WARP_THREADS * 4) + (threadIdx.x % CUDA_WARP_THREADS); + + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; thread_keys += CUDA_WARP_THREADS; thread_prefixes += CUDA_WARP_THREADS; + thread_keys[0] = thread_prefixes[0]; + } +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, 1) +static void libcubwt_set_sentinel_keys_kernel(uint64_t * RESTRICT device_keys_end, uint64_t s0, uint64_t s1, uint64_t s2, uint64_t s3, uint64_t s4, uint64_t s5, uint64_t s6, uint64_t s7) +{ + device_keys_end[-8] = s0; + device_keys_end[-7] = s1; + device_keys_end[-6] = s2; + device_keys_end[-5] = s3; + device_keys_end[-4] = s4; + device_keys_end[-3] = s5; + device_keys_end[-2] = s6; + device_keys_end[-1] = s7; +} + +static cudaError_t libcubwt_initialize_device_arrays(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t n) +{ + cudaError_t status = cudaSuccess; + + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_T, T, (size_t)n, cudaMemcpyHostToDevice, storage->cuda_stream))) == cudaSuccess) + { + int64_t n_initialize_blocks = 1 + (n / (CUDA_BLOCK_THREADS * 8)); + + bool extra_sentinel_bits = (T[n - 2] == 0) && (T[n - 1] == 0); + if (extra_sentinel_bits) + { + libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); + } + else + { + libcubwt_initialize_device_arrays_kernel<<<(uint32_t)n_initialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_T, storage->device_SA, storage->device_keys_temp_keys); + } + + { + uint64_t a = T[n - 8], b = T[n - 7], c = T[n - 6], d = T[n - 5], e = T[n - 4], f = T[n - 3], g = T[n - 2], h = T[n - 1]; + + uint64_t s0 = (a << 56) | (b << 48) | (c << 40) | (d << 32) | (e << 24) | (f << 16) | (g << 8) | (h << 0) | (extra_sentinel_bits ? 7 : 0); + uint64_t s1 = (b << 56) | (c << 48) | (d << 40) | (e << 32) | (f << 24) | (g << 16) | (h << 8) | (extra_sentinel_bits ? 6 : 0); + uint64_t s2 = (c << 56) | (d << 48) | (e << 40) | (f << 32) | (g << 24) | (h << 16) | (extra_sentinel_bits ? 5 : 0); + uint64_t s3 = (d << 56) | (e << 48) | (f << 40) | (g << 32) | (h << 24) | (extra_sentinel_bits ? 4 : 0); + uint64_t s4 = (e << 56) | (f << 48) | (g << 40) | (h << 32) | (extra_sentinel_bits ? 3 : 0); + uint64_t s5 = (f << 56) | (g << 48) | (h << 40) | (extra_sentinel_bits ? 2 : 0); + uint64_t s6 = (g << 56) | (h << 48) | (extra_sentinel_bits ? 1 : 0); + uint64_t s7 = (h << 56); + + libcubwt_set_sentinel_keys_kernel<<<1, 1, 0, storage->cuda_stream>>>(storage->device_keys_temp_keys + n, s0, s1, s2, s3, s4, s5, s6, s7); + } + + storage->num_unsorted_segments = (uint32_t)1; + storage->num_unsorted_suffixes = (uint32_t)n; + storage->is_partial_suffix_array = false; + } + + return status; +} + +static cudaError_t libcubwt_sort_suffixes_by_prefix(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n) +{ + cub::DoubleBuffer db_keys(storage->device_keys_temp_keys, storage->device_offsets_ISA); + cub::DoubleBuffer db_SA(storage->device_SA, storage->device_temp_SA); + + cudaError_t status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs( + storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size, + db_keys, db_SA, + (uint32_t)n, + 0, 64, + storage->cuda_stream)); + + if (db_keys.selector) + { + std::swap(storage->device_keys_temp_keys, storage->device_offsets_ISA); + + std::swap(storage->device_keys, storage->device_offsets); + std::swap(storage->device_temp_keys, storage->device_ISA); + } + + if (db_SA.selector) + { + std::swap(storage->device_SA, storage->device_temp_SA); + } + + return status; +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_rank_and_segment_suffixes_initialization_kernel(uint64_t * RESTRICT device_keys, uint8_t * RESTRICT device_heads, uint4 * RESTRICT device_descriptors_large, uint2 * RESTRICT device_descriptors_small, uint32_t n) +{ + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS + threadIdx.x; + + device_descriptors_large += thread_index; + device_descriptors_small += thread_index; + + device_descriptors_large[0] = make_uint4(0, 0, 0, 0); + device_descriptors_small[0] = make_uint2(0, 0); + + if (blockIdx.x == 0) + { + if (threadIdx.x < CUDA_WARP_THREADS) + { + device_descriptors_large[-CUDA_WARP_THREADS] = make_uint4((uint32_t)-1, 0, 0, 0); + device_descriptors_small[-CUDA_WARP_THREADS] = make_uint2((uint32_t)-1, 0); + } + + { + device_keys += threadIdx.x; device_heads += threadIdx.x; + uint64_t key = (threadIdx.x % 2 == 0) ? 0 : (uint64_t)-1; + + if (threadIdx.x < 2) + { + device_keys [-2] = key; + device_heads[-2] = 1; + } + + device_keys += n; device_heads += n; + + device_keys [0 * CUDA_BLOCK_THREADS] = key; + device_keys [1 * CUDA_BLOCK_THREADS] = key; + device_keys [2 * CUDA_BLOCK_THREADS] = key; + device_keys [3 * CUDA_BLOCK_THREADS] = key; + device_heads[0 * CUDA_BLOCK_THREADS] = 1; + device_heads[1 * CUDA_BLOCK_THREADS] = 1; + device_heads[2 * CUDA_BLOCK_THREADS] = 1; + device_heads[3 * CUDA_BLOCK_THREADS] = 1; + } + } +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_rank_and_segment_suffixes_initiatory_kernel( + const uint32_t * RESTRICT device_SA, + const uint64_t * RESTRICT device_keys, + uint8_t * RESTRICT device_heads, + uint32_t * RESTRICT device_ISA, + uint32_t * RESTRICT device_offsets_begin, + uint32_t * RESTRICT device_offsets_end, + uint4 * RESTRICT device_descriptors +) +{ + __shared__ __align__(32) uint2 warp_state[1 + CUDA_WARP_THREADS]; + + uint32_t thread_exclusive_suffix_rank; + uint32_t thread_suffix_rank[4]; + + uchar4 thread_segment_end_flag; + + uint32_t thread_exclusive_segment_index; + uint32_t thread_segment_index[4]; + + { + __shared__ __align__(32) ulonglong2 cache[1 + 2 * CUDA_BLOCK_THREADS]; + + { + device_keys += blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 2; + + if (threadIdx.x == 0) { cache[0] = __ldg((ulonglong2 *)(device_keys - 2)); } + cache[1 + threadIdx.x + 0 * CUDA_BLOCK_THREADS] = __ldg((ulonglong2 *)(device_keys + 0 * CUDA_BLOCK_THREADS)); + cache[1 + threadIdx.x + 1 * CUDA_BLOCK_THREADS] = __ldg((ulonglong2 *)(device_keys + 2 * CUDA_BLOCK_THREADS)); + } + + __syncthreads(); + + { + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; + const uint32_t thread_index = block_index + threadIdx.x * 4; + + ulonglong2 key_a = cache[2 * threadIdx.x + 0]; + ulonglong2 key_b = cache[2 * threadIdx.x + 1]; + ulonglong2 key_c = cache[2 * threadIdx.x + 2]; + + uchar4 thread_new_heads = make_uchar4( + (key_a.y != key_b.x) ? (uint8_t)1 : (uint8_t)0, + (key_b.x != key_b.y) ? (uint8_t)1 : (uint8_t)0, + (key_b.y != key_c.x) ? (uint8_t)1 : (uint8_t)0, + (key_c.x != key_c.y) ? (uint8_t)1 : (uint8_t)0); + + *(uchar4 *)(device_heads + thread_index) = thread_new_heads; + + thread_suffix_rank[0] = (thread_new_heads.x != 0) ? (thread_index + 0) : 0; + thread_suffix_rank[1] = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_suffix_rank[0]; + thread_suffix_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_rank[1]; + thread_suffix_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_rank[2]; + + thread_segment_end_flag.x = (thread_new_heads.x != 0) && (key_a.x == key_a.y); + thread_segment_end_flag.y = (thread_new_heads.y != 0) && (thread_new_heads.x == 0); + thread_segment_end_flag.z = (thread_new_heads.z != 0) && (thread_new_heads.y == 0); + thread_segment_end_flag.w = (thread_new_heads.w != 0) && (thread_new_heads.z == 0); + + thread_segment_index[0] = (uint32_t)thread_segment_end_flag.x; + thread_segment_index[1] = thread_segment_index[0] + (uint32_t)thread_segment_end_flag.y; + thread_segment_index[2] = thread_segment_index[1] + (uint32_t)thread_segment_end_flag.z; + thread_segment_index[3] = thread_segment_index[2] + (uint32_t)thread_segment_end_flag.w; + } + } + + { + uint32_t thread_inclusive_suffix_rank; + uint32_t thread_inclusive_segment_index; + + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_rank[3] , thread_inclusive_suffix_rank , thread_exclusive_suffix_rank , (uint32_t)0, cub::Max()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_segment_index[3], thread_inclusive_segment_index, thread_exclusive_segment_index, (uint32_t)0, cub::Sum()); + + if ((threadIdx.x % CUDA_WARP_THREADS) == (CUDA_WARP_THREADS - 1)) + { + warp_state[threadIdx.x / CUDA_WARP_THREADS] = make_uint2(thread_inclusive_suffix_rank, thread_inclusive_segment_index); + } + + __syncthreads(); + } + + { + if (threadIdx.x < CUDA_WARP_THREADS) + { + uint32_t block_exclusive_suffix_rank = 0; + uint32_t block_exclusive_segment_index = 0; + + uint32_t warp_inclusive_suffix_rank; + uint32_t warp_inclusive_segment_index; + + { + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage; + + uint2 warp_inclusive_state = warp_state[threadIdx.x]; + + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state.x, warp_inclusive_suffix_rank , cub::Max()); + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state.y, warp_inclusive_segment_index, cub::Sum()); + } + + { + const uint32_t descriptor_status_aggregate_not_ready = 0; + const uint32_t descriptor_status_partial_aggregate_ready = 1; + const uint32_t descriptor_status_full_aggregate_ready = 4; + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint4(descriptor_status_partial_aggregate_ready, 0, warp_inclusive_suffix_rank, warp_inclusive_segment_index)); + } + + { + uint4 * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x + threadIdx.x; + + int32_t full_aggregate_lane, delay = 8; + do + { + descriptors_lookback -= CUDA_WARP_THREADS; + + uint4 block_descriptor; + do + { + libcubwt_delay_or_prevent_hoisting(delay <<= 1); + + block_descriptor = cub::ThreadLoad(descriptors_lookback); + } while (__any_sync((uint32_t)-1, block_descriptor.x == descriptor_status_aggregate_not_ready)); + + delay = 0; + + { + full_aggregate_lane = 31 - __clz((int32_t)__ballot_sync((uint32_t)-1, block_descriptor.x != descriptor_status_partial_aggregate_ready)); + block_descriptor.z = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.z : 0; + block_descriptor.w = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.w : 0; + } + + { + block_exclusive_suffix_rank = cub::Max()(block_exclusive_suffix_rank , libcubwt_warp_reduce_max(block_descriptor.z)); + block_exclusive_segment_index = cub::Sum()(block_exclusive_segment_index, libcubwt_warp_reduce_sum(block_descriptor.w)); + } + + } while (full_aggregate_lane == -1); + + warp_inclusive_suffix_rank = cub::Max()(warp_inclusive_suffix_rank , block_exclusive_suffix_rank ); + warp_inclusive_segment_index = cub::Sum()(warp_inclusive_segment_index, block_exclusive_segment_index); + } + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint4(descriptor_status_full_aggregate_ready, 0, warp_inclusive_suffix_rank, warp_inclusive_segment_index)); + } + } + + { + if (threadIdx.x == 0) + { + warp_state[0] = make_uint2(block_exclusive_suffix_rank, block_exclusive_segment_index); + } + + warp_state[1 + threadIdx.x] = make_uint2(warp_inclusive_suffix_rank, warp_inclusive_segment_index); + } + } + + __syncthreads(); + } + + { + uint2 warp_exclusive_state = warp_state[threadIdx.x / CUDA_WARP_THREADS]; + + thread_exclusive_suffix_rank = cub::Max()(thread_exclusive_suffix_rank , warp_exclusive_state.x); + thread_exclusive_segment_index = cub::Sum()(thread_exclusive_segment_index, warp_exclusive_state.y); + + thread_suffix_rank[0] = cub::Max()(thread_suffix_rank[0], thread_exclusive_suffix_rank); + thread_suffix_rank[1] = cub::Max()(thread_suffix_rank[1], thread_exclusive_suffix_rank); + thread_suffix_rank[2] = cub::Max()(thread_suffix_rank[2], thread_exclusive_suffix_rank); + thread_suffix_rank[3] = cub::Max()(thread_suffix_rank[3], thread_exclusive_suffix_rank); + + thread_segment_index[0] = cub::Sum()(thread_segment_index[0], thread_exclusive_segment_index); + thread_segment_index[1] = cub::Sum()(thread_segment_index[1], thread_exclusive_segment_index); + thread_segment_index[2] = cub::Sum()(thread_segment_index[2], thread_exclusive_segment_index); + thread_segment_index[3] = cub::Sum()(thread_segment_index[3], thread_exclusive_segment_index); + + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + if (thread_segment_end_flag.x != 0) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } + if (thread_segment_end_flag.y != 0) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } + if (thread_segment_end_flag.z != 0) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } + if (thread_segment_end_flag.w != 0) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } + + if (scatter_ranks_directly) + { + const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); + + device_ISA[indexes.x] = thread_suffix_rank[0]; + device_ISA[indexes.y] = thread_suffix_rank[1]; + device_ISA[indexes.z] = thread_suffix_rank[2]; + device_ISA[indexes.w] = thread_suffix_rank[3]; + } + else + { + *(uint4 *)(device_ISA + thread_index) = make_uint4(thread_suffix_rank[0], thread_suffix_rank[1], thread_suffix_rank[2], thread_suffix_rank[3]); + } + } +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_rank_and_segment_suffixes_incremental_kernel( + const uint32_t * RESTRICT device_SA, + const uint32_t * RESTRICT device_keys, + uint8_t * RESTRICT device_heads, + uint32_t * RESTRICT device_out_SA, + uint32_t * RESTRICT device_out_ISA, + uint32_t * RESTRICT device_offsets_begin, + uint32_t * RESTRICT device_offsets_end, + uint4 * RESTRICT device_descriptors, + const uint4 * RESTRICT device_descriptors_copy +) +{ + __shared__ __align__(32) uint4 warp_state1[1 + CUDA_WARP_THREADS]; + __shared__ __align__(32) uint32_t warp_state2[1 + CUDA_WARP_THREADS]; + + uint32_t thread_exclusive_suffix_old_rank; + uint32_t thread_suffix_old_rank[4]; + + uint32_t thread_exclusive_suffix_new_rank; + uint32_t thread_suffix_new_rank[4]; + + uchar4 thread_segment_end_flag; + + uint32_t thread_exclusive_segment_index; + uint32_t thread_segment_index[4]; + + uchar4 thread_updated_suffix_flag; + + uint32_t thread_exclusive_suffix_index; + uint32_t thread_suffix_index[4]; + + { + const uint32_t block_index = blockIdx.x * CUDA_BLOCK_THREADS * 4; + const uint32_t thread_index = block_index + threadIdx.x * 4; + + device_keys += thread_index; device_heads += thread_index; + + uint2 key_a = __ldg((uint2 *)(device_keys - 2)); + uint4 key_b = __ldg((uint4 *)(device_keys)); + uchar4 thread_old_heads = *(uchar4 *)(device_heads); + + uchar4 thread_new_heads = make_uchar4( + (key_a.y != key_b.x) ? (uint8_t)1 : (uint8_t)thread_old_heads.x, + (key_b.x != key_b.y) ? (uint8_t)1 : (uint8_t)thread_old_heads.y, + (key_b.y != key_b.z) ? (uint8_t)1 : (uint8_t)thread_old_heads.z, + (key_b.z != key_b.w) ? (uint8_t)1 : (uint8_t)thread_old_heads.w); + + *(uchar4 *)(device_heads) = thread_new_heads; + + thread_suffix_old_rank[0] = (thread_old_heads.x != 0) ? (thread_index + 0) : 0; + thread_suffix_old_rank[1] = (thread_old_heads.y != 0) ? (thread_index + 1) : thread_suffix_old_rank[0]; + thread_suffix_old_rank[2] = (thread_old_heads.z != 0) ? (thread_index + 2) : thread_suffix_old_rank[1]; + thread_suffix_old_rank[3] = (thread_old_heads.w != 0) ? (thread_index + 3) : thread_suffix_old_rank[2]; + + thread_suffix_new_rank[0] = (thread_new_heads.x != 0) ? (thread_index + 0) : 0; + thread_suffix_new_rank[1] = (thread_new_heads.y != 0) ? (thread_index + 1) : thread_suffix_new_rank[0]; + thread_suffix_new_rank[2] = (thread_new_heads.z != 0) ? (thread_index + 2) : thread_suffix_new_rank[1]; + thread_suffix_new_rank[3] = (thread_new_heads.w != 0) ? (thread_index + 3) : thread_suffix_new_rank[2]; + + thread_segment_end_flag.x = (thread_new_heads.x != 0) && (key_a.x == key_a.y) && (device_heads[-1] == 0); + thread_segment_end_flag.y = (thread_new_heads.y != 0) && (thread_new_heads.x == 0); + thread_segment_end_flag.z = (thread_new_heads.z != 0) && (thread_new_heads.y == 0); + thread_segment_end_flag.w = (thread_new_heads.w != 0) && (thread_new_heads.z == 0); + + thread_segment_index[0] = (uint32_t)thread_segment_end_flag.x; + thread_segment_index[1] = thread_segment_index[0] + (uint32_t)thread_segment_end_flag.y; + thread_segment_index[2] = thread_segment_index[1] + (uint32_t)thread_segment_end_flag.z; + thread_segment_index[3] = thread_segment_index[2] + (uint32_t)thread_segment_end_flag.w; + } + + { + uint32_t thread_inclusive_suffix_old_rank; + uint32_t thread_inclusive_suffix_new_rank; + uint32_t thread_inclusive_segment_index; + + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_old_rank[3], thread_inclusive_suffix_old_rank, thread_exclusive_suffix_old_rank, (uint32_t)0, cub::Max()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_new_rank[3], thread_inclusive_suffix_new_rank, thread_exclusive_suffix_new_rank, (uint32_t)0, cub::Max()); + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_segment_index[3] , thread_inclusive_segment_index , thread_exclusive_segment_index , (uint32_t)0, cub::Sum()); + + if ((threadIdx.x % CUDA_WARP_THREADS) == (CUDA_WARP_THREADS - 1)) + { + warp_state1[threadIdx.x / CUDA_WARP_THREADS] = make_uint4(0, thread_inclusive_suffix_old_rank, thread_inclusive_suffix_new_rank, thread_inclusive_segment_index); + } + + __syncthreads(); + } + + { + if (threadIdx.x < CUDA_WARP_THREADS) + { + uint32_t block_exclusive_suffix_old_rank = __ldg((uint32_t *)(device_descriptors_copy + blockIdx.x - 1) + 2); + uint32_t block_exclusive_suffix_new_rank = 0; + uint32_t block_exclusive_segment_index = 0; + + uint32_t warp_inclusive_suffix_old_rank; + uint32_t warp_inclusive_suffix_new_rank; + uint32_t warp_inclusive_segment_index; + + { + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage; + + uint4 warp_inclusive_state = warp_state1[threadIdx.x]; + + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state.y, warp_inclusive_suffix_old_rank, cub::Max()); + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state.z, warp_inclusive_suffix_new_rank, cub::Max()); + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state.w, warp_inclusive_segment_index , cub::Sum()); + } + + { + const uint32_t descriptor_status_aggregate_not_ready = alternate_block_descriptor_statuses ? 4 : 0; + const uint32_t descriptor_status_partial_aggregate_ready = alternate_block_descriptor_statuses ? 3 : 1; + const uint32_t descriptor_status_full_aggregate_ready = scatter_ranks_directly ? (alternate_block_descriptor_statuses ? 0 : 4) : 2; + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint4(descriptor_status_partial_aggregate_ready, 0, warp_inclusive_suffix_new_rank, warp_inclusive_segment_index)); + } + + { + uint4 * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x + threadIdx.x; + + int32_t full_aggregate_lane, delay = 8; + do + { + descriptors_lookback -= CUDA_WARP_THREADS; + + uint4 block_descriptor; + do + { + libcubwt_delay_or_prevent_hoisting(delay <<= 1); + + block_descriptor = cub::ThreadLoad(descriptors_lookback); + } while (__any_sync((uint32_t)-1, block_descriptor.x == descriptor_status_aggregate_not_ready)); + + delay = 0; + + { + full_aggregate_lane = 31 - __clz((int32_t)__ballot_sync((uint32_t)-1, block_descriptor.x != descriptor_status_partial_aggregate_ready)); + block_descriptor.z = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.z : 0; + block_descriptor.w = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.w : 0; + } + + { + block_exclusive_suffix_new_rank = cub::Max()(block_exclusive_suffix_new_rank , libcubwt_warp_reduce_max(block_descriptor.z)); + block_exclusive_segment_index = cub::Sum()(block_exclusive_segment_index , libcubwt_warp_reduce_sum(block_descriptor.w)); + } + + } while (full_aggregate_lane == -1); + + warp_inclusive_suffix_old_rank = cub::Max()(warp_inclusive_suffix_old_rank, block_exclusive_suffix_old_rank); + warp_inclusive_suffix_new_rank = cub::Max()(warp_inclusive_suffix_new_rank, block_exclusive_suffix_new_rank); + warp_inclusive_segment_index = cub::Sum()(warp_inclusive_segment_index , block_exclusive_segment_index ); + } + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint4(descriptor_status_full_aggregate_ready, 0, warp_inclusive_suffix_new_rank, warp_inclusive_segment_index)); + } + } + + { + if (threadIdx.x == 0) + { + warp_state1[0] = make_uint4(0, block_exclusive_suffix_old_rank, block_exclusive_suffix_new_rank, block_exclusive_segment_index); + } + + warp_state1[1 + threadIdx.x] = make_uint4(0, warp_inclusive_suffix_old_rank, warp_inclusive_suffix_new_rank, warp_inclusive_segment_index); + } + } + + __syncthreads(); + } + + { + uint4 warp_exclusive_state = warp_state1[threadIdx.x / CUDA_WARP_THREADS]; + + thread_exclusive_suffix_old_rank = cub::Max()(thread_exclusive_suffix_old_rank, warp_exclusive_state.y); + thread_exclusive_suffix_new_rank = cub::Max()(thread_exclusive_suffix_new_rank, warp_exclusive_state.z); + thread_exclusive_segment_index = cub::Sum()(thread_exclusive_segment_index , warp_exclusive_state.w); + + thread_suffix_old_rank[0] = cub::Max()(thread_suffix_old_rank[0], thread_exclusive_suffix_old_rank); + thread_suffix_old_rank[1] = cub::Max()(thread_suffix_old_rank[1], thread_exclusive_suffix_old_rank); + thread_suffix_old_rank[2] = cub::Max()(thread_suffix_old_rank[2], thread_exclusive_suffix_old_rank); + thread_suffix_old_rank[3] = cub::Max()(thread_suffix_old_rank[3], thread_exclusive_suffix_old_rank); + + thread_suffix_new_rank[0] = cub::Max()(thread_suffix_new_rank[0], thread_exclusive_suffix_new_rank); + thread_suffix_new_rank[1] = cub::Max()(thread_suffix_new_rank[1], thread_exclusive_suffix_new_rank); + thread_suffix_new_rank[2] = cub::Max()(thread_suffix_new_rank[2], thread_exclusive_suffix_new_rank); + thread_suffix_new_rank[3] = cub::Max()(thread_suffix_new_rank[3], thread_exclusive_suffix_new_rank); + + thread_segment_index[0] = cub::Sum()(thread_segment_index[0], thread_exclusive_segment_index); + thread_segment_index[1] = cub::Sum()(thread_segment_index[1], thread_exclusive_segment_index); + thread_segment_index[2] = cub::Sum()(thread_segment_index[2], thread_exclusive_segment_index); + thread_segment_index[3] = cub::Sum()(thread_segment_index[3], thread_exclusive_segment_index); + + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + if (thread_segment_end_flag.x != 0) { device_offsets_begin[thread_segment_index[0]] = thread_exclusive_suffix_new_rank; device_offsets_end[thread_segment_index[0]] = thread_index + 0; } + if (thread_segment_end_flag.y != 0) { device_offsets_begin[thread_segment_index[1]] = thread_suffix_new_rank[0]; device_offsets_end[thread_segment_index[1]] = thread_index + 1; } + if (thread_segment_end_flag.z != 0) { device_offsets_begin[thread_segment_index[2]] = thread_suffix_new_rank[1]; device_offsets_end[thread_segment_index[2]] = thread_index + 2; } + if (thread_segment_end_flag.w != 0) { device_offsets_begin[thread_segment_index[3]] = thread_suffix_new_rank[2]; device_offsets_end[thread_segment_index[3]] = thread_index + 3; } + + if (scatter_ranks_directly) + { + const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); + + if (thread_suffix_old_rank[0] != thread_suffix_new_rank[0]) { device_out_ISA[indexes.x] = thread_suffix_new_rank[0]; } + if (thread_suffix_old_rank[1] != thread_suffix_new_rank[1]) { device_out_ISA[indexes.y] = thread_suffix_new_rank[1]; } + if (thread_suffix_old_rank[2] != thread_suffix_new_rank[2]) { device_out_ISA[indexes.z] = thread_suffix_new_rank[2]; } + if (thread_suffix_old_rank[3] != thread_suffix_new_rank[3]) { device_out_ISA[indexes.w] = thread_suffix_new_rank[3]; } + } + else + { + thread_updated_suffix_flag.x = thread_suffix_old_rank[0] != thread_suffix_new_rank[0]; + thread_updated_suffix_flag.y = thread_suffix_old_rank[1] != thread_suffix_new_rank[1]; + thread_updated_suffix_flag.z = thread_suffix_old_rank[2] != thread_suffix_new_rank[2]; + thread_updated_suffix_flag.w = thread_suffix_old_rank[3] != thread_suffix_new_rank[3]; + + thread_suffix_index[0] = (uint32_t)thread_updated_suffix_flag.x; + thread_suffix_index[1] = thread_suffix_index[0] + (uint32_t)thread_updated_suffix_flag.y; + thread_suffix_index[2] = thread_suffix_index[1] + (uint32_t)thread_updated_suffix_flag.z; + thread_suffix_index[3] = thread_suffix_index[2] + (uint32_t)thread_updated_suffix_flag.w; + } + } + + if (!scatter_ranks_directly) + { + { + uint32_t thread_inclusive_suffix_index; + + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_index[3], thread_inclusive_suffix_index, thread_exclusive_suffix_index, (uint32_t)0, cub::Sum()); + + if ((threadIdx.x % CUDA_WARP_THREADS) == (CUDA_WARP_THREADS - 1)) + { + warp_state2[threadIdx.x / CUDA_WARP_THREADS] = thread_inclusive_suffix_index; + } + + __syncthreads(); + } + + { + if (threadIdx.x < CUDA_WARP_THREADS) + { + uint32_t block_exclusive_suffix_index = 0; + uint32_t warp_inclusive_suffix_index; + + { + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage; + + uint32_t warp_inclusive_state = warp_state2[threadIdx.x]; + + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state, warp_inclusive_suffix_index, cub::Sum()); + } + + { + const uint32_t descriptor_status_aggregate_not_ready = alternate_block_descriptor_statuses ? 2 : 2; + const uint32_t descriptor_status_partial_aggregate_ready = alternate_block_descriptor_statuses ? 1 : 3; + const uint32_t descriptor_status_full_aggregate_ready = alternate_block_descriptor_statuses ? 0 : 4; + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore((uint2 *)(device_descriptors + blockIdx.x), make_uint2(descriptor_status_partial_aggregate_ready, warp_inclusive_suffix_index)); + } + + { + uint4 * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x + threadIdx.x; + + int32_t full_aggregate_lane, delay = 8; + do + { + descriptors_lookback -= CUDA_WARP_THREADS; + + uint2 block_descriptor; + do + { + libcubwt_delay_or_prevent_hoisting(delay <<= 1); + + block_descriptor = cub::ThreadLoad((uint2 *)descriptors_lookback); + } while (__any_sync((uint32_t)-1, alternate_block_descriptor_statuses + ? ((int32_t )block_descriptor.x >= (int32_t )descriptor_status_aggregate_not_ready) + : ((uint32_t)block_descriptor.x <= (uint32_t)descriptor_status_aggregate_not_ready))); + + delay = 0; + + { + full_aggregate_lane = 31 - __clz((int32_t)__ballot_sync((uint32_t)-1, block_descriptor.x != descriptor_status_partial_aggregate_ready)); + block_descriptor.y = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.y : 0; + } + + { + block_exclusive_suffix_index = cub::Sum()(block_exclusive_suffix_index, libcubwt_warp_reduce_sum(block_descriptor.y)); + } + + } while (full_aggregate_lane == -1); + + warp_inclusive_suffix_index = cub::Sum()(warp_inclusive_suffix_index, block_exclusive_suffix_index); + } + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore((uint2 *)(device_descriptors + blockIdx.x), make_uint2(descriptor_status_full_aggregate_ready, warp_inclusive_suffix_index)); + } + } + + { + if (threadIdx.x == 0) + { + warp_state2[0] = block_exclusive_suffix_index; + } + + warp_state2[1 + threadIdx.x] = warp_inclusive_suffix_index; + } + } + + __syncthreads(); + } + + { + if (thread_updated_suffix_flag.x + thread_updated_suffix_flag.y + thread_updated_suffix_flag.z + thread_updated_suffix_flag.w > 0) + { + uint32_t warp_exclusive_state = warp_state2[threadIdx.x / CUDA_WARP_THREADS]; + + thread_exclusive_suffix_index = cub::Sum()(thread_exclusive_suffix_index, warp_exclusive_state); + + thread_suffix_index[0] = cub::Sum()(thread_suffix_index[0], thread_exclusive_suffix_index); + thread_suffix_index[1] = cub::Sum()(thread_suffix_index[1], thread_exclusive_suffix_index); + thread_suffix_index[2] = cub::Sum()(thread_suffix_index[2], thread_exclusive_suffix_index); + thread_suffix_index[3] = cub::Sum()(thread_suffix_index[3], thread_exclusive_suffix_index); + + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); + + if (thread_updated_suffix_flag.x != 0) { device_out_SA[thread_suffix_index[0]] = indexes.x; device_out_ISA[thread_suffix_index[0]] = thread_suffix_new_rank[0]; } + if (thread_updated_suffix_flag.y != 0) { device_out_SA[thread_suffix_index[1]] = indexes.y; device_out_ISA[thread_suffix_index[1]] = thread_suffix_new_rank[1]; } + if (thread_updated_suffix_flag.z != 0) { device_out_SA[thread_suffix_index[2]] = indexes.z; device_out_ISA[thread_suffix_index[2]] = thread_suffix_new_rank[2]; } + if (thread_updated_suffix_flag.w != 0) { device_out_SA[thread_suffix_index[3]] = indexes.w; device_out_ISA[thread_suffix_index[3]] = thread_suffix_new_rank[3]; } + } + } + } +} + +static cudaError_t libcubwt_rank_and_segment_suffixes(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t iteration) +{ + cudaError_t status = cudaSuccess; + int64_t n_segmentation_blocks = 1 + (n / (CUDA_BLOCK_THREADS * 4)); + int64_t n_initialization_blocks = (n_segmentation_blocks + CUDA_BLOCK_THREADS - 1) / CUDA_BLOCK_THREADS; + bool scatter_ranks_directly = (n <= ((int64_t)1 << (storage->device_L2_cache_bits - 3))); + + if (iteration == 0) + { + libcubwt_rank_and_segment_suffixes_initialization_kernel<<<(uint32_t)n_initialization_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_keys_temp_keys, + storage->device_heads, + storage->device_descriptors_large, + storage->device_descriptors_small, + (uint32_t)n); + + if (scatter_ranks_directly) + { + libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_SA, + storage->device_keys_temp_keys, + storage->device_heads, + storage->device_ISA, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large); + } + else + { + libcubwt_rank_and_segment_suffixes_initiatory_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + NULL, + storage->device_keys_temp_keys, + storage->device_heads, + storage->device_temp_ISA, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large); + } + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->host_pinned_storage, &storage->device_descriptors_large[n_segmentation_blocks - 1], sizeof(uint4), cudaMemcpyDeviceToHost, storage->cuda_stream)); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); + + if (status == cudaSuccess) + { + storage->num_unsorted_segments = ((uint4 *)storage->host_pinned_storage)->w; + + if (!scatter_ranks_directly) + { + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_temp_SA, storage->device_SA, n * sizeof(uint32_t), cudaMemcpyDeviceToDevice, storage->cuda_stream))) == cudaSuccess) + { + status = libcubwt_permute_values_uint32(storage, storage->device_temp_SA, storage->device_temp_ISA, storage->device_ISA, n, storage->device_keys, storage->device_temp_keys); + } + } + } + } + else + { + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->device_descriptors_copy - 1, storage->device_descriptors_large - 1, n_segmentation_blocks * sizeof(uint4), cudaMemcpyDeviceToDevice, storage->cuda_stream))) == cudaSuccess) + { + if (scatter_ranks_directly) + { + if ((iteration % 2) == 0) + { + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_SA, + storage->device_keys, + storage->device_heads, + NULL, storage->device_ISA, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large, storage->device_descriptors_copy); + } + else + { + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_SA, + storage->device_keys, + storage->device_heads, + NULL, storage->device_ISA, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large, storage->device_descriptors_copy); + } + } + else + { + if ((iteration % 2) == 0) + { + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_SA, + storage->device_keys, + storage->device_heads, + storage->device_temp_SA - 1, storage->device_temp_ISA - 1, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large, storage->device_descriptors_copy); + } + else + { + libcubwt_rank_and_segment_suffixes_incremental_kernel<<<(uint32_t)n_segmentation_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_SA, + storage->device_keys, + storage->device_heads, + storage->device_temp_SA - 1, storage->device_temp_ISA - 1, + storage->device_offsets - 1, storage->device_offsets + (storage->max_length / 2) - 1, + storage->device_descriptors_large, storage->device_descriptors_copy); + } + } + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->host_pinned_storage, &storage->device_descriptors_large[n_segmentation_blocks - 1], sizeof(uint4), cudaMemcpyDeviceToHost, storage->cuda_stream)); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); + + if (status == cudaSuccess) + { + storage->num_unsorted_segments = ((uint4 *)storage->host_pinned_storage)->w; + + if (!scatter_ranks_directly) + { + uint32_t num_updated_suffixes = ((uint4 *)storage->host_pinned_storage)->y; + + if (num_updated_suffixes > 0) + { + status = libcubwt_scatter_values_uint32(storage, storage->device_temp_SA, storage->device_temp_ISA, storage->device_ISA, num_updated_suffixes, n, storage->device_keys, storage->device_temp_keys); + } + } + } + } + } + + return status; +} + +template +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_gather_unsorted_suffixes_kernel( + const uint8_t * RESTRICT device_heads, + const uint32_t * RESTRICT device_SA, + uint32_t * RESTRICT device_out_keys, + uint32_t * RESTRICT device_out_SA, + uint2 * RESTRICT device_descriptors) +{ + __shared__ __align__(32) uint32_t warp_state[1 + CUDA_WARP_THREADS]; + + uchar4 thread_updated_suffix_flag; + + uint32_t thread_exclusive_suffix_index; + uint32_t thread_suffix_index[4]; + + { + device_heads += blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + const uchar4 current_heads = __ldg((uchar4 *)(device_heads)); + const uint8_t next_head = current_heads.w > 0 ? __ldg(device_heads + 4) : 0; + + thread_updated_suffix_flag.x = (current_heads.x + current_heads.y < 2); + thread_updated_suffix_flag.y = (current_heads.y + current_heads.z < 2); + thread_updated_suffix_flag.z = (current_heads.z + current_heads.w < 2); + thread_updated_suffix_flag.w = (current_heads.w + next_head < 2); + + thread_suffix_index[0] = (uint32_t)thread_updated_suffix_flag.x; + thread_suffix_index[1] = thread_suffix_index[0] + (uint32_t)thread_updated_suffix_flag.y; + thread_suffix_index[2] = thread_suffix_index[1] + (uint32_t)thread_updated_suffix_flag.z; + thread_suffix_index[3] = thread_suffix_index[2] + (uint32_t)thread_updated_suffix_flag.w; + } + + { + uint32_t thread_inclusive_suffix_index; + + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS]; + + WarpScan(warp_scan_storage[threadIdx.x / CUDA_WARP_THREADS]).Scan(thread_suffix_index[3], thread_inclusive_suffix_index, thread_exclusive_suffix_index, (uint32_t)0, cub::Sum()); + + if ((threadIdx.x % CUDA_WARP_THREADS) == (CUDA_WARP_THREADS - 1)) + { + warp_state[threadIdx.x / CUDA_WARP_THREADS] = thread_inclusive_suffix_index; + } + + __syncthreads(); + } + + { + if (threadIdx.x < CUDA_WARP_THREADS) + { + uint32_t block_exclusive_suffix_index = 0; + uint32_t warp_inclusive_suffix_index; + + { + typedef cub::WarpScan WarpScan; + + __shared__ typename WarpScan::TempStorage warp_scan_storage; + + uint32_t warp_inclusive_state = warp_state[threadIdx.x]; + + WarpScan(warp_scan_storage).InclusiveScan(warp_inclusive_state, warp_inclusive_suffix_index, cub::Sum()); + } + + { + const uint32_t descriptor_status_aggregate_not_ready = alternate_block_descriptor_statuses ? 2 : 0; + const uint32_t descriptor_status_partial_aggregate_ready = alternate_block_descriptor_statuses ? 1 : 1; + const uint32_t descriptor_status_full_aggregate_ready = alternate_block_descriptor_statuses ? 0 : 2; + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint2(descriptor_status_partial_aggregate_ready, warp_inclusive_suffix_index)); + } + + { + uint2 * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x + threadIdx.x; + + int32_t full_aggregate_lane, delay = 8; + do + { + descriptors_lookback -= CUDA_WARP_THREADS; + + uint2 block_descriptor; + do + { + libcubwt_delay_or_prevent_hoisting(delay <<= 1); + + block_descriptor = cub::ThreadLoad(descriptors_lookback); + } while (__any_sync((uint32_t)-1, block_descriptor.x == descriptor_status_aggregate_not_ready)); + + delay = 0; + + { + full_aggregate_lane = 31 - __clz((int32_t)__ballot_sync((uint32_t)-1, block_descriptor.x != descriptor_status_partial_aggregate_ready)); + block_descriptor.y = (((int32_t)threadIdx.x) >= full_aggregate_lane) ? block_descriptor.y : 0; + } + + { + block_exclusive_suffix_index = cub::Sum()(block_exclusive_suffix_index, libcubwt_warp_reduce_sum(block_descriptor.y)); + } + + } while (full_aggregate_lane == -1); + + warp_inclusive_suffix_index = cub::Sum()(warp_inclusive_suffix_index, block_exclusive_suffix_index); + } + + if (threadIdx.x == ((CUDA_BLOCK_THREADS / CUDA_WARP_THREADS) - 1)) + { + cub::ThreadStore(device_descriptors + blockIdx.x, make_uint2(descriptor_status_full_aggregate_ready, warp_inclusive_suffix_index)); + } + } + + { + if (threadIdx.x == 0) + { + warp_state[0] = block_exclusive_suffix_index; + } + + warp_state[1 + threadIdx.x] = warp_inclusive_suffix_index; + } + } + + __syncthreads(); + } + + { + if (thread_updated_suffix_flag.x + thread_updated_suffix_flag.y + thread_updated_suffix_flag.z + thread_updated_suffix_flag.w > 0) + { + uint32_t warp_exclusive_state = warp_state[threadIdx.x / CUDA_WARP_THREADS]; + + thread_exclusive_suffix_index = cub::Sum()(thread_exclusive_suffix_index, warp_exclusive_state); + + thread_suffix_index[0] = cub::Sum()(thread_suffix_index[0], thread_exclusive_suffix_index); + thread_suffix_index[1] = cub::Sum()(thread_suffix_index[1], thread_exclusive_suffix_index); + thread_suffix_index[2] = cub::Sum()(thread_suffix_index[2], thread_exclusive_suffix_index); + thread_suffix_index[3] = cub::Sum()(thread_suffix_index[3], thread_exclusive_suffix_index); + + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + const uint4 indexes = __ldg((uint4 *)(device_SA + thread_index)); + + if (thread_updated_suffix_flag.x != 0) { device_out_keys[thread_suffix_index[0]] = thread_index + 0; device_out_SA[thread_suffix_index[0]] = indexes.x; } + if (thread_updated_suffix_flag.y != 0) { device_out_keys[thread_suffix_index[1]] = thread_index + 1; device_out_SA[thread_suffix_index[1]] = indexes.y; } + if (thread_updated_suffix_flag.z != 0) { device_out_keys[thread_suffix_index[2]] = thread_index + 2; device_out_SA[thread_suffix_index[2]] = indexes.z; } + if (thread_updated_suffix_flag.w != 0) { device_out_keys[thread_suffix_index[3]] = thread_index + 3; device_out_SA[thread_suffix_index[3]] = indexes.w; } + } + } +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_update_suffix_sorting_keys_kernel(const uint8_t * RESTRICT device_heads, const uint32_t * RESTRICT device_SA, const uint32_t * RESTRICT device_ISA, uint32_t * RESTRICT device_keys) +{ + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + device_heads += thread_index; + + const uchar4 current_heads = __ldg((uchar4 *)(device_heads)); + const uint8_t next_head = current_heads.w > 0 ? __ldg(device_heads + 4) : 0; + + if (current_heads.x + current_heads.y + current_heads.z + current_heads.w + next_head < 5) + { + device_SA += thread_index; device_keys += thread_index; + + const uint4 current_SA = __ldg((uint4 *)(device_SA)); + + ((uint4 *)device_keys)[0] = make_uint4( + (current_heads.x + current_heads.y < 2) ? __ldg(device_ISA + current_SA.x) : (uint32_t)-1, + (current_heads.y + current_heads.z < 2) ? __ldg(device_ISA + current_SA.y) : (uint32_t)-2, + (current_heads.z + current_heads.w < 2) ? __ldg(device_ISA + current_SA.z) : (uint32_t)-3, + (current_heads.w + next_head < 2) ? __ldg(device_ISA + current_SA.w) : (uint32_t)-4); + } +} + +static cudaError_t libcubwt_update_suffix_sorting_keys(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t iteration, int64_t depth) +{ + cudaError_t status = cudaSuccess; + int64_t n_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + bool gather_keys_directly = (n <= ((int64_t)1 << (storage->device_L2_cache_bits - 2))) || (n > ((int64_t)1 << (storage->device_L2_cache_bits - 2 + 8))); + + if (gather_keys_directly || (storage->num_unsorted_suffixes <= (n / 4))) + { + libcubwt_update_suffix_sorting_keys_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_heads, storage->device_SA, storage->device_ISA + depth, storage->device_keys); + } + else + { + if ((iteration % 2) == 0) + { + libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_heads, + storage->device_SA, + storage->device_temp_keys - 1, storage->device_temp_SA - 1, + storage->device_descriptors_small); + } + else + { + libcubwt_gather_unsorted_suffixes_kernel<<<(uint32_t)n_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>( + storage->device_heads, + storage->device_SA, + storage->device_temp_keys - 1, storage->device_temp_SA - 1, + storage->device_descriptors_small); + } + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(storage->host_pinned_storage, &storage->device_descriptors_small[n_blocks - 1], sizeof(uint2), cudaMemcpyDeviceToHost, storage->cuda_stream)); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status); + + if (status == cudaSuccess) + { + storage->num_unsorted_suffixes = ((uint2 *)storage->host_pinned_storage)->y; + + if (storage->num_unsorted_suffixes > 0) + { + status = libcubwt_gather_scatter_values_uint32(storage, storage->device_temp_SA, storage->device_ISA + depth, storage->device_temp_keys, storage->device_keys, storage->num_unsorted_suffixes, n, storage->device_temp_ISA, storage->device_keys); + } + } + } + + return status; +} + +static cudaError_t libcubwt_sort_segmented_suffixes_by_rank(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n) +{ + cub::DoubleBuffer d_keys(storage->device_keys, storage->device_temp_keys); + cub::DoubleBuffer d_values(storage->device_SA, storage->device_temp_SA); + + cudaError_t status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceSegmentedSort::SortPairs( + storage->device_ssort_temp_storage, storage->device_ssort_temp_storage_size, + d_keys, d_values, + (int)n, (int)storage->num_unsorted_segments, + storage->device_offsets, storage->device_offsets + (storage->max_length / 2), + storage->cuda_stream)); + + if (d_keys.selector) { std::swap(storage->device_keys, storage->device_temp_keys); } + if (d_values.selector) { std::swap(storage->device_SA, storage->device_temp_SA); storage->is_partial_suffix_array = true; } + + return status; +} + +static cudaError_t libcubwt_compute_inverse_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, int64_t n) +{ + cudaError_t status = cudaSuccess; + + if ((status = libcubwt_initialize_device_arrays(storage, T, n)) == cudaSuccess && + (status = libcubwt_sort_suffixes_by_prefix(storage, n)) == cudaSuccess) + { + for (int64_t iteration = 0, depth = 7; true; iteration += 1, depth *= 2) + { + if ((status = libcubwt_rank_and_segment_suffixes(storage, n, iteration)) != cudaSuccess) + { + break; + } + + if (storage->num_unsorted_segments == 0) + { + break; + } + + if ((status = libcubwt_update_suffix_sorting_keys(storage, n, iteration, depth)) != cudaSuccess) + { + break; + } + + if ((status = libcubwt_sort_segmented_suffixes_by_rank(storage, n)) != cudaSuccess) + { + break; + } + } + } + + return status; +} + +static cudaError_t libcubwt_copy_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * SA, int64_t n) +{ + return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(SA, storage->device_SA, n * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream)); +} + +static cudaError_t libcubwt_copy_inverse_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, uint32_t * ISA, int64_t n) +{ + return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(ISA, storage->device_ISA, n * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream)); +} + +static cudaError_t libcubwt_synchronize_cuda_stream(LIBCUBWT_DEVICE_STORAGE * storage) +{ + return libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream)); +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, 1) +static void libcubwt_extract_auxiliary_indexes_kernel(const uint32_t * RESTRICT device_ISA, uint32_t * RESTRICT device_offsets, uint32_t n, uint32_t r) +{ + for (uint32_t thread_index = threadIdx.x; thread_index < n; thread_index += CUDA_BLOCK_THREADS) + { + device_offsets[thread_index] = __ldg(device_ISA + (thread_index * r)); + } +} + +static cudaError_t libcubwt_extract_auxiliary_indexes(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n, int64_t r, uint32_t * I) +{ + cudaError_t status = cudaSuccess; + int64_t n_indexes = (n + r - 1) / r; + uint32_t * buffer = ((sizeof(uint32_t) * n_indexes) <= storage->host_pinned_storage_size) ? (uint32_t *)storage->host_pinned_storage : I; + + if (n_indexes > 1) + { + libcubwt_extract_auxiliary_indexes_kernel<<<1, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_ISA, storage->device_offsets, (uint32_t)n_indexes, (uint32_t)r); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(buffer, storage->device_offsets, sizeof(uint32_t) * n_indexes, cudaMemcpyDeviceToHost, storage->cuda_stream), status); + } + else + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(buffer, storage->device_ISA, sizeof(uint32_t) * n_indexes, cudaMemcpyDeviceToHost, storage->cuda_stream), status); + } + + if ((status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status)) == cudaSuccess) + { + if (I != buffer) { memcpy(I, buffer, sizeof(uint32_t) * n_indexes); } + + for (int64_t index = 0; index < n_indexes; index += 1) { I[index] += 1; } + } + + return status; +} + +__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS) +static void libcubwt_reinitialize_suffix_array_kernel(uint32_t * RESTRICT device_SA) +{ + const uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 4 + threadIdx.x * 4; + + *(uint4 *)(device_SA + thread_index) = make_uint4(thread_index + 0, thread_index + 1, thread_index + 2, thread_index + 3); +} + +static cudaError_t libcubwt_compute_suffix_array(LIBCUBWT_DEVICE_STORAGE * storage, int64_t n) +{ + cudaError_t status = cudaSuccess; + + if (storage->is_partial_suffix_array) + { + int64_t n_reinitialize_blocks = (n + CUDA_BLOCK_THREADS * 4 - 1) / (CUDA_BLOCK_THREADS * 4); + + libcubwt_reinitialize_suffix_array_kernel<<<(uint32_t)n_reinitialize_blocks, CUDA_BLOCK_THREADS, 0, storage->cuda_stream>>>(storage->device_temp_SA); + + status = libcubwt_permute_values_uint32(storage, storage->device_ISA, storage->device_temp_SA, storage->device_SA, n, storage->device_keys, storage->device_temp_keys); + } + + return status; +} + +static cudaError_t libcubwt_compute_burrows_wheeler_transform(LIBCUBWT_DEVICE_STORAGE * storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t index) +{ + cudaError_t status = cudaSuccess; + + if ((status = libcubwt_permute_values_uint8(storage, storage->device_ISA, storage->device_T - 1, storage->device_heads, n, storage->device_keys, (uint8_t *)storage->device_temp_keys)) == cudaSuccess) + { + L[0] = T[n - 1]; + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + 1, storage->device_heads, (size_t)(index - 1), cudaMemcpyDeviceToHost, storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(L + index, storage->device_heads + index, (size_t)(n - index), cudaMemcpyDeviceToHost, storage->cuda_stream), status); + } + + return status; +} + +int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_length) +{ + max_length = ((max_length + (int64_t)(CUDA_DEVICE_PADDING) - (int64_t)1) / (int64_t)(CUDA_DEVICE_PADDING)) * (int64_t)(CUDA_DEVICE_PADDING); + + if ((device_storage == NULL) || (max_length >= UINT32_MAX)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + *device_storage = NULL; + + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)malloc(sizeof(LIBCUBWT_DEVICE_STORAGE)); + if (storage != NULL) + { + memset(storage, 0, sizeof(LIBCUBWT_DEVICE_STORAGE)); + + cudaError_t status = cudaSuccess; + + int32_t cuda_device_ordinal; + int32_t cuda_device_L2_cache_size; + + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaGetDevice(&cuda_device_ordinal), status); + libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&cuda_device_L2_cache_size, cudaDevAttrL2CacheSize, cuda_device_ordinal), status); + + if (status == cudaSuccess) + { + { + storage->device_L2_cache_bits = 0; while (cuda_device_L2_cache_size >>= 1) { storage->device_L2_cache_bits += 1; }; + } + + { + cub::DoubleBuffer uint8_db; + cub::DoubleBuffer uint32_db; + cub::DoubleBuffer uint64_db; + + size_t temp_radix_sort_k32v8 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v8, uint32_db, uint8_db, (uint32_t)max_length), status); + + size_t temp_radix_sort_k32v32 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k32v32, uint32_db, uint32_db, (uint32_t)max_length), status); + + size_t temp_radix_sort_k64v32 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(NULL, temp_radix_sort_k64v32, uint64_db, uint32_db, (uint32_t)max_length), status); + + storage->device_rsort_temp_storage_size = std::max(std::max(temp_radix_sort_k32v8, temp_radix_sort_k32v32), temp_radix_sort_k64v32); + } + + { + cub::DoubleBuffer uint32_db; + + size_t temp_radix_segmented_sort_k32v32 = 0; + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceSegmentedSort::SortPairs(NULL, temp_radix_segmented_sort_k32v32, uint32_db, uint32_db, (int)max_length, (int)max_length / 2, uint32_db.Current(), uint32_db.Current()), status); + + storage->device_ssort_temp_storage_size = std::max(temp_radix_segmented_sort_k32v32, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)); + } + + int64_t n_descriptors = (max_length / (CUDA_BLOCK_THREADS * 4) + 4) & (-4); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc1, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc2, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc3, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc4, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint32_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_ssort_temp_storage, storage->device_ssort_temp_storage_size), status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc5, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t)) , status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc6, (max_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t)) , status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_rsort_temp_storage, storage->device_rsort_temp_storage_size), status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_alloc7, (3 * n_descriptors + (int64_t)6 * CUDA_DEVICE_PADDING) * sizeof(uint4)) , status); + + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size = CUDA_DEVICE_PADDING * sizeof(uint32_t)), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&storage->cuda_stream), status); + + if (status == cudaSuccess) + { + storage->max_length = max_length; + + storage->device_T = (uint8_t *)storage->device_alloc5 + CUDA_DEVICE_PADDING; + storage->device_heads = (uint8_t *)storage->device_alloc6 + CUDA_DEVICE_PADDING; + + storage->device_SA = (uint32_t *)storage->device_alloc3 + CUDA_DEVICE_PADDING; + storage->device_temp_SA = (uint32_t *)storage->device_alloc4 + CUDA_DEVICE_PADDING; + + storage->device_temp_ISA = (uint32_t *)storage->device_ssort_temp_storage + CUDA_DEVICE_PADDING; + + storage->device_keys = (uint32_t *)storage->device_alloc1 + CUDA_DEVICE_PADDING; + storage->device_offsets = (uint32_t *)storage->device_alloc2 + CUDA_DEVICE_PADDING; + + storage->device_temp_keys = (uint32_t *)storage->device_alloc1 + 3 * CUDA_DEVICE_PADDING + max_length; + storage->device_ISA = (uint32_t *)storage->device_alloc2 + 3 * CUDA_DEVICE_PADDING + max_length; + + storage->device_keys_temp_keys = (uint64_t *)storage->device_alloc1 + CUDA_DEVICE_PADDING; + storage->device_offsets_ISA = (uint64_t *)storage->device_alloc2 + CUDA_DEVICE_PADDING; + + storage->device_descriptors_large = (uint4 *)storage->device_alloc7 + CUDA_DEVICE_PADDING; + storage->device_descriptors_copy = (uint4 *)storage->device_alloc7 + 3 * CUDA_DEVICE_PADDING + 1 * n_descriptors; + storage->device_descriptors_small = (uint2 *)storage->device_alloc7 + 10 * CUDA_DEVICE_PADDING + 4 * n_descriptors; + + *device_storage = storage; + return LIBCUBWT_NO_ERROR; + } + } + + libcubwt_free_device_storage(storage); + + return libcubwt_get_error_code(status); + } + + return LIBCUBWT_NOT_ENOUGH_MEMORY; +} + +int64_t libcubwt_free_device_storage(void * device_storage) +{ + cudaError_t status = cudaSuccess; + + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + if (storage != NULL) + { + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamDestroy(storage->cuda_stream), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFreeHost((void *)storage->host_pinned_storage), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc7), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_rsort_temp_storage), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc6), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc5), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_ssort_temp_storage), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc4), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc3), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc2), status); + status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaFree((void *)storage->device_alloc1), status); + + free(storage); + } + + return status != cudaSuccess ? libcubwt_get_error_code(status) : LIBCUBWT_NO_ERROR; +} + +int64_t libcubwt_sa(void * device_storage, const uint8_t * T, uint32_t * SA, int64_t n) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (SA == NULL) || (n < 16) || (n > storage->max_length)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_compute_suffix_array(storage, n)) == cudaSuccess && + (status = libcubwt_copy_suffix_array(storage, SA, n)) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return LIBCUBWT_NO_ERROR; + } + + return libcubwt_get_error_code(status); +} + +int64_t libcubwt_isa(void * device_storage, const uint8_t * T, uint32_t * ISA, int64_t n) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (ISA == NULL) || (n < 16) || (n > storage->max_length)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return LIBCUBWT_NO_ERROR; + } + + return libcubwt_get_error_code(status); +} + +int64_t libcubwt_sa_isa(void * device_storage, const uint8_t * T, uint32_t * SA, uint32_t * ISA, int64_t n) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (ISA == NULL) || (n < 16) || (n > storage->max_length)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && + (status = libcubwt_compute_suffix_array(storage, n)) == cudaSuccess && + (status = libcubwt_copy_suffix_array(storage, SA, n)) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return LIBCUBWT_NO_ERROR; + } + + return libcubwt_get_error_code(status); +} + +int64_t libcubwt_bwt(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; uint32_t index; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_extract_auxiliary_indexes(storage, n, n, &index)) == cudaSuccess && + (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, index)) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return index; + } + + return libcubwt_get_error_code(status); +} + +int64_t libcubwt_bwt_isa(void * device_storage, const uint8_t * T, uint8_t * L, uint32_t * ISA, int64_t n) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; uint32_t index; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_copy_inverse_suffix_array(storage, ISA, n)) == cudaSuccess && + (status = libcubwt_extract_auxiliary_indexes(storage, n, n, &index)) == cudaSuccess && + (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, index)) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return index; + } + + return libcubwt_get_error_code(status); +} + +int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t r, uint32_t * I) +{ + LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage; + + if ((storage == NULL) || (T == NULL) || (L == NULL) || (n < 16) || (n > storage->max_length) || (r < 2) || ((r & (r - 1)) != 0) || (I == NULL)) + { + return LIBCUBWT_BAD_PARAMETER; + } + + cudaError_t status; + if ((status = libcubwt_compute_inverse_suffix_array(storage, T, n)) == cudaSuccess && + (status = libcubwt_extract_auxiliary_indexes(storage, n, r, I)) == cudaSuccess && + (status = libcubwt_compute_burrows_wheeler_transform(storage, T, L, n, I[0])) == cudaSuccess && + (status = libcubwt_synchronize_cuda_stream(storage)) == cudaSuccess) + { + return LIBCUBWT_NO_ERROR; + } + + return libcubwt_get_error_code(status); +} diff --git a/libbsc/bwt/libcubwt/libcubwt.cuh b/libbsc/bwt/libcubwt/libcubwt.cuh new file mode 100644 index 0000000..6872c5a --- /dev/null +++ b/libbsc/bwt/libcubwt/libcubwt.cuh @@ -0,0 +1,128 @@ +/*-- + +This file is a part of libcubwt, a library for CUDA accelerated +suffix array and burrows wheeler transform construction. + + Copyright (c) 2022-2023 Ilya Grebnov + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. + +Please see the file LICENSE for full copyright and license details. + +--*/ + +#ifndef LIBCUBWT_CUH +#define LIBCUBWT_CUH 1 + +#define LIBCUBWT_VERSION_MAJOR 1 +#define LIBCUBWT_VERSION_MINOR 0 +#define LIBCUBWT_VERSION_PATCH 0 +#define LIBCUBWT_VERSION_STRING "1.0.0" + +#define LIBCUBWT_NO_ERROR 0 +#define LIBCUBWT_BAD_PARAMETER -1 +#define LIBCUBWT_NOT_ENOUGH_MEMORY -2 + +#define LIBCUBWT_GPU_ERROR -7 +#define LIBCUBWT_GPU_NOT_SUPPORTED -8 +#define LIBCUBWT_GPU_NOT_ENOUGH_MEMORY -9 + +#ifdef __cplusplus +extern "C" { +#endif + + #include + + /** + * Allocates storage on the CUDA device that allows reusing allocated memory with each libcubwt operation. + * @param max_length The maximum length of string to support. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_length); + + /** + * Destroys the previously allocated storage on the CUDA device. + * @param device_storage The previously allocated storage on the CUDA device. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_free_device_storage(void * device_storage); + + /** + * Constructs the suffix array (SA) of a given string. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The output array of suffixes. + * @param n The length of the input string. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_sa(void * device_storage, const uint8_t * T, uint32_t * SA, int64_t n); + + /** + * Constructs the inverse suffix array (ISA) of a given string. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param ISA [0..n-1] The output inverse array of suffixes. + * @param n The length of the input string. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_isa(void * device_storage, const uint8_t * T, uint32_t * ISA, int64_t n); + + /** + * Constructs the suffix array (SA) and inverse suffix array (ISA) of a given string. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param SA [0..n-1] The output array of suffixes. + * @param ISA [0..n-1] The output inverse array of suffixes. + * @param n The length of the input string. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_sa_isa(void * device_storage, const uint8_t * T, uint32_t * SA, uint32_t * ISA, int64_t n); + + /** + * Constructs the Burrows-Wheeler Transform (BWT) of a given string. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param L [0..n-1] The output string (can be T). + * @param n The length of the input string. + * @return The primary index if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_bwt(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n); + + /** + * Constructs the Burrows-Wheeler Transform (BWT) and inverse suffix array (ISA) of a given string. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param L [0..n-1] The output string (can be T). + * @param ISA [0..n-1] The output inverse array of suffixes. + * @param n The length of the input string. + * @return The primary index if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_bwt_isa(void * device_storage, const uint8_t * T, uint8_t * L, uint32_t * ISA, int64_t n); + + /** + * Constructs the Burrows-Wheeler Transform (BWT) of a given string with auxiliary indexes. + * @param device_storage The previously allocated storage on the CUDA device. + * @param T [0..n-1] The input string. + * @param L [0..n-1] The output string (can be T). + * @param n The length of the input string. + * @param r The sampling rate for auxiliary indexes (must be power of 2). + * @param I [0..(n-1)/r] The output auxiliary indexes. + * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise. + */ + int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t r, uint32_t * I); + +#ifdef __cplusplus +} +#endif + +#endif diff --git a/libbsc/bwt/libsais/CHANGES b/libbsc/bwt/libsais/CHANGES new file mode 100644 index 0000000..c41b978 --- /dev/null +++ b/libbsc/bwt/libsais/CHANGES @@ -0,0 +1,34 @@ +Changes in 2.7.1 (June 19, 2022) +- Improved cache coherence for ARMv8 architecture. + +Changes in 2.7.0 (April 12, 2022) +- Support for longest common prefix array (LCP) construction. + +Changes in 2.6.5 (January 1, 2022) +- Exposed functions to construct suffix array of a given integer array. +- Improved detection of various compiler intrinsics. +- Capped free space parameter to avoid crashing due to 32-bit integer overflow. + +Changes in 2.6.0 (October 21, 2021) +- libsais16 for 16-bit inputs. + +Changes in 2.5.0 (October 15, 2021) +- Support for optional symbol frequency tables. + +Changes in 2.4.0 (July 14, 2021) +- Reverse Burrows-Wheeler transform. + +Changes in 2.3.0 (June 23, 2021) +- Burrows-Wheeler transform with auxiliary indexes. + +Changes in 2.2.0 (April 27, 2021) +- libsais64 for inputs larger than 2GB. + +Changes in 2.1.0 (April 19, 2021) +- Additional OpenMP acceleration. + +Changes in 2.0.0 (April 4, 2021) +- OpenMP acceleration. + +Changes in 1.0.0 (February 23, 2021) +- Initial Release. diff --git a/libbsc/bwt/libsais/LICENSE b/libbsc/bwt/libsais/LICENSE new file mode 100644 index 0000000..d645695 --- /dev/null +++ b/libbsc/bwt/libsais/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/libbsc/libbsc.h b/libbsc/libbsc.h index b9881b5..29da2a9 100644 --- a/libbsc/libbsc.h +++ b/libbsc/libbsc.h @@ -33,6 +33,11 @@ See also the bsc and libbsc web site: #ifndef _LIBBSC_LIBBSC_H #define _LIBBSC_LIBBSC_H +#define LIBBSC_VERSION_MAJOR 3 +#define LIBBSC_VERSION_MINOR 3 +#define LIBBSC_VERSION_PATCH 0 +#define LIBBSC_VERSION_STRING "3.3.0" + #define LIBBSC_NO_ERROR 0 #define LIBBSC_BAD_PARAMETER -1 #define LIBBSC_NOT_ENOUGH_MEMORY -2 @@ -70,7 +75,7 @@ See also the bsc and libbsc web site: #define LIBBSC_FEATURE_LARGEPAGES 4 #define LIBBSC_FEATURE_CUDA 8 -#define LIBBSC_DEFAULT_LZPHASHSIZE 16 +#define LIBBSC_DEFAULT_LZPHASHSIZE 15 #define LIBBSC_DEFAULT_LZPMINLEN 128 #define LIBBSC_DEFAULT_BLOCKSORTER LIBBSC_BLOCKSORTER_BWT #define LIBBSC_DEFAULT_CODER LIBBSC_CODER_QLFC_STATIC diff --git a/libbsc/libbsc/libbsc.cpp b/libbsc/libbsc/libbsc.cpp index 8b6f54a..763d042 100644 --- a/libbsc/libbsc/libbsc.cpp +++ b/libbsc/libbsc/libbsc.cpp @@ -49,6 +49,7 @@ int bsc_init_full(int features, void* (* malloc)(size_t size), void* (* zero_mal if (result == LIBBSC_NO_ERROR) result = bsc_platform_init(features, malloc, zero_malloc, free); if (result == LIBBSC_NO_ERROR) result = bsc_coder_init(features); + if (result == LIBBSC_NO_ERROR) result = bsc_bwt_init(features); #ifdef LIBBSC_SORT_TRANSFORM_SUPPORT diff --git a/libbsc/platform/platform.h b/libbsc/platform/platform.h index 7be61d9..23e2d98 100644 --- a/libbsc/platform/platform.h +++ b/libbsc/platform/platform.h @@ -181,31 +181,31 @@ extern "C" { /** * You should call this function before you call any of the other platform specific functions. - * @param malloc - function to use to allocate buffers + * @param malloc - function to use to allocate buffers * @param zero_malloc - function to use to allocate zero-filled buffers - * @param free - function used to free buffers - * @param features - the set of additional features. + * @param free - function used to free buffers + * @param features - the set of additional features. * @return LIBBSC_NO_ERROR if no error occurred, error code otherwise. */ int bsc_platform_init(int features, void* (* malloc)(size_t size), void* (* zero_malloc)(size_t size), void (* free)(void* address)); /** * Allocates memory blocks. - * @param size - bytes to allocate. + * @param size - bytes to allocate. * @return a pointer to allocated space or NULL if there is insufficient memory available. */ void * bsc_malloc(size_t size); /** * Allocates memory blocks and initializes all its bits to zero. - * @param size - bytes to allocate. + * @param size - bytes to allocate. * @return a pointer to allocated space or NULL if there is insufficient memory available. */ void * bsc_zero_malloc(size_t size); /** * Deallocates or frees a memory block. - * @param address - previously allocated memory block to be freed. + * @param address - previously allocated memory block to be freed. */ void bsc_free(void * address); diff --git a/libbsc/st/st.cu b/libbsc/st/st.cu index c59df7e..531e82a 100644 --- a/libbsc/st/st.cu +++ b/libbsc/st/st.cu @@ -53,10 +53,17 @@ See also the bsc and libbsc web site: #ifdef LIBBSC_OPENMP -omp_lock_t cuda_lock; +omp_lock_t st_cuda_lock; +cudaStream_t st_cuda_stream; + int bsc_st_cuda_init(int features) { - omp_init_lock(&cuda_lock); + if (features & LIBBSC_FEATURE_CUDA) + { + omp_init_lock(&st_cuda_lock); + st_cuda_stream = NULL; + } + return LIBBSC_NO_ERROR; } @@ -155,19 +162,19 @@ void bsc_st8_encode_cuda_postsort(unsigned long long * RESTRICT K_device, unsign if (K_device[index] == lookup) { atomicMin(I_device, index); } } -int bsc_st567_encode_cuda(unsigned char * T, unsigned char * T_device, int n, int num_blocks, int k) +int bsc_st567_encode_cuda(unsigned char * T, unsigned char * T_device, int n, int num_blocks, int k, cudaStream_t st_cuda_stream) { int index = LIBBSC_GPU_NOT_ENOUGH_MEMORY; { unsigned long long * K_device = NULL; unsigned long long * K_device_sorted = NULL; - if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&K_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned long long))) == cudaSuccess) + if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMallocAsync((void **)&K_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned long long), st_cuda_stream)) == cudaSuccess) { index = LIBBSC_GPU_ERROR; cudaError_t status = cudaSuccess; - bsc_st567_encode_cuda_presort<<>>(T_device, K_device); + bsc_st567_encode_cuda_presort<<>>(T_device, K_device); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { @@ -177,78 +184,81 @@ int bsc_st567_encode_cuda(unsigned char * T, unsigned char * T_device, int n, in void * d_temp_storage = NULL; size_t temp_storage_bytes = 0; - status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, n, (7 - k) * 8, 56), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, n, (7 - k) * 8, 56, st_cuda_stream), status); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMalloc(&d_temp_storage, temp_storage_bytes), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMallocAsync(&d_temp_storage, temp_storage_bytes, st_cuda_stream), status); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, n, (7 - k) * 8, 56), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFree(d_temp_storage), status); - } - } + status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortKeys(d_temp_storage, temp_storage_bytes, d_keys, n, (7 - k) * 8, 56, st_cuda_stream), status); - K_device_sorted = d_keys.Current(); - } + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + K_device_sorted = d_keys.Current(); - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - unsigned long long lookup; - { - unsigned int lo = (T[3 ] << 24) | (T[4] << 16) | (T[5] << 8) | T[6]; - unsigned int hi = (T[n - 1] << 24) | (T[0] << 16) | (T[1] << 8) | T[2]; + unsigned long long lookup; + { + unsigned int lo = (T[3 ] << 24) | (T[4] << 16) | (T[5] << 8) | T[6]; + unsigned int hi = (T[n - 1] << 24) | (T[0] << 16) | (T[1] << 8) | T[2]; - lookup = (((unsigned long long)hi) << 32) | ((unsigned long long)lo); + lookup = (((unsigned long long)hi) << 32) | ((unsigned long long)lo); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T_device - sizeof(int), &n, sizeof(int), cudaMemcpyHostToDevice), status); - } + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T_device - sizeof(int), &n, sizeof(int), cudaMemcpyHostToDevice, st_cuda_stream), status); + } - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - bsc_st567_encode_cuda_postsort<<>>(T_device, K_device_sorted, lookup, (int *)(T_device - sizeof(int))); + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + bsc_st567_encode_cuda_postsort<<>>(T_device, K_device_sorted, lookup, (int *)(T_device - sizeof(int))); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFree(K_device), status); + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T_device + n, T_device - sizeof(int), sizeof(int), cudaMemcpyDeviceToDevice, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T, T_device, n + sizeof(int), cudaMemcpyDeviceToHost, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(st_cuda_stream), status); + } - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T_device + n, T_device - sizeof(int), sizeof(int), cudaMemcpyDeviceToDevice), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T, T_device, n + sizeof(int), cudaMemcpyDeviceToHost), status); - } + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFreeAsync(d_temp_storage, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFreeAsync(K_device, st_cuda_stream), status); - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - index = *(int *)(T + n); - } + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + index = *(int *)(T + n); + } - return index; + return index; + } + } + + cudaFreeAsync(d_temp_storage, st_cuda_stream); + } } } - cudaFree(K_device); + cudaFreeAsync(K_device, st_cuda_stream); } } return index; } -int bsc_st8_encode_cuda(unsigned char * T, unsigned char * T_device, int n, int num_blocks) +int bsc_st8_encode_cuda(unsigned char * T, unsigned char * T_device, int n, int num_blocks, cudaStream_t st_cuda_stream) { int index = LIBBSC_GPU_NOT_ENOUGH_MEMORY; { unsigned char * V_device = NULL; unsigned char * V_device_sorted = NULL; - if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&V_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned char))) == cudaSuccess) + if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMallocAsync((void **)&V_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned char), st_cuda_stream)) == cudaSuccess) { unsigned long long * K_device = NULL; unsigned long long * K_device_sorted = NULL; - if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&K_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned long long))) == cudaSuccess) + if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaMallocAsync((void **)&K_device, 2 * (n + 2 * CUDA_DEVICE_PADDING) * sizeof(unsigned long long), st_cuda_stream)) == cudaSuccess) { index = LIBBSC_GPU_ERROR; cudaError_t status = cudaSuccess; - bsc_st8_encode_cuda_presort<<>>(T_device, K_device, V_device); + bsc_st8_encode_cuda_presort<<>>(T_device, K_device, V_device); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { @@ -260,58 +270,61 @@ int bsc_st8_encode_cuda(unsigned char * T, unsigned char * T_device, int n, int void * d_temp_storage = NULL; size_t temp_storage_bytes = 0; - status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, n), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, n, 0, 64, st_cuda_stream), status); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMalloc(&d_temp_storage, temp_storage_bytes), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMallocAsync(&d_temp_storage, temp_storage_bytes, st_cuda_stream), status); if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, n), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFree(d_temp_storage), status); - } - } + status = bsc_cuda_safe_call(__FILE__, __LINE__, cub::DeviceRadixSort::SortPairs(d_temp_storage, temp_storage_bytes, d_keys, d_values, n, 0, 64, st_cuda_stream), status); - K_device_sorted = d_keys.Current(); - V_device_sorted = d_values.Current(); - } + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + K_device_sorted = d_keys.Current(); + V_device_sorted = d_values.Current(); - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - unsigned long long lookup; - { - unsigned int lo = (T[4] << 24) | (T[5] << 16) | (T[6] << 8) | T[7]; - unsigned int hi = (T[0] << 24) | (T[1] << 16) | (T[2] << 8) | T[3]; + unsigned long long lookup; + { + unsigned int lo = (T[4] << 24) | (T[5] << 16) | (T[6] << 8) | T[7]; + unsigned int hi = (T[0] << 24) | (T[1] << 16) | (T[2] << 8) | T[3]; - lookup = (((unsigned long long)hi) << 32) | ((unsigned long long)lo); + lookup = (((unsigned long long)hi) << 32) | ((unsigned long long)lo); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(V_device_sorted + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int), &n, sizeof(int), cudaMemcpyHostToDevice), status); - } + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(V_device_sorted + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int), &n, sizeof(int), cudaMemcpyHostToDevice, st_cuda_stream), status); + } - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - bsc_st8_encode_cuda_postsort<<>>(K_device_sorted, lookup, (int *)(V_device_sorted + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int))); + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + bsc_st8_encode_cuda_postsort<<>>(K_device_sorted, lookup, (int *)(V_device_sorted + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int))); - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T, V_device_sorted, n + 2 * sizeof(int), cudaMemcpyDeviceToHost), status); - } + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T, V_device_sorted, n + 2 * sizeof(int), cudaMemcpyDeviceToHost, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(st_cuda_stream), status); + } - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFree(K_device), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFree(V_device), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFreeAsync(d_temp_storage, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFreeAsync(K_device, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaFreeAsync(V_device, st_cuda_stream), status); - if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) - { - index = *(int *)(T + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int)); - } + if (bsc_cuda_safe_call(__FILE__, __LINE__, status) == cudaSuccess) + { + index = *(int *)(T + ((n + sizeof(int) - 1) / sizeof(int)) * sizeof(int)); + } - return index; + return index; + } + } + + cudaFreeAsync(d_temp_storage, st_cuda_stream); + } } } - cudaFree(K_device); + cudaFreeAsync(K_device, st_cuda_stream); } - cudaFree(V_device); + cudaFreeAsync(V_device, st_cuda_stream); } } @@ -340,33 +353,48 @@ int bsc_st_encode_cuda(unsigned char * T, int n, int k, int features) } #ifdef LIBBSC_OPENMP - omp_set_lock(&cuda_lock); + omp_set_lock(&st_cuda_lock); + #else + cudaStream_t st_cuda_stream = NULL; #endif + if (st_cuda_stream == NULL) + { + if (bsc_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&st_cuda_stream)) != cudaSuccess) + { + st_cuda_stream = NULL; + } + } + int index = LIBBSC_GPU_NOT_ENOUGH_MEMORY; { unsigned char * T_device = NULL; - if (cudaMalloc((void **)&T_device, n + 2 * CUDA_DEVICE_PADDING) == cudaSuccess) + if (st_cuda_stream != NULL && cudaMallocAsync((void **)&T_device, n + 2 * CUDA_DEVICE_PADDING, st_cuda_stream) == cudaSuccess) { index = LIBBSC_GPU_ERROR; cudaError_t status = cudaSuccess; - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T_device + CUDA_DEVICE_PADDING , T , n , cudaMemcpyHostToDevice ), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T_device + CUDA_DEVICE_PADDING + n, T_device + CUDA_DEVICE_PADDING, CUDA_DEVICE_PADDING, cudaMemcpyDeviceToDevice), status); - status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpy(T_device , T_device + n , CUDA_DEVICE_PADDING, cudaMemcpyDeviceToDevice), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T_device + CUDA_DEVICE_PADDING , T , n , cudaMemcpyHostToDevice , st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T_device + CUDA_DEVICE_PADDING + n, T_device + CUDA_DEVICE_PADDING, CUDA_DEVICE_PADDING, cudaMemcpyDeviceToDevice, st_cuda_stream), status); + status = bsc_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(T_device , T_device + n , CUDA_DEVICE_PADDING, cudaMemcpyDeviceToDevice, st_cuda_stream), status); if (status == cudaSuccess) { - if (k >= 5 && k <= 7) index = bsc_st567_encode_cuda(T, T_device + CUDA_DEVICE_PADDING, n, num_blocks, k); - if (k == 8) index = bsc_st8_encode_cuda (T, T_device + CUDA_DEVICE_PADDING, n, num_blocks ); + if (k >= 5 && k <= 7) index = bsc_st567_encode_cuda(T, T_device + CUDA_DEVICE_PADDING, n, num_blocks, k, st_cuda_stream); + if (k == 8) index = bsc_st8_encode_cuda (T, T_device + CUDA_DEVICE_PADDING, n, num_blocks , st_cuda_stream); } - cudaFree(T_device); + cudaFreeAsync(T_device, st_cuda_stream); } } #ifdef LIBBSC_OPENMP - omp_unset_lock(&cuda_lock); + omp_unset_lock(&st_cuda_lock); + #else + if (st_cuda_stream != NULL) + { + bsc_cuda_safe_call(__FILE__, __LINE__, cudaStreamDestroy(st_cuda_stream)); + } #endif return index; diff --git a/makefile b/makefile index b9f15c0..23e3001 100644 --- a/makefile +++ b/makefile @@ -35,7 +35,6 @@ PREFIX = /usr OBJS = \ adler32.o \ libsais.o \ - bwt.o \ coder.o \ qlfc.o \ qlfc_model.o \ @@ -45,6 +44,7 @@ OBJS = \ lzp.o \ platform.o \ st.o \ + bwt.o \ all: libbsc.a bsc @@ -80,9 +80,6 @@ adler32.o: libbsc/adler32/adler32.cpp libsais.o: libbsc/bwt/libsais/libsais.c $(CC) $(CFLAGS) -c libbsc/bwt/libsais/libsais.c -bwt.o: libbsc/bwt/bwt.cpp - $(CC) $(CFLAGS) -c libbsc/bwt/bwt.cpp - coder.o: libbsc/coder/coder.cpp $(CC) $(CFLAGS) -c libbsc/coder/coder.cpp @@ -109,3 +106,6 @@ platform.o: libbsc/platform/platform.cpp st.o: libbsc/st/st.cpp $(CC) $(CFLAGS) -c libbsc/st/st.cpp + +bwt.o: libbsc/bwt/bwt.cpp + $(CC) $(CFLAGS) -c libbsc/bwt/bwt.cpp diff --git a/makefile.cuda b/makefile.cuda index edd64cf..40c99c3 100644 --- a/makefile.cuda +++ b/makefile.cuda @@ -1,8 +1,9 @@ SHELL = /bin/sh CUDA_PATH = /usr/local/cuda -CUDA_INCLUDE_PATH = $(CUDA_PATH)/include CUDA_BIN_PATH = $(CUDA_PATH)/bin +CUDA_LIB_PATH = $(CUDA_PATH)/lib64 +CUDA_INCLUDE_PATH = $(CUDA_PATH)/include NVCC = $(CUDA_BIN_PATH)/nvcc NVCCFLAGS = -m64 -O3 -Xcompiler=-fopenmp -arch=all -Wno-deprecated-gpu-targets -I $(CUDA_INCLUDE_PATH) @@ -46,7 +47,6 @@ PREFIX = /usr OBJS = \ adler32.o \ libsais.o \ - bwt.o \ coder.o \ qlfc.o \ qlfc_model.o \ @@ -55,13 +55,15 @@ OBJS = \ libbsc.o \ lzp.o \ platform.o \ - st.cpu.o \ - st.gpu.o \ + st.o \ + bwt.o \ + st.cuda.o \ + bwt.cuda.o \ all: libbsc.a bsc bsc: libbsc.a bsc.cpp - $(CC) $(CFLAGS) bsc.cpp -o bsc -L. -lbsc -lcudart + $(CC) $(CFLAGS) bsc.cpp -o bsc -L. -L$(CUDA_LIB_PATH) -lbsc -lcudart libbsc.a: $(OBJS) rm -f libbsc.a @@ -92,9 +94,6 @@ adler32.o: libbsc/adler32/adler32.cpp libsais.o: libbsc/bwt/libsais/libsais.c $(CC) $(CFLAGS) -c libbsc/bwt/libsais/libsais.c -bwt.o: libbsc/bwt/bwt.cpp - $(CC) $(CFLAGS) -c libbsc/bwt/bwt.cpp - coder.o: libbsc/coder/coder.cpp $(CC) $(CFLAGS) -c libbsc/coder/coder.cpp @@ -119,8 +118,14 @@ lzp.o: libbsc/lzp/lzp.cpp platform.o: libbsc/platform/platform.cpp $(CC) $(CFLAGS) -c libbsc/platform/platform.cpp -st.cpu.o: libbsc/st/st.cpp - $(CC) $(CFLAGS) -c libbsc/st/st.cpp -o st.cpu.o +st.o: libbsc/st/st.cpp + $(CC) $(CFLAGS) -c libbsc/st/st.cpp + +bwt.o: libbsc/bwt/bwt.cpp + $(CC) $(CFLAGS) -c libbsc/bwt/bwt.cpp + +st.cuda.o: libbsc/st/st.cu + $(NVCC) $(NVCCFLAGS) $(NVCCEXTRAFLAGS) -c libbsc/st/st.cu -o st.cuda.o -st.gpu.o: libbsc/st/st.cu - $(NVCC) $(NVCCFLAGS) $(NVCCEXTRAFLAGS) -c libbsc/st/st.cu -o st.gpu.o +bwt.cuda.o: libbsc/bwt/libcubwt/libcubwt.cu + $(NVCC) $(NVCCFLAGS) $(NVCCEXTRAFLAGS) -c libbsc/bwt/libcubwt/libcubwt.cu -o bwt.cuda.o