From cb2ceeb213e74c88447cc21795b97b8f33012afd Mon Sep 17 00:00:00 2001
From: Ilya Grebnov <Ilya.Grebnov@gmail.com>
Date: Wed, 24 Jan 2024 20:37:07 -0800
Subject: [PATCH] bsc 3.3.4

---
 CHANGES                          |   3 +
 README                           |   7 +-
 VERSION                          |   2 +-
 bsc.cpp                          |   6 +-
 libbsc/adler32/adler32.cpp       |   2 +-
 libbsc/adler32/adler32.h         |   2 +-
 libbsc/bwt/bwt.cpp               |  54 +-
 libbsc/bwt/bwt.h                 |   2 +-
 libbsc/bwt/libcubwt/AUTHORS      |   3 +-
 libbsc/bwt/libcubwt/CHANGES      |   3 +
 libbsc/bwt/libcubwt/VERSION      |   2 +-
 libbsc/bwt/libcubwt/libcubwt.cu  | 829 ++++++++++++++++++++++++++++++-
 libbsc/bwt/libcubwt/libcubwt.cuh |  31 +-
 libbsc/coder/coder.cpp           |   2 +-
 libbsc/coder/coder.h             |   2 +-
 libbsc/coder/common/predictor.h  |   2 +-
 libbsc/coder/common/rangecoder.h |   2 +-
 libbsc/coder/common/tables.h     |   2 +-
 libbsc/coder/qlfc/qlfc.cpp       |   2 +-
 libbsc/coder/qlfc/qlfc.h         |   2 +-
 libbsc/coder/qlfc/qlfc_model.cpp |   2 +-
 libbsc/coder/qlfc/qlfc_model.h   |   2 +-
 libbsc/filters.h                 |   2 +-
 libbsc/filters/detectors.cpp     |   2 +-
 libbsc/filters/preprocessing.cpp |   2 +-
 libbsc/filters/tables.h          |   2 +-
 libbsc/libbsc.h                  |   6 +-
 libbsc/libbsc/libbsc.cpp         |   2 +-
 libbsc/lzp/lzp.cpp               |   2 +-
 libbsc/lzp/lzp.h                 |   2 +-
 libbsc/platform/platform.cpp     |   2 +-
 libbsc/platform/platform.h       |   2 +-
 libbsc/st/st.cpp                 |   2 +-
 libbsc/st/st.cu                  |   2 +-
 libbsc/st/st.cuh                 |   2 +-
 libbsc/st/st.h                   |   2 +-
 36 files changed, 931 insertions(+), 65 deletions(-)

diff --git a/CHANGES b/CHANGES
index 628721a..e2783b2 100644
--- a/CHANGES
+++ b/CHANGES
@@ -1,3 +1,6 @@
+Changes in 3.3.4 (January, 24 2024)
+- Implemented GPU acceleration of reverse Burrows–Wheeler transform.
+
 Changes in 3.3.3 (November, 26 2023)
 - Fixed out-of-bound memory access issue for large inputs.
 - Slightly improved compression performance.
diff --git a/README b/README
index 03e5a06..822c3b2 100644
--- a/README
+++ b/README
@@ -10,7 +10,7 @@ block-sorting data compression algorithms.
 libbsc is a library based on bsc, it uses the same algorithms
 as bsc and enables you to compress memory blocks.
 
-Copyright (c) 2009-2023 Ilya Grebnov <ilya.grebnov@gmail.com>
+Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
 See file AUTHORS for a full list of contributors.
 
@@ -21,7 +21,7 @@ See the bsc and libbsc web site:
 Software License:
 -----------------
 
-Copyright (c) 2009-2023 Ilya Grebnov <ilya.grebnov@gmail.com>
+Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -54,7 +54,8 @@ Compression and decompression requirements are the same and in bytes, can
 be estimated as 16Mb + 5 x block size x number of blocks processed in parallel.
 
 GPU memory usage for NVIDIA CUDA technology is different from CPU memory usage
-and can be estimated as 20 x block size for ST and 21 x block size for BWT.
+and can be estimated as 20 x block size for ST, 21 x block size for forward BWT
+and 7 x block size for inverse BWT.
 
 
 NVIDIA GPU acceleration:
diff --git a/VERSION b/VERSION
index 3f09e91..2c6109e 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-3.3.3
\ No newline at end of file
+3.3.4
\ No newline at end of file
diff --git a/bsc.cpp b/bsc.cpp
index ee9396e..1925291 100644
--- a/bsc.cpp
+++ b/bsc.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -869,8 +869,8 @@ void ProcessCommandline(int argc, char * argv[])
 
 int main(int argc, char * argv[])
 {
-    fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.3. 26 November 2023.\n");
-    fprintf(stdout, "Copyright (c) 2009-2023 Ilya Grebnov <Ilya.Grebnov@gmail.com>.\n\n");
+    fprintf(stdout, "This is bsc, Block Sorting Compressor. Version 3.3.4. 24 January 2024.\n");
+    fprintf(stdout, "Copyright (c) 2009-2024 Ilya Grebnov <Ilya.Grebnov@gmail.com>.\n\n");
 
 #if defined(_OPENMP) && defined(__INTEL_COMPILER)
 
diff --git a/libbsc/adler32/adler32.cpp b/libbsc/adler32/adler32.cpp
index ea0381a..b09b894 100644
--- a/libbsc/adler32/adler32.cpp
+++ b/libbsc/adler32/adler32.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/adler32/adler32.h b/libbsc/adler32/adler32.h
index f9458bf..3f24c97 100644
--- a/libbsc/adler32/adler32.h
+++ b/libbsc/adler32/adler32.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/bwt/bwt.cpp b/libbsc/bwt/bwt.cpp
index b4b44e9..e1e054e 100644
--- a/libbsc/bwt/bwt.cpp
+++ b/libbsc/bwt/bwt.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -226,13 +226,63 @@ int bsc_bwt_encode(unsigned char * T, int n, unsigned char * num_indexes, int *
     return LIBBSC_NOT_ENOUGH_MEMORY;
 }
 
+int bsc_bwt_gpu_decode(unsigned char * T, int n, int index, int features)
+{
+    int result = -1;
+
+#ifdef LIBBSC_CUDA_SUPPORT
+    if (features & LIBBSC_FEATURE_CUDA)
+    {
+        int storage_approx_length = (n / 3) | 0x1fffff;
+
+#ifdef LIBBSC_OPENMP
+        omp_set_lock(&bwt_cuda_lock);
+
+        if (bwt_cuda_device_storage_size < storage_approx_length)
+        {
+            if (bwt_cuda_device_storage != NULL)
+            {
+                libcubwt_free_device_storage(bwt_cuda_device_storage);
+
+                bwt_cuda_device_storage = NULL;
+                bwt_cuda_device_storage_size = 0;
+            }
+
+            if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, storage_approx_length + (storage_approx_length / 32)) == LIBCUBWT_NO_ERROR)
+            {
+                bwt_cuda_device_storage_size = storage_approx_length + (storage_approx_length / 32);
+            }
+        }
+
+        if (bwt_cuda_device_storage_size >= storage_approx_length)
+        {
+            result = (int)libcubwt_unbwt(bwt_cuda_device_storage, T, T, n, NULL, index);
+        } 
+
+        omp_unset_lock(&bwt_cuda_lock);
+#else
+        void * bwt_cuda_device_storage = NULL;
+
+        if (libcubwt_allocate_device_storage(&bwt_cuda_device_storage, storage_approx_length) == LIBCUBWT_NO_ERROR)
+        {
+            result = (int)libcubwt_unbwt(bwt_cuda_device_storage, T, T, n, NULL, index);
+
+            libcubwt_free_device_storage(bwt_cuda_device_storage);
+        }
+#endif
+    }
+#endif
+
+    return result;
+}
+
 int bsc_bwt_decode(unsigned char * T, int n, int index, unsigned char num_indexes, int * indexes, int features)
 {
     if ((T == NULL) || (n < 0) || (index <= 0) || (index > n))
     {
         return LIBBSC_BAD_PARAMETER;
     }
-    if (n <= 1)
+    if (n <= 1 || bsc_bwt_gpu_decode(T, n, index, features) == 0)
     {
         return LIBBSC_NO_ERROR;
     }
diff --git a/libbsc/bwt/bwt.h b/libbsc/bwt/bwt.h
index cddbbaf..6a9f25f 100644
--- a/libbsc/bwt/bwt.h
+++ b/libbsc/bwt/bwt.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/bwt/libcubwt/AUTHORS b/libbsc/bwt/libcubwt/AUTHORS
index 56fe850..b4fb785 100644
--- a/libbsc/bwt/libcubwt/AUTHORS
+++ b/libbsc/bwt/libcubwt/AUTHORS
@@ -7,4 +7,5 @@
   Leyuan Wang, Sean Baxter, John D. Owens, Yury Shukhrov,
   Rory Mitchell, Jacopo Pantaleoni, Duane Merrill,
   Georgy Evtushenko, Allison Vacanti, Robert Crovella,
-  Mark Harris.
+  Mark Harris, Vitaly Osipov, Andy Adinets, Elias Stehle,
+  Michael Maniscalco.
diff --git a/libbsc/bwt/libcubwt/CHANGES b/libbsc/bwt/libcubwt/CHANGES
index 446d5d8..91b895a 100644
--- a/libbsc/bwt/libcubwt/CHANGES
+++ b/libbsc/bwt/libcubwt/CHANGES
@@ -1,3 +1,6 @@
+Changes in 1.6.0 (January 24, 2024)
+- Inverse Burrows-Wheeler transform.
+
 Changes in 1.5.0 (March 24, 2023)
 - Reduced memory usage and improved performance.
 
diff --git a/libbsc/bwt/libcubwt/VERSION b/libbsc/bwt/libcubwt/VERSION
index 3e1ad72..ce6a70b 100644
--- a/libbsc/bwt/libcubwt/VERSION
+++ b/libbsc/bwt/libcubwt/VERSION
@@ -1 +1 @@
-1.5.0
\ No newline at end of file
+1.6.0
\ No newline at end of file
diff --git a/libbsc/bwt/libcubwt/libcubwt.cu b/libbsc/bwt/libcubwt/libcubwt.cu
index d52b3d1..5bb96a6 100644
--- a/libbsc/bwt/libcubwt/libcubwt.cu
+++ b/libbsc/bwt/libcubwt/libcubwt.cu
@@ -1,9 +1,9 @@
 /*--
 
 This file is a part of libcubwt, a library for CUDA accelerated
-burrows wheeler transform construction.
+burrows wheeler transform construction and inversion.
 
-   Copyright (c) 2022-2023 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2022-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -68,6 +68,19 @@ Please see the file LICENSE for full copyright and license details.
 #define CUDA_WARP_THREADS                   (32)
 #define CUDA_DEVICE_PADDING                 (12 * 768)
 
+#define ALPHABET_SIZE                       (256)
+
+template <typename BaseTypeT, typename OffsetTypeT>
+struct OffsetToPointerOperator
+{
+    BaseTypeT * d_base;
+
+    __host__ __device__ __forceinline__ BaseTypeT * operator()(const OffsetTypeT& offset) const
+    { 
+        return d_base + offset;
+    }
+};
+
 typedef struct LIBCUBWT_DEVICE_STORAGE
 {
     void *          device_rsort_temp_storage;
@@ -98,7 +111,12 @@ typedef struct LIBCUBWT_DEVICE_STORAGE
     uint2 *         device_descriptors_small;
 
     void *          device_storage;
+    size_t          device_storage_size;
+
     int32_t         device_L2_cache_bits;
+    int32_t         device_multiprocessor_count;
+    int32_t         device_multiprocessor_max_blocks;
+    int32_t         device_multiprocessor_max_threads;
 
     void *          host_pinned_storage;
     size_t          host_pinned_storage_size;
@@ -136,7 +154,7 @@ static cudaError_t libcubwt_cuda_safe_call(const char * filename, int32_t line,
 }
 
 template <typename T>
-__device__ __forceinline__ T libcubwt_warp_reduce_sum(T value) 
+static __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value)
 {
 #if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__)
     return __reduce_add_sync((uint32_t)-1, value);
@@ -153,7 +171,7 @@ __device__ __forceinline__ T libcubwt_warp_reduce_sum(T value)
 }
 
 template <typename T>
-__device__ __forceinline__ T libcubwt_warp_reduce_max(T value) 
+static __device__ __forceinline__ T libcubwt_warp_reduce_max(T value)
 {
 #if CUDA_DEVICE_ARCH >= 800 && !defined(__CUDA__)
     return __reduce_max_sync((uint32_t)-1, value);
@@ -170,7 +188,7 @@ __device__ __forceinline__ T libcubwt_warp_reduce_max(T value)
 }
 
 template <typename T>
-__device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay)
+static __device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay)
 {
 #if CUDA_DEVICE_ARCH >= 700
     __nanosleep(delay);
@@ -179,6 +197,71 @@ __device__ __forceinline__ void libcubwt_delay_or_prevent_hoisting(T delay)
 #endif
 }
 
+template <int INPUT_BITS>
+static __device__ __forceinline__ uint32_t libcubwt_match_any_sync(const uint32_t sync_mask, const uint32_t input)
+{
+    uint32_t peers_mask = sync_mask;
+
+    #pragma unroll
+    for (uint32_t bit = 0; bit < INPUT_BITS; bit += 1)
+    {
+        uint32_t peers, bit_mask = 1 << bit;
+
+        asm("{\n"
+            "            .reg .pred p;\n"
+            "            and.b32 %0, %1, %2;\n"
+            "            setp.ne.u32 p, %0, 0;\n"
+            "            vote.ballot.sync.b32 %0, p, %3;\n"
+            "            @!p not.b32 %0, %0;\n"
+            "        }" 
+            : "=r"(peers) : "r"(input), "r"(bit_mask), "r"(sync_mask));
+
+        peers_mask &= peers;
+    }
+
+    return peers_mask;
+}
+
+static __device__ __forceinline__ uint32_t  libcubwt_xxhash32_b32(uint32_t data, uint32_t seed)
+{
+    uint32_t x = (data * 0xc2b2ae3du) + seed + (uint32_t)sizeof(data) + 0x165667b1u;
+
+    x = (x << 17) | (x >> 15);
+    x *= 0x27d4eb2fu; x ^= x >> 15;
+    x *= 0x85ebca77u; x ^= x >> 13;
+    x *= 0xc2b2ae3du; x ^= x >> 16;
+
+    return x;
+}
+
+static __device__ __forceinline__ uint4  libcubwt_shift_left_b128(uint4 v, uint32_t shift)
+{
+    if ((shift & 8) > 0) { v.w = v.y; v.z = v.x; v.y = 0  ; v.x = 0; }
+    if ((shift & 4) > 0) { v.w = v.z; v.z = v.y; v.y = v.x; v.x = 0; }
+
+    shift = (shift << 3) & 0x18u;
+    v.w = __funnelshift_l(v.z, v.w, shift);
+    v.z = __funnelshift_l(v.y, v.z, shift);
+    v.y = __funnelshift_l(v.x, v.y, shift);
+    v.x = __funnelshift_l(0x0, v.x, shift);
+
+    return v;
+}
+
+static __device__ __forceinline__ uint4  libcubwt_shift_right_b128(uint4 v, uint32_t shift)
+{
+    if ((shift & 8) > 0) { v.x = v.z; v.y = v.w; v.z = 0  ; v.w = 0; }
+    if ((shift & 4) > 0) { v.x = v.y; v.y = v.z; v.z = v.w; v.w = 0; }
+
+    shift = (shift << 3) & 0x18u;
+    v.x = __funnelshift_r(v.x, v.y, shift);
+    v.y = __funnelshift_r(v.y, v.z, shift);
+    v.z = __funnelshift_r(v.z, v.w, shift);
+    v.w = __funnelshift_r(v.w, 0x0, shift);
+
+    return v;
+}
+
 __global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS)
 static void libcubwt_gather_values_uint32_kernel(const uint32_t * device_idx, const uint32_t * RESTRICT device_src, uint32_t * device_dst, uint32_t m)
 {
@@ -1901,7 +1984,7 @@ static void libcubwt_merge_suffixes_kernel(
             uint2    mod12_suffix   = shared_storage.stage1.suffixes_l[mod12_index];
 
             #pragma unroll
-            for (uint32_t item = 0; item < 5; ++item)
+            for (uint32_t item = 0; item < 5; item += 1)
             {
                 bool predicate = libcubwt_compare_suffixes_kernel(mod0l_suffix, mod0h_suffix, mod12_suffix);
            
@@ -1916,7 +1999,7 @@ static void libcubwt_merge_suffixes_kernel(
 
         {
             #pragma unroll
-            for (uint32_t item = 0; item < 5; ++item)
+            for (uint32_t item = 0; item < 5; item += 1)
             {
                 if (suffixes[item] >= 0x01000000u)
                 {
@@ -2169,6 +2252,9 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len
             int32_t cuda_device_capability;
 
             libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaGetDevice(&cuda_device_ordinal), status);
+            libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_max_blocks, cudaDevAttrMaxBlocksPerMultiprocessor, cuda_device_ordinal), status);
+            libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_max_threads, cudaDevAttrMaxThreadsPerMultiProcessor, cuda_device_ordinal), status);
+            libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&storage->device_multiprocessor_count, cudaDevAttrMultiProcessorCount, cuda_device_ordinal), status);
             libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaDeviceGetAttribute(&cuda_device_L2_cache_size, cudaDevAttrL2CacheSize, cuda_device_ordinal), status);
             libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::PtxVersion(cuda_device_capability, cuda_device_ordinal), status);
 
@@ -2176,7 +2262,7 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len
             {
                 storage->device_L2_cache_bits = 0; while (cuda_device_L2_cache_size >>= 1) { storage->device_L2_cache_bits += 1; };
 
-                storage->cuda_block_threads = (cuda_device_capability == 860 || cuda_device_capability == 870 || cuda_device_capability == 890) ? 768 : 512;
+                storage->cuda_block_threads = (cuda_device_capability == 860 || cuda_device_capability == 870 || cuda_device_capability == 890) ? 768u : 512u;
             }
         }
                
@@ -2210,27 +2296,29 @@ int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_len
 
             if (status == cudaSuccess)
             {
-                size_t device_storage_size = 0;
+                storage->device_storage_size = 0;
 
-                device_storage_size += storage->device_ssort_temp_storage_size;
-                device_storage_size += storage->device_rsort_temp_storage_size;
+                storage->device_storage_size += storage->device_ssort_temp_storage_size;
+                storage->device_storage_size += storage->device_rsort_temp_storage_size;
 
-                device_storage_size += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t);
+                storage->device_storage_size += (max_expanded_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t);
 
-                device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t);
-                device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
-                device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
-                device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
+                storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint8_t);
+                storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
+                storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
+                storage->device_storage_size += (max_reduced_length + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint64_t);
 
-                device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4);
-                device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4);
-                device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2);
+                storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4);
+                storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint4);
+                storage->device_storage_size += (num_descriptors + (int64_t)2 * CUDA_DEVICE_PADDING) * sizeof(uint2);
 
-                status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_storage, device_storage_size), status);
+                status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMalloc((void **)&storage->device_storage, storage->device_storage_size), status);
 
                 if (status == cudaSuccess)
                 {
-                    status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size = 256 * sizeof(uint32_t)), status);
+                    storage->host_pinned_storage_size = (storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count) * 6 * sizeof(uint32_t);
+
+                    status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMallocHost((void **)&storage->host_pinned_storage, storage->host_pinned_storage_size), status);
                     status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamCreate(&storage->cuda_stream), status);
                 }
             }
@@ -2335,3 +2423,702 @@ int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L,
 
     return libcubwt_get_error_code(status);
 }
+
+__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS)
+static void libcubwt_compute_histogram_offsets(const uint8_t * RESTRICT device_L, const uint32_t n, uint32_t * RESTRICT device_offsets)
+{
+    __shared__ __align__(32) uint32_t histogram[ALPHABET_SIZE];
+
+    if (threadIdx.x < ALPHABET_SIZE)
+    {
+        histogram[threadIdx.x] = 0;
+    }
+
+    __syncthreads();
+    
+#if CUDA_DEVICE_ARCH < 800
+    uint32_t byte0 = 0, byte1 = 0, count = 0;
+#endif
+
+    for (uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 16 + threadIdx.x * 16; thread_index < n; thread_index += gridDim.x * CUDA_BLOCK_THREADS * 16)
+    {
+        uint4 bytes = __ldg((uint4 *)(device_L + thread_index));
+
+#if CUDA_DEVICE_ARCH < 800
+        byte0 = __byte_perm(0, bytes.x, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.x, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.x, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.x, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.y, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.y, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.y, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.y, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.z, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.z, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.z, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.z, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.w, 0x0004); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.w, 0x0005); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.w, 0x0006); if (byte0 != byte1) { atomicAdd(&histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.w, 0x0007); if (byte1 != byte0) { atomicAdd(&histogram[byte0], count); count = 0; } count += 1;
+#else
+        atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0004)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0005)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0006)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.x, 0x0007)], 1);
+
+        atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0004)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0005)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0006)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.y, 0x0007)], 1);
+
+        atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0004)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0005)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0006)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.z, 0x0007)], 1);
+
+        atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0004)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0005)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0006)], 1);
+        atomicAdd(&histogram[__byte_perm(0, bytes.w, 0x0007)], 1);
+#endif
+    }
+
+#if CUDA_DEVICE_ARCH < 800
+    atomicAdd(&histogram[byte1], count);
+#endif
+
+    __syncthreads();
+
+    if (threadIdx.x < ALPHABET_SIZE)
+    {
+        uint32_t offset;
+
+        {
+            typedef cub::BlockScan<uint32_t, ALPHABET_SIZE, cub::BLOCK_SCAN_WARP_SCANS> BlockScan;
+            __shared__ __align__(32) typename BlockScan::TempStorage temp_storage;
+
+            BlockScan(temp_storage).ExclusiveSum(histogram[threadIdx.x], offset);
+        }
+
+        if (blockIdx.x == 0)
+        {
+            offset += (threadIdx.x == 0 ? 0x80000001 : 0x80000000);
+        }
+
+        if (offset > 0)
+        {
+            atomicAdd(&device_offsets[threadIdx.x], offset);
+        }
+    }
+}
+
+__global__ __launch_bounds__(CUDA_BLOCK_THREADS, CUDA_SM_THREADS / CUDA_BLOCK_THREADS)
+static void libcubwt_compute_LF_mapping(const uint8_t * RESTRICT device_L, const uint32_t primary_index, uint32_t * RESTRICT device_descriptors, uint32_t * RESTRICT device_LF)
+{
+    __shared__ __align__(32) uint32_t   shared_histogram[CUDA_BLOCK_THREADS / CUDA_WARP_THREADS][ALPHABET_SIZE];
+    __shared__ __align__(32) uint4      shared_bytes[CUDA_BLOCK_THREADS];
+
+    bool has_primary_index = false;
+
+    {
+        uint32_t thread_index = blockIdx.x * CUDA_BLOCK_THREADS * 16 + threadIdx.x * 16;
+        uint4 bytes = __ldg((uint4 *)(device_L + thread_index)); shared_bytes[threadIdx.x] = bytes;
+
+        uint32_t * RESTRICT warp_histogram = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0];
+
+        #pragma unroll
+        for (uint32_t lane_index = 4 * (threadIdx.x % CUDA_WARP_THREADS), iteration = 0; iteration < ALPHABET_SIZE / (4 * CUDA_WARP_THREADS); lane_index += 4 * CUDA_WARP_THREADS, iteration += 1)
+        {
+            *(uint4 *)(warp_histogram + lane_index) = make_uint4(0, 0, 0, 0);
+        }
+
+        __syncwarp();
+
+#if CUDA_DEVICE_ARCH < 800
+        uint32_t byte0 = 0, byte1 = 0, count = 0;
+
+        byte0 = __byte_perm(0, bytes.x, 0x0004);                                                                              count += 1;
+        byte1 = __byte_perm(0, bytes.x, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.x, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.x, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.y, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.y, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.y, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.y, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.z, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.z, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.z, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.z, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+
+        byte0 = __byte_perm(0, bytes.w, 0x0004); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.w, 0x0005); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+        byte0 = __byte_perm(0, bytes.w, 0x0006); if (byte0 != byte1) { atomicAdd(&warp_histogram[byte1], count); count = 0; } count += 1;
+        byte1 = __byte_perm(0, bytes.w, 0x0007); if (byte1 != byte0) { atomicAdd(&warp_histogram[byte0], count); count = 0; } count += 1;
+
+        atomicAdd(&warp_histogram[byte1], count);
+#else
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0004)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0005)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0006)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.x, 0x0007)], 1);
+
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0004)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0005)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0006)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.y, 0x0007)], 1);
+
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0004)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0005)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0006)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.z, 0x0007)], 1);
+
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0004)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0005)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0006)], 1);
+        atomicAdd(&warp_histogram[__byte_perm(0, bytes.w, 0x0007)], 1);
+#endif
+
+        __syncwarp();
+
+        if ((primary_index - thread_index) < 16u)
+        {
+            has_primary_index = true; warp_histogram[0] -= 1;
+        }
+    }
+
+    __syncthreads();
+
+    if (threadIdx.x < ALPHABET_SIZE)
+    {
+        uint32_t prefix_sum = 0;
+        uint32_t byte_count = 0;
+
+        {
+            #pragma unroll
+            for (uint32_t warp_index = 0; warp_index < CUDA_BLOCK_THREADS / CUDA_WARP_THREADS; warp_index += 1)
+            {
+                byte_count += shared_histogram[warp_index][threadIdx.x];
+            }
+        }
+
+        {
+            {
+                cub::ThreadStore<cub::STORE_CG>(device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x, byte_count | 0x40000000u);
+            }
+
+            {
+                uint32_t * RESTRICT descriptors_lookback = device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x;
+
+                uint32_t descriptor, delay = 8;
+                do
+                {
+                    descriptors_lookback -= ALPHABET_SIZE;
+
+                    do
+                    {
+                        libcubwt_delay_or_prevent_hoisting(delay <<= 1);
+
+                        descriptor = cub::ThreadLoad<cub::LOAD_CG>(descriptors_lookback);
+
+                    } while (descriptor == 0);
+
+                    delay = 0; prefix_sum += descriptor & (~0xc0000000u);
+
+                } while ((descriptor & 0x80000000u) == 0);
+
+                __syncwarp();
+            }
+
+            {
+                cub::ThreadStore<cub::STORE_CG>(device_descriptors + blockIdx.x * ALPHABET_SIZE + threadIdx.x, (prefix_sum + byte_count) | 0x80000000u);
+            }
+        }
+
+        {
+            #pragma unroll
+            for (uint32_t warp_index = 0; warp_index < CUDA_BLOCK_THREADS / CUDA_WARP_THREADS; warp_index += 1)
+            {
+                byte_count = shared_histogram[warp_index][threadIdx.x];
+                shared_histogram[warp_index][threadIdx.x] = prefix_sum;
+                
+                prefix_sum += byte_count;
+            }
+        }
+    }
+
+    __syncthreads();
+
+    if (__any_sync((uint32_t)-1, has_primary_index))
+    {
+        uint32_t thread_index = (threadIdx.x / CUDA_WARP_THREADS) * (CUDA_WARP_THREADS * 16) + (threadIdx.x % CUDA_WARP_THREADS);
+
+        uint32_t * RESTRICT warp_histogram  = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0];
+        uint8_t  * RESTRICT thread_bytes    = ((uint8_t *)shared_bytes) + thread_index;
+
+        thread_index += blockIdx.x * CUDA_BLOCK_THREADS * 16;
+
+        #pragma unroll
+        for (uint32_t byte_index = 0; byte_index < 16; byte_index += 1, thread_bytes += CUDA_WARP_THREADS, thread_index += CUDA_WARP_THREADS)
+        {
+            uint32_t byte = thread_bytes[0]; if (primary_index == thread_index) { byte = 256; }
+
+            uint32_t peers_mask     = libcubwt_match_any_sync<9>((uint32_t)-1, byte);
+            uint32_t peers_offset   = __popc(peers_mask & cub::LaneMaskLt());
+            uint32_t warp_offset    = (byte < 256 ? warp_histogram[byte] : 0);
+
+            device_LF[thread_index] = warp_offset + peers_offset;
+
+            if (byte_index < 15)
+            {
+                __syncwarp();
+
+                if (byte < 256 && peers_offset == 0) { warp_histogram[byte] = warp_offset + __popc(peers_mask); }
+
+                __syncwarp();
+            }
+        }
+    }
+    else
+    {
+        uint32_t thread_index = (threadIdx.x / CUDA_WARP_THREADS) * (CUDA_WARP_THREADS * 16) + (threadIdx.x % CUDA_WARP_THREADS);
+
+        uint32_t * RESTRICT warp_histogram  = &shared_histogram[threadIdx.x / CUDA_WARP_THREADS][0];
+        uint8_t  * RESTRICT thread_bytes    = ((uint8_t *)shared_bytes) + thread_index;
+
+        thread_index += blockIdx.x * CUDA_BLOCK_THREADS * 16;
+
+        #pragma unroll
+        for (uint32_t byte_index = 0; byte_index < 16; byte_index += 1, thread_bytes += CUDA_WARP_THREADS, thread_index += CUDA_WARP_THREADS)
+        {
+            uint32_t byte = thread_bytes[0];
+
+            uint32_t peers_mask     = libcubwt_match_any_sync<8>((uint32_t)-1, byte);
+            uint32_t peers_offset   = __popc(peers_mask & cub::LaneMaskLt());
+            uint32_t warp_offset    = warp_histogram[byte];
+
+            device_LF[thread_index] = warp_offset + peers_offset;
+
+            if (byte_index < 15)
+            {
+                __syncwarp();
+
+                if (peers_offset == 0) { warp_histogram[byte] = warp_offset + __popc(peers_mask); }
+
+                __syncwarp();
+            }
+        }
+    }
+}
+
+template <uint32_t cuda_block_threads>
+__global__ __launch_bounds__(cuda_block_threads)
+static void libcubwt_mark_segments(const uint32_t segments_quotient, const uint32_t segments_remainder, uint32_t * RESTRICT device_LF, uint2 * RESTRICT device_segments_positions)
+{
+    uint32_t thread_index   = blockIdx.x * cuda_block_threads + threadIdx.x;
+    uint32_t F_position     = (thread_index * segments_quotient) + ((thread_index < segments_remainder) ? thread_index : segments_remainder);
+
+    if (thread_index > 0)
+    {
+        F_position += libcubwt_xxhash32_b32(thread_index, segments_quotient ^ segments_remainder) % (segments_quotient + (thread_index < segments_remainder));
+    }
+
+    uint32_t L_position = device_LF[F_position]; device_LF[F_position] = thread_index | 0x80000000u;
+    
+    device_segments_positions[thread_index] = make_uint2(F_position, L_position);
+}
+
+template <uint32_t cuda_block_threads>
+__global__ __launch_bounds__(cuda_block_threads)
+static void libcubwt_survey_segments(const uint32_t * RESTRICT device_LF, const uint32_t * RESTRICT device_offsets, uint2 * RESTRICT device_segments_positions, uint2 * RESTRICT device_segments_surveys, uint8_t * RESTRICT device_staging_area, uint32_t staging_size)
+{
+    __shared__ __align__(32) uint32_t shared_offsets[ALPHABET_SIZE];
+
+    {
+        if (cuda_block_threads == 32)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x +  0] = ((uint4 *)device_offsets)[threadIdx.x +  0];
+            ((uint4 *)shared_offsets)[threadIdx.x + 32] = ((uint4 *)device_offsets)[threadIdx.x + 32];
+        }
+        else if (cuda_block_threads == 64)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x];
+        }
+        else if (threadIdx.x < 64)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x];
+        }
+
+        __syncthreads();
+    }
+
+    uint32_t thread_index   = blockIdx.x * cuda_block_threads + threadIdx.x;
+    uint32_t F_position     = device_segments_positions[thread_index].x;
+    uint32_t L_position     = device_segments_positions[thread_index].y;
+    uint32_t count          = 0;
+
+    const uint32_t offset64 = shared_offsets[64], offset128 = shared_offsets[128], offset192 = shared_offsets[192];
+    uint8_t * RESTRICT thread_staging = device_staging_area + ((uint64_t)thread_index * staging_size);
+
+    uint4 T = make_uint4(0, 0, 0, 0);
+
+    do
+    {
+        F_position |= 0x80000000u;
+
+        uint32_t next_position  = cub::ThreadLoad<cub::LOAD_CG>(device_LF + L_position);
+        uint32_t val            = (F_position >= offset128) ? 128 : 0;
+        uint32_t offsetX64      = (F_position >= offset128) ? offset192 : offset64;
+
+        if (F_position >= offsetX64               ) { val += 64; }
+        if (F_position >= shared_offsets[val + 32]) { val += 32; }
+        if (F_position >= shared_offsets[val + 16]) { val += 16; }
+        if (F_position >= shared_offsets[val +  8]) { val +=  8; }
+        if (F_position >= shared_offsets[val +  4]) { val +=  4; }
+        if (F_position >= shared_offsets[val +  2]) { val +=  2; }
+        if (F_position >= shared_offsets[val +  1]) { val +=  1; }
+
+        T.w = __byte_perm(T.z, T.w, 0x6543); T.z = __byte_perm(T.y, T.z, 0x6543);
+        T.y = __byte_perm(T.x, T.y, 0x6543); T.x = __byte_perm(val, T.x, 0x6540);
+
+        F_position = L_position; L_position = next_position; count += 1;
+
+        if ((staging_size != 0) && (count % 16 == 0))
+        {
+            staging_size -= 16; *(uint4 *)(thread_staging + staging_size) = T;
+
+            if (staging_size == 0)
+            {
+                device_segments_positions[thread_index] = make_uint2(F_position, L_position);
+            }
+        }
+
+    } while ((int32_t)L_position >= 0);
+
+    __syncwarp();
+
+    device_segments_surveys[thread_index] = make_uint2(L_position ^ 0x80000000u, count);
+
+    if (staging_size != 0)
+    {
+        device_segments_positions[thread_index] = make_uint2(F_position, L_position);
+
+        if (count % 16 != 0)
+        {
+            staging_size -= 16; *(uint4 *)(thread_staging + staging_size) = libcubwt_shift_left_b128(T, -count);
+        }
+    }
+}
+
+template <uint32_t cuda_block_threads>
+__global__ __launch_bounds__(cuda_block_threads)
+static void libcubwt_decode_segments_fallback(const uint32_t * RESTRICT device_LF, const uint32_t * RESTRICT device_offsets, const uint2 * RESTRICT device_segments_positions, const uint2 * RESTRICT device_segments_surveys, uint8_t * RESTRICT device_output_area)
+{
+    __shared__ __align__(32) uint32_t shared_offsets[ALPHABET_SIZE];
+
+    {
+        if (cuda_block_threads == 32)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x +  0] = ((uint4 *)device_offsets)[threadIdx.x +  0];
+            ((uint4 *)shared_offsets)[threadIdx.x + 32] = ((uint4 *)device_offsets)[threadIdx.x + 32];
+        }
+        else if (cuda_block_threads == 64)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x];
+        }
+        else if (threadIdx.x < 64)
+        {
+            ((uint4 *)shared_offsets)[threadIdx.x] = ((uint4 *)device_offsets)[threadIdx.x];
+        }
+
+        __syncthreads();
+    }
+
+    uint32_t thread_index   = blockIdx.x * cuda_block_threads + threadIdx.x;
+    uint32_t F_position     = device_segments_positions[thread_index].x;
+    uint32_t L_position     = device_segments_positions[thread_index].y;
+    uint32_t offset         = device_segments_surveys[thread_index].x;
+
+    if ((int32_t)L_position < 0)
+    {
+        return;
+    }
+
+    const uint32_t offset64 = shared_offsets[64], offset128 = shared_offsets[128], offset192 = shared_offsets[192];
+    uint4 T = libcubwt_shift_right_b128((offset % 16 != 0) ? *(uint4 *)(device_output_area + (offset & (-16u))) : make_uint4(0, 0, 0, 0), offset);
+
+    do
+    {
+        F_position |= 0x80000000u;
+
+        uint32_t next_position  = cub::ThreadLoad<cub::LOAD_CG>(device_LF + L_position);
+        uint32_t val            = (F_position >= offset128) ? 128 : 0;
+        uint32_t offsetX64      = (F_position >= offset128) ? offset192 : offset64;
+
+        if (F_position >= offsetX64               ) { val += 64; }
+        if (F_position >= shared_offsets[val + 32]) { val += 32; }
+        if (F_position >= shared_offsets[val + 16]) { val += 16; }
+        if (F_position >= shared_offsets[val +  8]) { val +=  8; }
+        if (F_position >= shared_offsets[val +  4]) { val +=  4; }
+        if (F_position >= shared_offsets[val +  2]) { val +=  2; }
+        if (F_position >= shared_offsets[val +  1]) { val +=  1; }
+
+        T.w = __byte_perm(T.z, T.w, 0x6543); T.z = __byte_perm(T.y, T.z, 0x6543);
+        T.y = __byte_perm(T.x, T.y, 0x6543); T.x = __byte_perm(val, T.x, 0x6540);
+
+        F_position = L_position; L_position = next_position; offset -= 1;
+
+        if (offset % 16 == 0)
+        {
+            *(uint4 *)(device_output_area + offset) = T;
+        }
+
+    } while ((int32_t)L_position >= 0);
+
+    if (offset % 16 != 0)
+    {
+        uint4 P = *(uint4 *)(device_output_area + (offset & (-16u)));
+        uint4 M = make_uint4(~0u, ~0u, ~0u, ~0u);
+        
+        T = libcubwt_shift_left_b128(T, offset);
+        M = libcubwt_shift_left_b128(M, offset);
+
+        T.x = (T.x & M.x) | (P.x & ~M.x);
+        T.y = (T.y & M.y) | (P.y & ~M.y);
+        T.z = (T.z & M.z) | (P.z & ~M.z);
+        T.w = (T.w & M.w) | (P.w & ~M.w);
+
+        *(uint4 *)(device_output_area + (offset & (-16u))) = T;
+    }
+}
+
+static bool libcubwt_rank_segments(
+    uint2 *     RESTRICT segments_surveys,
+    uint64_t *  RESTRICT memcpy_from,
+    uint32_t *  RESTRICT memcpy_to,
+    uint32_t *  RESTRICT memcpy_size,
+    const uint32_t n,
+    const uint32_t num_segments,
+    const uint32_t staging_size)
+{
+    uint32_t sum = n, max_count = 0;
+
+    for (uint64_t p = 0, segment = num_segments; segment != 0; segment -= 2)
+    {
+        {
+            uint32_t next   = segments_surveys[p].x;
+            uint32_t count  = segments_surveys[p].y;
+            uint32_t size   = count < staging_size ? count : staging_size;
+
+            segments_surveys[p].x = sum - size;
+
+            memcpy_from[segment - 1] = ((uint64_t)staging_size * (p + 1)) - size;
+            memcpy_to  [segment - 1] = sum - size;
+            memcpy_size[segment - 1] = size;
+
+            sum -= count; p = next; max_count = max_count < count ? count : max_count;
+        }
+
+        {
+            uint32_t next   = segments_surveys[p].x;
+            uint32_t count  = segments_surveys[p].y;
+            uint32_t size   = count < staging_size ? count : staging_size;
+
+            segments_surveys[p].x = sum - size;
+
+            memcpy_from[segment - 2] = ((uint64_t)staging_size * (p + 1)) - size;
+            memcpy_to  [segment - 2] = sum - size;
+            memcpy_size[segment - 2] = size;
+
+            sum -= count; p = next; max_count = max_count < count ? count : max_count;
+        }
+    }
+
+    return max_count > staging_size;
+}
+
+int64_t libcubwt_unbwt(void * device_storage, const uint8_t * T, uint8_t * U, int64_t n, const int32_t * freq, int32_t i)
+{
+    LIBCUBWT_DEVICE_STORAGE * storage = (LIBCUBWT_DEVICE_STORAGE *)device_storage;
+
+    cudaError_t status          = cudaSuccess;
+    int64_t num_descriptors     = (n + 1 + storage->cuda_block_threads * 16 - 1) / (storage->cuda_block_threads * 16);
+    int64_t aligned_n           = num_descriptors * (storage->cuda_block_threads * 16);
+    int64_t num_segments        = std::min(storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count, ((int32_t)(n + 1) / 48) & (-128));
+
+    if ((storage == NULL) || (T == NULL) || (U == NULL) || (n < 128 * 1024) || (i <= 0) || (i > n) || (n > 5 * storage->max_length) || (aligned_n >= 1023 * 1024 * 1024))
+    {
+        return LIBCUBWT_BAD_PARAMETER;
+    }
+
+    size_t device_batched_memcpy_temp_storage_size;
+
+    {
+        uint8_t *  d_staging        = NULL;
+        uint64_t * d_memcpy_from    = NULL;
+        uint32_t * d_memcpy_to      = NULL;
+        uint32_t * d_memcpy_size    = NULL;
+
+        OffsetToPointerOperator<uint8_t, uint64_t> memcpy_from_operator { d_staging };
+        OffsetToPointerOperator<uint8_t, uint32_t> memcpy_to_operator   { d_staging };
+
+        cub::TransformInputIterator<uint8_t *, OffsetToPointerOperator<uint8_t, uint64_t>, uint64_t *> memcpy_from_iterator (d_memcpy_from, memcpy_from_operator);
+        cub::TransformInputIterator<uint8_t *, OffsetToPointerOperator<uint8_t, uint32_t>, uint32_t *> memcpy_to_iterator   (d_memcpy_to  , memcpy_to_operator  );
+
+        device_batched_memcpy_temp_storage_size = 0;
+
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceCopy::Batched(
+            NULL, device_batched_memcpy_temp_storage_size,
+            memcpy_from_iterator,
+            memcpy_to_iterator,
+            d_memcpy_size,
+            (uint32_t)num_segments));
+
+        if (status != cudaSuccess) { return libcubwt_get_error_code(status); }
+
+        device_batched_memcpy_temp_storage_size = (device_batched_memcpy_temp_storage_size + (size_t)1023) & (size_t)(-1024);
+    }
+
+    uint8_t  * device_alloc     = (uint8_t  *)storage->device_storage;
+    uint8_t  * device_L         = (uint8_t  *)device_alloc; device_alloc += aligned_n;
+    uint32_t * device_LF        = (uint32_t *)device_alloc; device_alloc += aligned_n * sizeof(uint32_t);
+    uint32_t * device_offsets   = (uint32_t *)device_alloc; device_alloc += ALPHABET_SIZE * sizeof(uint32_t);
+
+    void *      device_batched_memcpy_temp_storage = device_alloc;
+    uint2 *     device_segments_positions   = (uint2 *   )(device_alloc + device_batched_memcpy_temp_storage_size);
+    uint2 *     device_segments_surveys     = (uint2 *   )(device_segments_positions    + num_segments);
+    uint64_t *  device_batched_memcpy_from  = (uint64_t *)(device_segments_surveys      + num_segments);
+    uint32_t *  device_batched_memcpy_to    = (uint32_t *)(device_batched_memcpy_from   + num_segments);
+    uint32_t *  device_batched_memcpy_size  = (uint32_t *)(device_batched_memcpy_to     + num_segments);
+
+    device_alloc += std::max(num_descriptors * ALPHABET_SIZE * sizeof(uint32_t), device_batched_memcpy_temp_storage_size + num_segments * (8 + 4) * sizeof(uint32_t));
+
+    if ((size_t)(device_alloc - (uint8_t *)storage->device_storage) > storage->device_storage_size)
+    {
+        return LIBCUBWT_NOT_ENOUGH_MEMORY;
+    }
+
+    {
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_L        , T    , (size_t)(i    ), cudaMemcpyHostToDevice, storage->cuda_stream), status);
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_L + i + 1, T + i, (size_t)(n - i), cudaMemcpyHostToDevice, storage->cuda_stream), status);
+
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_L + i + 0, 0  , 1                          , storage->cuda_stream), status);
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_L + n + 1, 255, (size_t)(aligned_n - n - 1), storage->cuda_stream), status);
+
+        status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemsetAsync(device_offsets, 0, (1 + num_descriptors) * ALPHABET_SIZE * sizeof(uint32_t), storage->cuda_stream), status);
+    }
+    
+    if (status == cudaSuccess)
+    {
+        if (freq != NULL)
+        {
+            uint32_t * offsets = (uint32_t *)storage->host_pinned_storage;
+
+            for (uint32_t sum = 0x80000001, c = 0; c < ALPHABET_SIZE; c += 1) { offsets[c] = sum; sum += freq[c]; }
+
+            status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_offsets, offsets, ALPHABET_SIZE * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status);
+        }
+        else
+        {
+            int64_t n_histogram_blocks = std::min((uint32_t)num_descriptors, (storage->device_multiprocessor_max_threads / storage->cuda_block_threads) * storage->device_multiprocessor_count);
+            
+            libcubwt_compute_histogram_offsets<<<(uint32_t)n_histogram_blocks, storage->cuda_block_threads, 0, storage->cuda_stream>>>(device_L, (uint32_t)aligned_n, device_offsets);
+        }
+
+        if (status == cudaSuccess)
+        {
+            libcubwt_compute_LF_mapping<<<(uint32_t)num_descriptors, storage->cuda_block_threads, 0, storage->cuda_stream>>>(device_L, (uint32_t)i, device_offsets + ALPHABET_SIZE, device_LF);
+        }
+    }
+
+    if (status == cudaSuccess)
+    {
+        uint8_t * device_staging_area   = (uint8_t *)(device_batched_memcpy_size + num_segments);
+        uint8_t * device_staging_end    = (uint8_t *)storage->device_storage + storage->device_storage_size;
+        uint32_t  staging_size          = (uint32_t)(((device_staging_end - device_staging_area) / num_segments) & (-16));
+
+        {
+            if (32 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count))
+            {
+                libcubwt_mark_segments<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions);
+                libcubwt_survey_segments<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size);
+            }
+            else if (64 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count))
+            {
+                libcubwt_mark_segments<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions);
+                libcubwt_survey_segments<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size);
+            }
+            else
+            {
+                libcubwt_mark_segments<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>((uint32_t)((n + 1) / num_segments), (uint32_t)((n + 1) % num_segments), device_LF, device_segments_positions);
+                libcubwt_survey_segments<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_staging_area, staging_size);
+            }
+        }
+
+        {
+            uint2 *     host_segments_surveys       = (uint2 *   )storage->host_pinned_storage;
+            uint64_t *  host_batched_memcpy_from    = (uint64_t *)(host_segments_surveys    + num_segments);
+            uint32_t *  host_batched_memcpy_to      = (uint32_t *)(host_batched_memcpy_from + num_segments);
+            uint32_t *  host_batched_memcpy_size    = (uint32_t *)(host_batched_memcpy_to   + num_segments);
+
+            status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(host_segments_surveys, device_segments_surveys, num_segments * 2 * sizeof(uint32_t), cudaMemcpyDeviceToHost, storage->cuda_stream), status);
+            status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status);
+
+            if (status == cudaSuccess)
+            {
+                bool overflow = libcubwt_rank_segments(host_segments_surveys, host_batched_memcpy_from, host_batched_memcpy_to, host_batched_memcpy_size, (uint32_t)(n + 1), (uint32_t)num_segments, (uint32_t)staging_size);
+
+                status = overflow
+                    ? libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_segments_surveys, host_segments_surveys, num_segments * 6 * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status)
+                    : libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(device_batched_memcpy_from, host_batched_memcpy_from, num_segments * 4 * sizeof(uint32_t), cudaMemcpyHostToDevice, storage->cuda_stream), status);
+
+                if (status == cudaSuccess)
+                {
+                    OffsetToPointerOperator<uint8_t, uint64_t> memcpy_from_operator { device_staging_area };
+                    OffsetToPointerOperator<uint8_t, uint32_t> memcpy_to_operator   { device_L };
+
+                    cub::TransformInputIterator<uint8_t *, OffsetToPointerOperator<uint8_t, uint64_t>, uint64_t *> memcpy_from_iterator (device_batched_memcpy_from, memcpy_from_operator);
+                    cub::TransformInputIterator<uint8_t *, OffsetToPointerOperator<uint8_t, uint32_t>, uint32_t *> memcpy_to_iterator   (device_batched_memcpy_to  , memcpy_to_operator  );
+
+                    status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cub::DeviceCopy::Batched(
+                        device_batched_memcpy_temp_storage,
+                        device_batched_memcpy_temp_storage_size,
+                        memcpy_from_iterator,
+                        memcpy_to_iterator,
+                        device_batched_memcpy_size,
+                        (uint32_t)num_segments,
+                        storage->cuda_stream));
+                }
+
+                if (status == cudaSuccess && overflow)
+                { 
+                    if (32 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count))
+                    {
+                        libcubwt_decode_segments_fallback<32><<<(uint32_t)num_segments / 32, 32, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L);
+                    }
+                    else if (64 * storage->device_multiprocessor_max_blocks * storage->device_multiprocessor_count >= std::min((int32_t)num_segments, storage->device_multiprocessor_max_threads * storage->device_multiprocessor_count))
+                    {
+                        libcubwt_decode_segments_fallback<64><<<(uint32_t)num_segments / 64, 64, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L);
+                    }
+                    else
+                    {
+                        libcubwt_decode_segments_fallback<128><<<(uint32_t)num_segments / 128, 128, 0, storage->cuda_stream>>>(device_LF, device_offsets, device_segments_positions, device_segments_surveys, device_L);
+                    }
+                }
+
+                if (status == cudaSuccess)
+                {
+                    status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaMemcpyAsync(U, device_L, (size_t)n, cudaMemcpyDeviceToHost, storage->cuda_stream), status);
+                    status = libcubwt_cuda_safe_call(__FILE__, __LINE__, cudaStreamSynchronize(storage->cuda_stream), status);
+                }
+            }
+        }
+    }
+
+    return status != cudaSuccess ? libcubwt_get_error_code(status) : LIBCUBWT_NO_ERROR;
+}
diff --git a/libbsc/bwt/libcubwt/libcubwt.cuh b/libbsc/bwt/libcubwt/libcubwt.cuh
index f303803..f969c3c 100644
--- a/libbsc/bwt/libcubwt/libcubwt.cuh
+++ b/libbsc/bwt/libcubwt/libcubwt.cuh
@@ -1,9 +1,9 @@
 /*--
 
 This file is a part of libcubwt, a library for CUDA accelerated
-burrows wheeler transform construction.
+burrows wheeler transform construction and inversion.
 
-   Copyright (c) 2022-2023 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2022-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -25,9 +25,9 @@ Please see the file LICENSE for full copyright and license details.
 #define LIBCUBWT_CUH 1
 
 #define LIBCUBWT_VERSION_MAJOR          1
-#define LIBCUBWT_VERSION_MINOR          5
+#define LIBCUBWT_VERSION_MINOR          6
 #define LIBCUBWT_VERSION_PATCH          0
-#define LIBCUBWT_VERSION_STRING	        "1.5.0"
+#define LIBCUBWT_VERSION_STRING	        "1.6.0"
 
 #define LIBCUBWT_NO_ERROR               0
 #define LIBCUBWT_BAD_PARAMETER          -1
@@ -45,7 +45,16 @@ extern "C" {
 
     /**
     * Allocates storage on the CUDA device that allows reusing allocated memory with each libcubwt operation.
-    * @param max_length The maximum length of string to support.
+    * @param device_storage A reference to the memory pointer where the allocated device storage will be saved.
+    * @param max_length This parameter controls the amount of allocated memory, ensuring that libcubwt operations
+    *        can accommodate strings of lengths up to this value for both forward and reverse Burrows-Wheeler Transforms.
+    *        The method currently allocates approximately 20.5 times the string length, which is the necessary amount of memory
+    *        for the forward Burrows-Wheeler Transform of a string at maximum length. This allocation is also sufficient and
+    *        optimal for the reverse Burrows-Wheeler Transform. However, if performance is less critical, or if device memory is limited,
+    *        'max_length' can be lowered. Allocating memory at approximately 6.8 times the string length should still yield about 90%
+    *        of the optimal performance for the reverse Burrows-Wheeler Transform. This effectively means that reverse
+    *        Burrows-Wheeler Transform operations can be performed with storage allocated using a 'max_length' parameter
+    *        at a third of the input string's maximum length.
     * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise.
     */
     int64_t libcubwt_allocate_device_storage(void ** device_storage, int64_t max_length);
@@ -79,6 +88,18 @@ extern "C" {
     */
     int64_t libcubwt_bwt_aux(void * device_storage, const uint8_t * T, uint8_t * L, int64_t n, int64_t r, uint32_t * I);
 
+    /**
+    * Reconstructs the original string from a given burrows-wheeler transformed string (BWT) with primary index.
+    * @param device_storage The previously allocated storage on the CUDA device.
+    * @param T [0..n-1] The input string.
+    * @param U [0..n-1] The output string (can be T).
+    * @param n The length of the given string.
+    * @param freq [0..255] The input symbol frequency table (can be NULL).
+    * @param i The primary index.
+    * @return LIBCUBWT_NO_ERROR if no error occurred, libcubwt error code otherwise.
+    */
+    int64_t libcubwt_unbwt(void * device_storage, const uint8_t * T, uint8_t * U, int64_t n, const int32_t * freq, int32_t i);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/libbsc/coder/coder.cpp b/libbsc/coder/coder.cpp
index 5fcd2c2..4a5ad3d 100644
--- a/libbsc/coder/coder.cpp
+++ b/libbsc/coder/coder.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/coder.h b/libbsc/coder/coder.h
index 97bc040..cd80697 100644
--- a/libbsc/coder/coder.h
+++ b/libbsc/coder/coder.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/common/predictor.h b/libbsc/coder/common/predictor.h
index 34ca792..09267f4 100644
--- a/libbsc/coder/common/predictor.h
+++ b/libbsc/coder/common/predictor.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/common/rangecoder.h b/libbsc/coder/common/rangecoder.h
index 5df8282..29180fb 100644
--- a/libbsc/coder/common/rangecoder.h
+++ b/libbsc/coder/common/rangecoder.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/common/tables.h b/libbsc/coder/common/tables.h
index a11bc9c..c017d92 100644
--- a/libbsc/coder/common/tables.h
+++ b/libbsc/coder/common/tables.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/qlfc/qlfc.cpp b/libbsc/coder/qlfc/qlfc.cpp
index f4d3469..7ba5069 100644
--- a/libbsc/coder/qlfc/qlfc.cpp
+++ b/libbsc/coder/qlfc/qlfc.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/qlfc/qlfc.h b/libbsc/coder/qlfc/qlfc.h
index b6f2714..ac0b2be 100644
--- a/libbsc/coder/qlfc/qlfc.h
+++ b/libbsc/coder/qlfc/qlfc.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/qlfc/qlfc_model.cpp b/libbsc/coder/qlfc/qlfc_model.cpp
index 78fc09a..70c6665 100644
--- a/libbsc/coder/qlfc/qlfc_model.cpp
+++ b/libbsc/coder/qlfc/qlfc_model.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/coder/qlfc/qlfc_model.h b/libbsc/coder/qlfc/qlfc_model.h
index 110ae05..c154d7c 100644
--- a/libbsc/coder/qlfc/qlfc_model.h
+++ b/libbsc/coder/qlfc/qlfc_model.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/filters.h b/libbsc/filters.h
index bf732dc..cbc3b49 100644
--- a/libbsc/filters.h
+++ b/libbsc/filters.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/filters/detectors.cpp b/libbsc/filters/detectors.cpp
index 09c445c..895ffbd 100644
--- a/libbsc/filters/detectors.cpp
+++ b/libbsc/filters/detectors.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/filters/preprocessing.cpp b/libbsc/filters/preprocessing.cpp
index 7003ae5..56e5c5a 100644
--- a/libbsc/filters/preprocessing.cpp
+++ b/libbsc/filters/preprocessing.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/filters/tables.h b/libbsc/filters/tables.h
index ebf9f08..0780e87 100644
--- a/libbsc/filters/tables.h
+++ b/libbsc/filters/tables.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/libbsc.h b/libbsc/libbsc.h
index c14cb94..254e314 100644
--- a/libbsc/libbsc.h
+++ b/libbsc/libbsc.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
@@ -35,8 +35,8 @@ See also the bsc and libbsc web site:
 
 #define LIBBSC_VERSION_MAJOR           3
 #define LIBBSC_VERSION_MINOR           3
-#define LIBBSC_VERSION_PATCH           3
-#define LIBBSC_VERSION_STRING          "3.3.3"
+#define LIBBSC_VERSION_PATCH           4
+#define LIBBSC_VERSION_STRING          "3.3.4"
 
 #define LIBBSC_NO_ERROR                0
 #define LIBBSC_BAD_PARAMETER          -1
diff --git a/libbsc/libbsc/libbsc.cpp b/libbsc/libbsc/libbsc.cpp
index 763d042..7939b99 100644
--- a/libbsc/libbsc/libbsc.cpp
+++ b/libbsc/libbsc/libbsc.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/lzp/lzp.cpp b/libbsc/lzp/lzp.cpp
index 93f530c..69ab54c 100644
--- a/libbsc/lzp/lzp.cpp
+++ b/libbsc/lzp/lzp.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/lzp/lzp.h b/libbsc/lzp/lzp.h
index e4814a4..e8e3743 100644
--- a/libbsc/lzp/lzp.h
+++ b/libbsc/lzp/lzp.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/platform/platform.cpp b/libbsc/platform/platform.cpp
index 6a2f992..13fd565 100644
--- a/libbsc/platform/platform.cpp
+++ b/libbsc/platform/platform.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/platform/platform.h b/libbsc/platform/platform.h
index 23e2d98..e7a7d59 100644
--- a/libbsc/platform/platform.h
+++ b/libbsc/platform/platform.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/st/st.cpp b/libbsc/st/st.cpp
index 82123fe..5016c56 100644
--- a/libbsc/st/st.cpp
+++ b/libbsc/st/st.cpp
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/st/st.cu b/libbsc/st/st.cu
index 531e82a..0b54f25 100644
--- a/libbsc/st/st.cu
+++ b/libbsc/st/st.cu
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/st/st.cuh b/libbsc/st/st.cuh
index b3c0e62..70f1cb2 100644
--- a/libbsc/st/st.cuh
+++ b/libbsc/st/st.cuh
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.
diff --git a/libbsc/st/st.h b/libbsc/st/st.h
index f405e4c..90cf6d1 100644
--- a/libbsc/st/st.h
+++ b/libbsc/st/st.h
@@ -8,7 +8,7 @@
 This file is a part of bsc and/or libbsc, a program and a library for
 lossless, block-sorting data compression.
 
-   Copyright (c) 2009-2021 Ilya Grebnov <ilya.grebnov@gmail.com>
+   Copyright (c) 2009-2024 Ilya Grebnov <ilya.grebnov@gmail.com>
 
    Licensed under the Apache License, Version 2.0 (the "License");
    you may not use this file except in compliance with the License.