Merge pull request #44 from jermp/partitioned_phf

Partitioned phf
jermp · Mar 21, 2024 · 8d234bf · 8d234bf
2 parents 5a13d6a + a48eb2d
commit 8d234bf
Show file tree

Hide file tree

Showing 24 changed files with 267 additions and 115 deletions.
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -23,6 +23,20 @@ if (UNIX AND (CMAKE_HOST_SYSTEM_PROCESSOR STREQUAL "x86_64"))
   set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mbmi2 -msse4.2") # for hardware popcount and pdep
 endif()
 
+if (SSHASH_USE_MAX_KMER_LENGTH_63)
+  MESSAGE(STATUS "SSHash uses a maximum kmer length of 63")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSSHASH_USE_MAX_KMER_LENGTH_63")
+else()
+  MESSAGE(STATUS "SSHash uses a maximum kmer length of 31")
+endif()
+
+if (SSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING)
+  MESSAGE(STATUS "SSHash maps {'A','a'}->0, {'C','c'}->1, {'G','g'}->2, and {'T','t'}->3")
+  set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DSSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING")
+else()
+  MESSAGE(STATUS "SSHash maps {'A','a'}->0, {'C','c'}->1, {'T','t'}->2, and {'G','g'}->3")
+endif()
+
 if (UNIX)
 
   MESSAGE(STATUS "Compiling with flags: -std=c++17 -O3 -ggdb -pthread -Wall -Wextra -Wno-missing-braces -Wno-unknown-attributes -Wno-unused-function")
@@ -42,6 +56,8 @@ if (UNIX)
 
 endif()
 
+include_directories(.) # all include paths relative to parent directory
+
 set(Z_LIB_SOURCES
   include/gz/zip_stream.cpp
 )
@@ -68,3 +84,6 @@ target_link_libraries(sshash
 
 # tests
 add_executable(test_alphabet test/test_alphabet.cpp)
+target_link_libraries(test_alphabet
+  sshash_static
+)
diff --git a/README.md b/README.md
@@ -11,7 +11,7 @@ This is a compressed dictionary data structure for k-mers
 The data structure is described in the following papers:
 
 * [Sparse and Skew Hashing of K-Mers](https://doi.org/10.1093/bioinformatics/btac245) [1]
-* [On Weighted K-Mers Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2) [2,3]
+* [On Weighted K-Mer Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2) [2,3]
 
 **Please, cite these papers if you use SSHash.**
 
@@ -41,7 +41,7 @@ If you are interested in a **membership-only** version of SSHash, have a look at
 #### Table of contents
 * [Compiling the Code](#compiling-the-code)
 * [Dependencies](#dependencies)
-* [Tools](#tools)
+* [Tools and Usage](#tools-and-usage)
 * [Examples](#Examples)
 * [Input Files](#input-files)
 * [Benchmarks](#benchmarks)
@@ -77,6 +77,41 @@ For a testing environment, use the following instead:
     cmake .. -D CMAKE_BUILD_TYPE=Debug -D SSHASH_USE_SANITIZERS=On
     make -j
 
+### Encoding of Nucleotides
+
+SSHash uses by default the following 2-bit encoding of nucleotides.
+
+	 A     65     01000.00.1 -> 00
+	 C     67     01000.01.1 -> 01
+	 G     71     01000.11.1 -> 11
+	 T     84     01010.10.0 -> 10
+
+	 a     97     01100.00.1 -> 00
+	 c     99     01100.01.1 -> 01
+	 g    103     01100.11.1 -> 11
+	 t    116     01110.10.0 -> 10
+
+If you want to use the "traditional" encoding
+
+	 A     65     01000001 -> 00
+	 C     67     01000011 -> 01
+	 G     71     01000111 -> 10
+	 T     84     01010100 -> 11
+
+	 a     97     01100001 -> 00
+	 c     99     01100011 -> 01
+	 g    103     01100111 -> 10
+	 t    116     01110100 -> 11
+
+for compatibility issues with other software, then
+compile SSHash with the flag `-DSSHASH_USE_TRADITIONAL_NUCLEOTIDE_ENCODING=On`.
+
+### K-mer Length
+
+By default, SSHash uses a maximum k-mer length of 31.
+If you want to support k-mer lengths up to (and including) 63,
+compile the library with the flag `-DSSHASH_USE_MAX_KMER_LENGTH_63=On`.
+
 Dependencies
 ------------
 
@@ -95,8 +130,8 @@ if you are on Linux/Ubuntu, or
 
 if you have a Mac.
 
-Tools
------
+Tools and Usage
+---------------
 
 There is one executable called `sshash` after the compilation, which can be used to run a tool.
 Run `./sshash` as follows to see a list of available tools.
@@ -114,20 +149,25 @@ Run `./sshash` as follows to see a list of available tools.
       permute                permute a weighted input file
       compute-statistics     compute index statistics
 
+For large-scale indexing, it could be necessary to increase the number of file descriptors that can be opened simultaneously:
+
+	ulimit -n 2048
+
 Examples
 --------
 
 For the examples, we are going to use some collections
-of *stitched unitigs* from the directory `../data/unitigs_stitched`.
-The value of k used during the formation of the unitigs
+of *stitched unitigs* from the directory `data/unitigs_stitched`.
+
+**Important note:** The value of k used during the formation of the unitigs
 is indicated in the name of each file and the dictionaries
-should be built with that value as well to ensure correctness.
+**must** be built with that value as well to ensure correctness.
 
 For example, `data/unitigs_stitched/ecoli4_k31_ust.fa.gz` indicates the value k = 31, whereas `data/unitigs_stitched/se.ust.k63.fa.gz` indicates the value k = 63.
 
 For all the examples below, we are going to use k = 31.
 
-(The subdirectory `../data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.)
+(The directory `data/unitigs_stitched/with_weights` contains some files with k-mers' weights too.)
 
 In the section [Input Files](#input-files), we explain how
 such collections of stitched unitigs can be obtained from raw FASTA files.
@@ -340,6 +380,6 @@ Author
 References
 -----
 * [1] Giulio Ermanno Pibiri. [Sparse and Skew Hashing of K-Mers](https://doi.org/10.1093/bioinformatics/btac245). Bioinformatics. 2022.
-* [2] Giulio Ermanno Pibiri. [On Weighted K-Mers Dictionaries](https://drops.dagstuhl.de/opus/volltexte/2022/17043/). International Workshop on Algorithms in Bioinformatics (WABI). 2022.
-* [3] Giulio Ermanno Pibiri. [On Weighted K-Mers Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2). Algorithms for Molecular Biology (ALGOMB). 2023.
+* [2] Giulio Ermanno Pibiri. [On Weighted K-Mer Dictionaries](https://drops.dagstuhl.de/opus/volltexte/2022/17043/). International Workshop on Algorithms in Bioinformatics (WABI). 2022.
+* [3] Giulio Ermanno Pibiri. [On Weighted K-Mer Dictionaries](https://almob.biomedcentral.com/articles/10.1186/s13015-023-00226-2). Algorithms for Molecular Biology (ALGOMB). 2023.
 * [4] Schmidt, S., Khan, S., Alanko, J., Pibiri, G. E., and Tomescu, A. I. [Matchtigs: minimum plain text representation of k-mer sets](https://genomebiology.biomedcentral.com/articles/10.1186/s13059-023-02968-z). Genome Biology 24, 136. 2023.
diff --git a/external/pthash b/external/pthash
diff --git a/include/bit_vector_iterator.hpp b/include/bit_vector_iterator.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../external/pthash/include/encoders/bit_vector.hpp"
+#include "external/pthash/include/encoders/bit_vector.hpp"
 #include "util.hpp"
 
 namespace sshash {

diff --git a/include/buckets.hpp b/include/buckets.hpp
@@ -259,9 +259,9 @@ struct buckets {
 
 private:
     bool is_valid(lookup_result res) const {
-        return (res.contig_size != constants::invalid_uint32 and
+        return (res.contig_size != constants::invalid_uint64 and
                 res.kmer_id_in_contig < res.contig_size) and
-               (res.contig_id != constants::invalid_uint32 or res.contig_id < pieces.size());
+               (res.contig_id != constants::invalid_uint64 or res.contig_id < pieces.size());
     }
 };
 

diff --git a/include/builder/build.cpp b/include/builder/build.cpp
@@ -1,5 +1,5 @@
-#include "../dictionary.hpp"
-#include "../../external/pthash/external/essentials/include/essentials.hpp"
+#include "include/dictionary.hpp"
+#include "external/pthash/external/essentials/include/essentials.hpp"
 #include "util.hpp"
 
 /** build steps **/

diff --git a/include/builder/build_index.hpp b/include/builder/build_index.hpp
@@ -1,7 +1,7 @@
 #pragma once
 
 #include "file_merging_iterator.hpp"
-#include "../buckets_statistics.hpp"
+#include "include/buckets_statistics.hpp"
 
 namespace sshash {
 

diff --git a/include/builder/build_skew_index.hpp b/include/builder/build_skew_index.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../../external/pthash/include/pthash.hpp"
+#include "external/pthash/include/pthash.hpp"
 
 namespace sshash {
 
@@ -129,13 +129,11 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
         pthash::build_configuration mphf_config;
         mphf_config.c = build_config.c;
         mphf_config.alpha = 0.94;
-        mphf_config.seed = 1234567890;  // my favourite seed
+        mphf_config.seed = util::get_seed_for_hash_function(build_config);
         mphf_config.minimal_output = true;
         mphf_config.verbose_output = false;
-        mphf_config.num_threads = std::thread::hardware_concurrency() >= 8 ? 8 : 1;
-
-        std::cout << "building PTHash mphfs (with " << mphf_config.num_threads
-                  << " threads) and positions..." << std::endl;
+        mphf_config.num_threads = std::thread::hardware_concurrency();
+        mphf_config.num_partitions = 4 * mphf_config.num_threads;
 
         uint64_t partition_id = 0;
         uint64_t lower = 1ULL << min_log2_size;
@@ -160,9 +158,24 @@ void build_skew_index(skew_index& m_skew_index, parse_data& data, buckets const&
                 auto& mphf = m_skew_index.mphfs[partition_id];
                 assert(num_kmers_in_partition[partition_id] == keys_in_partition.size());
                 assert(num_kmers_in_partition[partition_id] == super_kmer_ids_in_partition.size());
+
+                if (keys_in_partition.size() / mphf_config.num_partitions <
+                    pthash::constants::min_partition_size) {
+                    mphf_config.num_partitions = std::max<uint64_t>(
+                        1, keys_in_partition.size() / (2 * pthash::constants::min_partition_size));
+                }
+
+                if (build_config.verbose) {
+                    std::cout << "  building minimizers MPHF with " << mphf_config.num_threads
+                              << " threads and " << mphf_config.num_partitions << " partitions..."
+                              << std::endl;
+                }
+
                 mphf.build_in_internal_memory(keys_in_partition.begin(), keys_in_partition.size(),
                                               mphf_config);
 
+                mphf_config.num_partitions = 4 * mphf_config.num_threads;  // restore default value
+
                 std::cout << "  built mphs[" << partition_id << "] for " << keys_in_partition.size()
                           << " keys; bits/key = "
                           << static_cast<double>(mphf.num_bits()) / mphf.num_keys() << std::endl;

diff --git a/include/builder/parse_file.hpp b/include/builder/parse_file.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../gz/zip_stream.hpp"
+#include "include/gz/zip_stream.hpp"
 
 namespace sshash {
 

diff --git a/include/builder/util.hpp b/include/builder/util.hpp
@@ -43,9 +43,7 @@ struct compact_string_pool {
             uint64_t prefix = 0;
             if (glue) {
                 prefix = k - 1;
-            } else {
-                /* otherwise, start a new piece */
-                check_contig_size();
+            } else { /* otherwise, start a new piece */
                 pieces.push_back(bvb_strings.size() / 2);
             }
             for (uint64_t i = prefix; i != size; ++i) {
@@ -58,7 +56,6 @@ struct compact_string_pool {
         void finalize() {
             /* So pieces will be of size p+1, where p is the number of DNA sequences
                in the input file. */
-            check_contig_size();
             pieces.push_back(bvb_strings.size() / 2);
             assert(pieces.front() == 0);
 
@@ -72,22 +69,6 @@ struct compact_string_pool {
         uint64_t num_super_kmers;
         std::vector<uint64_t> pieces;
         pthash::bit_vector_builder bvb_strings;
-
-    private:
-        void check_contig_size() const {
-            /* Support a max of 2^32-1 contigs, or "pieces", whose
-               max length must also be < 2^32. */
-            if (!pieces.empty()) {
-                uint64_t contig_length = bvb_strings.size() / 2 - pieces.back();
-                if (contig_length >= 1ULL << 32) {
-                    throw std::runtime_error("contig_length " + std::to_string(contig_length) +
-                                             " does not fit into 32 bits");
-                }
-                if (pieces.size() == 1ULL << 32) {
-                    throw std::runtime_error("num_contigs must be less than 2^32");
-                }
-            }
-        }
     };
 
     uint64_t num_bits() const { return strings.size(); }

diff --git a/include/constants.hpp b/include/constants.hpp
@@ -12,9 +12,7 @@ constexpr uint64_t max_k = sizeof(kmer_t) * 4 - 1;
 /* max *odd* size that can be packed into 64 bits */
 constexpr uint64_t max_m = 31;
 
-// constexpr kmer_t invalid_uint_kmer = kmer_t(-1);
 constexpr uint64_t invalid_uint64 = uint64_t(-1);
-constexpr uint32_t invalid_uint32 = uint32_t(-1);
 
 constexpr uint64_t seed = 1;
 constexpr double c = 3.0;  // for PTHash

diff --git a/include/cover/cover.hpp b/include/cover/cover.hpp
@@ -5,7 +5,7 @@
 #include <unordered_set>
 #include <vector>
 
-#include "../util.hpp"
+#include "include/util.hpp"
 #include "node.hpp"
 #include "even_frequency_weights.hpp"
 

diff --git a/include/cover/node.hpp b/include/cover/node.hpp
@@ -1,12 +1,16 @@
 #pragma once
 
-#include "../util.hpp"
+#include "include/util.hpp"
 
 #include <vector>
 #include <deque>
 
 namespace sshash {
 
+namespace constants {
+constexpr uint32_t invalid_uint32 = uint32_t(-1);
+}
+
 struct node {
     node()
         : id(constants::invalid_uint32)

diff --git a/include/cover/parse_file.hpp b/include/cover/parse_file.hpp
@@ -1,11 +1,11 @@
 #pragma once
 
-#include "../../external/pthash/include/pthash.hpp"
-#include "../../external/pthash/external/cmd_line_parser/include/parser.hpp"
-#include "../../external/pthash/external/essentials/include/essentials.hpp"
-#include "../gz/zip_stream.hpp"
-#include "../gz/zip_stream.cpp"
-#include "../builder/util.hpp"
+#include "external/pthash/include/pthash.hpp"
+#include "external/pthash/external/cmd_line_parser/include/parser.hpp"
+#include "external/pthash/external/essentials/include/essentials.hpp"
+#include "include/gz/zip_stream.hpp"
+#include "include/gz/zip_stream.cpp"
+#include "include/builder/util.hpp"
 
 #include "node.hpp"
 

diff --git a/include/ef_sequence.hpp b/include/ef_sequence.hpp
@@ -1,8 +1,8 @@
 #pragma once
 
-#include "../external/pthash/include/encoders/bit_vector.hpp"
-#include "../external/pthash/include/encoders/compact_vector.hpp"
-#include "../external/pthash/include/encoders/darray.hpp"
+#include "external/pthash/include/encoders/bit_vector.hpp"
+#include "external/pthash/include/encoders/compact_vector.hpp"
+#include "external/pthash/include/encoders/darray.hpp"
 
 namespace sshash {
 

diff --git a/include/hash_util.hpp b/include/hash_util.hpp
@@ -1,6 +1,6 @@
 #pragma once
 
-#include "../external/pthash/include/pthash.hpp"
+#include "external/pthash/include/pthash.hpp"
 #include "constants.hpp"
 
 namespace sshash {
@@ -53,16 +53,26 @@ typedef pthash::murmurhash2_128 minimizers_base_hasher_type;
 // typedef kmers_pthash_hasher_64 kmers_base_hasher_type;
 typedef kmers_pthash_hasher_128 kmers_base_hasher_type;
 
-typedef pthash::single_phf<minimizers_base_hasher_type,    // base hasher
-                           pthash::dictionary_dictionary,  // encoder type
-                           true                            // minimal output
-                           >
+// typedef pthash::single_phf<minimizers_base_hasher_type,    // base hasher
+//                            pthash::dictionary_dictionary,  // encoder type
+//                            true                            // minimal output
+//                            >
+//     minimizers_pthash_type;
+typedef pthash::partitioned_phf<minimizers_base_hasher_type,    // base hasher
+                                pthash::dictionary_dictionary,  // encoder type
+                                true                            // minimal output
+                                >
     minimizers_pthash_type;
 
-typedef pthash::single_phf<kmers_base_hasher_type,         // base hasher
-                           pthash::dictionary_dictionary,  // encoder type
-                           true                            // minimal output
-                           >
+// typedef pthash::single_phf<kmers_base_hasher_type,         // base hasher
+//                            pthash::dictionary_dictionary,  // encoder type
+//                            true                            // minimal output
+//                            >
+//     kmers_pthash_type;
+typedef pthash::partitioned_phf<kmers_base_hasher_type,         // base hasher
+                                pthash::dictionary_dictionary,  // encoder type
+                                true                            // minimal output
+                                >
     kmers_pthash_type;
 
 /* used to hash m-mers and determine the minimizer of a k-mer */
+1 −1		include/builders/external_memory_builder_partitioned_phf.hpp
+1 −1		include/builders/internal_memory_builder_partitioned_phf.hpp
+2 −0		include/utils/util.hpp