forked from Kingsford-Group/kbf
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.cpp
191 lines (176 loc) · 7.57 KB
/
main.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
/*************************************************************
* Copyright (c) David Pellow, Darya Filippova, Carl Kingsford
*************************************************************/
#include <algorithm>
#include <cassert>
#include <chrono>
#include <fstream>
#include <iostream>
#include <memory>
#include <random>
#include <string>
#include <unordered_set>
#include <vector>
//
#include "BaseBloomFilter.hpp"
#include "KBF1.hpp"
#include "KBF2.hpp"
#include "KBFSparse.hpp"
#include "KBFSparseRelaxed.hpp"
#include "KBFUtil.hpp"
//
#include "JellyfishUtil.h"
using namespace std;
////////////////////////////////////////////////////////////////////////////////
// query a set of test kmers and write out results
////////////////////////////////////////////////////////////////////////////////
void queryKmers(vector<kmer_t>& test_kmers, unordered_set<kmer_t>& true_kmers, BaseBloomFilter& sbf, const string& out_fname) {
vector<bool> states;
// time this part
auto start = std::chrono::system_clock::now();
for (auto qk : test_kmers) {
bool state = sbf.contains(qk);
states.push_back(state);
}
auto end = std::chrono::system_clock::now();
std::chrono::duration<double> elapsed_seconds = end - start;
// cerr << "query time: " << elapsed_seconds.count() << " s" << endl;
// end the timing here
// write states and true answers to file
ofstream f_out(out_fname);
f_out << "kmer\tBF_state\ttrue_state" << endl;
for (int i = 0; i < states.size(); i++)
f_out << test_kmers[i] << "\t" << states[i] << "\t" << (true_kmers.find(test_kmers[i]) != true_kmers.end()) << endl;
f_out.close();
}
////////////////////////////////////////////////////////////////////////////////
// Sample a subset of the kmers
////////////////////////////////////////////////////////////////////////////////
vector<kmer_t> sample_kmers(unordered_set<kmer_t>& kmer_set, int const set_size, const int K, bool TP = false) {
vector<kmer_t> query_kmers;
// store the kmer set in a vector so that can sample from it
vector<kmer_t> kmer_vec;
for (auto km : kmer_set) kmer_vec.push_back(km);
// set up a random number gen
std::random_device rd;
std::mt19937 gen(rd());
std::uniform_int_distribution<> sample_dis(0, kmer_set.size() - 1);
std::uniform_int_distribution<> shift_dis(0, K - 1);
std::uniform_int_distribution<> base_dis(0, 3);
const char base_table[4] = {'A', 'C', 'G', 'T'};
//sample the input kmers
int i = query_kmers.size();
while (i < set_size) {
auto r = sample_dis(gen);
assert(r < kmer_set.size());
kmer_t sample_kmer = kmer_vec[r];
// mutate the sampled kmer
if (!TP) {
string string_kmer = mer_binary_to_string(sample_kmer, K);
auto base = base_table[base_dis(gen)];
auto ind = shift_dis(gen);
while (string_kmer[ind] == base) {
base = base_table[base_dis(gen)];
}
string_kmer[ind] = base;
sample_kmer = mer_string_to_binary(string_kmer.c_str(), K);
}
query_kmers.push_back(sample_kmer);
i++;
}
return query_kmers;
}
/////////////////////////////////////////////////////////
// main
/////////////////////////////////////////////////////////
// Usage:
//./kbf <input fasta> <query fasta> <k> [outfile prefix = 'test'] [# queries = 1000000] [use all TP = false]
int main(int argc, char* argv[]) {
if (argc < 4) {
cerr << "\tMissing required arguments." << endl;
cerr << "\tUsage:" << endl;
cerr << "\tkbf <reads.fa> <k> <query.fa> [outfile prefix = 'test'] [# queries = 1M] [use all TP = 'false']" << endl;
exit(1);
}
string input_fasta = argv[1];
int K = stoi(argv[2]);
unsigned long query_set_size = 1000000;
string base_prefix = "test";
string queryFilename = "";
bool TP = false;
if (argc > 3) {
queryFilename = argv[3];
}
if (argc > 4) {
base_prefix = argv[4];
}
if (argc > 5) {
query_set_size = stoi(argv[5]);
}
if (argc > 6) {
string TP_string = argv[6];
if (TP_string.compare("true") == 0)
TP = true;
else
assert(TP_string.compare("false") == 0);
}
unordered_set<kmer_t> read_kmers;
// parse input reads -- will build a kmer set from that
// cerr << "Parsing input fasta " << input_fasta << endl;
// auto start = std::chrono::system_clock::now();
vector<string> reads = parseFasta(input_fasta);
// auto end = std::chrono::system_clock::now();
// std::chrono::duration<double> elapsed_seconds = end - start;
// cerr << "Parsing: " << elapsed_seconds.count() << " s" << endl;
// cerr << "Getting kmers... " << endl;
// start = std::chrono::system_clock::now();
read_kmers = getKmers(reads, K);
// end = std::chrono::system_clock::now();
// elapsed_seconds = end - start;
// cerr << "Input kmers: " << read_kmers.size() << endl;
// cerr << "Getting kmers: " << elapsed_seconds.count() << " s" << endl;
// parse test reads and count the test kmers
// cerr << "Generating test kmers..." << endl;
// vector<kmer_t> query_kmers = sample_kmers(read_kmers, query_set_size, K, TP);
vector<kmer_t> query_kmers = getKmersVect(parseFasta(queryFilename), K);
// const std::vector<size_t> size_factors = {7, 8, 9, 10}; //1 hash function: []
const std::vector<size_t> size_factors = {26, 28, 30, 32, 35, 40};
std::cout << "{" << std::endl;
for (auto size_factor : size_factors) {
std::cout << " \"" << size_factor << "\": {" << std::endl;
std::string prefix = base_prefix + "_" + std::to_string(size_factor);
{
// Test the classic bloom filter
// cerr << "#### CLASSIC BLOOM FILTER ####" << endl;
auto start = std::chrono::high_resolution_clock::now();
BaseBloomFilter b(K, read_kmers.size(), size_factor);
b.populate(read_kmers);
auto end = std::chrono::high_resolution_clock::now();
cout << " \"classic_index\": " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "," << endl;
start = std::chrono::high_resolution_clock::now();
cout << " \"number_of_elements\": " << read_kmers.size() << "," << std::endl;
cout << " \"size\": " << read_kmers.size() * size_factor << "," << std::endl;
queryKmers(query_kmers, read_kmers, b, prefix + "_classic.txt");
end = std::chrono::high_resolution_clock::now();
cout << " \"classic_query\": " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "," << endl;
}
{
// Test KBF1
// cerr << "#### KBF1 ####" << endl;
auto start = std::chrono::high_resolution_clock::now();
KBF1 kbf1(K, read_kmers, size_factor, 1);
auto end = std::chrono::high_resolution_clock::now();
cout << " \"kbf1_index\": " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "," << endl;
start = std::chrono::high_resolution_clock::now();
queryKmers(query_kmers, read_kmers, kbf1, prefix + "_kbf1.txt");
end = std::chrono::high_resolution_clock::now();
cout << " \"kbf1_query\": " << std::chrono::duration_cast<std::chrono::milliseconds>(end - start).count() << "" << endl;
}
std::cout << " }";
if (size_factor < 40) {
cout << ",";
}
cout << std::endl;
}
cout << "}";
}