Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Change -f 4 output. #7

Open
wants to merge 9 commits into
base: master
Choose a base branch
from
Prev Previous commit
Next Next commit
Reformat code with clang-format (version 12).
Reformat code with clang-format (version 12):

    clang-format -style=LLVM -i *.cpp *.hpp

Fix the following errors in cbust.cpp and
remove_overlapping_segments.hpp after reformatting:

    error: ‘>>’ should be ‘> >’ within a nested template argument list

Not all clang-format changes are applied (like formating of lines
that print text).
ghuls committed Apr 11, 2022
commit 2b79593cd20e6e70475587b1d03d93aeb8322df1
2 changes: 1 addition & 1 deletion MCFbio.hpp
Original file line number Diff line number Diff line change
@@ -89,7 +89,7 @@ void count_oligos(
std::vector<unsigned> &counts, // place to store counts
unsigned oli_len, // length of oligos to count
unsigned alphsize); // size of alphabet
}
} // namespace mcf

inline char mcf::number_to_DNA(unsigned b) {
static const char lookup[] = "acgtn";
2 changes: 1 addition & 1 deletion MCFgen.hpp
Original file line number Diff line number Diff line change
@@ -43,7 +43,7 @@ template <class It> bool is_reverse(It start, It end);
// no guarantee that the answer will be exactly 1
// the range had better contain floating-point values!
template <class It> void normalize(It start, It end);
}
} // namespace mcf

template <class T> inline std::string mcf::tostring(T x) {
std::ostringstream temp;
39 changes: 27 additions & 12 deletions args.cpp
Original file line number Diff line number Diff line change
@@ -20,7 +20,7 @@ bool genomic_coordinates = false;
bool zero_based = false;
double tau = 0;
bool verbose = false;
}
} // namespace args

void args::parse(int argc, char **argv) {
const string doc =
@@ -218,20 +218,32 @@ void args::parse(int argc, char **argv) {
"-h Help: print documentation\n"
"-V Show version\n"
"-v Verbose\n"
"-c Cluster score threshold (" + mcf::tostring(score_thresh) + ")\n"
"-m Motif score threshold (" + mcf::tostring(motif_thresh) + ")\n"
"-c Cluster score threshold (" +
mcf::tostring(score_thresh) +
")\n"
"-m Motif score threshold (" +
mcf::tostring(motif_thresh) +
")\n"
"-g Expected gap (bp) between neighboring motifs in a cluster (" +
mcf::tostring(e_gap) + ")\n"
mcf::tostring(e_gap) +
")\n"
"-r Range in bp for counting local nucleotide abundances (" +
mcf::tostring(bg_range) + ")\n"
"-b Background padding in bp (" + mcf::tostring(bg_padding) + ")\n"
mcf::tostring(bg_range) +
")\n"
"-b Background padding in bp (" +
mcf::tostring(bg_padding) +
")\n"
"-l Mask lowercase letters\n"
"-p Pseudocount (" + mcf::tostring(pseudo) + ")\n"
"-p Pseudocount (" +
mcf::tostring(pseudo) +
")\n"
"-t Keep top X clusters per sequence (0 (= all))\n"
"-G Use genomic coordinates (extracted from sequence name)\n"
" 0: zero-based start coordinate\n"
" 1: one-based start coordinate\n"
"-f Output format (" + mcf::tostring(out_format) + ")\n"
"-f Output format (" +
mcf::tostring(out_format) +
")\n"
" 0: per sequence (default)\n"
" 1: per sequence, concise format\n"
" 2: sorted by cluster score\n"
@@ -350,13 +362,16 @@ void args::print(ostream &strm, uint seq_num, uint mat_num) {
<< "Pseudocount: " << pseudo << '\n'
<< "Keep top X clusters per sequence: "
<< (keep_top_x_clusters_per_sequence > 0
? mcf::tostring(keep_top_x_clusters_per_sequence) : "0 (= all)") << '\n'
? mcf::tostring(keep_top_x_clusters_per_sequence)
: "0 (= all)")
<< '\n'
<< "Extract genomic coordinates from sequence name: "
<< (genomic_coordinates
? (zero_based) ? "ON (zero-based)\n" : "ON (one-based)\n"
: "OFF\n")
? (zero_based) ? "ON (zero-based)\n" : "ON (one-based)\n"
: "OFF\n")
<< "Cluster score threshold: " << score_thresh << '\n'
<< "Motif score threshold: " << motif_thresh << '\n'
<< "Motif score threshold: " << motif_thresh
<< '\n'
// << "Tau: " << tau << '\n'
<< flush;
}
11 changes: 9 additions & 2 deletions args.hpp
Original file line number Diff line number Diff line change
@@ -11,7 +11,14 @@ extern double score_thresh; // Print clusters with scores >= this
extern double motif_thresh; // Print motifs inside clusters with scores >= this
extern double e_gap; // Expected distance between pairs of cis-elements
extern bool gap_specified; // Did the user specify a value for e_gap?
enum format { BY_SEQUENCE, BY_SEQUENCE_CONCISE, BY_SCORE, BY_SCORE_CONCISE, SEQUENCE_NAME_SORTED_BY_SCORE, BED };
enum format {
BY_SEQUENCE,
BY_SEQUENCE_CONCISE,
BY_SCORE,
BY_SCORE_CONCISE,
SEQUENCE_NAME_SORTED_BY_SCORE,
BED
};
extern format out_format;
extern uint bg_range; // Go up to this far either side of current base
// when counting local base abundances
@@ -28,4 +35,4 @@ extern bool verbose; // Be verbose or not

void parse(int argc, char **argv);
void print(ostream &strm, uint seq_num, uint mat_num);
}
} // namespace args
99 changes: 51 additions & 48 deletions cbust.cpp
Original file line number Diff line number Diff line change
@@ -145,35 +145,40 @@ void output_sequence_name_sorted_by_score(ostream &strm, const vector<seq_info>
}

// Return "log(1+exp(x))" evaluated carefully for largish "x".
// This is also called the "softplus": https://en.wikipedia.org/wiki/Rectifier_(neural_networks)
// transformation, being a smooth approximation to "max(0,x)".
// This is also called the "softplus":
// https://en.wikipedia.org/wiki/Rectifier_(neural_networks) transformation,
// being a smooth approximation to "max(0,x)".
//
// See:
// - Martin Maechler (2012) "Accurately Computing log(1 − exp(− |a|))": http://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
// - Julia LegExpFunctions package: https://github.com/JuliaStats/LogExpFunctions.jl/blob/master/src/basicfuns.jl
// - Martin Maechler (2012) "Accurately Computing log(1 − exp(− |a|))":
// http://cran.r-project.org/web/packages/Rmpfr/vignettes/log1mexp-note.pdf
// - Julia LegExpFunctions package:
// https://github.com/JuliaStats/LogExpFunctions.jl/blob/master/src/basicfuns.jl
inline double log1pexp(double x) {
// Set thresholds x0, x1, x2 such that:
// - log1pexp(x) ≈ exp(x) for x ≤ x0
// - log1pexp(x) ≈ log1p(exp(x)) for x0 < x ≤ x1
// - log1pexp(x) ≈ x + exp(-x) for x1 < x ≤ x2
// - log1pexp(x) ≈ x for x > x2
// where the tolerances of the approximations are on the order of eps(typeof(x)).
double x0 = -36.7368005696771;
double x1 = 18.021826694558577;
double x2 = 33.23111882352963;

if (x < x0) {
return exp(x);
} else if (x < x1) {
return log1p(exp(x));
} else if (x < x2) {
return x + exp(-x);
} else {
return x;
}
// Set thresholds x0, x1, x2 such that:
// - log1pexp(x) ≈ exp(x) for x ≤ x0
// - log1pexp(x) ≈ log1p(exp(x)) for x0 < x ≤ x1
// - log1pexp(x) ≈ x + exp(-x) for x1 < x ≤ x2
// - log1pexp(x) ≈ x for x > x2
// where the tolerances of the approximations are on the order of
// eps(typeof(x)).
double x0 = -36.7368005696771;
double x1 = 18.021826694558577;
double x2 = 33.23111882352963;

if (x < x0) {
return exp(x);
} else if (x < x1) {
return log1p(exp(x));
} else if (x < x2) {
return x + exp(-x);
} else {
return x;
}
}

inline cb::seq_info cb::get_chrom_and_pos(string &seq_name, uint length, bool zero_based) {
inline cb::seq_info cb::get_chrom_and_pos(string &seq_name, uint length,
bool zero_based) {
// Extract chromosome name, start position and extra info from
// sequence name.
//
@@ -444,8 +449,8 @@ void cb::scan_seq(uint seq_num) {
vector<s_segment> s_segs;

if (args::bg_padding * 2 + 1 > seq.size()) {
mcf::die("Sequence should be at least "
+ to_string(args::bg_padding * 2 + 1) + " bp long.");
mcf::die("Sequence should be at least " +
to_string(args::bg_padding * 2 + 1) + " bp long.");
}

for (uint i = 0; i < mats.size(); ++i) {
@@ -460,8 +465,9 @@ void cb::scan_seq(uint seq_num) {

for (vector<segment>::iterator s = segs.begin(); s != segs.end(); ++s) {
// cerr << s->first << " " << s->second << endl;
uint start =
s->first + 2 > args::bg_padding + max_motif_width ? s->first + 2 - max_motif_width : args::bg_padding;
uint start = s->first + 2 > args::bg_padding + max_motif_width
? s->first + 2 - max_motif_width
: args::bg_padding;
start = backward(start, s->second, bg, scores, mat_names.size());
if (scores[start] > args::score_thresh) {
s_segs.push_back(s_segment(start, s->second, scores[start]));
@@ -474,12 +480,12 @@ void cb::scan_seq(uint seq_num) {
// Remove overlapping segements (returned segments are sorted by position).
mcf::remove_overlapping_segments(s_segs, s_segs); // overkill

if (args::keep_top_x_clusters_per_sequence > 0
&& args::keep_top_x_clusters_per_sequence < s_segs.size()) {
// Sort by cluster score again.
sort(s_segs.begin(), s_segs.end(), byscore<s_segment>());
// Keep only the top X clusters per sequence.
s_segs.resize(args::keep_top_x_clusters_per_sequence);
if (args::keep_top_x_clusters_per_sequence > 0 &&
args::keep_top_x_clusters_per_sequence < s_segs.size()) {
// Sort by cluster score again.
sort(s_segs.begin(), s_segs.end(), byscore<s_segment>());
// Keep only the top X clusters per sequence.
s_segs.resize(args::keep_top_x_clusters_per_sequence);
}

for (vector<s_segment>::const_iterator s = s_segs.begin(); s != s_segs.end();
@@ -599,10 +605,7 @@ void cb::get_matrices() {
cout << "\n";
}*/
transform(mats[m][0], mats[m][1], mats[m][0],
[&](double const& elem) {
return elem + term;
}
);
[&](double const &elem) { return elem + term; });
}
}

@@ -708,12 +711,11 @@ void cb::output_by_seq_bed(ostream &strm, const seq_info &seq) {
<< "-\t"
<< ((seq.extra_info != "") ? seq.extra_info : "-") << "\n";

for (vector<motif>::const_iterator h = r->hits.begin();
h != r->hits.end(); ++h) {
for (vector<motif>::const_iterator h = r->hits.begin(); h != r->hits.end();
++h) {
// Get motif type contribution score.
vector<string>::iterator mat_names_iterator = find(mat_names.begin(),
mat_names.end(),
h->name);
vector<string>::iterator mat_names_iterator =
find(mat_names.begin(), mat_names.end(), h->name);
int matrix_name_index = distance(mat_names.begin(), mat_names_iterator);

strm << seq.chrom << "\t"
@@ -843,9 +845,10 @@ int main(int argc, char **argv) {
is >> seq_name; // get first word (?)

if (args::genomic_coordinates) {
seqs.push_back(cb::get_chrom_and_pos(seq_name, cb::seq.size(), args::zero_based));
seqs.push_back(
cb::get_chrom_and_pos(seq_name, cb::seq.size(), args::zero_based));
} else {
seqs.push_back(cb::seq_info(seq_name, cb::seq.size()));
seqs.push_back(cb::seq_info(seq_name, cb::seq.size()));
}

if (args::verbose) {
@@ -894,9 +897,9 @@ int main(int argc, char **argv) {

if (args::out_format != args::SEQUENCE_NAME_SORTED_BY_SCORE &&
args::out_format != args::BED) {
cout.precision(6); // reset to default precision
args::print(cout, seqs.size(),
cb::mat_names.size()); // print command line arguments
cout.precision(6); // reset to default precision
args::print(cout, seqs.size(),
cb::mat_names.size()); // print command line arguments
}

cout << std::flush;
2 changes: 1 addition & 1 deletion get_cbust_pssm.hpp
Original file line number Diff line number Diff line change
@@ -12,4 +12,4 @@ std::istream &get_cbust_pssm(
float &weight, // = 1 by default if not specified
float &gap, // = -1 by default if not specified
unsigned alphsize = 4); // perhaps it should guess alphsize from the input
}
} // namespace cb
2 changes: 1 addition & 1 deletion matrix.hpp
Original file line number Diff line number Diff line change
@@ -57,6 +57,6 @@ template <class T> class matrix {
return mcf::is_reverse(data.begin(), data.end());
}
};
}
} // namespace mcf

#endif
2 changes: 1 addition & 1 deletion remove_overlapping_segments.hpp
Original file line number Diff line number Diff line change
@@ -18,7 +18,7 @@ namespace { // anonymous namespace: accessible from this file only
template <class T> struct lesspos { // sort criterion for segments
bool operator()(const T &a, const T &b) const { return a.end < b.start; }
};
}
} // namespace

template <class T>
void mcf::remove_overlapping_segments(const std::vector<T> &in,