From 9053414a7224b295157e1e7934f135e431984549 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Wed, 19 Jun 2013 14:24:55 -0300 Subject: [PATCH 01/23] Consensus class --- src/CMakeLists.txt | 4 +++- src/Consensus.cpp | 15 +++++++++++++ src/Consensus.hpp | 55 ++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 73 insertions(+), 1 deletion(-) create mode 100644 src/Consensus.cpp create mode 100644 src/Consensus.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 02d52640..5ea95957 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,7 +17,9 @@ include_directories(../lang/src .) ADD_LIBRARY(ToPS SHARED Symbol Alphabet util SparseMatrix MultipleAlignment GHMMStates DiscreteIIDModel VariableLengthMarkovChain InhomogeneousMarkovChain HiddenMarkovModel ConfigurationReader DiscreteIIDModelCreator VariableLengthMarkovChainCreator InhomogeneousMarkovChainCreator HiddenMarkovModelCreator GeneralizedHiddenMarkovModelCreator SequenceEntry SequenceFactory GeneralizedHiddenMarkovModel TrainVariableLengthMarkovChain TrainFixedLengthMarkovChain TrainInterpolatedMarkovChain TrainHMMBaumWelch TrainHMMMaximumLikelihood ContextTree TrainWeightArrayModel BayesianInformationCriteria TrainVariableLengthInhomogeneousMarkovChain ProbabilisticModelCreatorClient AkaikeInformationCriteria TrainPhasedMarkovChain FactorableModel DecodableModel InhomogeneousFactorableModel SmoothedHistogramKernelDensity FixedSequenceAtPosition FixedSequenceAtPositionCreator PhasedRunLengthDistribution PhasedRunLengthDistributionCreator SmoothedHistogramStanke TrainPhasedMarkovChainContextAlgorithm RemoveSequenceFromModel SmoothedHistogramBurge SequenceFormat TrainDiscreteIIDModel TargetModel TargetModelCreator ReverseComplementDNA ReverseComplementDNACreator ProbabilisticModelParameter TrainGHMMTransitions BernoulliModelCreator TrainInterpolatedPhasedMarkovChain SimilarityBasedSequenceWeightingCreator SimilarityBasedSequenceWeighting TrainSimilarityBasedSequenceWeighting MultipleSequentialModelCreator MultipleSequentialModel StoreLoadedModel PairHiddenMarkovModel PairHiddenMarkovModelCreator TrainPHMMBaumWelch MaximumDependenceDecomposition crossplatform.hpp ProbabilisticModelConfiguration ProbabilisticModelParameterValue2 -../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch) +../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch +Consensus +) diff --git a/src/Consensus.cpp b/src/Consensus.cpp new file mode 100644 index 00000000..f2834ee4 --- /dev/null +++ b/src/Consensus.cpp @@ -0,0 +1,15 @@ +#include "Consensus.hpp" + +namespace tops { + bool Consensus::is(int symbol) { + for (std::vector::iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + if (*it == symbol) + return true; + } + return false; + } + + double Consensus::getFrequency() { + return _frequency; + } +} \ No newline at end of file diff --git a/src/Consensus.hpp b/src/Consensus.hpp new file mode 100644 index 00000000..7ac312e7 --- /dev/null +++ b/src/Consensus.hpp @@ -0,0 +1,55 @@ +/* + * ConfigurationReader.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef CONFIGURATION_READER_HPP +#define CONFIGURATION_READER_HPP +#include + +#include "crossplatform.hpp" + + #include "Sequence.hpp" + +#include +#include +#include + + +using namespace std; + + +namespace tops { + + class Consensus { + public: + Consensus(Sequence symbols, double frequency):_symbols(symbols), _frequency(frequency) {} + bool is(int symbol); + double getFrequency(); + private: + Sequence _symbols; + double _frequency; + }; + +} + +#endif From 539b08c4ac663187a5496acde60abb5591896300 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 20 Jun 2013 11:15:47 -0300 Subject: [PATCH 02/23] chi-squared and dependece between the concensus and the sequence --- src/Consensus.cpp | 34 +++++++++++++++++++++++++++++++++- src/Consensus.hpp | 23 ++++++++++++++++++----- 2 files changed, 51 insertions(+), 6 deletions(-) diff --git a/src/Consensus.cpp b/src/Consensus.cpp index f2834ee4..6eb663eb 100644 --- a/src/Consensus.cpp +++ b/src/Consensus.cpp @@ -9,7 +9,39 @@ namespace tops { return false; } - double Consensus::getFrequency() { + vector Consensus::getFrequency() { return _frequency; } + + void ConsensusDependence::init() { + + } + + double ConsensusDependence::chi(int ci, int xi) { + vector freq_e = _consensus_sequence[ci].getFrequency(); + vector freq_o(freq_e.size(), 0); + std::vector::iterator it; + for (it = _sequences.begin(); it != _sequences.end(); it++) { + freq_o[(*it)[xi]] += 1; + } + + double chi = 0; + for (int i = 0; i < freq_e.size(); i++) { + chi += pow(freq_e[i] - (freq_o[i]/_sequences.size()), 2)/freq_e[i]; + } + + return chi; + } + + Matrix ConsensusDependence::dependences() { + Matrix dep(_consensus_sequence.size(), _consensus_sequence.size()); + for (int i = 0; i < _consensus_sequence.size(); i++) { + for (int j = 0; j < _consensus_sequence.size(); j++) { + if (i != j) { + dep(i, j) = chi(i, j); + } + } + } + return dep; + } } \ No newline at end of file diff --git a/src/Consensus.hpp b/src/Consensus.hpp index 7ac312e7..2a69a06e 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -27,8 +27,8 @@ #include #include "crossplatform.hpp" - - #include "Sequence.hpp" +#include "util.hpp" +#include "Sequence.hpp" #include #include @@ -42,12 +42,25 @@ namespace tops { class Consensus { public: - Consensus(Sequence symbols, double frequency):_symbols(symbols), _frequency(frequency) {} + Consensus(Sequence symbols, vector frequency):_symbols(symbols), _frequency(frequency) {} bool is(int symbol); - double getFrequency(); + vector getFrequency(); private: Sequence _symbols; - double _frequency; + vector _frequency; + }; + + typedef std::vector ConsensusSequence; + + class ConsensusDependence { + public: + ConsensusDependence(ConsensusSequence consensus_sequence, SequenceList sequences):_consensus_sequence(consensus_sequence), _sequences(sequences) { init(); } + void init(); + double chi(int ci, int xi); + Matrix dependences(); + private: + ConsensusSequence _consensus_sequence; + SequenceList _sequences; }; } From 18a1cfc2c439f605c369d5b6e4e4563560deb08e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 25 Jun 2013 15:26:31 -0300 Subject: [PATCH 03/23] fixing Consensus.hpp --- src/Consensus.hpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Consensus.hpp b/src/Consensus.hpp index 2a69a06e..9ac8ef7f 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -22,8 +22,8 @@ * MA 02110-1301, USA. */ -#ifndef CONFIGURATION_READER_HPP -#define CONFIGURATION_READER_HPP +#ifndef CONSENSUS_HPP +#define CONSENSUS_HPP #include #include "crossplatform.hpp" From 6b9ad4ab175b74f52d270c8f5d39a8e9efc7c6e8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 25 Jun 2013 16:18:23 -0300 Subject: [PATCH 04/23] chi-square test --- src/CMakeLists.txt | 1 + src/ChiSquare.cpp | 78 ++++++++++++++++++++++++++++++++++++++++++++++ src/ChiSquare.hpp | 36 +++++++++++++++++++++ 3 files changed, 115 insertions(+) create mode 100644 src/ChiSquare.cpp create mode 100644 src/ChiSquare.hpp diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 5ea95957..c8dee55d 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -19,6 +19,7 @@ ADD_LIBRARY(ToPS SHARED Symbol Alphabet util SparseMatrix MultipleAlignment GHM crossplatform.hpp ProbabilisticModelConfiguration ProbabilisticModelParameterValue2 ../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch Consensus +ChiSquare ) diff --git a/src/ChiSquare.cpp b/src/ChiSquare.cpp new file mode 100644 index 00000000..5c69a074 --- /dev/null +++ b/src/ChiSquare.cpp @@ -0,0 +1,78 @@ +#include "ChiSquare.hpp" + +namespace tops { + + double CHI_QUARE[31][9] = { + {0.00098, 1.64, 2.71, 3.84, 5.02, 6.63, 7.88, 10.83, 12.12}, + {0.051, 3.22, 4.61, 5.99, 7.38, 9.21, 10.60, 13.82, 15.20}, + {0.216, 4.64, 6.25, 7.81, 9.35, 11.34, 12.84, 16.27, 17.73}, + {0.48, 5.99, 7.78, 9.49, 11.14, 13.28, 14.86, 18.47, 20.00}, + {0.83, 7.29, 9.24, 11.07, 12.83, 15.09, 16.75, 20.51, 22.11}, + {1.24, 8.56, 10.64, 12.59, 14.45, 16.81, 18.55, 22.46, 24.10}, + {1.69, 9.80, 12.02, 14.07, 16.01, 18.48, 20.28, 24.32, 26.02}, + {2.18, 11.03, 13.36, 15.51, 17.53, 20.09, 21.95, 26.12, 27.87}, + {2.70, 12.24, 14.68, 16.92, 19.02, 21.67, 23.59, 27.88, 29.67}, + {3.25, 13.44, 15.99, 18.31, 20.48, 23.21, 25.19, 29.59, 31.42}, + {3.82, 14.63, 17.28, 19.68, 21.92, 24.73, 26.76, 31.26, 33.14}, + {4.40, 15.81, 18.55, 21.03, 23.34, 26.22, 28.30, 32.91, 34.82}, + {5.01, 16.98, 19.81, 22.36, 24.74, 27.69, 29.82, 34.53, 36.48}, + {5.63, 18.15, 21.06, 23.68, 26.12, 29.14, 31.32, 36.12, 38.11}, + {6.26, 19.31, 22.31, 25.00, 27.49, 30.58, 32.80, 37.70, 39.72}, + {6.91, 20.47, 23.54, 26.30, 28.85, 32.00, 34.27, 39.25, 41.31}, + {7.56, 21.61, 24.77, 27.59, 30.19, 33.41, 35.72, 40.79, 42.88}, + {8.23, 22.76, 25.99, 28.87, 31.53, 34.81, 37.16, 42.31, 44.43}, + {8.91, 23.90, 27.20, 30.14, 32.85, 36.19, 38.58, 43.82, 45.97}, + {9.59, 25.04, 28.41, 31.41, 34.17, 37.57, 40.00, 45.31, 47.50}, + {10.28, 26.17, 29.62, 32.67, 35.48, 38.93, 41.40, 46.80, 49.01}, + {10.98, 27.30, 30.81, 33.92, 36.78, 40.29, 42.80, 48.27, 50.51}, + {11.69, 28.43, 32.01, 35.17, 38.08, 41.64, 44.18, 49.73, 52.00}, + {12.40, 29.55, 33.20, 36.42, 39.36, 42.98, 45.56, 51.18, 53.48}, + {13.12, 30.68, 34.38, 37.65, 40.65, 44.31, 46.93, 52.62, 54.95}, + {16.79, 36.25, 40.26, 43.77, 46.98, 50.89, 53.67, 59.70, 62.16}, + {24.43, 47.27, 51.81, 55.76, 59.34, 63.69, 66.77, 73.40, 76.10}, + {32.36, 58.16, 63.17, 67.50, 71.42, 76.15, 79.49, 86.66, 89.56}, + {40.48, 68.97, 74.40, 79.08, 83.30, 88.38, 91.95, 99.61, 102.7}, + {57.15, 90.41, 96.58, 101.9, 106.6, 112.3, 116.3, 124.8, 128.3}, + {74.22, 111.7, 118.5, 124.3, 129.6, 135.8, 140.2, 149.4, 153.2} + }; + + int chiPi(double p) { + double prob[] = {0.975, 0.2, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001, 0.0005}; + int pi = 0; + for (int i = 0; i < 9; i++) { + if (p >= prob[i]) { + if (i == 0) + pi = 0; + else if (abs(p - prob[i]) > abs(p - prob[i-1])) + pi = i-1; + else + pi = i; + break; + } + } + return pi; + } + + int chiDfi(int df) { + if (df <= 25) + return df-1; + + int dfi = 30; + while ((df - dfi) > 0) { + dfi += 10; + if (dfi == 80) { + if (abs(df - 80) > abs(df - 100)) + return 30; + else + return 29; + } + } + return ((dfi - 30)/10 + 25); + } + + double chi(double p, int df) { + int pi = chiPi(p); + int dfi = chiDfi(df); + return CHI_QUARE[dfi][pi]; + } +} \ No newline at end of file diff --git a/src/ChiSquare.hpp b/src/ChiSquare.hpp new file mode 100644 index 00000000..f0938b05 --- /dev/null +++ b/src/ChiSquare.hpp @@ -0,0 +1,36 @@ +/* + * ConfigurationReader.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef CHI_SQUARE_HPP +#define CHI_SQUARE_HPP +#include + +#include "crossplatform.hpp" + + +namespace tops { + double chi(double p, int df); +} + +#endif From b1e9a395c3bd0380d3bf710a5370840d0a07e387 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 27 Jun 2013 09:55:29 -0300 Subject: [PATCH 05/23] consensus sequence --- app/CMakeLists.txt | 2 + src/CMakeLists.txt | 1 + src/ChiSquare.cpp | 2 +- src/ChiSquare.hpp | 2 +- src/Consensus.cpp | 38 +----- src/Consensus.hpp | 16 +-- src/MaximalDependenceDecomposition.cpp | 4 + src/MaximalDependenceDecomposition.hpp | 48 ++++++++ src/cmake_install.cmake | 156 ++++++++++++------------- 9 files changed, 142 insertions(+), 127 deletions(-) create mode 100644 src/MaximalDependenceDecomposition.cpp create mode 100644 src/MaximalDependenceDecomposition.hpp diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index a7e9f3bf..fe7a245a 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -36,6 +36,7 @@ ADD_EXECUTABLE(predalign predalign.cpp) ADD_EXECUTABLE(runAllPredalignTests runAllPredalignTests.cpp) ADD_EXECUTABLE(print_graph_from_model print_graph_from_model.cpp) ADD_EXECUTABLE(profileHMMtests profileHMMtests.cpp) +ADD_EXECUTABLE(teste_mdd teste_mdd.cpp) TARGET_LINK_LIBRARIES(evaluate ${APP_DEP} ) TARGET_LINK_LIBRARIES(train ${APP_DEP} ) @@ -60,6 +61,7 @@ TARGET_LINK_LIBRARIES(runAllPredalignTests ${APP_DEP} ) TARGET_LINK_LIBRARIES(posterior_decoding ${APP_DEP} ) TARGET_LINK_LIBRARIES(print_graph_from_model ${APP_DEP} ) TARGET_LINK_LIBRARIES(profileHMMtests ${APP_DEP} ) +TARGET_LINK_LIBRARIES(teste_mdd ${APP_DEP} ) install(TARGETS train viterbi_decoding mea_decoding simulate testeFBGHMM testeGPHMM simulateAlignment align predalign runAllPredalignTests bayes_classifier one_file_model sliding_window evaluate kullback_positional posterior_probabilities posterior_decoding print_graph_from_model profileHMMtests RUNTIME DESTINATION bin diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index c8dee55d..9efd3e60 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -20,6 +20,7 @@ crossplatform.hpp ProbabilisticModelConfiguration ProbabilisticModelParameterVal ../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch Consensus ChiSquare +MaximalDependenceDecomposition ) diff --git a/src/ChiSquare.cpp b/src/ChiSquare.cpp index 5c69a074..de39b08f 100644 --- a/src/ChiSquare.cpp +++ b/src/ChiSquare.cpp @@ -70,7 +70,7 @@ namespace tops { return ((dfi - 30)/10 + 25); } - double chi(double p, int df) { + double chiSquare(double p, int df) { int pi = chiPi(p); int dfi = chiDfi(df); return CHI_QUARE[dfi][pi]; diff --git a/src/ChiSquare.hpp b/src/ChiSquare.hpp index f0938b05..c59ee674 100644 --- a/src/ChiSquare.hpp +++ b/src/ChiSquare.hpp @@ -30,7 +30,7 @@ namespace tops { - double chi(double p, int df); + double chiSquare(double p, int df); } #endif diff --git a/src/Consensus.cpp b/src/Consensus.cpp index 6eb663eb..1c776bfb 100644 --- a/src/Consensus.cpp +++ b/src/Consensus.cpp @@ -9,39 +9,11 @@ namespace tops { return false; } - vector Consensus::getFrequency() { - return _frequency; - } - - void ConsensusDependence::init() { - - } - - double ConsensusDependence::chi(int ci, int xi) { - vector freq_e = _consensus_sequence[ci].getFrequency(); - vector freq_o(freq_e.size(), 0); - std::vector::iterator it; - for (it = _sequences.begin(); it != _sequences.end(); it++) { - freq_o[(*it)[xi]] += 1; - } - - double chi = 0; - for (int i = 0; i < freq_e.size(); i++) { - chi += pow(freq_e[i] - (freq_o[i]/_sequences.size()), 2)/freq_e[i]; - } - - return chi; - } - - Matrix ConsensusDependence::dependences() { - Matrix dep(_consensus_sequence.size(), _consensus_sequence.size()); - for (int i = 0; i < _consensus_sequence.size(); i++) { - for (int j = 0; j < _consensus_sequence.size(); j++) { - if (i != j) { - dep(i, j) = chi(i, j); - } - } + std::string Consensus::str() { + std::stringstream out; + for (std::vector::iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + out << (*it); } - return dep; + return out.str(); } } \ No newline at end of file diff --git a/src/Consensus.hpp b/src/Consensus.hpp index 9ac8ef7f..b762926f 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -42,27 +42,15 @@ namespace tops { class Consensus { public: - Consensus(Sequence symbols, vector frequency):_symbols(symbols), _frequency(frequency) {} + Consensus(Sequence symbols):_symbols(symbols) {} bool is(int symbol); - vector getFrequency(); + std::string str(); private: Sequence _symbols; - vector _frequency; }; typedef std::vector ConsensusSequence; - class ConsensusDependence { - public: - ConsensusDependence(ConsensusSequence consensus_sequence, SequenceList sequences):_consensus_sequence(consensus_sequence), _sequences(sequences) { init(); } - void init(); - double chi(int ci, int xi); - Matrix dependences(); - private: - ConsensusSequence _consensus_sequence; - SequenceList _sequences; - }; - } #endif diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp new file mode 100644 index 00000000..30d14b47 --- /dev/null +++ b/src/MaximalDependenceDecomposition.cpp @@ -0,0 +1,4 @@ +#include "MaximalDependenceDecomposition.hpp" + +namespace tops { +} \ No newline at end of file diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp new file mode 100644 index 00000000..5486d4bc --- /dev/null +++ b/src/MaximalDependenceDecomposition.hpp @@ -0,0 +1,48 @@ +/* + * MaximalDependenceDecomposition.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef MAXIMAL_DEPENDENCE_DECOMPOSITION_HPP +#define MAXIMAL_DEPENDENCE_DECOMPOSITION_HPP +#include + +#include "crossplatform.hpp" +#include "util.hpp" +#include "Sequence.hpp" + #include "Consensus.hpp" + +#include +#include +#include + + +using namespace std; + + +namespace tops { + + class MaximalDependenceDecomposition { + }; +} + +#endif diff --git a/src/cmake_install.cmake b/src/cmake_install.cmake index 8913670b..01fde601 100644 --- a/src/cmake_install.cmake +++ b/src/cmake_install.cmake @@ -1,4 +1,4 @@ -# Install script for directory: /Users/yoshiaki/work/programas/tops/src +# Install script for directory: /Users/igorbonadio/Projetos/tops/src # Set the install prefix IF(NOT DEFINED CMAKE_INSTALL_PREFIX) @@ -28,7 +28,7 @@ IF(NOT CMAKE_INSTALL_COMPONENT) ENDIF(NOT CMAKE_INSTALL_COMPONENT) IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") - FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/Users/yoshiaki/work/programas/tops/src/libToPS.dylib") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/Users/igorbonadio/Projetos/tops/src/libToPS.dylib") IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libToPS.dylib" AND NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libToPS.dylib") EXECUTE_PROCESS(COMMAND "/usr/bin/install_name_tool" @@ -42,82 +42,82 @@ ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unsp IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/tops" TYPE FILE FILES - "/Users/yoshiaki/work/programas/tops/src/AkaikeInformationCriteria.hpp" - "/Users/yoshiaki/work/programas/tops/src/Alphabet.hpp" - "/Users/yoshiaki/work/programas/tops/src/BayesianInformationCriteria.hpp" - "/Users/yoshiaki/work/programas/tops/src/ConfigurationReader.hpp" - "/Users/yoshiaki/work/programas/tops/src/ContextTree.hpp" - "/Users/yoshiaki/work/programas/tops/src/DecodableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/DegenerateDistribution.hpp" - "/Users/yoshiaki/work/programas/tops/src/FactorableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/FactorableModelPrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/DiscreteIIDModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/DiscreteIIDModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/FixedSequenceAtPosition.hpp" - "/Users/yoshiaki/work/programas/tops/src/FixedSequenceAtPositionCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/GHMMStates.hpp" - "/Users/yoshiaki/work/programas/tops/src/GeneralizedHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/GeneralizedHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/HiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/HiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousFactorableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousMarkovChainCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleAlignment.hpp" - "/Users/yoshiaki/work/programas/tops/src/NullPrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedFactorableModelEvaluationAlgorithm.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedRunLengthDistribution.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedRunLengthDistributionCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/PrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelCreatorClient.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelDecorator.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelParameter.hpp" - "/Users/yoshiaki/work/programas/tops/src/RemoveSequenceFromModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ReverseComplementDNA.hpp" - "/Users/yoshiaki/work/programas/tops/src/ReverseComplementDNACreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/Sequence.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceEntry.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceFactory.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceFormat.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramBurge.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramKernelDensity.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramStanke.hpp" - "/Users/yoshiaki/work/programas/tops/src/SparseMatrix.hpp" - "/Users/yoshiaki/work/programas/tops/src/Symbol.hpp" - "/Users/yoshiaki/work/programas/tops/src/TargetModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/TargetModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainDiscreteIIDModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainFixedLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainInterpolatedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainHMMBaumWelch.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainHMMMaximumLikelihood.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPhasedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainInterpolatedPhasedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPhasedMarkovChainContextAlgorithm.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainVariableLengthInhomogeneousMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainVariableLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainWeightArrayModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/VariableLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/VariableLengthMarkovChainCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/util.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainGHMMTransitions.hpp" - "/Users/yoshiaki/work/programas/tops/src/BernoulliModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/SimilarityBasedSequenceWeighting.hpp" - "/Users/yoshiaki/work/programas/tops/src/SimilarityBasedSequenceWeightingCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainSimilarityBasedSequenceWeighting.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleSequentialModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleSequentialModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/StoreLoadedModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/PairHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/PairHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPHMMBaumWelch.hpp" - "/Users/yoshiaki/work/programas/tops/src/MaximumDependenceDecomposition.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProfileHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProfileHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainProfileHMMMaxLikelihood.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainProfileHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/AkaikeInformationCriteria.hpp" + "/Users/igorbonadio/Projetos/tops/src/Alphabet.hpp" + "/Users/igorbonadio/Projetos/tops/src/BayesianInformationCriteria.hpp" + "/Users/igorbonadio/Projetos/tops/src/ConfigurationReader.hpp" + "/Users/igorbonadio/Projetos/tops/src/ContextTree.hpp" + "/Users/igorbonadio/Projetos/tops/src/DecodableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/DegenerateDistribution.hpp" + "/Users/igorbonadio/Projetos/tops/src/FactorableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/FactorableModelPrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/DiscreteIIDModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/DiscreteIIDModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/FixedSequenceAtPosition.hpp" + "/Users/igorbonadio/Projetos/tops/src/FixedSequenceAtPositionCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/GHMMStates.hpp" + "/Users/igorbonadio/Projetos/tops/src/GeneralizedHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/GeneralizedHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/HiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/HiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousFactorableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousMarkovChainCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleAlignment.hpp" + "/Users/igorbonadio/Projetos/tops/src/NullPrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedFactorableModelEvaluationAlgorithm.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedRunLengthDistribution.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedRunLengthDistributionCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/PrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelCreatorClient.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelDecorator.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelParameter.hpp" + "/Users/igorbonadio/Projetos/tops/src/RemoveSequenceFromModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ReverseComplementDNA.hpp" + "/Users/igorbonadio/Projetos/tops/src/ReverseComplementDNACreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/Sequence.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceEntry.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceFactory.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceFormat.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramBurge.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramKernelDensity.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramStanke.hpp" + "/Users/igorbonadio/Projetos/tops/src/SparseMatrix.hpp" + "/Users/igorbonadio/Projetos/tops/src/Symbol.hpp" + "/Users/igorbonadio/Projetos/tops/src/TargetModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/TargetModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainDiscreteIIDModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainFixedLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainInterpolatedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainHMMMaximumLikelihood.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPhasedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainInterpolatedPhasedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPhasedMarkovChainContextAlgorithm.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainVariableLengthInhomogeneousMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainVariableLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainWeightArrayModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/VariableLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/VariableLengthMarkovChainCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/util.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainGHMMTransitions.hpp" + "/Users/igorbonadio/Projetos/tops/src/BernoulliModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/SimilarityBasedSequenceWeighting.hpp" + "/Users/igorbonadio/Projetos/tops/src/SimilarityBasedSequenceWeightingCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainSimilarityBasedSequenceWeighting.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleSequentialModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleSequentialModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/StoreLoadedModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/PairHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/PairHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/MaximumDependenceDecomposition.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProfileHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProfileHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainProfileHMMMaxLikelihood.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainProfileHMMBaumWelch.hpp" ) ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") From 04fd28aeaa04312a4e2cd2d5352543ab075b5891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 27 Jun 2013 15:10:58 -0300 Subject: [PATCH 06/23] mdd model --- src/MaximalDependenceDecomposition.cpp | 30 +++++++++++++++++++ src/MaximalDependenceDecomposition.hpp | 40 ++++++++++++++++++++++++-- 2 files changed, 68 insertions(+), 2 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 30d14b47..7403141f 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -1,4 +1,34 @@ #include "MaximalDependenceDecomposition.hpp" namespace tops { + int MaximalDependenceDecompositionNode::getSymbol() { + return _symbol; + } + + ProbabilisticModelPtr MaximalDependenceDecompositionNode::getModel() { + return _model; + } + + void MaximalDependenceDecompositionNode::setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right) { + _left = left; + _right = right; + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecompositionNode::getLeft() { + return _left; + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecompositionNode::getRight() { + return _right; + } + + void MaximalDependenceDecomposition::setMDDTree(MaximalDependenceDecompositionNodePtr root) { + _mdd_tree = root; + } + + void MaximalDependenceDecomposition::setConsensusSequence(ConsensusSequence consensus_sequence) { + _consensus_sequence = consensus_sequence; + } + + } \ No newline at end of file diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index 5486d4bc..7013faa6 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -29,7 +29,8 @@ #include "crossplatform.hpp" #include "util.hpp" #include "Sequence.hpp" - #include "Consensus.hpp" +#include "Consensus.hpp" +#include "ProbabilisticModel.hpp" #include #include @@ -41,8 +42,43 @@ using namespace std; namespace tops { - class MaximalDependenceDecomposition { + class MaximalDependenceDecompositionNode; + typedef boost::shared_ptr MaximalDependenceDecompositionNodePtr; + + class MaximalDependenceDecompositionNode { + public: + MaximalDependenceDecompositionNode(ProbabilisticModelPtr model, int symbol):_model(model), _symbol(symbol) {}; + + int getSymbol(); + ProbabilisticModelPtr getModel(); + + void setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right); + MaximalDependenceDecompositionNodePtr getLeft(); + MaximalDependenceDecompositionNodePtr getRight(); + private: + ProbabilisticModelPtr _model; + int _symbol; + MaximalDependenceDecompositionNodePtr _left; + MaximalDependenceDecompositionNodePtr _right; + }; + + class MaximalDependenceDecomposition : public ProbabilisticModel { + public: + MaximalDependenceDecomposition() {}; + void setMDDTree(MaximalDependenceDecompositionNodePtr root); + void setConsensusSequence(ConsensusSequence consensus_sequence); + + + + virtual std::string model_name() const { + return "MaximumDependenceDecomposition"; + } + private: + MaximalDependenceDecompositionNodePtr _mdd_tree; + ConsensusSequence _consensus_sequence; }; + + typedef boost::shared_ptr MaximalDependenceDecompositionPtr; } #endif From 17b71d58d0959840bf66450c7783da9d1bd1a875 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 27 Jun 2013 17:10:42 -0300 Subject: [PATCH 07/23] MMD can evaluate a sequence --- src/MaximalDependenceDecomposition.cpp | 37 +++++++++++++++++++++++--- src/MaximalDependenceDecomposition.hpp | 12 ++++++--- 2 files changed, 41 insertions(+), 8 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 7403141f..a53ca2b1 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -1,8 +1,9 @@ #include "MaximalDependenceDecomposition.hpp" +#include "InhomogeneousMarkovChain.hpp" namespace tops { - int MaximalDependenceDecompositionNode::getSymbol() { - return _symbol; + int MaximalDependenceDecompositionNode::getIndex() { + return _index; } ProbabilisticModelPtr MaximalDependenceDecompositionNode::getModel() { @@ -30,5 +31,33 @@ namespace tops { _consensus_sequence = consensus_sequence; } - -} \ No newline at end of file + double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) { + vector indexes; + return _evaluateAux(s, _mdd_tree, indexes); + } + + double MaximalDependenceDecomposition::_evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) { + double p = 0; + if (node->getLeft()) { + p = node->getModel()->inhomogeneous()->evaluatePosition(s, node->getIndex(), node->getIndex()); + indexes.push_back(node->getIndex()); + cout << node->getIndex() << endl; + cout << "tem filho" << endl; + if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { + cout << "eh consensus" << endl; + p += _evaluateAux(s, node->getLeft(), indexes); + } else { + cout << "nao eh consensus" << endl; + p += _evaluateAux(s, node->getRight(), indexes); + } + } else { // leaf + cout << "nao tem filho" << endl; + for (int i = 0; i < s.size(); i++) { + if (std::find(indexes.begin(), indexes.end(), i) == indexes.end()) { + p += node->getModel()->inhomogeneous()->evaluatePosition(s, i, i); + } + } + } + return p; + } +} diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index 7013faa6..d01742b9 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -47,17 +47,18 @@ namespace tops { class MaximalDependenceDecompositionNode { public: - MaximalDependenceDecompositionNode(ProbabilisticModelPtr model, int symbol):_model(model), _symbol(symbol) {}; + MaximalDependenceDecompositionNode(ProbabilisticModelPtr model, int index):_model(model), _index(index) {}; - int getSymbol(); + int getIndex(); ProbabilisticModelPtr getModel(); void setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right); MaximalDependenceDecompositionNodePtr getLeft(); MaximalDependenceDecompositionNodePtr getRight(); private: + vector _otherIndexes; ProbabilisticModelPtr _model; - int _symbol; + int _index; MaximalDependenceDecompositionNodePtr _left; MaximalDependenceDecompositionNodePtr _right; }; @@ -68,12 +69,15 @@ namespace tops { void setMDDTree(MaximalDependenceDecompositionNodePtr root); void setConsensusSequence(ConsensusSequence consensus_sequence); - + virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end); virtual std::string model_name() const { return "MaximumDependenceDecomposition"; } private: + + double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes); + MaximalDependenceDecompositionNodePtr _mdd_tree; ConsensusSequence _consensus_sequence; }; From 7ef3d6922f4902e1fad9a4bbcf4ffd9c890a295a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 27 Jun 2013 17:13:23 -0300 Subject: [PATCH 08/23] removing unused comments --- src/MaximalDependenceDecomposition.cpp | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index a53ca2b1..bf196c94 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -41,17 +41,17 @@ namespace tops { if (node->getLeft()) { p = node->getModel()->inhomogeneous()->evaluatePosition(s, node->getIndex(), node->getIndex()); indexes.push_back(node->getIndex()); - cout << node->getIndex() << endl; - cout << "tem filho" << endl; + // cout << node->getIndex() << endl; + // cout << "tem filho" << endl; if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { - cout << "eh consensus" << endl; + // cout << "eh consensus" << endl; p += _evaluateAux(s, node->getLeft(), indexes); } else { - cout << "nao eh consensus" << endl; + // cout << "nao eh consensus" << endl; p += _evaluateAux(s, node->getRight(), indexes); } } else { // leaf - cout << "nao tem filho" << endl; + // cout << "nao tem filho" << endl; for (int i = 0; i < s.size(); i++) { if (std::find(indexes.begin(), indexes.end(), i) == indexes.end()) { p += node->getModel()->inhomogeneous()->evaluatePosition(s, i, i); From dc916b36f0c66d761685e2ceed3465b9f738fa8c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 27 Jun 2013 21:30:42 -0300 Subject: [PATCH 09/23] MDD can choose sequences --- src/MaximalDependenceDecomposition.cpp | 24 ++++++++++++++++++++++++ src/MaximalDependenceDecomposition.hpp | 3 +++ 2 files changed, 27 insertions(+) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index bf196c94..1f1902c5 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -60,4 +60,28 @@ namespace tops { } return p; } + + Sequence & MaximalDependenceDecomposition::choose(Sequence & s, int size) { + s = Sequence(size, -1); + _chooseAux(s, _mdd_tree); + return s; + } + + int MaximalDependenceDecomposition::_chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) { + if (node->getLeft()) { + s[node->getIndex()] = node->getModel()->inhomogeneous()->choosePosition(s, node->getIndex(), node->getIndex()); + if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { + _chooseAux(s, node->getLeft()); + } else { + _chooseAux(s, node->getRight()); + } + } else { // leaf + for (int i = 0; i < s.size(); i++) { + if (s[i] == -1) { + s[i] = node->getModel()->inhomogeneous()->choosePosition(s, i, i); + } + } + } + return 0; + } } diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index d01742b9..fffcf78a 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -70,6 +70,7 @@ namespace tops { void setConsensusSequence(ConsensusSequence consensus_sequence); virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end); + virtual Sequence & choose(Sequence & h, int size); virtual std::string model_name() const { return "MaximumDependenceDecomposition"; @@ -77,6 +78,8 @@ namespace tops { private: double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes); + int _chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node); + MaximalDependenceDecompositionNodePtr _mdd_tree; ConsensusSequence _consensus_sequence; From 8e50e8108a3d79ac7b158c172825f1a93229d926 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 16 Jul 2013 11:49:06 -0300 Subject: [PATCH 10/23] fixing const bugs --- src/Consensus.cpp | 4 +- src/Consensus.hpp | 2 +- src/MaximalDependenceDecomposition.cpp | 142 ++++++++++++++++++++++++- src/MaximalDependenceDecomposition.hpp | 20 +++- 4 files changed, 155 insertions(+), 13 deletions(-) diff --git a/src/Consensus.cpp b/src/Consensus.cpp index 1c776bfb..4b173878 100644 --- a/src/Consensus.cpp +++ b/src/Consensus.cpp @@ -1,8 +1,8 @@ #include "Consensus.hpp" namespace tops { - bool Consensus::is(int symbol) { - for (std::vector::iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + bool Consensus::is(int symbol) const { + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { if (*it == symbol) return true; } diff --git a/src/Consensus.hpp b/src/Consensus.hpp index b762926f..3acf539b 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -43,7 +43,7 @@ namespace tops { class Consensus { public: Consensus(Sequence symbols):_symbols(symbols) {} - bool is(int symbol); + bool is(int symbol) const; std::string str(); private: Sequence _symbols; diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 1f1902c5..5c4a2a4b 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -1,5 +1,6 @@ #include "MaximalDependenceDecomposition.hpp" #include "InhomogeneousMarkovChain.hpp" +#include "ContextTree.hpp" namespace tops { int MaximalDependenceDecompositionNode::getIndex() { @@ -31,12 +32,12 @@ namespace tops { _consensus_sequence = consensus_sequence; } - double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) { + double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const { vector indexes; return _evaluateAux(s, _mdd_tree, indexes); } - double MaximalDependenceDecomposition::_evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) { + double MaximalDependenceDecomposition::_evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const { double p = 0; if (node->getLeft()) { p = node->getModel()->inhomogeneous()->evaluatePosition(s, node->getIndex(), node->getIndex()); @@ -61,13 +62,13 @@ namespace tops { return p; } - Sequence & MaximalDependenceDecomposition::choose(Sequence & s, int size) { + Sequence & MaximalDependenceDecomposition::choose(Sequence & s, int size) const { s = Sequence(size, -1); _chooseAux(s, _mdd_tree); return s; } - int MaximalDependenceDecomposition::_chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) { + void MaximalDependenceDecomposition::_chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) const { if (node->getLeft()) { s[node->getIndex()] = node->getModel()->inhomogeneous()->choosePosition(s, node->getIndex(), node->getIndex()); if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { @@ -82,6 +83,137 @@ namespace tops { } } } - return 0; + } + + InhomogeneousMarkovChainPtr MaximalDependenceDecomposition::trainInhomogeneousMarkovChain(SequenceEntryList & sequences) { + vector position_specific_context_trees; + for (int j = 0; j < sequences[0]->getSequence().size(); j++) { + SequenceEntryList imc_sequences; + + SequenceEntryPtr se = SequenceEntryPtr(new SequenceEntry()); + Sequence s; + for (int i = 0; i < sequences.size(); i++) { + s.push_back(sequences[i]->getSequence()[j]); + } + se->setSequence(s); + se->setName("s"); + imc_sequences.push_back(se); + + std::map w; + ContextTreePtr tree = ContextTreePtr(new ContextTree(_alphabet)); + tree->initializeCounter(imc_sequences, 0, w); + tree->normalize(); + position_specific_context_trees.push_back(tree); + } + InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); + model->setPositionSpecificDistribution(position_specific_context_trees); + return model; + } + + int MaximalDependenceDecomposition::getMaximalDependenceIndex(InhomogeneousMarkovChainPtr model, Sequence selected) { + Sequence s(_consensus_sequence.size(), -1); + double maximal = -HUGE; + double maximal_i = -1; + for (int i = 0; i < _consensus_sequence.size(); i++) { + double sum; + for (int j = 0; j < _consensus_sequence.size(); j++) { + if (i != j) { + double x; + double chi = -HUGE; + for (int k = 0; k < _alphabet->size(); k++) { + s[i] = k; + double e = _consensus_model->inhomogeneous()->evaluatePosition(s, i, i); + s[j] = k; + double o = model->evaluatePosition(s, j, j); + x = (o - e)+(o - e)-e; + chi = log_sum(chi, x); + } + // cout << chi << "\t"; + sum = log_sum(sum, chi); + } else { + // cout << "-" << "\t"; + } + } + // cout << sum <getSequence()[index])) { + consensus.push_back(sequences[i]); + } else { + nonconsensus.push_back(sequences[i]); + } + } + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(SequenceEntryList & sequences, int divmin, Sequence selected) { + InhomogeneousMarkovChainPtr model = trainInhomogeneousMarkovChain(sequences); + int consensus_index = getMaximalDependenceIndex(model, selected); + selected.push_back(consensus_index); + + MaximalDependenceDecompositionNodePtr mdd_node; + + SequenceEntryList consensus_sequences; + SequenceEntryList nonconsensus_sequences; + subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); + + // cout << "**********************************" << endl; + // cout << "consensus_index = " << consensus_index << endl; + // cout << "consensus_siquences = " << consensus_sequences.size() << endl; + // cout << "nonconsensus_siquences = " << nonconsensus_sequences.size() << endl; + + if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(model, consensus_index)); + MaximalDependenceDecompositionNodePtr left = newNode(consensus_sequences, divmin, selected); + MaximalDependenceDecompositionNodePtr right = newNode(nonconsensus_sequences, divmin, selected); + mdd_node->setChildern(left, right); + } else { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(model, -1)); + } + + + return mdd_node; + } + + void MaximalDependenceDecomposition::train(SequenceEntryList & sequences, int divmin) { + Sequence selected; + setMDDTree(newNode(sequences, divmin, selected)); } } + + + + + + + + + + + + + + + + + diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index fffcf78a..a286e8a6 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -31,6 +31,7 @@ #include "Sequence.hpp" #include "Consensus.hpp" #include "ProbabilisticModel.hpp" +#include "InhomogeneousMarkovChain.hpp" #include #include @@ -65,24 +66,33 @@ namespace tops { class MaximalDependenceDecomposition : public ProbabilisticModel { public: - MaximalDependenceDecomposition() {}; + MaximalDependenceDecomposition(AlphabetPtr alphabet):_alphabet(alphabet) {}; void setMDDTree(MaximalDependenceDecompositionNodePtr root); void setConsensusSequence(ConsensusSequence consensus_sequence); + void setConsensusModel(ProbabilisticModelPtr model); - virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end); - virtual Sequence & choose(Sequence & h, int size); + InhomogeneousMarkovChainPtr trainInhomogeneousMarkovChain(SequenceEntryList & sequences); + int getMaximalDependenceIndex(InhomogeneousMarkovChainPtr model, Sequence selected); + void subset(int index, SequenceEntryList & sequences, SequenceEntryList & consensus, SequenceEntryList & nonconsensus); + MaximalDependenceDecompositionNodePtr newNode(SequenceEntryList & sequences, int divmin, Sequence selected); + void train(SequenceEntryList & sequences, int divmin); + + virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end) const; + virtual Sequence & choose(Sequence & h, int size) const; virtual std::string model_name() const { return "MaximumDependenceDecomposition"; } private: - double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes); - int _chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node); + double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const; + void _chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) const; MaximalDependenceDecompositionNodePtr _mdd_tree; ConsensusSequence _consensus_sequence; + ProbabilisticModelPtr _consensus_model; + AlphabetPtr _alphabet; }; typedef boost::shared_ptr MaximalDependenceDecompositionPtr; From 1932b9943f08ecb04ecdff4967076c4107224b15 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 16 Jul 2013 11:51:26 -0300 Subject: [PATCH 11/23] MDD tests --- app/teste_mdd.cpp | 177 ++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 177 insertions(+) create mode 100644 app/teste_mdd.cpp diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp new file mode 100644 index 00000000..1e8570c7 --- /dev/null +++ b/app/teste_mdd.cpp @@ -0,0 +1,177 @@ +#include +#include +#include + +#include "Consensus.hpp" +#include "ChiSquare.hpp" +#include "MaximalDependenceDecomposition.hpp" +#include "ProbabilisticModelCreatorClient.hpp" +#include "ContextTree.hpp" + +using namespace tops; +using namespace std; + +int main (int argc, char ** argv) +{ + cout << "TesteMDD" << endl; + + /***********************************************************/ + // Alphabet + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->createSymbol("A"); + alphabet->createSymbol("C"); + alphabet->createSymbol("G"); + alphabet->createSymbol("T"); + + /***********************************************************/ + // Consensus Sequence + ConsensusSequence consensus_sequence; + + vector s1; + s1.push_back(0); s1.push_back(1); // a/c + Consensus c1(s1); + consensus_sequence.push_back(c1); + + int x1[] = {0, 2, 2, 3}; // AGGT + for (int i = 0; i < 4; i++) { + vector s; + s.push_back(x1[i]); + Consensus c(s); + consensus_sequence.push_back(c); + } + + vector s2; + s2.push_back(0); s2.push_back(2); // a/g + Consensus c2(s2); + consensus_sequence.push_back(c2); + + int x2[] = {0, 2, 3}; // agt + for (int i = 0; i < 3; i++) { + vector s; + s.push_back(x2[i]); + Consensus c(s); + consensus_sequence.push_back(c); + } + + for (std::vector::iterator it = consensus_sequence.begin() ; it != consensus_sequence.end(); ++it) { + cout << it->str() << '\t'; + } + cout << endl; + + /***********************************************************/ + // WMMs + ProbabilisticModelCreatorClient creator; + ProbabilisticModelPtr root = creator.create("_test2/root.txt"); + ProbabilisticModelPtr g5 = creator.create("_test2/g5.txt"); + ProbabilisticModelPtr h5 = creator.create("_test2/h5.txt"); + ProbabilisticModelPtr g5gm1 = creator.create("_test2/g5gm1.txt"); + ProbabilisticModelPtr g5hm1 = creator.create("_test2/g5hm1.txt"); + ProbabilisticModelPtr g5gm1am2 = creator.create("_test2/g5gm1am2.txt"); + ProbabilisticModelPtr g5gm1bm2 = creator.create("_test2/g5gm1bm2.txt"); + ProbabilisticModelPtr g5gm1am2u6 = creator.create("_test2/g5gm1am2u6.txt"); + ProbabilisticModelPtr g5gm1am2v6 = creator.create("_test2/g5gm1am2v6.txt"); + + /***********************************************************/ + // MDD tree + MaximalDependenceDecompositionNodePtr mdd_root = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(root, 7)); + MaximalDependenceDecompositionNodePtr mdd_g5 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5, 2)); + MaximalDependenceDecompositionNodePtr mdd_h5 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(h5, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5gm1, 1)); + MaximalDependenceDecompositionNodePtr mdd_g5hm1 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5hm1, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5gm1am2, 8)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1bm2 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5gm1bm2, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2u6 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5gm1am2u6, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2v6 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode(g5gm1am2v6, -1)); + + mdd_root->setChildern(mdd_g5, mdd_h5); + mdd_g5->setChildern(mdd_g5gm1, mdd_g5hm1); + mdd_g5gm1->setChildern(mdd_g5gm1am2, mdd_g5gm1bm2); + mdd_g5gm1am2->setChildern(mdd_g5gm1am2u6, mdd_g5gm1am2v6); + + /***********************************************************/ + // MDD definition + + MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); + mdd->setMDDTree(mdd_root); + mdd->setConsensusSequence(consensus_sequence); + + /***********************************************************/ + // Evaluate + + Sequence s; + s.push_back(1);s.push_back(0);s.push_back(2);s.push_back(2);s.push_back(3);s.push_back(2);s.push_back(0);s.push_back(0);s.push_back(3); + cout << mdd->evaluate(s, 0, 9) << endl; + + /***********************************************************/ + // Choose + + srand(time(NULL)); + Sequence new_sequence; + mdd->choose(new_sequence, 9); + + for (int i = 0; i < 9; i++) { + cout << new_sequence[i] << "\t"; + } + cout << endl; + + /***********************************************************/ + // Criando SequenceEntryList + SequenceEntryList sequences; + for (int i = 0; i < 10; i++) { + SequenceEntryPtr se = SequenceEntryPtr(new SequenceEntry(alphabet)); + Sequence random_sequence; + mdd->choose(random_sequence, 9); + se->setSequence(random_sequence); + se->setName("s"); + sequences.push_back(se); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + cout << sequences[i]->getSequence()[j]; + } + cout << endl; + } + + /***********************************************************/ + // Treinando MDD + MaximalDependenceDecompositionPtr trained_mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); + trained_mdd->setConsensusSequence(consensus_sequence); + + ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); + trained_mdd->setConsensusModel(consensus_model); + + trained_mdd->train(sequences, 2); + + cout << "-------------------------" << endl; + Sequence new_sequence_from_trained_mdd; + trained_mdd->choose(new_sequence_from_trained_mdd, 9); + + for (int i = 0; i < 9; i++) { + cout << new_sequence_from_trained_mdd[i] << "\t"; + } + cout << endl; + + // std::map w; + // ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); + // tree->initializeCounter(sequences, 0, w); + // tree->normalize(); + + // cout << tree->getContext(0)->getDistribution()->str() << endl; + + return 0; +} + + + + + From 2ea65dfe839097416d668e50b52d6ea0d1d03e62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Sun, 21 Jul 2013 13:09:54 -0300 Subject: [PATCH 12/23] mdd prints its definition --- app/teste_mdd.cpp | 24 +++++---- src/Consensus.cpp | 18 ++++++- src/Consensus.hpp | 3 +- src/MaximalDependenceDecomposition.cpp | 69 +++++++++++++++++++++++--- src/MaximalDependenceDecomposition.hpp | 10 +++- 5 files changed, 102 insertions(+), 22 deletions(-) diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp index 1e8570c7..befda0bd 100644 --- a/app/teste_mdd.cpp +++ b/app/teste_mdd.cpp @@ -74,23 +74,23 @@ int main (int argc, char ** argv) /***********************************************************/ // MDD tree MaximalDependenceDecompositionNodePtr mdd_root = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(root, 7)); + new MaximalDependenceDecompositionNode("root", root, 7)); MaximalDependenceDecompositionNodePtr mdd_g5 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5, 2)); + new MaximalDependenceDecompositionNode("g5", g5, 2)); MaximalDependenceDecompositionNodePtr mdd_h5 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(h5, -1)); + new MaximalDependenceDecompositionNode("h5", h5, -1)); MaximalDependenceDecompositionNodePtr mdd_g5gm1 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5gm1, 1)); + new MaximalDependenceDecompositionNode("g5gm1", g5gm1, 1)); MaximalDependenceDecompositionNodePtr mdd_g5hm1 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5hm1, -1)); + new MaximalDependenceDecompositionNode("g5hm1", g5hm1, -1)); MaximalDependenceDecompositionNodePtr mdd_g5gm1am2 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5gm1am2, 8)); + new MaximalDependenceDecompositionNode("g5gm1am2", g5gm1am2, 8)); MaximalDependenceDecompositionNodePtr mdd_g5gm1bm2 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5gm1bm2, -1)); + new MaximalDependenceDecompositionNode("g5gm1bm2", g5gm1bm2, -1)); MaximalDependenceDecompositionNodePtr mdd_g5gm1am2u6 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5gm1am2u6, -1)); + new MaximalDependenceDecompositionNode("g5gm1am2u6", g5gm1am2u6, -1)); MaximalDependenceDecompositionNodePtr mdd_g5gm1am2v6 = MaximalDependenceDecompositionNodePtr( - new MaximalDependenceDecompositionNode(g5gm1am2v6, -1)); + new MaximalDependenceDecompositionNode("g5gm1am2v6", g5gm1am2v6, -1)); mdd_root->setChildern(mdd_g5, mdd_h5); mdd_g5->setChildern(mdd_g5gm1, mdd_g5hm1); @@ -103,6 +103,8 @@ int main (int argc, char ** argv) MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); mdd->setMDDTree(mdd_root); mdd->setConsensusSequence(consensus_sequence); + ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); + mdd->setConsensusModel(consensus_model); /***********************************************************/ // Evaluate @@ -147,7 +149,7 @@ int main (int argc, char ** argv) MaximalDependenceDecompositionPtr trained_mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); trained_mdd->setConsensusSequence(consensus_sequence); - ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); + // ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); trained_mdd->setConsensusModel(consensus_model); trained_mdd->train(sequences, 2); @@ -168,6 +170,8 @@ int main (int argc, char ** argv) // cout << tree->getContext(0)->getDistribution()->str() << endl; + cout << trained_mdd->str() << endl; + return 0; } diff --git a/src/Consensus.cpp b/src/Consensus.cpp index 4b173878..8b87434a 100644 --- a/src/Consensus.cpp +++ b/src/Consensus.cpp @@ -1,4 +1,6 @@ #include "Consensus.hpp" +#include "Alphabet.hpp" +#include "Symbol.hpp" namespace tops { bool Consensus::is(int symbol) const { @@ -9,11 +11,23 @@ namespace tops { return false; } - std::string Consensus::str() { + std::string Consensus::str() const { std::stringstream out; - for (std::vector::iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { out << (*it); } return out.str(); } + + std::string Consensus::sym_str(AlphabetPtr alphabet) const { + std::stringstream out; + out << "\""; + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + out << alphabet->getSymbol(*it)->name(); + if ((it+1) != _symbols.end()) + out << " "; + } + out << "\""; + return out.str(); + } } \ No newline at end of file diff --git a/src/Consensus.hpp b/src/Consensus.hpp index 3acf539b..0ffd7b62 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -44,7 +44,8 @@ namespace tops { public: Consensus(Sequence symbols):_symbols(symbols) {} bool is(int symbol) const; - std::string str(); + std::string str() const; + std::string sym_str(AlphabetPtr alphabet) const; private: Sequence _symbols; }; diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 5c4a2a4b..c6e47b87 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -24,6 +24,34 @@ namespace tops { return _right; } + std::string MaximalDependenceDecompositionNode::tree_str() { + std::stringstream out; + if (_left && _right) { + out << "("; + out << _node_name; + out << ", "; + out << _left->tree_str(); + out << ", "; + out << _right->tree_str(); + out << ")"; + } else { + out << _node_name; + } + return out.str(); + } + + std::string MaximalDependenceDecompositionNode::model_str() { + std::stringstream out; + out << _node_name << " = [" << endl; + out << _model->str(); + out << "]" << endl; + if (_left && _right) { + out << _left->model_str(); + out << _right->model_str(); + } + return out.str(); + } + void MaximalDependenceDecomposition::setMDDTree(MaximalDependenceDecompositionNodePtr root) { _mdd_tree = root; } @@ -107,6 +135,7 @@ namespace tops { } InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); model->setPositionSpecificDistribution(position_specific_context_trees); + model->setAlphabet(_alphabet); return model; } @@ -166,7 +195,7 @@ namespace tops { } } - MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(SequenceEntryList & sequences, int divmin, Sequence selected) { + MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected) { InhomogeneousMarkovChainPtr model = trainInhomogeneousMarkovChain(sequences); int consensus_index = getMaximalDependenceIndex(model, selected); selected.push_back(consensus_index); @@ -183,12 +212,16 @@ namespace tops { // cout << "nonconsensus_siquences = " << nonconsensus_sequences.size() << endl; if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { - mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(model, consensus_index)); - MaximalDependenceDecompositionNodePtr left = newNode(consensus_sequences, divmin, selected); - MaximalDependenceDecompositionNodePtr right = newNode(nonconsensus_sequences, divmin, selected); + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); + std::stringstream p; + p << "mdd_node_p" << consensus_index; + MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); + std::stringstream n; + n << "mdd_node_n" << consensus_index; + MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); mdd_node->setChildern(left, right); } else { - mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(model, -1)); + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); } @@ -197,12 +230,34 @@ namespace tops { void MaximalDependenceDecomposition::train(SequenceEntryList & sequences, int divmin) { Sequence selected; - setMDDTree(newNode(sequences, divmin, selected)); + setMDDTree(newNode("root", sequences, divmin, selected)); } -} + std::string MaximalDependenceDecomposition::str () const { + std::stringstream out; + out << "model_name = \"MaximalDependenceDecomposition\"" << endl; + out << _alphabet->str(); + + out << "consensus = ("; + for (int i = 0; i < _consensus_sequence.size(); i++) { + out << _consensus_sequence[i].sym_str(_alphabet); + if (i != (_consensus_sequence.size() - 1)) + out << ", "; + } + out << ")" << endl; + + out << "consensus_model = [" << endl; + out << _consensus_model->str(); + out<< "]" << endl; + + out << _mdd_tree->model_str(); + out << "tree = " << _mdd_tree->tree_str() << endl; + + return out.str(); + } +} diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index a286e8a6..e0ae0cca 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -48,7 +48,7 @@ namespace tops { class MaximalDependenceDecompositionNode { public: - MaximalDependenceDecompositionNode(ProbabilisticModelPtr model, int index):_model(model), _index(index) {}; + MaximalDependenceDecompositionNode(std::string node_name, ProbabilisticModelPtr model, int index):_model(model), _index(index), _node_name(node_name) {}; int getIndex(); ProbabilisticModelPtr getModel(); @@ -56,10 +56,14 @@ namespace tops { void setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right); MaximalDependenceDecompositionNodePtr getLeft(); MaximalDependenceDecompositionNodePtr getRight(); + + std::string tree_str(); + std::string model_str(); private: vector _otherIndexes; ProbabilisticModelPtr _model; int _index; + std::string _node_name; MaximalDependenceDecompositionNodePtr _left; MaximalDependenceDecompositionNodePtr _right; }; @@ -74,7 +78,7 @@ namespace tops { InhomogeneousMarkovChainPtr trainInhomogeneousMarkovChain(SequenceEntryList & sequences); int getMaximalDependenceIndex(InhomogeneousMarkovChainPtr model, Sequence selected); void subset(int index, SequenceEntryList & sequences, SequenceEntryList & consensus, SequenceEntryList & nonconsensus); - MaximalDependenceDecompositionNodePtr newNode(SequenceEntryList & sequences, int divmin, Sequence selected); + MaximalDependenceDecompositionNodePtr newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected); void train(SequenceEntryList & sequences, int divmin); virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end) const; @@ -83,6 +87,8 @@ namespace tops { virtual std::string model_name() const { return "MaximumDependenceDecomposition"; } + + virtual std::string str () const ; private: double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const; From ce1b8522134faf27085b43b924930755416132c2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Sun, 21 Jul 2013 17:53:06 -0300 Subject: [PATCH 13/23] reading mdd configuration --- app/teste_mdd.cpp | 12 +- src/CMakeLists.txt | 1 + src/ConfigurationReader.cpp | 12 +- src/MaximalDependenceDecomposition.cpp | 139 ++++++++++++++++++++++-- src/MaximalDependenceDecomposition.hpp | 8 +- src/ProbabilisticModelCreatorClient.cpp | 5 +- 6 files changed, 164 insertions(+), 13 deletions(-) diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp index befda0bd..1d7f4eb5 100644 --- a/app/teste_mdd.cpp +++ b/app/teste_mdd.cpp @@ -100,7 +100,8 @@ int main (int argc, char ** argv) /***********************************************************/ // MDD definition - MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); + MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + mdd->setAlphabet(alphabet); mdd->setMDDTree(mdd_root); mdd->setConsensusSequence(consensus_sequence); ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); @@ -146,7 +147,8 @@ int main (int argc, char ** argv) /***********************************************************/ // Treinando MDD - MaximalDependenceDecompositionPtr trained_mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition(alphabet)); + MaximalDependenceDecompositionPtr trained_mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + trained_mdd->setAlphabet(alphabet); trained_mdd->setConsensusSequence(consensus_sequence); // ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); @@ -170,7 +172,11 @@ int main (int argc, char ** argv) // cout << tree->getContext(0)->getDistribution()->str() << endl; - cout << trained_mdd->str() << endl; + // cout << mdd->str() << endl; + + cout << "--------------------------" << endl; + ProbabilisticModelPtr mdd_from_config = creator.create("_test2/mdd.txt"); + cout << mdd_from_config->str() << endl; return 0; } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 9efd3e60..69410af3 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -21,6 +21,7 @@ crossplatform.hpp ProbabilisticModelConfiguration ProbabilisticModelParameterVal Consensus ChiSquare MaximalDependenceDecomposition +MaximalDependenceDecompositionCreator ) diff --git a/src/ConfigurationReader.cpp b/src/ConfigurationReader.cpp index 553522f7..d4717ed9 100644 --- a/src/ConfigurationReader.cpp +++ b/src/ConfigurationReader.cpp @@ -471,15 +471,24 @@ namespace tops { rule config_file, parameter_spec, parameter_value, parameter_name, prob_table, string_vector, double_vector, - int_vector, word, word_p, string_map, transition_map, nested_configuration, nested_parameter_spec; + int_vector, word, word_p, string_map, transition_map, nested_configuration, nested_parameter_spec, + tree_p, tree; word_p = lexeme_d [ +(alnum_p | (ch_p('_') | '.' | '/' | '-' | ' ' | ',' | '+' ))] ; + tree_p + = lexeme_d [ +(alnum_p | (ch_p('_') | '.' | '/' | '-' | ' ' | ',' | '+' | '(' | ')' | ':' ))] + ; word = ch_p('"') >> word_p >> ch_p('"') ; + tree + = ch_p('{') + >> tree_p + >> ch_p('}') + ; double_vector = ch_p('(') >> real_p[create_double_vector(this)] @@ -541,6 +550,7 @@ namespace tops { = double_vector | parameter_name [set_parameter_value_string(this)] | word [set_parameter_value_word(this)] + | tree [set_parameter_value_word(this)] | string_vector | transition_map | strict_real_p [set_parameter_value_double(this)] diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index c6e47b87..128e49f5 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -1,6 +1,11 @@ #include "MaximalDependenceDecomposition.hpp" #include "InhomogeneousMarkovChain.hpp" #include "ContextTree.hpp" +#include "ProbabilisticModelParameter.hpp" +#include "Symbol.hpp" +#include "ProbabilisticModelCreatorClient.hpp" + +#include namespace tops { int MaximalDependenceDecompositionNode::getIndex() { @@ -27,13 +32,13 @@ namespace tops { std::string MaximalDependenceDecompositionNode::tree_str() { std::stringstream out; if (_left && _right) { - out << "("; - out << _node_name; - out << ", "; + out << "( "; + out << _node_name << ":" << _index; + out << " "; out << _left->tree_str(); - out << ", "; + out << " "; out << _right->tree_str(); - out << ")"; + out << " )"; } else { out << _node_name; } @@ -253,13 +258,133 @@ namespace tops { out<< "]" << endl; out << _mdd_tree->model_str(); - out << "tree = " << _mdd_tree->tree_str() << endl; + out << "tree = {" << _mdd_tree->tree_str() << "}" << endl; return out.str(); } -} + void MaximalDependenceDecomposition::initialize(const ProbabilisticModelParameters & parameters) { + ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr consensus_param = parameters.getMandatoryParameterValue("consensus"); + ProbabilisticModelParameterValuePtr consensus_model_param = parameters.getMandatoryParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr tree = parameters.getMandatoryParameterValue("tree"); + + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->initializeFromVector(symbols->getStringVector()); + setAlphabet(alphabet); + + std::vector consensus_symbols = consensus_param->getStringVector(); + ConsensusSequence consensus_sequence; + for (int i = 0; i < consensus_symbols.size(); i++) { + std::vector syms; + boost::split(syms, consensus_symbols[i], boost::is_any_of(" ")); + vector s; + for (int j = 0; j < syms.size(); j++) { + s.push_back(alphabet->getSymbol(syms[j])->id()); + } + Consensus cons(s); + consensus_sequence.push_back(cons); + } + setConsensusSequence(consensus_sequence); + + std::string consensus_model_str = consensus_model_param->getString(); + consensus_model_str = consensus_model_str.substr(1, consensus_model_str.size() - 2); + ConfigurationReader consensus_model_reader; + ProbabilisticModelCreatorClient consensus_model_creator; + consensus_model_reader.load(consensus_model_str); + setConsensusModel(consensus_model_creator.create(*(consensus_model_reader.parameters()))); + + std::vector _tree; + string tree_str = tree->getString(); + boost::split(_tree, tree_str, boost::is_any_of(" ")); + std::vector tree_r; + for (int i = 0; i < _tree.size(); i++) + if (_tree[i] != "" && _tree[i] != " " && _tree[i] != "\n" && _tree[i] != "\t") + tree_r.push_back(_tree[i]); + setMDDTree(initializeTree(parameters, tree_r)); + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::initializeTree(const ProbabilisticModelParameters & parameters, std::vector& tree) { + MaximalDependenceDecompositionNodePtr node; + if (tree[0] == "(") { + std::vector tree_node; + + boost::split(tree_node, tree[1], boost::is_any_of(":")); + string node_name = tree_node[0]; + + int index = std::atoi(tree_node[1].c_str()); + + std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); + model_str = model_str.substr(1, model_str.size() - 2); + ConfigurationReader model_reader; + ProbabilisticModelCreatorClient model_creator; + model_reader.load(model_str); + + cout << node_name << endl; + cout << index << endl; + + MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new + MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); + + int count = 0; + int i = 2; + std::vector tree_node_left; + tree_node_left.push_back(tree[2]); + if (tree[2] == "(") { + count = 1; + i = 2; + while (count > 0) { + i++; + if (tree[i] == "(") + count++; + else if (tree[i] == ")") + count--; + tree_node_left.push_back(tree[i]); + } + } + + std::vector tree_node_right; + tree_node_right.push_back(tree[i+1]); + if (tree[count] == "(") { + count = 1; + i++; + while (count > 0) { + i++; + if (tree[i] == "(") + count++; + else if (tree[i] == ")") + count--; + tree_node_right.push_back(tree[i]); + } + } + + // cout << "left:" << endl; + MaximalDependenceDecompositionNodePtr left_node = initializeTree(parameters, tree_node_left); + // cout << "right:" << endl; + MaximalDependenceDecompositionNodePtr right_node = initializeTree(parameters, tree_node_right); + + root->setChildern(left_node, right_node); + + node = root; + } else { + string node_name = tree[0]; + int index = -1; + // cout << "-> " << node_name << endl; + std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); + model_str = model_str.substr(1, model_str.size() - 2); + ConfigurationReader model_reader; + ProbabilisticModelCreatorClient model_creator; + model_reader.load(model_str); + + MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new + MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); + + node = root; + } + return node; + } +} diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index e0ae0cca..29bcb924 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -70,7 +70,10 @@ namespace tops { class MaximalDependenceDecomposition : public ProbabilisticModel { public: - MaximalDependenceDecomposition(AlphabetPtr alphabet):_alphabet(alphabet) {}; + MaximalDependenceDecomposition() {}; + void setAlphabet(AlphabetPtr alphabet) { + _alphabet = alphabet; + } void setMDDTree(MaximalDependenceDecompositionNodePtr root); void setConsensusSequence(ConsensusSequence consensus_sequence); void setConsensusModel(ProbabilisticModelPtr model); @@ -89,6 +92,9 @@ namespace tops { } virtual std::string str () const ; + + virtual void initialize(const ProbabilisticModelParameters & parameters); + MaximalDependenceDecompositionNodePtr initializeTree(const ProbabilisticModelParameters & parameters, std::vector& tree); private: double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const; diff --git a/src/ProbabilisticModelCreatorClient.cpp b/src/ProbabilisticModelCreatorClient.cpp index 62b008f7..a2042818 100644 --- a/src/ProbabilisticModelCreatorClient.cpp +++ b/src/ProbabilisticModelCreatorClient.cpp @@ -43,6 +43,7 @@ #include "ProbabilisticModelParameter.hpp" #include "SimilarityBasedSequenceWeightingCreator.hpp" #include "MultipleSequentialModelCreator.hpp" +#include "MaximalDependenceDecompositionCreator.hpp" #include "util.hpp" @@ -154,7 +155,7 @@ namespace tops conf.append(line); } input.close(); - tops::lang::parse(conf); + // tops::lang::parse(conf); if(readConfig.load(conf)){ _p = *(readConfig.parameters()); return _p; @@ -314,6 +315,8 @@ namespace tops _createModelCommand["SimilarityBasedSequenceWeighting"] = SimilarityBasedSequenceWeightingCreatorPtr(new SimilarityBasedSequenceWeightingCreator()); _createModelCommand["MultipleSequentialModels"] = MultipleSequentialModelCreatorPtr( new MultipleSequentialModelCreator()); + _createModelCommand["MaximalDependenceDecomposition"] = + MaximalDependenceDecompositionCreatorPtr(new MaximalDependenceDecompositionCreator()); } } From 3cfb66e78c54d7cb5123c9a203b3f95c7dfd30ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Mon, 5 Aug 2013 11:32:53 -0300 Subject: [PATCH 14/23] training mdd --- app/train.cpp | 2 + src/CMakeLists.txt | 1 + src/MaximalDependenceDecompositionCreator.cpp | 18 ++++ src/MaximalDependenceDecompositionCreator.hpp | 45 ++++++++++ src/TrainMaximalDependenceDecomposition.cpp | 89 +++++++++++++++++++ src/TrainMaximalDependenceDecomposition.hpp | 58 ++++++++++++ 6 files changed, 213 insertions(+) create mode 100644 src/MaximalDependenceDecompositionCreator.cpp create mode 100644 src/MaximalDependenceDecompositionCreator.hpp create mode 100644 src/TrainMaximalDependenceDecomposition.cpp create mode 100644 src/TrainMaximalDependenceDecomposition.hpp diff --git a/app/train.cpp b/app/train.cpp index cf872138..6d57abc0 100644 --- a/app/train.cpp +++ b/app/train.cpp @@ -49,6 +49,7 @@ #include "TrainInterpolatedMarkovChain.hpp" #include "TrainSimilarityBasedSequenceWeighting.hpp" #include "TrainPhasedMarkovChainContextAlgorithm.hpp" +#include "TrainMaximalDependenceDecomposition.hpp" #include "TrainHMMMaximumLikelihood.hpp" #include "RemoveSequenceFromModel.hpp" #include "SequenceFormat.hpp" @@ -114,6 +115,7 @@ int main(int argc, char ** argv) { createModelCommand["SmoothedHistogramStanke"] = SmoothedHistogramStankePtr(new SmoothedHistogramStanke()); createModelCommand["SmoothedHistogramBurge"] = SmoothedHistogramBurgePtr(new SmoothedHistogramBurge()); createModelCommand["DiscreteIIDModel"] = TrainDiscreteIIDModelPtr(new TrainDiscreteIIDModel()); + createModelCommand["MaximalDependenceDecomposition"] = TrainMaximalDependenceDecompositionPtr(new TrainMaximalDependenceDecomposition()); modelSelectionCommand["BIC"] = BayesianInformationCriteriaPtr( diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 69410af3..85ca52dd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -22,6 +22,7 @@ Consensus ChiSquare MaximalDependenceDecomposition MaximalDependenceDecompositionCreator +TrainMaximalDependenceDecomposition ) diff --git a/src/MaximalDependenceDecompositionCreator.cpp b/src/MaximalDependenceDecompositionCreator.cpp new file mode 100644 index 00000000..1c9c208f --- /dev/null +++ b/src/MaximalDependenceDecompositionCreator.cpp @@ -0,0 +1,18 @@ +#include "MaximalDependenceDecompositionCreator.hpp" +#include "MaximalDependenceDecomposition.hpp" + +namespace tops { + ProbabilisticModelPtr MaximalDependenceDecompositionCreator::create(ProbabilisticModelParameters & parameters) const { + MaximalDependenceDecompositionPtr model = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr concensus = parameters.getMandatoryParameterValue("consensus"); + ProbabilisticModelParameterValuePtr concensus_model = parameters.getMandatoryParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr tree = parameters.getMandatoryParameterValue("tree"); + if((symbols == NULL) || (concensus == NULL) || (concensus_model == NULL) || (tree == NULL)) + { + std::cerr << help() << std::endl; + } + model->initialize(parameters); + return model; + } +} \ No newline at end of file diff --git a/src/MaximalDependenceDecompositionCreator.hpp b/src/MaximalDependenceDecompositionCreator.hpp new file mode 100644 index 00000000..ed716e9e --- /dev/null +++ b/src/MaximalDependenceDecompositionCreator.hpp @@ -0,0 +1,45 @@ +/* + * MaximalDependenceDecompositionCreator.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef MDD_CREATOR_HPP +#define MDD_CREATOR_HPP + +#include "crossplatform.hpp" + +#include "ProbabilisticModelCreator.hpp" +#include "ProbabilisticModel.hpp" +#include + +namespace tops { + //! This class is a factory for the variable length markov chain + class DLLEXPORT MaximalDependenceDecompositionCreator : public ProbabilisticModelCreator + { + public: + MaximalDependenceDecompositionCreator() {} + virtual ProbabilisticModelPtr create(ProbabilisticModelParameters & parameters) const; + }; + typedef boost::shared_ptr < MaximalDependenceDecompositionCreator> MaximalDependenceDecompositionCreatorPtr; +} + +#endif diff --git a/src/TrainMaximalDependenceDecomposition.cpp b/src/TrainMaximalDependenceDecomposition.cpp new file mode 100644 index 00000000..06514aa9 --- /dev/null +++ b/src/TrainMaximalDependenceDecomposition.cpp @@ -0,0 +1,89 @@ +/* + * TrainMaximalDependenceDecomposition.cpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "ProbabilisticModel.hpp" +#include "ProbabilisticModelCreator.hpp" +#include "ConfigurationReader.hpp" +#include "TrainMaximalDependenceDecomposition.hpp" +#include "MaximalDependenceDecomposition.hpp" +#include "util.hpp" +#include "ProbabilisticModelCreatorClient.hpp" +#include "Symbol.hpp" +#include + +namespace tops { + + ProbabilisticModelPtr TrainMaximalDependenceDecomposition::create( ProbabilisticModelParameters & parameters) const + { + ProbabilisticModelParameterValuePtr alphabet_parameter = parameters.getOptionalParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr consensus_parameter = parameters.getOptionalParameterValue("consensus"); + ProbabilisticModelParameterValuePtr consensus_model_parameter = parameters.getOptionalParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr training_set_parameter = parameters.getOptionalParameterValue("training_set"); + + if(alphabet_parameter == NULL || consensus_parameter == NULL || consensus_model_parameter == NULL || training_set_parameter == NULL) { + std::cerr << "ERROR: initial_specification is a mandatory paramenter\n" << std::endl; + return MaximalDependenceDecompositionPtr(); + } else { + MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->initializeFromVector(alphabet_parameter->getStringVector()); + mdd->setAlphabet(alphabet); + + std::vector consensus_symbols = consensus_parameter->getStringVector(); + ConsensusSequence consensus_sequence; + for (int i = 0; i < consensus_symbols.size(); i++) { + std::vector syms; + boost::split(syms, consensus_symbols[i], boost::is_any_of(" ")); + vector s; + for (int j = 0; j < syms.size(); j++) { + s.push_back(alphabet->getSymbol(syms[j])->id()); + } + Consensus cons(s); + consensus_sequence.push_back(cons); + } + mdd->setConsensusSequence(consensus_sequence); + + std::string consensus_model_str = consensus_model_parameter->getString(); + consensus_model_str = consensus_model_str.substr(1, consensus_model_str.size() - 2); + ConfigurationReader consensus_model_reader; + ProbabilisticModelCreatorClient consensus_model_creator; + consensus_model_reader.load(consensus_model_str); + mdd->setConsensusModel(consensus_model_creator.create(*(consensus_model_reader.parameters()))); + + SequenceEntryList sample_set; + readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString()); + + cout << "-------------------------------" << endl; + + mdd->train(sample_set, 2); + + cout << "-------------------------------" << endl; + + return mdd; + } + } +}; + + + diff --git a/src/TrainMaximalDependenceDecomposition.hpp b/src/TrainMaximalDependenceDecomposition.hpp new file mode 100644 index 00000000..0993f977 --- /dev/null +++ b/src/TrainMaximalDependenceDecomposition.hpp @@ -0,0 +1,58 @@ +/* + * TrainMaximalDependenceDecomposition.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef TRAIN_MDD_HPP +#define TRAIN_MDD_HPP + +#include "crossplatform.hpp" + +#include "ProbabilisticModel.hpp" +#include "ProbabilisticModelCreator.hpp" +#include "ConfigurationReader.hpp" + + +namespace tops { + + //! Creates a HMM using Baum-Welch + class DLLEXPORT TrainMaximalDependenceDecomposition : public ProbabilisticModelCreator { + public: + TrainMaximalDependenceDecomposition () {} + virtual ~TrainMaximalDependenceDecomposition () {}; + //! Creates a probability model + /*! \param parameters is a set of parameters that is utilized to build the model */ + virtual ProbabilisticModelPtr create( ProbabilisticModelParameters & parameters) const ; + + //! Provides a help + virtual std::string help() const { + std::string s; + return s; + } + + + }; + typedef boost::shared_ptr TrainMaximalDependenceDecompositionPtr ; +}; + + +#endif From 89d024b3ee6d24ca4eddd845b6e3fc85485e9b1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Mon, 5 Aug 2013 11:40:06 -0300 Subject: [PATCH 15/23] removing some prints --- src/MaximalDependenceDecomposition.cpp | 4 ++-- src/TrainMaximalDependenceDecomposition.cpp | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 128e49f5..8289f452 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -319,8 +319,8 @@ namespace tops { ProbabilisticModelCreatorClient model_creator; model_reader.load(model_str); - cout << node_name << endl; - cout << index << endl; + // cout << node_name << endl; + // cout << index << endl; MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); diff --git a/src/TrainMaximalDependenceDecomposition.cpp b/src/TrainMaximalDependenceDecomposition.cpp index 06514aa9..943ab266 100644 --- a/src/TrainMaximalDependenceDecomposition.cpp +++ b/src/TrainMaximalDependenceDecomposition.cpp @@ -73,13 +73,9 @@ namespace tops { SequenceEntryList sample_set; readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString()); - - cout << "-------------------------------" << endl; mdd->train(sample_set, 2); - cout << "-------------------------------" << endl; - return mdd; } } From b3366e09d5778e1f464857be0214becc3bb95b48 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Mon, 5 Aug 2013 11:49:15 -0300 Subject: [PATCH 16/23] bug fix: mdd can generate symbols --- src/MaximalDependenceDecomposition.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index 29bcb924..6ed766a1 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -74,6 +74,10 @@ namespace tops { void setAlphabet(AlphabetPtr alphabet) { _alphabet = alphabet; } + virtual AlphabetPtr alphabet() const + { + return _alphabet; + } void setMDDTree(MaximalDependenceDecompositionNodePtr root); void setConsensusSequence(ConsensusSequence consensus_sequence); void setConsensusModel(ProbabilisticModelPtr model); From d362d277a941ca7985d732c78e23c37d198428ba Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 6 Aug 2013 11:15:41 -0300 Subject: [PATCH 17/23] mdd's minimum_subset can be configured --- src/TrainMaximalDependenceDecomposition.cpp | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/TrainMaximalDependenceDecomposition.cpp b/src/TrainMaximalDependenceDecomposition.cpp index 943ab266..513a94b4 100644 --- a/src/TrainMaximalDependenceDecomposition.cpp +++ b/src/TrainMaximalDependenceDecomposition.cpp @@ -40,6 +40,7 @@ namespace tops { ProbabilisticModelParameterValuePtr consensus_parameter = parameters.getOptionalParameterValue("consensus"); ProbabilisticModelParameterValuePtr consensus_model_parameter = parameters.getOptionalParameterValue("consensus_model"); ProbabilisticModelParameterValuePtr training_set_parameter = parameters.getOptionalParameterValue("training_set"); + ProbabilisticModelParameterValuePtr minimum_subset_parameter = parameters.getOptionalParameterValue("minimum_subset"); if(alphabet_parameter == NULL || consensus_parameter == NULL || consensus_model_parameter == NULL || training_set_parameter == NULL) { std::cerr << "ERROR: initial_specification is a mandatory paramenter\n" << std::endl; @@ -74,7 +75,7 @@ namespace tops { SequenceEntryList sample_set; readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString()); - mdd->train(sample_set, 2); + mdd->train(sample_set, minimum_subset_parameter->getInt()); return mdd; } From 92eb33e1e75a3f559544ebd057728a27f56f1fc9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 6 Aug 2013 14:15:52 -0300 Subject: [PATCH 18/23] invariant positions should not stop mdd divisions --- src/Consensus.hpp | 3 + src/MaximalDependenceDecomposition.cpp | 83 ++++++++++++++++---------- src/MaximalDependenceDecomposition.hpp | 1 + 3 files changed, 57 insertions(+), 30 deletions(-) diff --git a/src/Consensus.hpp b/src/Consensus.hpp index 0ffd7b62..c5c6888a 100644 --- a/src/Consensus.hpp +++ b/src/Consensus.hpp @@ -46,6 +46,9 @@ namespace tops { bool is(int symbol) const; std::string str() const; std::string sym_str(AlphabetPtr alphabet) const; + Sequence symbols() { + return _symbols; + } private: Sequence _symbols; }; diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 8289f452..f5edd6d7 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -21,6 +21,10 @@ namespace tops { _right = right; } + void MaximalDependenceDecompositionNode::setChild(MaximalDependenceDecompositionNodePtr child) { + _left = child; + } + MaximalDependenceDecompositionNodePtr MaximalDependenceDecompositionNode::getLeft() { return _left; } @@ -31,13 +35,16 @@ namespace tops { std::string MaximalDependenceDecompositionNode::tree_str() { std::stringstream out; - if (_left && _right) { + if (_left || _right) { out << "( "; out << _node_name << ":" << _index; out << " "; out << _left->tree_str(); out << " "; - out << _right->tree_str(); + if (_right) + out << _right->tree_str(); + else + out << "null"; out << " )"; } else { out << _node_name; @@ -50,9 +57,10 @@ namespace tops { out << _node_name << " = [" << endl; out << _model->str(); out << "]" << endl; - if (_left && _right) { + if (_left || _right) { out << _left->model_str(); - out << _right->model_str(); + if (_right) + out << _right->model_str(); } return out.str(); } @@ -201,32 +209,45 @@ namespace tops { } MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected) { + InhomogeneousMarkovChainPtr null_model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); InhomogeneousMarkovChainPtr model = trainInhomogeneousMarkovChain(sequences); int consensus_index = getMaximalDependenceIndex(model, selected); selected.push_back(consensus_index); MaximalDependenceDecompositionNodePtr mdd_node; - - SequenceEntryList consensus_sequences; - SequenceEntryList nonconsensus_sequences; - subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); - // cout << "**********************************" << endl; - // cout << "consensus_index = " << consensus_index << endl; - // cout << "consensus_siquences = " << consensus_sequences.size() << endl; - // cout << "nonconsensus_siquences = " << nonconsensus_sequences.size() << endl; - - if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + Sequence s(_consensus_sequence.size(), -1); + s[consensus_index] = _consensus_sequence[consensus_index].symbols()[0]; + double prob = _consensus_model->inhomogeneous()->evaluatePosition(s, consensus_index, consensus_index); + if ( prob >= -0.001 && prob <= 0.001) { mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); std::stringstream p; p << "mdd_node_p" << consensus_index; - MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); - std::stringstream n; - n << "mdd_node_n" << consensus_index; - MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); - mdd_node->setChildern(left, right); + MaximalDependenceDecompositionNodePtr child = newNode(p.str(), sequences, divmin, selected); + mdd_node->setChild(child); } else { - mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); + + SequenceEntryList consensus_sequences; + SequenceEntryList nonconsensus_sequences; + subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); + + // cout << "**********************************" << endl; + // cout << "consensus_index = " << consensus_index << endl; + // cout << "consensus_sequences = " << consensus_sequences.size() << endl; + // cout << "nonconsensus_sequences = " << nonconsensus_sequences.size() << endl; + + if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); + std::stringstream p; + p << "mdd_node_p" << consensus_index; + MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); + std::stringstream n; + n << "mdd_node_n" << consensus_index; + MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); + mdd_node->setChildern(left, right); + } else { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); + } } @@ -367,20 +388,22 @@ namespace tops { node = root; } else { string node_name = tree[0]; - int index = -1; + if (node_name != "null") { + int index = -1; - // cout << "-> " << node_name << endl; + // cout << "-> " << node_name << endl; - std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); - model_str = model_str.substr(1, model_str.size() - 2); - ConfigurationReader model_reader; - ProbabilisticModelCreatorClient model_creator; - model_reader.load(model_str); + std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); + model_str = model_str.substr(1, model_str.size() - 2); + ConfigurationReader model_reader; + ProbabilisticModelCreatorClient model_creator; + model_reader.load(model_str); - MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new - MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); + MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new + MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); - node = root; + node = root; + } } return node; } diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index 6ed766a1..c8059d9b 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -54,6 +54,7 @@ namespace tops { ProbabilisticModelPtr getModel(); void setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right); + void setChild(MaximalDependenceDecompositionNodePtr child); MaximalDependenceDecompositionNodePtr getLeft(); MaximalDependenceDecompositionNodePtr getRight(); From 20cf01bea8e3964997a3f3688cc378f76a0ca0b7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 6 Aug 2013 14:20:09 -0300 Subject: [PATCH 19/23] bug fix: mdd stops when the (n-1)th level of thre tree is reached --- src/MaximalDependenceDecomposition.cpp | 61 ++++++++++++++------------ 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index f5edd6d7..f857ba0a 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -209,44 +209,47 @@ namespace tops { } MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected) { - InhomogeneousMarkovChainPtr null_model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); InhomogeneousMarkovChainPtr model = trainInhomogeneousMarkovChain(sequences); - int consensus_index = getMaximalDependenceIndex(model, selected); - selected.push_back(consensus_index); - MaximalDependenceDecompositionNodePtr mdd_node; - Sequence s(_consensus_sequence.size(), -1); - s[consensus_index] = _consensus_sequence[consensus_index].symbols()[0]; - double prob = _consensus_model->inhomogeneous()->evaluatePosition(s, consensus_index, consensus_index); - if ( prob >= -0.001 && prob <= 0.001) { - mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); - std::stringstream p; - p << "mdd_node_p" << consensus_index; - MaximalDependenceDecompositionNodePtr child = newNode(p.str(), sequences, divmin, selected); - mdd_node->setChild(child); - } else { - - SequenceEntryList consensus_sequences; - SequenceEntryList nonconsensus_sequences; - subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); + int consensus_index = getMaximalDependenceIndex(model, selected); + + if (consensus_index > 0) { - // cout << "**********************************" << endl; - // cout << "consensus_index = " << consensus_index << endl; - // cout << "consensus_sequences = " << consensus_sequences.size() << endl; - // cout << "nonconsensus_sequences = " << nonconsensus_sequences.size() << endl; + selected.push_back(consensus_index); - if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + Sequence s(_consensus_sequence.size(), -1); + s[consensus_index] = _consensus_sequence[consensus_index].symbols()[0]; + double prob = _consensus_model->inhomogeneous()->evaluatePosition(s, consensus_index, consensus_index); + if ( prob >= -0.001 && prob <= 0.001) { mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); std::stringstream p; p << "mdd_node_p" << consensus_index; - MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); - std::stringstream n; - n << "mdd_node_n" << consensus_index; - MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); - mdd_node->setChildern(left, right); + MaximalDependenceDecompositionNodePtr child = newNode(p.str(), sequences, divmin, selected); + mdd_node->setChild(child); } else { - mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); + + SequenceEntryList consensus_sequences; + SequenceEntryList nonconsensus_sequences; + subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); + + // cout << "**********************************" << endl; + // cout << "consensus_index = " << consensus_index << endl; + // cout << "consensus_sequences = " << consensus_sequences.size() << endl; + // cout << "nonconsensus_sequences = " << nonconsensus_sequences.size() << endl; + + if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); + std::stringstream p; + p << "mdd_node_p" << consensus_index; + MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); + std::stringstream n; + n << "mdd_node_n" << consensus_index; + MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); + mdd_node->setChildern(left, right); + } else { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); + } } } From c48ee788ee529425320ec094d2dac36f45355595 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Thu, 8 Aug 2013 09:37:00 -0300 Subject: [PATCH 20/23] bug fix: mdd generates unique node names --- src/MaximalDependenceDecomposition.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index f857ba0a..13b1b8e9 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -224,7 +224,7 @@ namespace tops { if ( prob >= -0.001 && prob <= 0.001) { mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); std::stringstream p; - p << "mdd_node_p" << consensus_index; + p << node_name << "_p" << consensus_index; MaximalDependenceDecompositionNodePtr child = newNode(p.str(), sequences, divmin, selected); mdd_node->setChild(child); } else { @@ -241,10 +241,10 @@ namespace tops { if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); std::stringstream p; - p << "mdd_node_p" << consensus_index; + p << node_name << "_p" << consensus_index; MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); std::stringstream n; - n << "mdd_node_n" << consensus_index; + n << node_name << "_n" << consensus_index; MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); mdd_node->setChildern(left, right); } else { @@ -259,7 +259,7 @@ namespace tops { void MaximalDependenceDecomposition::train(SequenceEntryList & sequences, int divmin) { Sequence selected; - setMDDTree(newNode("root", sequences, divmin, selected)); + setMDDTree(newNode("node_r0", sequences, divmin, selected)); } std::string MaximalDependenceDecomposition::str () const { From cbb5c07cbbb0d035280b3af35185b53829dec293 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Fri, 9 Aug 2013 11:04:58 -0300 Subject: [PATCH 21/23] mdd#prefix_sum_array --- app/teste_mdd.cpp | 18 +++++++++++++++++- src/MaximalDependenceDecomposition.cpp | 22 +++++++++++++++++++++- src/MaximalDependenceDecomposition.hpp | 4 ++++ 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp index 1d7f4eb5..602f1136 100644 --- a/app/teste_mdd.cpp +++ b/app/teste_mdd.cpp @@ -176,7 +176,23 @@ int main (int argc, char ** argv) cout << "--------------------------" << endl; ProbabilisticModelPtr mdd_from_config = creator.create("_test2/mdd.txt"); - cout << mdd_from_config->str() << endl; + // cout << mdd_from_config->str() << endl; + + cout << trained_mdd->evaluate(new_sequence, 0, 9) << endl; + cout << trained_mdd->evaluate(new_sequence, 0, 1) << endl; + + new_sequence.push_back(0); + new_sequence.push_back(0); + new_sequence.push_back(0); + new_sequence.push_back(0); + + trained_mdd->initialize_prefix_sum_array(new_sequence); + cout << trained_mdd->prefix_sum_array_compute(0, 1) << endl; + cout << trained_mdd->prefix_sum_array_compute(0, 9) << endl; + cout << trained_mdd->prefix_sum_array_compute(1, 10) << endl; + cout << trained_mdd->prefix_sum_array_compute(2, 11) << endl; + cout << trained_mdd->prefix_sum_array_compute(3, 12) << endl; + cout << trained_mdd->prefix_sum_array_compute(4, 13) << endl; return 0; } diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 13b1b8e9..57a94c90 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -73,9 +73,29 @@ namespace tops { _consensus_sequence = consensus_sequence; } + double MaximalDependenceDecomposition::prefix_sum_array_compute(int begin , int end) { + if ((end - begin) != _consensus_sequence.size()) + return -HUGE; + return _prefix_sum_array[begin]; + } + + bool MaximalDependenceDecomposition::initialize_prefix_sum_array(const Sequence & s) { + int len = s.size(); + int clen = _consensus_sequence.size(); + for (int i = 0; i < (len - clen); i++) { + _prefix_sum_array.push_back(evaluate(s, i, i + clen)); + } + return true; + } + double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const { + if ((end - begin) != _consensus_sequence.size()) + return -HUGE; + vector::const_iterator first = s.begin() + begin; + vector::const_iterator last = s.begin() + end; + vector subseq(first, last); vector indexes; - return _evaluateAux(s, _mdd_tree, indexes); + return _evaluateAux(subseq, _mdd_tree, indexes); } double MaximalDependenceDecomposition::_evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const { diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp index c8059d9b..cf4424bc 100644 --- a/src/MaximalDependenceDecomposition.hpp +++ b/src/MaximalDependenceDecomposition.hpp @@ -92,6 +92,9 @@ namespace tops { virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end) const; virtual Sequence & choose(Sequence & h, int size) const; + virtual bool initialize_prefix_sum_array(const Sequence & s); + virtual double prefix_sum_array_compute(int begin , int end); + virtual std::string model_name() const { return "MaximumDependenceDecomposition"; } @@ -110,6 +113,7 @@ namespace tops { ConsensusSequence _consensus_sequence; ProbabilisticModelPtr _consensus_model; AlphabetPtr _alphabet; + vector _prefix_sum_array; }; typedef boost::shared_ptr MaximalDependenceDecompositionPtr; From da1dbb06ee59b3bcfab77ce32ef0630fc0d49299 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Fri, 9 Aug 2013 19:17:43 -0300 Subject: [PATCH 22/23] bug fix: wrong indexes when evaluating mdd --- app/teste_mdd.cpp | 19 ++++++++++--------- src/MaximalDependenceDecomposition.cpp | 8 ++++---- 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp index 602f1136..4f8344e2 100644 --- a/app/teste_mdd.cpp +++ b/app/teste_mdd.cpp @@ -178,21 +178,22 @@ int main (int argc, char ** argv) ProbabilisticModelPtr mdd_from_config = creator.create("_test2/mdd.txt"); // cout << mdd_from_config->str() << endl; - cout << trained_mdd->evaluate(new_sequence, 0, 9) << endl; - cout << trained_mdd->evaluate(new_sequence, 0, 1) << endl; + // cout << trained_mdd->evaluate(new_sequence, 0, 9) << endl; + // cout << trained_mdd->evaluate(new_sequence, 0, 1) << endl; new_sequence.push_back(0); new_sequence.push_back(0); new_sequence.push_back(0); new_sequence.push_back(0); - trained_mdd->initialize_prefix_sum_array(new_sequence); - cout << trained_mdd->prefix_sum_array_compute(0, 1) << endl; - cout << trained_mdd->prefix_sum_array_compute(0, 9) << endl; - cout << trained_mdd->prefix_sum_array_compute(1, 10) << endl; - cout << trained_mdd->prefix_sum_array_compute(2, 11) << endl; - cout << trained_mdd->prefix_sum_array_compute(3, 12) << endl; - cout << trained_mdd->prefix_sum_array_compute(4, 13) << endl; + mdd->initialize_prefix_sum_array(new_sequence); + // cout << mdd->evaluate(new_sequence, 0, 8); + cout << mdd->prefix_sum_array_compute(0, 1) << endl; + cout << mdd->prefix_sum_array_compute(0, 8) << endl; + cout << mdd->prefix_sum_array_compute(1, 9) << endl; + cout << mdd->prefix_sum_array_compute(2, 10) << endl; + cout << mdd->prefix_sum_array_compute(3, 11) << endl; + cout << mdd->prefix_sum_array_compute(4, 12) << endl; return 0; } diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index 57a94c90..ff1e3ea6 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -74,7 +74,7 @@ namespace tops { } double MaximalDependenceDecomposition::prefix_sum_array_compute(int begin , int end) { - if ((end - begin) != _consensus_sequence.size()) + if ((end - begin + 1) != _consensus_sequence.size()) return -HUGE; return _prefix_sum_array[begin]; } @@ -83,16 +83,16 @@ namespace tops { int len = s.size(); int clen = _consensus_sequence.size(); for (int i = 0; i < (len - clen); i++) { - _prefix_sum_array.push_back(evaluate(s, i, i + clen)); + _prefix_sum_array.push_back(evaluate(s, i, i + clen - 1)); } return true; } double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const { - if ((end - begin) != _consensus_sequence.size()) + if ((end - begin + 1) != _consensus_sequence.size()) return -HUGE; vector::const_iterator first = s.begin() + begin; - vector::const_iterator last = s.begin() + end; + vector::const_iterator last = s.begin() + end + 1; vector subseq(first, last); vector indexes; return _evaluateAux(subseq, _mdd_tree, indexes); From e220c2daddd73547ef38b4e59d1969f558cf02be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=C3=8Dgor=20Bonadio?= Date: Tue, 8 Oct 2013 20:55:10 -0300 Subject: [PATCH 23/23] bug fix: mdd can select the first consensus symbol --- src/MaximalDependenceDecomposition.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp index ff1e3ea6..9696855f 100644 --- a/src/MaximalDependenceDecomposition.cpp +++ b/src/MaximalDependenceDecomposition.cpp @@ -234,7 +234,7 @@ namespace tops { int consensus_index = getMaximalDependenceIndex(model, selected); - if (consensus_index > 0) { + if (consensus_index >= 0) { selected.push_back(consensus_index);