diff --git a/app/CMakeLists.txt b/app/CMakeLists.txt index a7e9f3bf..fe7a245a 100644 --- a/app/CMakeLists.txt +++ b/app/CMakeLists.txt @@ -36,6 +36,7 @@ ADD_EXECUTABLE(predalign predalign.cpp) ADD_EXECUTABLE(runAllPredalignTests runAllPredalignTests.cpp) ADD_EXECUTABLE(print_graph_from_model print_graph_from_model.cpp) ADD_EXECUTABLE(profileHMMtests profileHMMtests.cpp) +ADD_EXECUTABLE(teste_mdd teste_mdd.cpp) TARGET_LINK_LIBRARIES(evaluate ${APP_DEP} ) TARGET_LINK_LIBRARIES(train ${APP_DEP} ) @@ -60,6 +61,7 @@ TARGET_LINK_LIBRARIES(runAllPredalignTests ${APP_DEP} ) TARGET_LINK_LIBRARIES(posterior_decoding ${APP_DEP} ) TARGET_LINK_LIBRARIES(print_graph_from_model ${APP_DEP} ) TARGET_LINK_LIBRARIES(profileHMMtests ${APP_DEP} ) +TARGET_LINK_LIBRARIES(teste_mdd ${APP_DEP} ) install(TARGETS train viterbi_decoding mea_decoding simulate testeFBGHMM testeGPHMM simulateAlignment align predalign runAllPredalignTests bayes_classifier one_file_model sliding_window evaluate kullback_positional posterior_probabilities posterior_decoding print_graph_from_model profileHMMtests RUNTIME DESTINATION bin diff --git a/app/teste_mdd.cpp b/app/teste_mdd.cpp new file mode 100644 index 00000000..4f8344e2 --- /dev/null +++ b/app/teste_mdd.cpp @@ -0,0 +1,204 @@ +#include +#include +#include + +#include "Consensus.hpp" +#include "ChiSquare.hpp" +#include "MaximalDependenceDecomposition.hpp" +#include "ProbabilisticModelCreatorClient.hpp" +#include "ContextTree.hpp" + +using namespace tops; +using namespace std; + +int main (int argc, char ** argv) +{ + cout << "TesteMDD" << endl; + + /***********************************************************/ + // Alphabet + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->createSymbol("A"); + alphabet->createSymbol("C"); + alphabet->createSymbol("G"); + alphabet->createSymbol("T"); + + /***********************************************************/ + // Consensus Sequence + ConsensusSequence consensus_sequence; + + vector s1; + s1.push_back(0); s1.push_back(1); // a/c + Consensus c1(s1); + consensus_sequence.push_back(c1); + + int x1[] = {0, 2, 2, 3}; // AGGT + for (int i = 0; i < 4; i++) { + vector s; + s.push_back(x1[i]); + Consensus c(s); + consensus_sequence.push_back(c); + } + + vector s2; + s2.push_back(0); s2.push_back(2); // a/g + Consensus c2(s2); + consensus_sequence.push_back(c2); + + int x2[] = {0, 2, 3}; // agt + for (int i = 0; i < 3; i++) { + vector s; + s.push_back(x2[i]); + Consensus c(s); + consensus_sequence.push_back(c); + } + + for (std::vector::iterator it = consensus_sequence.begin() ; it != consensus_sequence.end(); ++it) { + cout << it->str() << '\t'; + } + cout << endl; + + /***********************************************************/ + // WMMs + ProbabilisticModelCreatorClient creator; + ProbabilisticModelPtr root = creator.create("_test2/root.txt"); + ProbabilisticModelPtr g5 = creator.create("_test2/g5.txt"); + ProbabilisticModelPtr h5 = creator.create("_test2/h5.txt"); + ProbabilisticModelPtr g5gm1 = creator.create("_test2/g5gm1.txt"); + ProbabilisticModelPtr g5hm1 = creator.create("_test2/g5hm1.txt"); + ProbabilisticModelPtr g5gm1am2 = creator.create("_test2/g5gm1am2.txt"); + ProbabilisticModelPtr g5gm1bm2 = creator.create("_test2/g5gm1bm2.txt"); + ProbabilisticModelPtr g5gm1am2u6 = creator.create("_test2/g5gm1am2u6.txt"); + ProbabilisticModelPtr g5gm1am2v6 = creator.create("_test2/g5gm1am2v6.txt"); + + /***********************************************************/ + // MDD tree + MaximalDependenceDecompositionNodePtr mdd_root = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("root", root, 7)); + MaximalDependenceDecompositionNodePtr mdd_g5 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5", g5, 2)); + MaximalDependenceDecompositionNodePtr mdd_h5 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("h5", h5, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5gm1", g5gm1, 1)); + MaximalDependenceDecompositionNodePtr mdd_g5hm1 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5hm1", g5hm1, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5gm1am2", g5gm1am2, 8)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1bm2 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5gm1bm2", g5gm1bm2, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2u6 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5gm1am2u6", g5gm1am2u6, -1)); + MaximalDependenceDecompositionNodePtr mdd_g5gm1am2v6 = MaximalDependenceDecompositionNodePtr( + new MaximalDependenceDecompositionNode("g5gm1am2v6", g5gm1am2v6, -1)); + + mdd_root->setChildern(mdd_g5, mdd_h5); + mdd_g5->setChildern(mdd_g5gm1, mdd_g5hm1); + mdd_g5gm1->setChildern(mdd_g5gm1am2, mdd_g5gm1bm2); + mdd_g5gm1am2->setChildern(mdd_g5gm1am2u6, mdd_g5gm1am2v6); + + /***********************************************************/ + // MDD definition + + MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + mdd->setAlphabet(alphabet); + mdd->setMDDTree(mdd_root); + mdd->setConsensusSequence(consensus_sequence); + ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); + mdd->setConsensusModel(consensus_model); + + /***********************************************************/ + // Evaluate + + Sequence s; + s.push_back(1);s.push_back(0);s.push_back(2);s.push_back(2);s.push_back(3);s.push_back(2);s.push_back(0);s.push_back(0);s.push_back(3); + cout << mdd->evaluate(s, 0, 9) << endl; + + /***********************************************************/ + // Choose + + srand(time(NULL)); + Sequence new_sequence; + mdd->choose(new_sequence, 9); + + for (int i = 0; i < 9; i++) { + cout << new_sequence[i] << "\t"; + } + cout << endl; + + /***********************************************************/ + // Criando SequenceEntryList + SequenceEntryList sequences; + for (int i = 0; i < 10; i++) { + SequenceEntryPtr se = SequenceEntryPtr(new SequenceEntry(alphabet)); + Sequence random_sequence; + mdd->choose(random_sequence, 9); + se->setSequence(random_sequence); + se->setName("s"); + sequences.push_back(se); + } + + for (int i = 0; i < 10; i++) { + for (int j = 0; j < 10; j++) { + cout << sequences[i]->getSequence()[j]; + } + cout << endl; + } + + /***********************************************************/ + // Treinando MDD + MaximalDependenceDecompositionPtr trained_mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + trained_mdd->setAlphabet(alphabet); + trained_mdd->setConsensusSequence(consensus_sequence); + + // ProbabilisticModelPtr consensus_model = creator.create("_test2/consensus_model.txt"); + trained_mdd->setConsensusModel(consensus_model); + + trained_mdd->train(sequences, 2); + + cout << "-------------------------" << endl; + Sequence new_sequence_from_trained_mdd; + trained_mdd->choose(new_sequence_from_trained_mdd, 9); + + for (int i = 0; i < 9; i++) { + cout << new_sequence_from_trained_mdd[i] << "\t"; + } + cout << endl; + + // std::map w; + // ContextTreePtr tree = ContextTreePtr(new ContextTree(alphabet)); + // tree->initializeCounter(sequences, 0, w); + // tree->normalize(); + + // cout << tree->getContext(0)->getDistribution()->str() << endl; + + // cout << mdd->str() << endl; + + cout << "--------------------------" << endl; + ProbabilisticModelPtr mdd_from_config = creator.create("_test2/mdd.txt"); + // cout << mdd_from_config->str() << endl; + + // cout << trained_mdd->evaluate(new_sequence, 0, 9) << endl; + // cout << trained_mdd->evaluate(new_sequence, 0, 1) << endl; + + new_sequence.push_back(0); + new_sequence.push_back(0); + new_sequence.push_back(0); + new_sequence.push_back(0); + + mdd->initialize_prefix_sum_array(new_sequence); + // cout << mdd->evaluate(new_sequence, 0, 8); + cout << mdd->prefix_sum_array_compute(0, 1) << endl; + cout << mdd->prefix_sum_array_compute(0, 8) << endl; + cout << mdd->prefix_sum_array_compute(1, 9) << endl; + cout << mdd->prefix_sum_array_compute(2, 10) << endl; + cout << mdd->prefix_sum_array_compute(3, 11) << endl; + cout << mdd->prefix_sum_array_compute(4, 12) << endl; + + return 0; +} + + + + + diff --git a/app/train.cpp b/app/train.cpp index cf872138..6d57abc0 100644 --- a/app/train.cpp +++ b/app/train.cpp @@ -49,6 +49,7 @@ #include "TrainInterpolatedMarkovChain.hpp" #include "TrainSimilarityBasedSequenceWeighting.hpp" #include "TrainPhasedMarkovChainContextAlgorithm.hpp" +#include "TrainMaximalDependenceDecomposition.hpp" #include "TrainHMMMaximumLikelihood.hpp" #include "RemoveSequenceFromModel.hpp" #include "SequenceFormat.hpp" @@ -114,6 +115,7 @@ int main(int argc, char ** argv) { createModelCommand["SmoothedHistogramStanke"] = SmoothedHistogramStankePtr(new SmoothedHistogramStanke()); createModelCommand["SmoothedHistogramBurge"] = SmoothedHistogramBurgePtr(new SmoothedHistogramBurge()); createModelCommand["DiscreteIIDModel"] = TrainDiscreteIIDModelPtr(new TrainDiscreteIIDModel()); + createModelCommand["MaximalDependenceDecomposition"] = TrainMaximalDependenceDecompositionPtr(new TrainMaximalDependenceDecomposition()); modelSelectionCommand["BIC"] = BayesianInformationCriteriaPtr( diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 02d52640..85ca52dd 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -17,7 +17,13 @@ include_directories(../lang/src .) ADD_LIBRARY(ToPS SHARED Symbol Alphabet util SparseMatrix MultipleAlignment GHMMStates DiscreteIIDModel VariableLengthMarkovChain InhomogeneousMarkovChain HiddenMarkovModel ConfigurationReader DiscreteIIDModelCreator VariableLengthMarkovChainCreator InhomogeneousMarkovChainCreator HiddenMarkovModelCreator GeneralizedHiddenMarkovModelCreator SequenceEntry SequenceFactory GeneralizedHiddenMarkovModel TrainVariableLengthMarkovChain TrainFixedLengthMarkovChain TrainInterpolatedMarkovChain TrainHMMBaumWelch TrainHMMMaximumLikelihood ContextTree TrainWeightArrayModel BayesianInformationCriteria TrainVariableLengthInhomogeneousMarkovChain ProbabilisticModelCreatorClient AkaikeInformationCriteria TrainPhasedMarkovChain FactorableModel DecodableModel InhomogeneousFactorableModel SmoothedHistogramKernelDensity FixedSequenceAtPosition FixedSequenceAtPositionCreator PhasedRunLengthDistribution PhasedRunLengthDistributionCreator SmoothedHistogramStanke TrainPhasedMarkovChainContextAlgorithm RemoveSequenceFromModel SmoothedHistogramBurge SequenceFormat TrainDiscreteIIDModel TargetModel TargetModelCreator ReverseComplementDNA ReverseComplementDNACreator ProbabilisticModelParameter TrainGHMMTransitions BernoulliModelCreator TrainInterpolatedPhasedMarkovChain SimilarityBasedSequenceWeightingCreator SimilarityBasedSequenceWeighting TrainSimilarityBasedSequenceWeighting MultipleSequentialModelCreator MultipleSequentialModel StoreLoadedModel PairHiddenMarkovModel PairHiddenMarkovModelCreator TrainPHMMBaumWelch MaximumDependenceDecomposition crossplatform.hpp ProbabilisticModelConfiguration ProbabilisticModelParameterValue2 -../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch) +../lang/src/ASTNode ../lang/src/PropertyNode ../lang/src/KeyNode ../lang/src/ValueNode ../lang/src/StringNode ../lang/src/ListNode ../lang/src/ConfigurationNode ../lang/src/IntegerNode ../lang/src/FloatNode ../lang/src/ConditionalProbabilityNode ../lang/src/ConditionalProbabilityMapNode ../lang/src/ConditionNode ../lang/src/ProbabilityMapNode ../lang/src/ProbabilityNode ../lang/src/parser ../lang/src/tokens ../lang/src/lang ../lang/src/ToPSLangVisitor ProfileHiddenMarkovModel ProfileHiddenMarkovModelCreator TrainProfileHMMMaxLikelihood TrainProfileHMMBaumWelch +Consensus +ChiSquare +MaximalDependenceDecomposition +MaximalDependenceDecompositionCreator +TrainMaximalDependenceDecomposition +) diff --git a/src/ChiSquare.cpp b/src/ChiSquare.cpp new file mode 100644 index 00000000..de39b08f --- /dev/null +++ b/src/ChiSquare.cpp @@ -0,0 +1,78 @@ +#include "ChiSquare.hpp" + +namespace tops { + + double CHI_QUARE[31][9] = { + {0.00098, 1.64, 2.71, 3.84, 5.02, 6.63, 7.88, 10.83, 12.12}, + {0.051, 3.22, 4.61, 5.99, 7.38, 9.21, 10.60, 13.82, 15.20}, + {0.216, 4.64, 6.25, 7.81, 9.35, 11.34, 12.84, 16.27, 17.73}, + {0.48, 5.99, 7.78, 9.49, 11.14, 13.28, 14.86, 18.47, 20.00}, + {0.83, 7.29, 9.24, 11.07, 12.83, 15.09, 16.75, 20.51, 22.11}, + {1.24, 8.56, 10.64, 12.59, 14.45, 16.81, 18.55, 22.46, 24.10}, + {1.69, 9.80, 12.02, 14.07, 16.01, 18.48, 20.28, 24.32, 26.02}, + {2.18, 11.03, 13.36, 15.51, 17.53, 20.09, 21.95, 26.12, 27.87}, + {2.70, 12.24, 14.68, 16.92, 19.02, 21.67, 23.59, 27.88, 29.67}, + {3.25, 13.44, 15.99, 18.31, 20.48, 23.21, 25.19, 29.59, 31.42}, + {3.82, 14.63, 17.28, 19.68, 21.92, 24.73, 26.76, 31.26, 33.14}, + {4.40, 15.81, 18.55, 21.03, 23.34, 26.22, 28.30, 32.91, 34.82}, + {5.01, 16.98, 19.81, 22.36, 24.74, 27.69, 29.82, 34.53, 36.48}, + {5.63, 18.15, 21.06, 23.68, 26.12, 29.14, 31.32, 36.12, 38.11}, + {6.26, 19.31, 22.31, 25.00, 27.49, 30.58, 32.80, 37.70, 39.72}, + {6.91, 20.47, 23.54, 26.30, 28.85, 32.00, 34.27, 39.25, 41.31}, + {7.56, 21.61, 24.77, 27.59, 30.19, 33.41, 35.72, 40.79, 42.88}, + {8.23, 22.76, 25.99, 28.87, 31.53, 34.81, 37.16, 42.31, 44.43}, + {8.91, 23.90, 27.20, 30.14, 32.85, 36.19, 38.58, 43.82, 45.97}, + {9.59, 25.04, 28.41, 31.41, 34.17, 37.57, 40.00, 45.31, 47.50}, + {10.28, 26.17, 29.62, 32.67, 35.48, 38.93, 41.40, 46.80, 49.01}, + {10.98, 27.30, 30.81, 33.92, 36.78, 40.29, 42.80, 48.27, 50.51}, + {11.69, 28.43, 32.01, 35.17, 38.08, 41.64, 44.18, 49.73, 52.00}, + {12.40, 29.55, 33.20, 36.42, 39.36, 42.98, 45.56, 51.18, 53.48}, + {13.12, 30.68, 34.38, 37.65, 40.65, 44.31, 46.93, 52.62, 54.95}, + {16.79, 36.25, 40.26, 43.77, 46.98, 50.89, 53.67, 59.70, 62.16}, + {24.43, 47.27, 51.81, 55.76, 59.34, 63.69, 66.77, 73.40, 76.10}, + {32.36, 58.16, 63.17, 67.50, 71.42, 76.15, 79.49, 86.66, 89.56}, + {40.48, 68.97, 74.40, 79.08, 83.30, 88.38, 91.95, 99.61, 102.7}, + {57.15, 90.41, 96.58, 101.9, 106.6, 112.3, 116.3, 124.8, 128.3}, + {74.22, 111.7, 118.5, 124.3, 129.6, 135.8, 140.2, 149.4, 153.2} + }; + + int chiPi(double p) { + double prob[] = {0.975, 0.2, 0.1, 0.05, 0.025, 0.01, 0.005, 0.001, 0.0005}; + int pi = 0; + for (int i = 0; i < 9; i++) { + if (p >= prob[i]) { + if (i == 0) + pi = 0; + else if (abs(p - prob[i]) > abs(p - prob[i-1])) + pi = i-1; + else + pi = i; + break; + } + } + return pi; + } + + int chiDfi(int df) { + if (df <= 25) + return df-1; + + int dfi = 30; + while ((df - dfi) > 0) { + dfi += 10; + if (dfi == 80) { + if (abs(df - 80) > abs(df - 100)) + return 30; + else + return 29; + } + } + return ((dfi - 30)/10 + 25); + } + + double chiSquare(double p, int df) { + int pi = chiPi(p); + int dfi = chiDfi(df); + return CHI_QUARE[dfi][pi]; + } +} \ No newline at end of file diff --git a/src/ChiSquare.hpp b/src/ChiSquare.hpp new file mode 100644 index 00000000..c59ee674 --- /dev/null +++ b/src/ChiSquare.hpp @@ -0,0 +1,36 @@ +/* + * ConfigurationReader.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef CHI_SQUARE_HPP +#define CHI_SQUARE_HPP +#include + +#include "crossplatform.hpp" + + +namespace tops { + double chiSquare(double p, int df); +} + +#endif diff --git a/src/ConfigurationReader.cpp b/src/ConfigurationReader.cpp index 553522f7..d4717ed9 100644 --- a/src/ConfigurationReader.cpp +++ b/src/ConfigurationReader.cpp @@ -471,15 +471,24 @@ namespace tops { rule config_file, parameter_spec, parameter_value, parameter_name, prob_table, string_vector, double_vector, - int_vector, word, word_p, string_map, transition_map, nested_configuration, nested_parameter_spec; + int_vector, word, word_p, string_map, transition_map, nested_configuration, nested_parameter_spec, + tree_p, tree; word_p = lexeme_d [ +(alnum_p | (ch_p('_') | '.' | '/' | '-' | ' ' | ',' | '+' ))] ; + tree_p + = lexeme_d [ +(alnum_p | (ch_p('_') | '.' | '/' | '-' | ' ' | ',' | '+' | '(' | ')' | ':' ))] + ; word = ch_p('"') >> word_p >> ch_p('"') ; + tree + = ch_p('{') + >> tree_p + >> ch_p('}') + ; double_vector = ch_p('(') >> real_p[create_double_vector(this)] @@ -541,6 +550,7 @@ namespace tops { = double_vector | parameter_name [set_parameter_value_string(this)] | word [set_parameter_value_word(this)] + | tree [set_parameter_value_word(this)] | string_vector | transition_map | strict_real_p [set_parameter_value_double(this)] diff --git a/src/Consensus.cpp b/src/Consensus.cpp new file mode 100644 index 00000000..8b87434a --- /dev/null +++ b/src/Consensus.cpp @@ -0,0 +1,33 @@ +#include "Consensus.hpp" +#include "Alphabet.hpp" +#include "Symbol.hpp" + +namespace tops { + bool Consensus::is(int symbol) const { + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + if (*it == symbol) + return true; + } + return false; + } + + std::string Consensus::str() const { + std::stringstream out; + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + out << (*it); + } + return out.str(); + } + + std::string Consensus::sym_str(AlphabetPtr alphabet) const { + std::stringstream out; + out << "\""; + for (std::vector::const_iterator it = _symbols.begin() ; it != _symbols.end(); ++it) { + out << alphabet->getSymbol(*it)->name(); + if ((it+1) != _symbols.end()) + out << " "; + } + out << "\""; + return out.str(); + } +} \ No newline at end of file diff --git a/src/Consensus.hpp b/src/Consensus.hpp new file mode 100644 index 00000000..c5c6888a --- /dev/null +++ b/src/Consensus.hpp @@ -0,0 +1,60 @@ +/* + * ConfigurationReader.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef CONSENSUS_HPP +#define CONSENSUS_HPP +#include + +#include "crossplatform.hpp" +#include "util.hpp" +#include "Sequence.hpp" + +#include +#include +#include + + +using namespace std; + + +namespace tops { + + class Consensus { + public: + Consensus(Sequence symbols):_symbols(symbols) {} + bool is(int symbol) const; + std::string str() const; + std::string sym_str(AlphabetPtr alphabet) const; + Sequence symbols() { + return _symbols; + } + private: + Sequence _symbols; + }; + + typedef std::vector ConsensusSequence; + +} + +#endif diff --git a/src/MaximalDependenceDecomposition.cpp b/src/MaximalDependenceDecomposition.cpp new file mode 100644 index 00000000..9696855f --- /dev/null +++ b/src/MaximalDependenceDecomposition.cpp @@ -0,0 +1,445 @@ +#include "MaximalDependenceDecomposition.hpp" +#include "InhomogeneousMarkovChain.hpp" +#include "ContextTree.hpp" +#include "ProbabilisticModelParameter.hpp" +#include "Symbol.hpp" +#include "ProbabilisticModelCreatorClient.hpp" + +#include + +namespace tops { + int MaximalDependenceDecompositionNode::getIndex() { + return _index; + } + + ProbabilisticModelPtr MaximalDependenceDecompositionNode::getModel() { + return _model; + } + + void MaximalDependenceDecompositionNode::setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right) { + _left = left; + _right = right; + } + + void MaximalDependenceDecompositionNode::setChild(MaximalDependenceDecompositionNodePtr child) { + _left = child; + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecompositionNode::getLeft() { + return _left; + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecompositionNode::getRight() { + return _right; + } + + std::string MaximalDependenceDecompositionNode::tree_str() { + std::stringstream out; + if (_left || _right) { + out << "( "; + out << _node_name << ":" << _index; + out << " "; + out << _left->tree_str(); + out << " "; + if (_right) + out << _right->tree_str(); + else + out << "null"; + out << " )"; + } else { + out << _node_name; + } + return out.str(); + } + + std::string MaximalDependenceDecompositionNode::model_str() { + std::stringstream out; + out << _node_name << " = [" << endl; + out << _model->str(); + out << "]" << endl; + if (_left || _right) { + out << _left->model_str(); + if (_right) + out << _right->model_str(); + } + return out.str(); + } + + void MaximalDependenceDecomposition::setMDDTree(MaximalDependenceDecompositionNodePtr root) { + _mdd_tree = root; + } + + void MaximalDependenceDecomposition::setConsensusSequence(ConsensusSequence consensus_sequence) { + _consensus_sequence = consensus_sequence; + } + + double MaximalDependenceDecomposition::prefix_sum_array_compute(int begin , int end) { + if ((end - begin + 1) != _consensus_sequence.size()) + return -HUGE; + return _prefix_sum_array[begin]; + } + + bool MaximalDependenceDecomposition::initialize_prefix_sum_array(const Sequence & s) { + int len = s.size(); + int clen = _consensus_sequence.size(); + for (int i = 0; i < (len - clen); i++) { + _prefix_sum_array.push_back(evaluate(s, i, i + clen - 1)); + } + return true; + } + + double MaximalDependenceDecomposition::evaluate(const Sequence & s, unsigned int begin, unsigned int end) const { + if ((end - begin + 1) != _consensus_sequence.size()) + return -HUGE; + vector::const_iterator first = s.begin() + begin; + vector::const_iterator last = s.begin() + end + 1; + vector subseq(first, last); + vector indexes; + return _evaluateAux(subseq, _mdd_tree, indexes); + } + + double MaximalDependenceDecomposition::_evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const { + double p = 0; + if (node->getLeft()) { + p = node->getModel()->inhomogeneous()->evaluatePosition(s, node->getIndex(), node->getIndex()); + indexes.push_back(node->getIndex()); + // cout << node->getIndex() << endl; + // cout << "tem filho" << endl; + if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { + // cout << "eh consensus" << endl; + p += _evaluateAux(s, node->getLeft(), indexes); + } else { + // cout << "nao eh consensus" << endl; + p += _evaluateAux(s, node->getRight(), indexes); + } + } else { // leaf + // cout << "nao tem filho" << endl; + for (int i = 0; i < s.size(); i++) { + if (std::find(indexes.begin(), indexes.end(), i) == indexes.end()) { + p += node->getModel()->inhomogeneous()->evaluatePosition(s, i, i); + } + } + } + return p; + } + + Sequence & MaximalDependenceDecomposition::choose(Sequence & s, int size) const { + s = Sequence(size, -1); + _chooseAux(s, _mdd_tree); + return s; + } + + void MaximalDependenceDecomposition::_chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) const { + if (node->getLeft()) { + s[node->getIndex()] = node->getModel()->inhomogeneous()->choosePosition(s, node->getIndex(), node->getIndex()); + if (_consensus_sequence[node->getIndex()].is(s[node->getIndex()])) { + _chooseAux(s, node->getLeft()); + } else { + _chooseAux(s, node->getRight()); + } + } else { // leaf + for (int i = 0; i < s.size(); i++) { + if (s[i] == -1) { + s[i] = node->getModel()->inhomogeneous()->choosePosition(s, i, i); + } + } + } + } + + InhomogeneousMarkovChainPtr MaximalDependenceDecomposition::trainInhomogeneousMarkovChain(SequenceEntryList & sequences) { + vector position_specific_context_trees; + for (int j = 0; j < sequences[0]->getSequence().size(); j++) { + SequenceEntryList imc_sequences; + + SequenceEntryPtr se = SequenceEntryPtr(new SequenceEntry()); + Sequence s; + for (int i = 0; i < sequences.size(); i++) { + s.push_back(sequences[i]->getSequence()[j]); + } + se->setSequence(s); + se->setName("s"); + imc_sequences.push_back(se); + + std::map w; + ContextTreePtr tree = ContextTreePtr(new ContextTree(_alphabet)); + tree->initializeCounter(imc_sequences, 0, w); + tree->normalize(); + position_specific_context_trees.push_back(tree); + } + InhomogeneousMarkovChainPtr model = InhomogeneousMarkovChainPtr(new InhomogeneousMarkovChain()); + model->setPositionSpecificDistribution(position_specific_context_trees); + model->setAlphabet(_alphabet); + return model; + } + + int MaximalDependenceDecomposition::getMaximalDependenceIndex(InhomogeneousMarkovChainPtr model, Sequence selected) { + Sequence s(_consensus_sequence.size(), -1); + double maximal = -HUGE; + double maximal_i = -1; + for (int i = 0; i < _consensus_sequence.size(); i++) { + double sum; + for (int j = 0; j < _consensus_sequence.size(); j++) { + if (i != j) { + double x; + double chi = -HUGE; + for (int k = 0; k < _alphabet->size(); k++) { + s[i] = k; + double e = _consensus_model->inhomogeneous()->evaluatePosition(s, i, i); + s[j] = k; + double o = model->evaluatePosition(s, j, j); + x = (o - e)+(o - e)-e; + chi = log_sum(chi, x); + } + // cout << chi << "\t"; + sum = log_sum(sum, chi); + } else { + // cout << "-" << "\t"; + } + } + // cout << sum <getSequence()[index])) { + consensus.push_back(sequences[i]); + } else { + nonconsensus.push_back(sequences[i]); + } + } + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected) { + InhomogeneousMarkovChainPtr model = trainInhomogeneousMarkovChain(sequences); + MaximalDependenceDecompositionNodePtr mdd_node; + + int consensus_index = getMaximalDependenceIndex(model, selected); + + if (consensus_index >= 0) { + + selected.push_back(consensus_index); + + Sequence s(_consensus_sequence.size(), -1); + s[consensus_index] = _consensus_sequence[consensus_index].symbols()[0]; + double prob = _consensus_model->inhomogeneous()->evaluatePosition(s, consensus_index, consensus_index); + if ( prob >= -0.001 && prob <= 0.001) { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); + std::stringstream p; + p << node_name << "_p" << consensus_index; + MaximalDependenceDecompositionNodePtr child = newNode(p.str(), sequences, divmin, selected); + mdd_node->setChild(child); + } else { + + SequenceEntryList consensus_sequences; + SequenceEntryList nonconsensus_sequences; + subset(consensus_index, sequences, consensus_sequences, nonconsensus_sequences); + + // cout << "**********************************" << endl; + // cout << "consensus_index = " << consensus_index << endl; + // cout << "consensus_sequences = " << consensus_sequences.size() << endl; + // cout << "nonconsensus_sequences = " << nonconsensus_sequences.size() << endl; + + if ((consensus_sequences.size() > divmin) && (nonconsensus_sequences.size() > divmin)) { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, consensus_index)); + std::stringstream p; + p << node_name << "_p" << consensus_index; + MaximalDependenceDecompositionNodePtr left = newNode(p.str(), consensus_sequences, divmin, selected); + std::stringstream n; + n << node_name << "_n" << consensus_index; + MaximalDependenceDecompositionNodePtr right = newNode(n.str(), nonconsensus_sequences, divmin, selected); + mdd_node->setChildern(left, right); + } else { + mdd_node = MaximalDependenceDecompositionNodePtr(new MaximalDependenceDecompositionNode(node_name, model, -1)); + } + } + } + + + return mdd_node; + } + + void MaximalDependenceDecomposition::train(SequenceEntryList & sequences, int divmin) { + Sequence selected; + setMDDTree(newNode("node_r0", sequences, divmin, selected)); + } + + std::string MaximalDependenceDecomposition::str () const { + std::stringstream out; + + out << "model_name = \"MaximalDependenceDecomposition\"" << endl; + + out << _alphabet->str(); + + out << "consensus = ("; + for (int i = 0; i < _consensus_sequence.size(); i++) { + out << _consensus_sequence[i].sym_str(_alphabet); + if (i != (_consensus_sequence.size() - 1)) + out << ", "; + } + out << ")" << endl; + + out << "consensus_model = [" << endl; + out << _consensus_model->str(); + out<< "]" << endl; + + out << _mdd_tree->model_str(); + out << "tree = {" << _mdd_tree->tree_str() << "}" << endl; + + return out.str(); + } + void MaximalDependenceDecomposition::initialize(const ProbabilisticModelParameters & parameters) { + ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr consensus_param = parameters.getMandatoryParameterValue("consensus"); + ProbabilisticModelParameterValuePtr consensus_model_param = parameters.getMandatoryParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr tree = parameters.getMandatoryParameterValue("tree"); + + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->initializeFromVector(symbols->getStringVector()); + setAlphabet(alphabet); + + std::vector consensus_symbols = consensus_param->getStringVector(); + ConsensusSequence consensus_sequence; + for (int i = 0; i < consensus_symbols.size(); i++) { + std::vector syms; + boost::split(syms, consensus_symbols[i], boost::is_any_of(" ")); + vector s; + for (int j = 0; j < syms.size(); j++) { + s.push_back(alphabet->getSymbol(syms[j])->id()); + } + Consensus cons(s); + consensus_sequence.push_back(cons); + } + setConsensusSequence(consensus_sequence); + + std::string consensus_model_str = consensus_model_param->getString(); + consensus_model_str = consensus_model_str.substr(1, consensus_model_str.size() - 2); + ConfigurationReader consensus_model_reader; + ProbabilisticModelCreatorClient consensus_model_creator; + consensus_model_reader.load(consensus_model_str); + setConsensusModel(consensus_model_creator.create(*(consensus_model_reader.parameters()))); + + std::vector _tree; + string tree_str = tree->getString(); + boost::split(_tree, tree_str, boost::is_any_of(" ")); + std::vector tree_r; + for (int i = 0; i < _tree.size(); i++) + if (_tree[i] != "" && _tree[i] != " " && _tree[i] != "\n" && _tree[i] != "\t") + tree_r.push_back(_tree[i]); + setMDDTree(initializeTree(parameters, tree_r)); + } + + MaximalDependenceDecompositionNodePtr MaximalDependenceDecomposition::initializeTree(const ProbabilisticModelParameters & parameters, std::vector& tree) { + MaximalDependenceDecompositionNodePtr node; + if (tree[0] == "(") { + std::vector tree_node; + + boost::split(tree_node, tree[1], boost::is_any_of(":")); + string node_name = tree_node[0]; + + int index = std::atoi(tree_node[1].c_str()); + + std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); + model_str = model_str.substr(1, model_str.size() - 2); + ConfigurationReader model_reader; + ProbabilisticModelCreatorClient model_creator; + model_reader.load(model_str); + + // cout << node_name << endl; + // cout << index << endl; + + MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new + MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); + + int count = 0; + int i = 2; + std::vector tree_node_left; + tree_node_left.push_back(tree[2]); + if (tree[2] == "(") { + count = 1; + i = 2; + while (count > 0) { + i++; + if (tree[i] == "(") + count++; + else if (tree[i] == ")") + count--; + tree_node_left.push_back(tree[i]); + } + } + + std::vector tree_node_right; + tree_node_right.push_back(tree[i+1]); + if (tree[count] == "(") { + count = 1; + i++; + while (count > 0) { + i++; + if (tree[i] == "(") + count++; + else if (tree[i] == ")") + count--; + tree_node_right.push_back(tree[i]); + } + } + + // cout << "left:" << endl; + MaximalDependenceDecompositionNodePtr left_node = initializeTree(parameters, tree_node_left); + // cout << "right:" << endl; + MaximalDependenceDecompositionNodePtr right_node = initializeTree(parameters, tree_node_right); + + root->setChildern(left_node, right_node); + + node = root; + } else { + string node_name = tree[0]; + if (node_name != "null") { + int index = -1; + + // cout << "-> " << node_name << endl; + + std::string model_str = parameters.getMandatoryParameterValue(node_name)->getString(); + model_str = model_str.substr(1, model_str.size() - 2); + ConfigurationReader model_reader; + ProbabilisticModelCreatorClient model_creator; + model_reader.load(model_str); + + MaximalDependenceDecompositionNodePtr root = MaximalDependenceDecompositionNodePtr(new + MaximalDependenceDecompositionNode(node_name, model_creator.create(*(model_reader.parameters())), index)); + + node = root; + } + } + return node; + } +} + + + + + + + + + + + + diff --git a/src/MaximalDependenceDecomposition.hpp b/src/MaximalDependenceDecomposition.hpp new file mode 100644 index 00000000..cf4424bc --- /dev/null +++ b/src/MaximalDependenceDecomposition.hpp @@ -0,0 +1,122 @@ +/* + * MaximalDependenceDecomposition.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef MAXIMAL_DEPENDENCE_DECOMPOSITION_HPP +#define MAXIMAL_DEPENDENCE_DECOMPOSITION_HPP +#include + +#include "crossplatform.hpp" +#include "util.hpp" +#include "Sequence.hpp" +#include "Consensus.hpp" +#include "ProbabilisticModel.hpp" +#include "InhomogeneousMarkovChain.hpp" + +#include +#include +#include + + +using namespace std; + + +namespace tops { + + class MaximalDependenceDecompositionNode; + typedef boost::shared_ptr MaximalDependenceDecompositionNodePtr; + + class MaximalDependenceDecompositionNode { + public: + MaximalDependenceDecompositionNode(std::string node_name, ProbabilisticModelPtr model, int index):_model(model), _index(index), _node_name(node_name) {}; + + int getIndex(); + ProbabilisticModelPtr getModel(); + + void setChildern(MaximalDependenceDecompositionNodePtr left, MaximalDependenceDecompositionNodePtr right); + void setChild(MaximalDependenceDecompositionNodePtr child); + MaximalDependenceDecompositionNodePtr getLeft(); + MaximalDependenceDecompositionNodePtr getRight(); + + std::string tree_str(); + std::string model_str(); + private: + vector _otherIndexes; + ProbabilisticModelPtr _model; + int _index; + std::string _node_name; + MaximalDependenceDecompositionNodePtr _left; + MaximalDependenceDecompositionNodePtr _right; + }; + + class MaximalDependenceDecomposition : public ProbabilisticModel { + public: + MaximalDependenceDecomposition() {}; + void setAlphabet(AlphabetPtr alphabet) { + _alphabet = alphabet; + } + virtual AlphabetPtr alphabet() const + { + return _alphabet; + } + void setMDDTree(MaximalDependenceDecompositionNodePtr root); + void setConsensusSequence(ConsensusSequence consensus_sequence); + void setConsensusModel(ProbabilisticModelPtr model); + + InhomogeneousMarkovChainPtr trainInhomogeneousMarkovChain(SequenceEntryList & sequences); + int getMaximalDependenceIndex(InhomogeneousMarkovChainPtr model, Sequence selected); + void subset(int index, SequenceEntryList & sequences, SequenceEntryList & consensus, SequenceEntryList & nonconsensus); + MaximalDependenceDecompositionNodePtr newNode(std::string node_name, SequenceEntryList & sequences, int divmin, Sequence selected); + void train(SequenceEntryList & sequences, int divmin); + + virtual double evaluate(const Sequence & s, unsigned int begin, unsigned int end) const; + virtual Sequence & choose(Sequence & h, int size) const; + + virtual bool initialize_prefix_sum_array(const Sequence & s); + virtual double prefix_sum_array_compute(int begin , int end); + + virtual std::string model_name() const { + return "MaximumDependenceDecomposition"; + } + + virtual std::string str () const ; + + virtual void initialize(const ProbabilisticModelParameters & parameters); + MaximalDependenceDecompositionNodePtr initializeTree(const ProbabilisticModelParameters & parameters, std::vector& tree); + private: + + double _evaluateAux(const Sequence & s, MaximalDependenceDecompositionNodePtr node, vector &indexes) const; + void _chooseAux(Sequence & s, MaximalDependenceDecompositionNodePtr node) const; + + + MaximalDependenceDecompositionNodePtr _mdd_tree; + ConsensusSequence _consensus_sequence; + ProbabilisticModelPtr _consensus_model; + AlphabetPtr _alphabet; + vector _prefix_sum_array; + }; + + typedef boost::shared_ptr MaximalDependenceDecompositionPtr; +} + +#endif diff --git a/src/MaximalDependenceDecompositionCreator.cpp b/src/MaximalDependenceDecompositionCreator.cpp new file mode 100644 index 00000000..1c9c208f --- /dev/null +++ b/src/MaximalDependenceDecompositionCreator.cpp @@ -0,0 +1,18 @@ +#include "MaximalDependenceDecompositionCreator.hpp" +#include "MaximalDependenceDecomposition.hpp" + +namespace tops { + ProbabilisticModelPtr MaximalDependenceDecompositionCreator::create(ProbabilisticModelParameters & parameters) const { + MaximalDependenceDecompositionPtr model = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + ProbabilisticModelParameterValuePtr symbols = parameters.getMandatoryParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr concensus = parameters.getMandatoryParameterValue("consensus"); + ProbabilisticModelParameterValuePtr concensus_model = parameters.getMandatoryParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr tree = parameters.getMandatoryParameterValue("tree"); + if((symbols == NULL) || (concensus == NULL) || (concensus_model == NULL) || (tree == NULL)) + { + std::cerr << help() << std::endl; + } + model->initialize(parameters); + return model; + } +} \ No newline at end of file diff --git a/src/MaximalDependenceDecompositionCreator.hpp b/src/MaximalDependenceDecompositionCreator.hpp new file mode 100644 index 00000000..ed716e9e --- /dev/null +++ b/src/MaximalDependenceDecompositionCreator.hpp @@ -0,0 +1,45 @@ +/* + * MaximalDependenceDecompositionCreator.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef MDD_CREATOR_HPP +#define MDD_CREATOR_HPP + +#include "crossplatform.hpp" + +#include "ProbabilisticModelCreator.hpp" +#include "ProbabilisticModel.hpp" +#include + +namespace tops { + //! This class is a factory for the variable length markov chain + class DLLEXPORT MaximalDependenceDecompositionCreator : public ProbabilisticModelCreator + { + public: + MaximalDependenceDecompositionCreator() {} + virtual ProbabilisticModelPtr create(ProbabilisticModelParameters & parameters) const; + }; + typedef boost::shared_ptr < MaximalDependenceDecompositionCreator> MaximalDependenceDecompositionCreatorPtr; +} + +#endif diff --git a/src/ProbabilisticModelCreatorClient.cpp b/src/ProbabilisticModelCreatorClient.cpp index 62b008f7..a2042818 100644 --- a/src/ProbabilisticModelCreatorClient.cpp +++ b/src/ProbabilisticModelCreatorClient.cpp @@ -43,6 +43,7 @@ #include "ProbabilisticModelParameter.hpp" #include "SimilarityBasedSequenceWeightingCreator.hpp" #include "MultipleSequentialModelCreator.hpp" +#include "MaximalDependenceDecompositionCreator.hpp" #include "util.hpp" @@ -154,7 +155,7 @@ namespace tops conf.append(line); } input.close(); - tops::lang::parse(conf); + // tops::lang::parse(conf); if(readConfig.load(conf)){ _p = *(readConfig.parameters()); return _p; @@ -314,6 +315,8 @@ namespace tops _createModelCommand["SimilarityBasedSequenceWeighting"] = SimilarityBasedSequenceWeightingCreatorPtr(new SimilarityBasedSequenceWeightingCreator()); _createModelCommand["MultipleSequentialModels"] = MultipleSequentialModelCreatorPtr( new MultipleSequentialModelCreator()); + _createModelCommand["MaximalDependenceDecomposition"] = + MaximalDependenceDecompositionCreatorPtr(new MaximalDependenceDecompositionCreator()); } } diff --git a/src/TrainMaximalDependenceDecomposition.cpp b/src/TrainMaximalDependenceDecomposition.cpp new file mode 100644 index 00000000..513a94b4 --- /dev/null +++ b/src/TrainMaximalDependenceDecomposition.cpp @@ -0,0 +1,86 @@ +/* + * TrainMaximalDependenceDecomposition.cpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#include "ProbabilisticModel.hpp" +#include "ProbabilisticModelCreator.hpp" +#include "ConfigurationReader.hpp" +#include "TrainMaximalDependenceDecomposition.hpp" +#include "MaximalDependenceDecomposition.hpp" +#include "util.hpp" +#include "ProbabilisticModelCreatorClient.hpp" +#include "Symbol.hpp" +#include + +namespace tops { + + ProbabilisticModelPtr TrainMaximalDependenceDecomposition::create( ProbabilisticModelParameters & parameters) const + { + ProbabilisticModelParameterValuePtr alphabet_parameter = parameters.getOptionalParameterValue("alphabet"); + ProbabilisticModelParameterValuePtr consensus_parameter = parameters.getOptionalParameterValue("consensus"); + ProbabilisticModelParameterValuePtr consensus_model_parameter = parameters.getOptionalParameterValue("consensus_model"); + ProbabilisticModelParameterValuePtr training_set_parameter = parameters.getOptionalParameterValue("training_set"); + ProbabilisticModelParameterValuePtr minimum_subset_parameter = parameters.getOptionalParameterValue("minimum_subset"); + + if(alphabet_parameter == NULL || consensus_parameter == NULL || consensus_model_parameter == NULL || training_set_parameter == NULL) { + std::cerr << "ERROR: initial_specification is a mandatory paramenter\n" << std::endl; + return MaximalDependenceDecompositionPtr(); + } else { + MaximalDependenceDecompositionPtr mdd = MaximalDependenceDecompositionPtr(new MaximalDependenceDecomposition()); + AlphabetPtr alphabet = AlphabetPtr(new Alphabet()); + alphabet->initializeFromVector(alphabet_parameter->getStringVector()); + mdd->setAlphabet(alphabet); + + std::vector consensus_symbols = consensus_parameter->getStringVector(); + ConsensusSequence consensus_sequence; + for (int i = 0; i < consensus_symbols.size(); i++) { + std::vector syms; + boost::split(syms, consensus_symbols[i], boost::is_any_of(" ")); + vector s; + for (int j = 0; j < syms.size(); j++) { + s.push_back(alphabet->getSymbol(syms[j])->id()); + } + Consensus cons(s); + consensus_sequence.push_back(cons); + } + mdd->setConsensusSequence(consensus_sequence); + + std::string consensus_model_str = consensus_model_parameter->getString(); + consensus_model_str = consensus_model_str.substr(1, consensus_model_str.size() - 2); + ConfigurationReader consensus_model_reader; + ProbabilisticModelCreatorClient consensus_model_creator; + consensus_model_reader.load(consensus_model_str); + mdd->setConsensusModel(consensus_model_creator.create(*(consensus_model_reader.parameters()))); + + SequenceEntryList sample_set; + readSequencesFromFile(sample_set, alphabet, training_set_parameter->getString()); + + mdd->train(sample_set, minimum_subset_parameter->getInt()); + + return mdd; + } + } +}; + + + diff --git a/src/TrainMaximalDependenceDecomposition.hpp b/src/TrainMaximalDependenceDecomposition.hpp new file mode 100644 index 00000000..0993f977 --- /dev/null +++ b/src/TrainMaximalDependenceDecomposition.hpp @@ -0,0 +1,58 @@ +/* + * TrainMaximalDependenceDecomposition.hpp + * + * Copyright 2011 Andre Yoshiaki Kashiwabara + * Ígor Bonadio + * Vitor Onuchic + * Alan Mitchell Durham + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, + * MA 02110-1301, USA. + */ + +#ifndef TRAIN_MDD_HPP +#define TRAIN_MDD_HPP + +#include "crossplatform.hpp" + +#include "ProbabilisticModel.hpp" +#include "ProbabilisticModelCreator.hpp" +#include "ConfigurationReader.hpp" + + +namespace tops { + + //! Creates a HMM using Baum-Welch + class DLLEXPORT TrainMaximalDependenceDecomposition : public ProbabilisticModelCreator { + public: + TrainMaximalDependenceDecomposition () {} + virtual ~TrainMaximalDependenceDecomposition () {}; + //! Creates a probability model + /*! \param parameters is a set of parameters that is utilized to build the model */ + virtual ProbabilisticModelPtr create( ProbabilisticModelParameters & parameters) const ; + + //! Provides a help + virtual std::string help() const { + std::string s; + return s; + } + + + }; + typedef boost::shared_ptr TrainMaximalDependenceDecompositionPtr ; +}; + + +#endif diff --git a/src/cmake_install.cmake b/src/cmake_install.cmake index 8913670b..01fde601 100644 --- a/src/cmake_install.cmake +++ b/src/cmake_install.cmake @@ -1,4 +1,4 @@ -# Install script for directory: /Users/yoshiaki/work/programas/tops/src +# Install script for directory: /Users/igorbonadio/Projetos/tops/src # Set the install prefix IF(NOT DEFINED CMAKE_INSTALL_PREFIX) @@ -28,7 +28,7 @@ IF(NOT CMAKE_INSTALL_COMPONENT) ENDIF(NOT CMAKE_INSTALL_COMPONENT) IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") - FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/Users/yoshiaki/work/programas/tops/src/libToPS.dylib") + FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/lib" TYPE SHARED_LIBRARY FILES "/Users/igorbonadio/Projetos/tops/src/libToPS.dylib") IF(EXISTS "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libToPS.dylib" AND NOT IS_SYMLINK "$ENV{DESTDIR}${CMAKE_INSTALL_PREFIX}/lib/libToPS.dylib") EXECUTE_PROCESS(COMMAND "/usr/bin/install_name_tool" @@ -42,82 +42,82 @@ ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unsp IF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified") FILE(INSTALL DESTINATION "${CMAKE_INSTALL_PREFIX}/include/tops" TYPE FILE FILES - "/Users/yoshiaki/work/programas/tops/src/AkaikeInformationCriteria.hpp" - "/Users/yoshiaki/work/programas/tops/src/Alphabet.hpp" - "/Users/yoshiaki/work/programas/tops/src/BayesianInformationCriteria.hpp" - "/Users/yoshiaki/work/programas/tops/src/ConfigurationReader.hpp" - "/Users/yoshiaki/work/programas/tops/src/ContextTree.hpp" - "/Users/yoshiaki/work/programas/tops/src/DecodableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/DegenerateDistribution.hpp" - "/Users/yoshiaki/work/programas/tops/src/FactorableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/FactorableModelPrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/DiscreteIIDModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/DiscreteIIDModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/FixedSequenceAtPosition.hpp" - "/Users/yoshiaki/work/programas/tops/src/FixedSequenceAtPositionCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/GHMMStates.hpp" - "/Users/yoshiaki/work/programas/tops/src/GeneralizedHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/GeneralizedHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/HiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/HiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousFactorableModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/InhomogeneousMarkovChainCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleAlignment.hpp" - "/Users/yoshiaki/work/programas/tops/src/NullPrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedFactorableModelEvaluationAlgorithm.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedRunLengthDistribution.hpp" - "/Users/yoshiaki/work/programas/tops/src/PhasedRunLengthDistributionCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/PrefixSumArray.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelCreatorClient.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelDecorator.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProbabilisticModelParameter.hpp" - "/Users/yoshiaki/work/programas/tops/src/RemoveSequenceFromModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ReverseComplementDNA.hpp" - "/Users/yoshiaki/work/programas/tops/src/ReverseComplementDNACreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/Sequence.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceEntry.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceFactory.hpp" - "/Users/yoshiaki/work/programas/tops/src/SequenceFormat.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramBurge.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramKernelDensity.hpp" - "/Users/yoshiaki/work/programas/tops/src/SmoothedHistogramStanke.hpp" - "/Users/yoshiaki/work/programas/tops/src/SparseMatrix.hpp" - "/Users/yoshiaki/work/programas/tops/src/Symbol.hpp" - "/Users/yoshiaki/work/programas/tops/src/TargetModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/TargetModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainDiscreteIIDModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainFixedLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainInterpolatedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainHMMBaumWelch.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainHMMMaximumLikelihood.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPhasedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainInterpolatedPhasedMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPhasedMarkovChainContextAlgorithm.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainVariableLengthInhomogeneousMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainVariableLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainWeightArrayModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/VariableLengthMarkovChain.hpp" - "/Users/yoshiaki/work/programas/tops/src/VariableLengthMarkovChainCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/util.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainGHMMTransitions.hpp" - "/Users/yoshiaki/work/programas/tops/src/BernoulliModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/SimilarityBasedSequenceWeighting.hpp" - "/Users/yoshiaki/work/programas/tops/src/SimilarityBasedSequenceWeightingCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainSimilarityBasedSequenceWeighting.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleSequentialModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/MultipleSequentialModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/StoreLoadedModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/PairHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/PairHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainPHMMBaumWelch.hpp" - "/Users/yoshiaki/work/programas/tops/src/MaximumDependenceDecomposition.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProfileHiddenMarkovModel.hpp" - "/Users/yoshiaki/work/programas/tops/src/ProfileHiddenMarkovModelCreator.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainProfileHMMMaxLikelihood.hpp" - "/Users/yoshiaki/work/programas/tops/src/TrainProfileHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/AkaikeInformationCriteria.hpp" + "/Users/igorbonadio/Projetos/tops/src/Alphabet.hpp" + "/Users/igorbonadio/Projetos/tops/src/BayesianInformationCriteria.hpp" + "/Users/igorbonadio/Projetos/tops/src/ConfigurationReader.hpp" + "/Users/igorbonadio/Projetos/tops/src/ContextTree.hpp" + "/Users/igorbonadio/Projetos/tops/src/DecodableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/DegenerateDistribution.hpp" + "/Users/igorbonadio/Projetos/tops/src/FactorableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/FactorableModelPrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/DiscreteIIDModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/DiscreteIIDModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/FixedSequenceAtPosition.hpp" + "/Users/igorbonadio/Projetos/tops/src/FixedSequenceAtPositionCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/GHMMStates.hpp" + "/Users/igorbonadio/Projetos/tops/src/GeneralizedHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/GeneralizedHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/HiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/HiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousFactorableModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/InhomogeneousMarkovChainCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleAlignment.hpp" + "/Users/igorbonadio/Projetos/tops/src/NullPrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedFactorableModelEvaluationAlgorithm.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedRunLengthDistribution.hpp" + "/Users/igorbonadio/Projetos/tops/src/PhasedRunLengthDistributionCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/PrefixSumArray.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelCreatorClient.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelDecorator.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProbabilisticModelParameter.hpp" + "/Users/igorbonadio/Projetos/tops/src/RemoveSequenceFromModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ReverseComplementDNA.hpp" + "/Users/igorbonadio/Projetos/tops/src/ReverseComplementDNACreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/Sequence.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceEntry.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceFactory.hpp" + "/Users/igorbonadio/Projetos/tops/src/SequenceFormat.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramBurge.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramKernelDensity.hpp" + "/Users/igorbonadio/Projetos/tops/src/SmoothedHistogramStanke.hpp" + "/Users/igorbonadio/Projetos/tops/src/SparseMatrix.hpp" + "/Users/igorbonadio/Projetos/tops/src/Symbol.hpp" + "/Users/igorbonadio/Projetos/tops/src/TargetModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/TargetModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainDiscreteIIDModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainFixedLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainInterpolatedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainHMMMaximumLikelihood.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPhasedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainInterpolatedPhasedMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPhasedMarkovChainContextAlgorithm.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainVariableLengthInhomogeneousMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainVariableLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainWeightArrayModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/VariableLengthMarkovChain.hpp" + "/Users/igorbonadio/Projetos/tops/src/VariableLengthMarkovChainCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/util.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainGHMMTransitions.hpp" + "/Users/igorbonadio/Projetos/tops/src/BernoulliModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/SimilarityBasedSequenceWeighting.hpp" + "/Users/igorbonadio/Projetos/tops/src/SimilarityBasedSequenceWeightingCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainSimilarityBasedSequenceWeighting.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleSequentialModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/MultipleSequentialModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/StoreLoadedModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/PairHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/PairHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainPHMMBaumWelch.hpp" + "/Users/igorbonadio/Projetos/tops/src/MaximumDependenceDecomposition.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProfileHiddenMarkovModel.hpp" + "/Users/igorbonadio/Projetos/tops/src/ProfileHiddenMarkovModelCreator.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainProfileHMMMaxLikelihood.hpp" + "/Users/igorbonadio/Projetos/tops/src/TrainProfileHMMBaumWelch.hpp" ) ENDIF(NOT CMAKE_INSTALL_COMPONENT OR "${CMAKE_INSTALL_COMPONENT}" STREQUAL "Unspecified")