diff --git a/Fred2/Core/Generator.py b/Fred2/Core/Generator.py index 31037186..45a40b89 100644 --- a/Fred2/Core/Generator.py +++ b/Fred2/Core/Generator.py @@ -366,108 +366,6 @@ def _generate_combinations(tId, vs, seq, usedVs, offset, isReverse=False): yield Transcript("".join(varSeq), geneid, tId, vars=varComb) -def generate_transcripts_from_tumor_variants(normal, tumor, dbadapter, id_type): - """ - Generates all possible :class:`~Fred2.Core.Transcript.Transcript` variations of the given - :class:`~Fred2.Core.Variant.Variant`. - - The result is a generator. - - :param normal: A list of variants of the normal tissue - :type normal: list(:class:`~Fred2.Core.Variant.Variant`) - :param tumor: A list of variant of the cancer tissue for which transcript should be generated - :type tumor: list(:class:`~Fred2.Core.Variant.Variant`) - :param dbadapter: a DBAdapter to fetch the transcript sequences - :type dbadapter: :class:`~Fred2.IO.ADBAdapter.ADBAdapter` - :param id_type: The type of the transcript IDs used in annotation of variants (e.g. REFSEQ, ENSAMBLE) - :type id_type: :func:`~Fred2.IO.ADBAdapter.EIdentifierTypes` - :return: A generator of transcripts with all possible variations determined by the given variant list - :rtype: Generator(:class:`~Fred2.Core.Transcript.Transcript`) - """ - def _generate_combinations(tId, vs, seq, usedVs, offset): - """ - recursive variant combination generator - """ - transOff = generate_transcripts_from_tumor_variants.transOff - #print "TransOffset ", transOff, tId,usedVs - if vs: - flag, v = vs.pop() - - if v.isHomozygous: - pos = v.coding[tId].tranPos + offset - usedVs[pos] = v - offset = _incorp.get(v.type, lambda a, b, c, d, e: e)(seq, v, tId, pos, offset) - for s in _generate_combinations(tId, vs, seq, usedVs, offset): - yield s - else: - vs_tmp = vs[:] - tmp_seq = seq[:] - tmp_usedVs = usedVs.copy() - - if flag: - for s in _generate_combinations(tId, vs_tmp, tmp_seq, tmp_usedVs, offset): - yield s - - # update the transcript variant id - generate_transcripts_from_tumor_variants.transOff += 1 - pos = v.coding[tId].tranPos + offset - usedVs[pos] = v - - offset = _incorp.get(v.type, lambda a, b, c, d, e: e)(seq, v, tId, pos, offset) - - for s in _generate_combinations(tId, vs, seq, usedVs, offset): - yield s - else: - yield tId+":FRED2_%i"%transOff, seq, usedVs - - #1) get all transcripts and sort the variants to transcripts - - #For a transcript do: - #A) get transcript sequences - #B) generate all possible combinations of variants - #C) apply variants to transcript and generate transcript object - - if not isinstance(dbadapter, ADBAdapter): - raise TypeError("The given dbadapter is not of type ADBAdapter") - - transToVar = {} - for v in tumor: - for trans_id in v.coding.iterkeys(): - transToVar.setdefault(trans_id, []).append((False, v)) - - for v in normal: - for trans_id in v.coding.iterkeys(): - if trans_id in transToVar: - transToVar.setdefault(trans_id, []).append((True, v)) - - for tId, vs in transToVar.iteritems(): - query = dbadapter.get_transcript_information(tId, type=id_type) - if query is None: - warnings.warn("Transcript with ID %s not found in DB"%tId) - continue - - tSeq = query[EAdapterFields.SEQ] - geneid = query[EAdapterFields.GENE] - strand = query[EAdapterFields.STRAND] - - #if its a reverse transcript form the complement of the variants - if strand == REVERS: - for flag, v in transToVar[tId]: - v.ref = v.ref[::-1].translate(COMPLEMENT) - v.obs = v.obs[::-1].translate(COMPLEMENT) - - vs.sort(key=lambda (isTumor, v): v.genomePos-1 - if v.type in [VariationType.FSINS, VariationType.INS] - else v.genomePos, reverse=True) - if not _check_for_problematic_variants(map(lambda x: x[1],vs)): - warnings.warn("Intersecting variants found for Transcript %s"%tId) - continue - - generate_transcripts_from_tumor_variants.transOff = 0 - for tId, varSeq, varComb in _generate_combinations(tId, vs, list(tSeq), {}, 0): - yield Transcript("".join(varSeq), geneid, tId, vars=varComb) - - ################################################################################ # T R A N S C R I P T = = > P R O T E I N ################################################################################ diff --git a/Fred2/Distance2Self/Distance2Self.py b/Fred2/Distance2Self/Distance2Self.py deleted file mode 100644 index 58823388..00000000 --- a/Fred2/Distance2Self/Distance2Self.py +++ /dev/null @@ -1,70 +0,0 @@ -# This code is part of the Fred2 distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -__author__ = 'mohr, walzer' - -from logging import warning -from tempfile import NamedTemporaryFile - -from Fred2.Core.Peptide import Peptide -from Fred2.Core.Result import Distance2SelfResult -from Fred2 import d2s - - -def p2s(peptides): - if not isinstance(peptides, list): - warning("no list of peptides given - returning None!") - return None - - if any(not(isinstance(p, Peptide) or isinstance(p, str)) for p in peptides): - warning("no list with peptides given - returning None!") - return None - - revi = list() - for x in peptides: - if isinstance(x, Peptide): - revi.append(str(x)) - elif isinstance(x, str): - revi.append(x) - else: - warning("no list of peptides given - returning None!") - return None - return revi - - -class Distance2Self(object): - """ - Implements calulcation routine of distance to (self) peptides - Calculate k closest distances of peptide to peptide set represented as trie - - All our matrices have the same ordering of letters. - If you use a new matrix, pleas make sure to use the same ordering! Otherwise the tries have to be recomputed! - """ - def __init__(self, matrix, selfpeptides, max_res=10): - """ - - :param matrix: a DistanceMatrix - :param selfpeptides: list of Peptides or str of Peptides all in same length - :param max_res: max number of minimum distances returned - """ - self.__matrix = matrix - self.__matrixfile = NamedTemporaryFile(delete=False) - self.__matrixfile.close() - matrix.to_file(self.__matrixfile.name) - self.__max_res = max_res - #TODO @mohr: default behaviour for heterogeneous lists? - self.__d2s = d2s.Distance2Self(self.__matrixfile.name, p2s(selfpeptides)) #TODO @mohr: what default? - #TODO get trie from d2s and write like done with __matrixfile - - def calculate_distances(self, peptides): - import sys - print "@@@", sys.getrecursionlimit() - dd = self.__d2s.getDistance2Self(p2s(peptides), self.__max_res) - print "derp" - result = {key: value.values() for key, value in dd.iteritems()} - # create temporary file with peptides for distance computation - - resultDf = Distance2SelfResult.from_dict(result) - resultDf['matrix'] = self.__matrix.name - - return resultDf \ No newline at end of file diff --git a/Fred2/Distance2Self/DistanceMatrix.py b/Fred2/Distance2Self/DistanceMatrix.py deleted file mode 100644 index d757c0b7..00000000 --- a/Fred2/Distance2Self/DistanceMatrix.py +++ /dev/null @@ -1,52 +0,0 @@ - -__author__ = 'mohr' - -import datetime -import os -from collections import OrderedDict - - -class DistanceMatrix(object): - - def __init__(self, data, name="DistanceMatrix"): - self.__matrix = data - self.name = name - assert isinstance(data, OrderedDict), 'DistanceMatrix object has to be initiated with dictionary.' - - def __str__(self): - repr = '\t' - repr += '\t'.join(self.__matrix.keys()) - repr += '\n' - for k in self.__matrix.keys(): - lineStr = '%s\t' % k - for kk in self.__matrix[k].keys(): - lineStr += '%s\t' % self.__matrix[k][kk] - - repr += '%s\n' % lineStr - return repr - - def to_file(self, file_path): - with open(file_path, 'w') as newMatrixFile: - for k in self.__matrix.keys(): - lineStr = '%s ' % k - for kk in self.__matrix[k].keys(): - lineStr += '%s ' % self.__matrix[k][kk] - newMatrixFile.write(lineStr + '\n') - - def from_file(self, file_path): - with open(file_path, 'r') as matrixFile: - matrixDict = OrderedDict([]) - aas = [] - distances = [] - for line in matrixFile: - values = line.strip().split(' ') - aas.append(values[0]) - distances.append(values[1:]) - - for aa, vals in zip(aas, distances): - matrixValues = OrderedDict([]) - for aaa, v in zip(aas,vals): - matrixValues[aaa] = v - matrixDict[aa] = matrixValues - self.__matrix = matrixDict - diff --git a/Fred2/Distance2Self/__init__.py b/Fred2/Distance2Self/__init__.py deleted file mode 100644 index 06c213ba..00000000 --- a/Fred2/Distance2Self/__init__.py +++ /dev/null @@ -1 +0,0 @@ -__author__ = 'mohr, walzer' diff --git a/Fred2/Distance2Self/hw.cpp b/Fred2/Distance2Self/hw.cpp deleted file mode 100644 index 5af87111..00000000 --- a/Fred2/Distance2Self/hw.cpp +++ /dev/null @@ -1,18 +0,0 @@ -#define BOOST_PYTHON_STATIC_LIB -#include -using namespace boost::python; - -struct World -{ - void set(std::string msg) { this->msg = msg; } - std::string greet() { return msg; } - std::string msg; -}; - - -BOOST_PYTHON_MODULE(helloworld) -{ - class_("World") - .def("greet", &World::greet) - .def("set", &World::set); -} \ No newline at end of file diff --git a/Fred2/Distance2Self/src/Matrix.hpp b/Fred2/Distance2Self/src/Matrix.hpp deleted file mode 100755 index 42307c41..00000000 --- a/Fred2/Distance2Self/src/Matrix.hpp +++ /dev/null @@ -1,70 +0,0 @@ -#include - -using namespace std; - -class Matrix -{ - public: - typedef vector IndexSequence; - - protected: - vector matrix_; - size_t alphabet_size_; - string alphabet_; - map char_to_index_; - map index_to_char_; - - public: - Matrix(const string& filename); - vector data_; - - size_t charToIndex(char c) { return char_to_index_[c]; } - char indexToChar(size_t i) { return index_to_char_[i]; } - - void translate(const string& s, IndexSequence& sequence); - protected: - void setupMaps_(); - private: - Matrix(); -}; - -Matrix::Matrix(const string& filename) - : alphabet_(""), data_(20 * 20) -{ - //TODO: check file exists - ifstream is(filename.c_str()); - data_.resize(20 * 20); - for (size_t i = 0; i < 20; ++i) - { - char c; - is >> c; - alphabet_.push_back(c); - double score; - for (size_t j = 0; j < 20; ++j) - { - is >> score; - data_[i * 20 + j] = score; - } - } - setupMaps_(); -} - -void Matrix::setupMaps_() -{ - for (size_t i = 0; i < alphabet_.size(); ++i) - { - index_to_char_.insert(make_pair(i, alphabet_[i])); - char_to_index_.insert(make_pair(alphabet_[i], i)); - } -} - -// Matrices are stored with integers as indices, not with letters -void Matrix::translate(const string& s, Matrix::IndexSequence& sequence) -{ - sequence.resize(s.size()); - for (size_t i = 0; i < s.size(); ++i) - { - sequence[i] = char_to_index_[s[i]]; - } -} - diff --git a/Fred2/Distance2Self/src/Sequences.hpp b/Fred2/Distance2Self/src/Sequences.hpp deleted file mode 100644 index 0f28d928..00000000 --- a/Fred2/Distance2Self/src/Sequences.hpp +++ /dev/null @@ -1,98 +0,0 @@ -class Sequences -{ - public: - typedef vector SequenceVector; - - - protected: - SequenceVector data_; - SequenceVector desc_; - - public: - string& operator [] (size_t index) { return data_[index]; } - const string& operator [] (size_t index) const { return data_[index]; } - - string& description(size_t index) { return desc_[index]; } - const string& description(size_t index) const { return desc_[index]; } - - size_t size() const { return data_.size(); } - - void clear() { data_.resize(0); } - - Sequences(size_t size = 0) : data_(size), desc_(size) {} - - Sequences(const string& filename) - { - - ifstream is(filename.c_str()); - if (not is) - throw "Cannot open File!"; - - string line; - bool done = false; - - while (not done and getline(is, line)) - { - // quit reading after a blank line - if (line.size() == 0) break; - - // compare if expectations are met... - if (line[0] != '>') - throw "FASTA sequence doesn't start with '>'"; - - // Parse the header - string desc = line; - - // Parse the sequence - string s; - while (is) - { - char c = is.get(); - - // bail on EOF - if (not is) { done = true; break;} - is.putback(c); - - // finish this sequence before beginning of next sequence - if (c == '>') break; - - // read the next line of letters - getline(is, line); - - // quit reading after a blank line - if (not line.size()) { done = true; break;} - - // add the letters in - s += line; - } - - data_.push_back(s); - desc_.push_back(desc); - } - } - - vector::iterator getDataBegin() - { - return data_.begin(); - } - - - vector::iterator getDataEnd() - { - return data_.end(); - } - - void push_back(const string& s) - { - data_.push_back(s); - desc_.push_back(""); - } - - void push_back(const string& s, const string& d) - { - data_.push_back(s); - desc_.push_back(d); - } - - -}; diff --git a/Fred2/Distance2Self/src/Trie.hpp b/Fred2/Distance2Self/src/Trie.hpp deleted file mode 100755 index 5aca3ca2..00000000 --- a/Fred2/Distance2Self/src/Trie.hpp +++ /dev/null @@ -1,62 +0,0 @@ -#include - -using namespace std; -using namespace boost; - -class Trie -{ - public: - typedef property VertexIndex; - typedef adjacency_list TrieGraph; - typedef property_map::type PropertyMap; - typedef TrieGraph::adjacency_iterator VertexIterator; - typedef graph_traits::vertex_descriptor Vertex; - TrieGraph g; - PropertyMap map; - Vertex root; - - Trie() - { - root = add_vertex(0, g); - } - - void add(const vector& sequence) - { - Vertex u = root; - PropertyMap idx = get(vertex_index_t(), g); - for (size_t i = 0; i < sequence.size(); ++i) - { - graph_traits::adjacency_iterator vi, vi_end; - tie(vi, vi_end) = adjacent_vertices(u, g); - if (vi != vi_end) - { - // There are vertices on this level, now we need to figure out - // whether there is a node labeled with sequence[i] already. - for (; vi != vi_end; ++vi) - { - if (idx[*vi] == sequence[i]) - { - u = *vi; - break; - } - } - if (vi == vi_end) - { - // We didn't find a matching node, add it to the trie. - Vertex v = add_vertex(sequence[i], g); - add_edge(u, v, g); - u = v; - } - } - else - { - // No node on this level exists, so we can safely add a node with - // sequence[i] on this level. - Vertex v = add_vertex(sequence[i], g); - add_edge(u, v, g); - u = v; - } - } - } - -}; diff --git a/Fred2/Distance2Self/src/TrieArray.hpp b/Fred2/Distance2Self/src/TrieArray.hpp deleted file mode 100755 index 7e5262f1..00000000 --- a/Fred2/Distance2Self/src/TrieArray.hpp +++ /dev/null @@ -1,230 +0,0 @@ -#include -#include -#include -#include -#include -//#include breaks on OS X > 10.9 -#include "Trie.hpp" -#include "Matrix.hpp" -#include -#include -#include - -using namespace std; - -class TrieArray -{ - public: - typedef size_t PackedIndex; - typedef vector LevelArray; - typedef vector PackedTrie; - - PackedTrie data; - TrieArray(); - TrieArray(string ta); - TrieArray(const Trie& t); - void add(string s); - - template - void save(Archive & ar, const unsigned int version) const; - - template - void load(Archive & ar, const unsigned int version); - - BOOST_SERIALIZATION_SPLIT_MEMBER(); - - private: - friend class boost::serialization::access; - -}; -BOOST_CLASS_VERSION(TrieArray, 1); - - -TrieArray::TrieArray() - : data () -{ -} - -TrieArray::TrieArray(string ta) - : data () -{ - std::ifstream ifs(ta.c_str()); - boost::archive::text_iarchive ia(ifs); - load(ia,1); -} - - -TrieArray::TrieArray(const Trie& t) -{ - vector nodes; - nodes.push_back(t.root); - - data.push_back(LevelArray(1, 0)); - while (!nodes.empty()) - { - size_t number_of_nodes = 0; - for (size_t i = 0; i < nodes.size(); ++i) - { - data.back()[i] |= number_of_nodes; - number_of_nodes += out_degree(nodes[i], t.g); - } - data.push_back(LevelArray()); - data.back().resize(number_of_nodes); - //cout << "Layer " << data.size() << " has " << number_of_nodes << " nodes." << endl; - size_t j = 0; - vector new_nodes(number_of_nodes); - for (size_t i = 0; i < nodes.size(); ++i) - { - Trie::TrieGraph::adjacency_iterator ai, a_end; - for (tie(ai, a_end) = adjacent_vertices(nodes[i], t.g); ai != a_end; ++ai, ++j) - { - data.back()[j] = get(vertex_index_t(), t.g, *ai) << 56; - new_nodes[j] = *ai; - } - } - nodes = new_nodes; - } -} - -void TrieArray::add(string s) -{ - //cout << "Adding: " << s << endl; - //TODO -} - -template -void TrieArray::save(Archive & ar, const unsigned int version) const -{ - for (size_t i=0; i -void TrieArray::load(Archive & ar, const unsigned int version) -{ - for (size_t i=0; i<11; ++i) - { - LevelArray v; - ar & v; - data.push_back(v); - } -} - - -typedef pair Node; -typedef vector Children; -typedef vector Peptide; - -static Children getChildren(const TrieArray& ta, Node n) -{ - Children children; - if (n.first == ta.data.size()-1) //leaf - { - return children; - } - else - { - size_t v_e; - - size_t v = ta.data[n.first][n.second]; //Inhalt aktueller Knoten - size_t v_s = (v & 0x00ffffffffffffff); //Zeiger auf erstes Kind bzw. Index des ersten Kindes in naechster Ebene. - - if (n.second >= ta.data[n.first].size()-1) //last sibling - { - v_e = ta.data[n.first + 1].size(); - } - else - { - size_t v_next = ta.data[n.first][n.second+1]; //Inhalt naechster Geschwisterknoten, braucht man fuer den Zeiger auf Ende der Kinder vom aktuellen Knoten - v_e = (v_next & 0x00ffffffffffffff); //Zeiger auf letztes Kind bzw. Index des letzten Kindes in naechster Ebene. - } - - //if (v_s > v_e) - //{ - // cout << v << " " << v_s << " " << v_e << " n1: " << n.first << " n2: " << n.second << " " << ta.data[n.first].size() << endl; - //} - - for( size_t i = v_s; i < v_e; i++ ) - { - size_t level = n.first + 1; - size_t sibling = i; - Node c (level,sibling); - children.push_back(c); - } - return children; - } -} - - -static multiset > DFS_BnB_x_pair(const TrieArray& ta, Node n, double& dist, Matrix& m, Peptide& p, Peptide& s, multiset >& dist_min, size_t x=10) -/* -Returns the x clostest peptides -Returns pairs of distance and peptide -Parameters: -TrieArray: The search trie -dist: distance of search peptide s to -m: Distance matrix //we have to see what we will do with similarity matrices... -p: path to the current node in the trie. keeps track of the words stored in the nodes. -s: search peptide -dist_min: minimal distance foud so far. Required for break in branch and bound. -*/ -{ - if (dist_min.size() >= x) //If not enough distances found so far - { - if (dist > (*--dist_min.end()).first)//check if the bound criteria is fullfilled (distance is already greater or equal to maximal distance selected so far) - { - //cout << "break: " << *--dist_min.end() << " (" << dist << ") " << "-----"; - return dist_min; - } - } - - Children c = getChildren(ta, n); - if (c.size() == 0) //at leaf - { - string seq; - for (size_t pos=0; pos temp_pair(dist,seq); - dist_min.insert(temp_pair); - //cout << dist << "\t" << seq << endl; - } - else if (dist < (*--dist_min.end()).first)//peptide with new minimal distance found. - { - pair temp_pair(dist,seq); - //cout << seq << endl; - dist_min.insert(temp_pair); - //cout << dist << "\t" << seq << endl; - dist_min.erase(--dist_min.end()); //remove largest distance from list - //cout << --dist_min.end() << endl; - } - - return dist_min; - } - - for (size_t i=0; i> 56); - size_t b = s[(c[i].first-1)]; - p.push_back(a); - double d = m.data_[a*20 + b]; - if (abs(d)<0.0000001) - {d = 0.0;} - dist += d; - multiset > dist_min_tmp; - dist_min_tmp = DFS_BnB_x_pair(ta,c[i],dist,m,p,s, dist_min, x); - dist_min = dist_min_tmp; - p.pop_back(); - dist -= d; - if (abs(dist)<0.0000001) - {dist = 0.0;} - } - return dist_min; -} - diff --git a/Fred2/Distance2Self/src/distance2self.cpp b/Fred2/Distance2Self/src/distance2self.cpp deleted file mode 100644 index c8f71293..00000000 --- a/Fred2/Distance2Self/src/distance2self.cpp +++ /dev/null @@ -1,77 +0,0 @@ -#define BOOST_PYTHON_STATIC_LIB -#include "distance2self.hpp" - -using namespace std; -using namespace boost::python; - -Distance2Self::Distance2Self(string f_m, boost::python::list selfs): - m_(f_m), ta_() -{ - //Matrix m("/abi-projects/dist2self/matrices/BLOSUM45_distance_normal.dat"); cout << "Initializing trie. " << endl; Trie t; - Trie t; - Matrix::IndexSequence indices; - for (int i = 0; i < len(selfs); ++i) - { - //TODO check for equi-legth?! - string s = boost::python::extract(selfs[i]); - m_.translate(s, indices); - t.add(indices); - } - ta_ = TrieArray(t); -} - -boost::python::dict Distance2Self::getDistance2Self(boost::python::list sequences, int max_result) -{ - dict peptides; - for (int i = 0; i < len(sequences); ++i) - { - //http://stackoverflow.com/questions/17659572/boostpython-passing-reference-of-pythonlist - string seq = boost::python::extract(sequences[i]); - - vector p, seq_i; - m_.translate(seq, seq_i); //translate peptide sequence to matrix indices. seq contains the translated peptide sequence. - - multiset > dist_min, dt; - double dist = 0.0; - pair n (0,0); //Node (0,0) : start at top of the trie - dist_min = DFS_BnB_x_pair(ta_,n,dist,m_,p,seq_i,dt, max_result); - dict dists; - for (multiset >::iterator dit=dist_min.begin() ; dit != dist_min.end(); ++dit) - { - dists[dit->second] = dit->first; - } - peptides[seq]=dists; - } - return peptides; -} - -//~ template -//~ dict map2BoostPythonDict(std::map map) { - //~ typename map::iterator it; - //~ dict bpd; - //~ for (it = map.begin(); it != map.end(); ++it) { - //~ bpd[it->first] = it->second; - //~ } - //~ return bpd; -//~ } - -void Distance2Self::setTrieArray(string f_ta) -{ - ta_ = TrieArray(f_ta); -} - -void Distance2Self::setMatrix(string f_m) -{ - m_ = Matrix(f_m); -} - -#include -#include -BOOST_PYTHON_MODULE(d2s) -{ - class_("Distance2Self",init()) - .def("setTrieArray", &Distance2Self::setTrieArray) - .def("getDistance2Self", &Distance2Self::getDistance2Self) - .def("setTrieArray", &Distance2Self::setTrieArray) - .def("setMatrix", &Distance2Self::setMatrix); -} diff --git a/Fred2/Distance2Self/src/distance2self.hpp b/Fred2/Distance2Self/src/distance2self.hpp deleted file mode 100644 index 581ef44c..00000000 --- a/Fred2/Distance2Self/src/distance2self.hpp +++ /dev/null @@ -1,27 +0,0 @@ -#ifndef DISTANCE2SELF_H -#define DISTANCE2SELF_H - -#include -#include -#include -#include "TrieArray.hpp" - -using namespace std; - -class Distance2Self -{ - private: - Distance2Self(); - protected: - Matrix m_; - TrieArray ta_; - - public: - Distance2Self(string f_m, boost::python::list selfs); - boost::python::dict getDistance2Self(boost::python::list sequences, int max_result); - void setTrieArray(string f_ta); - void setMatrix(string f_m); -// Distance2Self(); -}; - -#endif // DISTANCE2SELF_H diff --git a/Fred2/EpitopeAssembly/MosaicVaccine.py b/Fred2/EpitopeAssembly/MosaicVaccine.py deleted file mode 100644 index aefecee5..00000000 --- a/Fred2/EpitopeAssembly/MosaicVaccine.py +++ /dev/null @@ -1,624 +0,0 @@ -# This code is part of the Fred2 distribution and governed by its -# license. Please see the LICENSE file that should have been included -# as part of this package. -""" -.. module:: EpitopeAssembly.MosaicVaccine - :synopsis: Embedding of the mosaic vaccine design problem into the asymmetric orienteering problem. -The methods offers an exact solution for small till medium sized problems as well as heuristics based on a Matheuristic -using Tabu Search and Branch-and-Bound for large problems. - -The heuristic proceeds as follows: - -I: initialize solution s_best via greedy construction - -s_current = s_best -WHILE convergence is not reached DO: - I: s<-Tabu Search(s_current) - II: s<-Intensification via local MIP(s) solution (allow only alpha arcs to change) - if s > s_best: - s_best = s - III: Diversification(s) to escape local maxima -END - -.. moduleauthor:: schubert - -""" - -from __future__ import division - -import itertools as itr -import multiprocessing as mp -import math - -import numpy as np -import collections -import string -import copy - -from pyomo.environ import * -from pyomo.opt import SolverFactory - -from Fred2.Core import EpitopePredictionResult - - -class TabuList(collections.MutableSet): - - def __init__(self, iterable=None, size=None): - self.end = end = [] - end += [None, end, end] # sentinel node for doubly linked list - self.map = {} # key --> [key, prev, next] - self.maxSize = size - if iterable is not None: - self |= iterable - - def __len__(self): - return len(self.map) - - def __contains__(self, key): - return tuple(key) in self.map - - def add(self, key): - key = tuple(key) - if key not in self.map: - if len(self.map) >= self.maxSize: - self.pop() - end = self.end - curr = end[1] - curr[2] = end[1] = self.map[key] = [key, curr, end] - - def discard(self, key): - if key in self.map: - key, prev, next = self.map.pop(key) - prev[2] = next - next[1] = prev - - def __iter__(self): - end = self.end - curr = end[2] - while curr is not end: - yield curr[0] - curr = curr[2] - - def __reversed__(self): - end = self.end - curr = end[1] - while curr is not end: - yield curr[0] - curr = curr[1] - - def pop(self, last=True): - if not self: - raise KeyError('set is empty') - key = self.end[1][0] if last else self.end[2][0] - self.discard(key) - return key - - def __repr__(self): - if not self: - return '%s()' % (self.__class__.__name__,) - return '%s(%r)' % (self.__class__.__name__, list(self)) - - def __eq__(self, other): - if isinstance(other, OrderedSet): - return len(self) == len(other) and list(self) == list(other) - return set(self) == set(other) - - -def suffixPrefixMatch(m): - ''' Return length of longest suffix of x of length at least k that - matches a prefix of y. Return 0 if there no suffix/prefix - match has length at least k. - ''' - x,y,k = m - if x == y: - return 0 - if x == "start": - return len(y) - if y == "start": - return 0 - - if len(x) < k or len(y) < k: - return 0 - idx = len(y) # start at the right end of y - # Search right-to-left in y for length-k suffix of x - while True: - hit = string.rfind(y, x[-k:], 0, idx) - if hit == -1: # not found - return len(y) - ln = hit + k - # See if match can be extended to include entire prefix of y - if x[-ln:] == y[:ln]: - return len(y)-ln # return length of prefix - idx = hit + k - 1 # keep searching to left in Y - return -1 - - -def _start(args): - return _move(*args) - - -def _move(func, args): - return func(*args) - - -def _vertexDel(vins, vdel, best_obj, best_sol, __imm, __arcCost): - """ - simultaniously insert a new vertex and delete a - existing vertex - """ - pertuped = [] - for e in best_sol: - if vdel == e[0]: - pertuped.append((vins,e[1])) - elif vdel == e[1]: - pertuped.append((e[0],vins)) - else: - pertuped.append(e) - return best_obj+__imm[vins]-__imm[vdel], sum(__arcCost[i][j] for i,j in pertuped), pertuped, vins - - -def _vertexIns(vertex, edge, best_obj, best_sol, __imm, __arcCost): - """ - Insert a new vertex between vertex i and vertex j - """ - pertuped = [] - for e in best_sol: - if edge == e: - pertuped.append((e[0],vertex)) - pertuped.append((vertex, e[1])) - else: - pertuped.append(e) - length = sum(__arcCost[i][j] for i,j in pertuped) - return best_obj+__imm[vertex], length, pertuped, vertex - - -class MosaicVaccineTS: - - def __init__(self, _results, threshold=None, k=10, solver="glpk", verbosity=0): - - #check input data - if not isinstance(_results, EpitopePredictionResult): - raise ValueError("first input parameter is not of type EpitopePredictionResult") - - #start constructing model - self.__pool = mp.Pool(mp.cpu_count()) - self.__solver = SolverFactory(solver)#, solver_io = "python") - self.__verbosity = verbosity - self.__changed = True - a = _results.columns.tolist() - self.__alleleProb = self.__init_alleles(a) - self.__k = k - self.__result = None - self.__thresh = {} if threshold is None else threshold - self.__imm, self.__peps = self.__init_imm(_results) - self.__n = len(self.__peps) - self.__arcCost = self.__init_arc_cost() - self.instance = self.__init_model() - - def __init_alleles(self, _alleles, verbosity=0): - """ - initializes allele objects an tests if they have probs - if not copys them and uniformily distributes probability within locus - """ - prob = [] - no_prob = [] - for a in _alleles: - if a.prob is None: - no_prob.append(copy.deepcopy(a)) - else: - prob.append(a) - - if len(no_prob) > 0: - #group by locus - no_prob_grouped = {} - prob_grouped = {} - for a in no_prob: - no_prob_grouped.setdefault(a.locus, []).append(a) - for a in prob: - prob_grouped.setdefault(a.locus, []).append(a) - - for g, v in no_prob_grouped.iteritems(): - total_loc_a = len(v) - if g in prob_grouped: - remaining_mass = 1.0 - sum(a.prob for a in prob_grouped[g]) - for a in v: - a.prob = remaining_mass/total_loc_a - else: - for a in v: - a.prob = 1.0/total_loc_a - - if verbosity: - for a in _alleles: - print a.name, a.prob - return prob+no_prob - - def __init_imm(self, _results): - __thresh = self.__thresh - res_df = _results.xs(_results.index.values[0][1], level="Method") - #print "before",len(res_df) - res_df = res_df[res_df.apply(lambda x: any(x[a] > __thresh.get(a.name, -float("inf")) - for a in res_df.columns), axis=1)] - pep = ["start"] - imm = [0] - - for tup in res_df.itertuples(): - p = tup[0] - pep.append(p) - imm.append(sum(a.prob*s for a, s in itr.izip(self.__alleleProb, tup[1:]) - if s > __thresh.get(a.name, -float("inf")))) - return imm, pep - - def __init_arc_cost(self): - - pool = self.__pool - return [pool.map(suffixPrefixMatch, ((str(i), str(j), 1) for j in self.__peps)) for i in self.__peps] - - def __init_model(self): - """ - initializes MIP model for OR with MTZ sub tour elimination - """ - model = ConcreteModel() - - #Sets - model.Nodes = RangeSet(0,len(self.__peps)-1) - def arc_init(model): - return ((i,j) for j in model.Nodes for i in model.Nodes if i != j) - model.Arcs = Set(dimen=2,initialize=arc_init) - def NodesOut_init(model, node): - return [ j for (i,j) in model.Arcs if i == node] - model.NodesOut = Set(model.Nodes, initialize=NodesOut_init) - def NodesIn_init(model, node): - return [ i for (i,j) in model.Arcs if j == node] - model.NodesIn = Set(model.Nodes, initialize=NodesIn_init) - - #Params - def i_init(model, i): - return self.__imm[i] - model.i = Param(model.Nodes, initialize= i_init) - def d_init(model, i, j ): - return self.__arcCost[i][j] - model.d = Param(model.Arcs, initialize=d_init) - model.TMAX = Param(initialize=self.__k, within=PositiveIntegers, mutable=True) - - #Variables - model.x = Var(model.Arcs, domain=Binary, bounds=(0,1), initialize=0) - model.u = Var(model.Nodes - set([0]), bounds=(1.0,len(self.__peps)-1)) - - model.Obj = Objective(rule=lambda model:sum( model.x[i,j]*model.i[i] for i,j in model.Arcs), - sense=maximize) - - #conecitfity constraint - def Conn1_rule(model, node): - return sum( model.x[node,j] for j in model.NodesOut[node]) <= 1.0 - model.Conn1 = Constraint(model.Nodes,rule=Conn1_rule) - def Conn2_rule(model, node): - return sum( model.x[i,node] for i in model.NodesIn[node]) <= 1.0 - model.Conn2 = Constraint(model.Nodes,rule=Conn1_rule) - #Equality constraint - def Equal_rule(model, node): - return sum( model.x[node,j] for j in model.NodesOut[node]) == sum( model.x[i,node] for i in model.NodesIn[node]) - model.Equal = Constraint(model.Nodes, rule=Equal_rule) - #Knapsack Constriant - def Knapsack_rule(model): - return sum( model.d[i,j]*model.x[i,j] for i,j in model.Arcs) <= self.__k - model.Knapsack = Constraint(rule=Knapsack_rule) - #Subout Elimination MTZ - def Subtour_rule(model, i,j): - return model.u[i]-model.u[j]+(len(self.__peps)-1)*model.x[i,j] <= len(self.__peps)-2 - model.SubTour = Constraint(((i,j) for i in xrange(1,len(self.__peps)) - for j in xrange(1,len(self.__peps)) if i != j), rule=Subtour_rule) - model.c = ConstraintList() - model.tabu = ConstraintList() - - return model - - def solve(self, options=None): - """ - solves the model optimally - """ - options = dict() if options is None else options - - instance = self.instance - - res = self.__solver.solve(instance, options=options, tee=False) - instance.load(res) - if self.__verbosity > 0: - res.write(num=1) - sol = [] - unsorted = dict([(i, j) for i, j in instance.Arcs if instance.x[i, j].value > 0]) - i=0 - while unsorted: - j = unsorted[i] - sol.append((i, j)) - del unsorted[i] - i = j - return instance.Obj(), sol - - def approximate(self, phi=0.05, options=None, _greedyLP=True, _tabu=True, _intensify=True, _jump=True, - max_iter=10000, delta_change=1e-4, max_delta=101, seed=23478234): - """ - Matheueristic using Tabu Search - """ - options = dict() if options is None else options - def __greedy_init(): - __imm = self.__imm - __arcCost = self.__arcCost - __k = self.__k - normalized_gain = np.divide(np.array(__imm),np.array(__arcCost)) - imm = 0 - length = 0 - possible = set(xrange(1,self.__n)) - i = 0 - result = [] - while length < __k: - if length == 0: - j = _,j = max([ (normalized_gain[0,j],j) for j in possible]) - result.append((0,j)) - length += __arcCost[0][j] - i = j - else: - possible.discard(i) - _,j = max([ (normalized_gain[i,j],j) for j in possible]) - length += __arcCost[i][j] - imm += __imm[i] - if length > __k: - result.append((i,0)) - return imm, result - result.append((i,j)) - i = j - imm += __imm[j] - result.append((j,0)) - return imm,result - - def __lp_init(): - """ - initializes first construction based on LP relaxation of the global problem with rounding - allows for complete constraints not only the capasity an stuff - see TABU SEARCH FOR MIXED INTEGER PROGRAMMING Joao Pedro Pedroso - """ - #set variables to non negativ real to obtain relaxation - - #copy is supoptimal for large instances - #but it seams as if the change of domain can only be done once?? - #at least an error occures when solving the MIP after solving its relaxation - instance = copy.deepcopy(self.instance) - solver = self.__solver - __n = self.__n - __k = self.__k - __imm = self.__imm - __arcCost = self.__arcCost - - instance.x.domain = NonNegativeReals - result = [] - imm = 0 - length = 0 - possible = set(xrange(1,__n)) - i = 0 - while length < __k: - lp_result = solver.solve(instance, options=options) - instance.solutions.load_from(lp_result) - if length == 0: - _,j = max([(instance.x[0,j].value,j) for j in possible]) - instance.x[0,j].setlb(1) - instance.x[0,j].setub(1) - result.append((0,j)) - length += __arcCost[0][j] - i = j - else: - possible.discard(i) - _,j = max([(instance.x[i,j].value,j) for j in possible]) - length += __arcCost[i][j] - imm += __imm[i] - if length > __k: - result.append((i,0)) - return imm, result - #possible = remove_j(possible,i) - result.append((i,j)) - instance.x[i,j].setlb(1) - instance.x[i,j].setub(1) - i = j - imm += __imm[j] - result.append((j,0)) - del instance - return imm, result - - def __tabu_search(best_obj, best_sol): - """ - performe Tabu Search according to Liang et al. IEEE 2002 - some moves are missing -> arc swap for example. dont know if - needed. - """ - __k = self.__k - __imm = self.__imm - __arcCost = self.__arcCost - #generates tasks - best_vertices = set(sum(best_sol, ())) - remaining_vertices = set(xrange(self.__n)) - best_vertices - tasks = [] - for v in remaining_vertices: - for e in best_sol: - if tenure.get(e[0],0) < curr_iter and tenure.get(e[1],0) < curr_iter: - tasks.append((_vertexIns,(v, e, best_obj, best_sol, __imm, __arcCost))) - for r in remaining_vertices: - for v in best_vertices: - if tenure.get(v,0) < curr_iter: - tasks.append((_vertexDel,(r, v, best_obj, best_sol, __imm, __arcCost))) - #make neighborhood - neighbor = self.__pool.map(_start, tasks) - changed_vertex = None - best_length = 0 - b_obj = -float('inf') - b_sol = None - #filter neighbour with tabu list and best_obj - for obj, length, sol, v in neighbor: - if sol not in tabu_list and obj > b_obj and length <= __k: - b_sol = sol - b_obj = obj - best_length = length - changed_vertex = v - if b_sol is None: - return best_obj, best_sol - tabu_list.add(b_sol) - #iteration dependent? the earlier the iteration the longer tabu? - tenure[changed_vertex] = curr_iter + math.ceil(math.sqrt((len(best_vertices)+len(remaining_vertices))/curr_iter)) \ - + math.floor((len(best_vertices)/(self.__k/9.))*best_length) - return b_obj, b_sol - - def __mip_intensification(phi, curr_sol): - """ - refine solution by allowing at most phi new variables - """ - #set warmstart - instance = self.instance - instance.x.domain = Binary - selected_vars = set(curr_sol) - sorted_nodes = [ i for (i,j) in curr_sol] - for i,j in instance.Arcs: - instance.x[i,j] = 1 if (i,j) in selected_vars else 0 - if i != 0: - try: - k = sorted_nodes.index(i) - instance.u[i] = k+1 - except ValueError: - instance.u[i] = 1 - if j != 0: - try: - k = sorted_nodes.index(j) - instance.u[j] = k+1 - except ValueError: - instance.u[j] = 1 - #clear possible contained constraint in constraint list and add new constraint - instance.c.clear() - instance.tabu.clear() - instance.c.add(sum(1-instance.x[i,j] for i,j in curr_sol) <= max(math.ceil(phi*len(curr_sol)),2)) - options["timelimit"] = "10" - result = self.__solver.solve(instance, options=options, warmstart=True)# tee=True) - instance.solutions.load_from(result) - sol = [] - unsorted = dict([(i,j) for i,j in instance.Arcs if instance.x[i,j].value > 0]) - i=0 - while unsorted: - j = unsorted[i] - sol.append((i,j)) - del unsorted[i] - i=j - del options["timelimit"] - return instance.Obj(), sol - - def __jump(best_sol): - """ - change 50% of the current solution with new nodes - use __lp_init with a initial starting set of vertices - (in random order) - """ - #set variables to non negativ real to obtain relaxation - instance = copy.deepcopy(self.instance) - solver = self.__solver - rand = np.random.rand - __n = self.__n - __k = self.__k - __imm = self.__imm - __arcCost = self.__arcCost - - instance.x.domain = NonNegativeReals - instance.c.clear() - instance.tabu.clear() - for sol in tabu_list: - instance.tabu.add(sum(1 - instance.x[i,j] for i,j in sol) >= 1) - #best_vertices = set(sum(best_sol, ())) - #selected = [] - for k, (i,j) in enumerate(best_sol): - if rand() >= 0.5: - instance.x[i,j].setlb(1) - instance.x[i,j].setub(1) - #selected.append((k,i,j)) - #if i > 0: - # instance.u[i].setlb(k+1) - # instance.u[i].setub(k+1) - #if j > 0: - # instance.u[j].setlb(k+2) - # instance.u[j].setub(k+2) - #instance.preprocess() - result = [] - imm = 0 - length = 0 - possible = set(xrange(1,__n)) - i = 0 - while length < __k: - options["timelimit"] = "20" - lp_result = solver.solve(instance, options=options) - instance.load(lp_result) - if length == 0: - _,j = max([ (instance.x[0,j].value,j) for j in possible]) - instance.x[0,j].setlb(1) - instance.x[0,j].setub(1) - result.append((0,j)) - length += __arcCost[0][j] - i = j - else: - possible.discard(i) - _,j = max([ (instance.x[i,j].value,j) for j in possible ]) - length += __arcCost[i][j] - imm += __imm[i] - if length > __k: - result.append((i,0)) - return imm, result - #possible = remove_j(possible,i) - result.append((i,j)) - instance.x[i,j].setlb(1) - instance.x[i,j].setub(1) - i = j - imm += __imm[j] - result.append((j,0)) - del options["timelimit"] - return imm, result - ########################################## - - np.random.seed(seed) - - if _greedyLP: - best_obj, best_sol = __lp_init() - else: - best_obj, best_sol = __greedy_init() - curr_obj, curr_sol = best_obj, best_sol - - if not _tabu: - return best_obj, best_sol - print "Start solution: ", best_obj, best_sol - curr_iter = 1 - delta = 1 - tabu_list = TabuList(size=self.__k) - tabu_list.add(best_sol) - tenure = {0:max_iter} - while (curr_iter < max_iter) and (delta < max_delta): - curr_obj, curr_sol = __tabu_search(curr_obj, curr_sol) - - if curr_obj > best_obj: - best_obj = curr_obj - best_sol = curr_sol - - if abs(best_obj - curr_obj) <= delta_change or curr_obj < best_obj: - delta += 1 - - if _intensify: - #sould not be done every time - #TODO: find good rule when to apply intensification - if not curr_iter % 50: - curr_obj, curr_sol = __mip_intensification(phi, curr_sol) - if curr_obj > best_obj: - best_obj = curr_obj - best_sol = curr_sol - delta = 0 - #if _jump: - #diversification should be performed after each - #increase of objective (?) - # curr_obj, curr_sol = __jump(curr_sol) - if delta % max(math.floor(max_delta/5),10) == 0 and _jump: - curr_obj, curr_sol = __jump(curr_sol) - curr_iter += 1 - - self.__pool.close() - return best_obj, sum(self.__arcCost[i][j] for i,j in best_sol), best_sol - - - diff --git a/Fred2/EpitopeAssembly/__init__.py b/Fred2/EpitopeAssembly/__init__.py index 81215210..edcac429 100644 --- a/Fred2/EpitopeAssembly/__init__.py +++ b/Fred2/EpitopeAssembly/__init__.py @@ -1,4 +1,3 @@ __author__ = 'schubert' -from Fred2.EpitopeAssembly.EpitopeAssembly import * -from Fred2.EpitopeAssembly.MosaicVaccine import * \ No newline at end of file +from Fred2.EpitopeAssembly.EpitopeAssembly import * \ No newline at end of file diff --git a/Fred2/EpitopeSelection/OptiTope.py b/Fred2/EpitopeSelection/OptiTope.py index 08e753b7..81fa2f2d 100644 --- a/Fred2/EpitopeSelection/OptiTope.py +++ b/Fred2/EpitopeSelection/OptiTope.py @@ -137,9 +137,13 @@ def __init__(self, results, threshold=None, k=10, solver="glpk", verbosity=0): peps[seq] = p for a, s in itr.izip(res_df.columns, tup[1:]): if method in ["smm", "smmpmbec", "arb", "comblibsidney"]: - thr = min(1., max(0.0, 1.0 - math.log(self.__thresh.get(a.name), - 50000))) if a.name in self.__thresh else -float("inf") - if s > thr: + try: + thr = min(1., max(0.0, 1.0 - math.log(self.__thresh.get(a.name), + 50000))) if a.name in self.__thresh else -float("inf") + except: + thr = 0 + + if s >= thr: alleles_I.setdefault(a.name, set()).add(seq) imm[seq, a.name] = min(1., max(0.0, 1.0 - math.log(s, 50000))) else: diff --git a/Fred2/test/TestProtein.py b/Fred2/test/TestProtein.py index 0da167c7..94a863a2 100644 --- a/Fred2/test/TestProtein.py +++ b/Fred2/test/TestProtein.py @@ -9,7 +9,7 @@ from Fred2.test.VariantsForTesting import * from Fred2.Core.Protein import Protein -from Fred2.Core.Generator import generate_peptides_from_proteins, generate_transcripts_from_tumor_variants +from Fred2.Core.Generator import generate_peptides_from_proteins from Fred2.Core.Generator import generate_proteins_from_transcripts from Fred2.IO.ADBAdapter import EIdentifierTypes @@ -127,14 +127,6 @@ def test3_protein_from_variants(self): ## GENERATE Peptides: peptides = generate_peptides_from_proteins(proteins,2) - def test1_protein_from_tumor_variants(self): - dummy_db = DummyAdapter() - normal_vars = [var_n2] - tumor_vars = [var_t1] - t = list(generate_transcripts_from_tumor_variants(normal_vars, tumor_vars, dummy_db, EIdentifierTypes.REFSEQ)) - for trans in generate_transcripts_from_tumor_variants(normal_vars, tumor_vars, dummy_db, - EIdentifierTypes.REFSEQ): - print "Tumor trans", repr(trans) def test4_peptides_from_variants(self): """ diff --git a/setup.py b/setup.py index 24bb2e6d..f550c724 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ name='Fred2', # Version: - version='2.0.0b2', + version='2.0.0', description='A Framework for Epitope Detection and Vaccine Design', long_description=readme,