diff --git a/gffutils/__init__.py b/gffutils/__init__.py index 5b1ac81c..aea0e288 100644 --- a/gffutils/__init__.py +++ b/gffutils/__init__.py @@ -5,5 +5,4 @@ from gffutils.helpers import example_filename from gffutils.exceptions import FeatureNotFoundError, DuplicateIDError from gffutils.version import version as __version__ - - +from gffutils import biopython_integration diff --git a/gffutils/biopython_integration.py b/gffutils/biopython_integration.py new file mode 100644 index 00000000..3b7376b0 --- /dev/null +++ b/gffutils/biopython_integration.py @@ -0,0 +1,66 @@ +""" +Module for integration with BioPython, specifically SeqRecords and SeqFeature +objects. +""" +try: + from Bio.SeqFeature import SeqFeature, FeatureLocation +except ImportError: + raise ImportError( + "BioPython must be installed to use this module") +from . import Feature + +_biopython_strand = { + '+': 1, + '-': -1, + '.': 0, +} +_feature_strand = dict((v, k) for k, v in _biopython_strand.items()) + + +def to_seqfeature(f): + """ + Converts a gffutils.Feature object to a Bio.SeqFeature object. + + The GFF fields `source`, `score`, `seqid`, and `frame` are stored as + qualifiers. GFF `attributes` are also stored as qualifiers. + """ + qualifiers = { + 'source': [f.source], + 'score': [f.score], + 'seqid': [f.seqid], + 'frame': [f.frame], + } + qualifiers.update(f.attributes) + return SeqFeature( + FeatureLocation(f.start, f.stop), + id=f.id, + type=f.featuretype, + strand=_biopython_strand[f.strand], + qualifiers=qualifiers + ) + + +def from_seqfeature(s, **kwargs): + """ + Converts a Bio.SeqFeature object to a gffutils.Feature object. + + The GFF fields `source`, `score`, `seqid`, and `frame` are assumed to be + stored as qualifiers. Any other qualifiers will be assumed to be GFF + attributes. + """ + source = s.qualifiers.get('source', '.')[0] + score = s.qualifiers.get('score', '.')[0] + seqid = s.qualifiers.get('seqid', '.')[0] + frame = s.qualifiers.get('frame', '.')[0] + strand = _feature_strand[s.strand] + start = s.location.start.position + stop = s.location.end.position + featuretype = s.type + id = s.id + attributes = dict(s.qualifiers) + attributes.pop('source') + attributes.pop('score') + attributes.pop('seqid') + attributes.pop('frame') + return Feature(seqid, source, featuretype, start, stop, score, strand, + frame, attributes, **kwargs) diff --git a/gffutils/test/test_biopython_integration.py b/gffutils/test/test_biopython_integration.py new file mode 100644 index 00000000..4a99b14d --- /dev/null +++ b/gffutils/test/test_biopython_integration.py @@ -0,0 +1,16 @@ +from gffutils import example_filename, create, parser, feature +import gffutils +import gffutils.biopython_integration as bp + +def test_roundtrip(): + """ + Feature -> SeqFeature -> Feature should be invariant. + """ + db_fname = gffutils.example_filename("gff_example1.gff3") + db = gffutils.create_db(db_fname, ':memory:') + feature = db['ENSMUSG00000033845'] + feature.keep_order = True + dialect = feature.dialect + s = bp.to_seqfeature(feature) + f = bp.from_seqfeature(s, dialect=dialect, keep_order=True) + assert feature == f