Skip to content

Commit

Permalink
Fix GFF parser (bebop#302)
Browse files Browse the repository at this point in the history
* fixed bebop#152
---------

Co-authored-by: Timothy Stiles <[email protected]>
  • Loading branch information
toothsy and TimothyStiles authored Mar 24, 2023
1 parent 09599af commit 22b6bfa
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 4 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -175,3 +175,4 @@ Network Trash Folder
Temporary Items
.apdisk

.vscode/launch.json
3 changes: 3 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"cSpell.words": ["bioinformatics", "genbank", "polyjson"]
}
3 changes: 3 additions & 0 deletions data/ecoli-mg1655.gff
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
##gff-version 3
#!genome-build ASM584v2
#!genome-build-accession NCBI_Assembly:GCA_000005845.2
#!annotation-source NCBI RefSeq
##sequence-region U00096.3 1 4641652
U00096.3 feature gene 190 255 . + . db_xref=EcoGene:EG11277;gene=thrL;gene_synonym=ECK0001,JW4367;locus_tag=b0001
U00096.3 feature CDS 190 255 . + 0 codon_start=1;db_xref=GI:1786182,ASAP:ABE-0000006,UniProtKB/Swiss-Prot:P0AD86,EcoGene:EG11277;function=leader%3B Amino acid biosynthesis: Threonine,1.5.1.8 metabolism%3B building block biosynthesis%3B amino acids%3B threonine;gene=thrL;gene_synonym=ECK0001,JW4367;locus_tag=b0001;note=GO_process: GO:0009088 - threonine biosynthetic process;product=thr operon leader peptide;protein_id=AAC73112.1;transl_table=11;translation=MKRISTTITTTITITTGNGAG
Expand Down
34 changes: 30 additions & 4 deletions io/gff/gff.go
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ package gff

import (
"bytes"
"errors"
"io"
"os"
"sort"
Expand Down Expand Up @@ -127,10 +128,12 @@ func Parse(file io.Reader) (Gff, error) {
gff.Meta.CheckSum = blake3.Sum256(fileBytes)

lines := strings.Split(gffString, "\n")
metaString := lines[0:2]
regionStringArray, endOfMetaInfo, err := extractInfoFromField(lines, "##sequence-region")
metaString := lines[0:endOfMetaInfo]
versionString := metaString[0]
regionStringArray := strings.Split(metaString[1], " ")

if err != nil {
return Gff{}, err
}
// get name for general meta
meta := Meta{}
meta.Name = regionStringArray[1] // Formally region name, but changed to name here for generality/interoperability.
Expand All @@ -154,7 +157,7 @@ func Parse(file io.Reader) (Gff, error) {
fastaFlag = true
} else if len(line) == 0 {
continue
} else if line[0:2] == "##" {
} else if line[0:2] == "##" || line[0:2] == "#!" {
continue
} else if fastaFlag && line[0:1] != ">" {
// sequence.Sequence = sequence.Sequence + line
Expand Down Expand Up @@ -206,6 +209,29 @@ func Parse(file io.Reader) (Gff, error) {
return gff, err
}

// regionString takes in the lines array,fieldName that is needed in gff file, and
// returns the region containing fieldName if found
// throws error if not found
func extractInfoFromField(lines []string, fieldName string) ([]string, int, error) {
index := 0
endOfMetaInfo := 0
for lineIndex, line := range lines {
if strings.Contains(line, "#") {
if strings.Contains(line, fieldName) {
index = lineIndex
}
continue
}
endOfMetaInfo = lineIndex
break

}
if index == 0 && fieldName != "gff-version" {
return nil, 0, errors.New("the given file does not have any meta information")
}
return strings.Split(lines[index], " "), endOfMetaInfo, nil
}

// Build takes an Annotated sequence and returns a byte array representing a gff to be written out.
func Build(sequence Gff) ([]byte, error) {
var gffBuffer bytes.Buffer
Expand Down

0 comments on commit 22b6bfa

Please sign in to comment.