From 6728d39756130ddc9889259953362d63f60e2455 Mon Sep 17 00:00:00 2001 From: Maxim Koltsov Date: Thu, 16 May 2024 16:04:53 +0300 Subject: [PATCH] version 0.1.5.4 Fix Fasta parser with modification (#74) --- ChangeLog.md | 3 +++ package.yaml | 2 +- src/Bio/FASTA.hs | 2 +- src/Bio/FASTA/Parser.hs | 8 +++++++- test/FASTA/order10.fasta | 2 ++ test/FASTA/order9.fasta | 8 ++++++++ test/FASTASpec.hs | 20 ++++++++++++++++---- 7 files changed, 38 insertions(+), 7 deletions(-) create mode 100644 test/FASTA/order10.fasta create mode 100644 test/FASTA/order9.fasta diff --git a/ChangeLog.md b/ChangeLog.md index ca6c332..4da6cc7 100644 --- a/ChangeLog.md +++ b/ChangeLog.md @@ -2,6 +2,9 @@ ## [Unreleased] +## [0.1.5.4] - 2024-05-16 +- Fix Fasta parser for unknown modifications on the end of the line. + ## [0.1.5.3] - 2023-12-08 - Update tests and dependencies. diff --git a/package.yaml b/package.yaml index 1c45c4d..74431c6 100644 --- a/package.yaml +++ b/package.yaml @@ -1,5 +1,5 @@ name: cobot-io -version: 0.1.5.3 +version: 0.1.5.4 github: "biocad/cobot-io" license: BSD3 category: Bio diff --git a/src/Bio/FASTA.hs b/src/Bio/FASTA.hs index 0c2ff4b..59848ea 100644 --- a/src/Bio/FASTA.hs +++ b/src/Bio/FASTA.hs @@ -28,7 +28,7 @@ import Bio.FASTA.Writer (WritableFastaToken (..), fastaToText) -- | Reads 'FastaSequence' from given file. -- -fromFile :: (MonadFail m, MonadIO m) => FilePath -> m (Fasta Char) +fromFile :: (MonadFail m, MonadIO m, ParsableFastaToken a) => FilePath -> m (Fasta a) fromFile f = liftIO (readFile f) >>= either (fail . errorBundlePretty) pure . parse fastaP (takeBaseName f) -- | Writes 'FastaSequence' to file. diff --git a/src/Bio/FASTA/Parser.hs b/src/Bio/FASTA/Parser.hs index 4eb7cc0..c7bf238 100644 --- a/src/Bio/FASTA/Parser.hs +++ b/src/Bio/FASTA/Parser.hs @@ -36,8 +36,14 @@ type Parser = Parsec Void Text parseOnly :: Parsec Void Text a -> Text -> Either String a parseOnly p s = first errorBundlePretty $ parse p "input.fasta" s +-- Using 'hspace1' instead of just 'space1' because our 'fastaLine' parser +-- expects each line to end with line-ending or end of file. But if 'sc' consumes end-of-line, +-- 'lexeme' in 'unknownP' also will and 'fastaLine' will not know that line has ended and will +-- expect more symbols. +-- +-- 'hspace1' consumes only "horizontal" space, leaving line-ending for 'fastaLine'. sc :: Parser () -sc = L.space space1 empty empty +sc = L.space hspace1 empty empty lexeme :: Parser a -> Parser a lexeme = L.lexeme sc diff --git a/test/FASTA/order10.fasta b/test/FASTA/order10.fasta new file mode 100644 index 0000000..b8e091c --- /dev/null +++ b/test/FASTA/order10.fasta @@ -0,0 +1,2 @@ +>mol1 +[FAM]ACGT[UNK][ diff --git a/test/FASTA/order9.fasta b/test/FASTA/order9.fasta new file mode 100644 index 0000000..9e800ad --- /dev/null +++ b/test/FASTA/order9.fasta @@ -0,0 +1,8 @@ +>mol1 +[FAM]ACGT[UNK] + +>mol2 +[HEX]ACCGT + +>mol3 +[HEX]ACGTCA[UNK] diff --git a/test/FASTASpec.hs b/test/FASTASpec.hs index 619e4a0..7e0380c 100644 --- a/test/FASTASpec.hs +++ b/test/FASTASpec.hs @@ -10,9 +10,9 @@ import Prelude hiding (readFile, writeFile) import System.Directory (removeFile) import Test.Hspec -import Bio.FASTA (fastaP, fromFile, toFile) +import Bio.FASTA (ParsableFastaToken, fastaP, fromFile, toFile) import Bio.FASTA.Parser (parseOnly) -import Bio.FASTA.Type (Fasta, FastaItem (..)) +import Bio.FASTA.Type (Fasta, FastaItem (..), ModItem (..), Modification (..)) import Bio.Sequence (bareSequence) correctFasta1 :: Fasta Char @@ -45,6 +45,16 @@ badFasta7 = Left "input.fasta:2:1:\n |\n2 | 5\8217-CTTCAAGAGAGAGACCTGCGT-3\8217 badFasta8 :: Either String (Fasta Char) badFasta8 = Left "input.fasta:21:5:\n |\n21 | CMV + enhMCK + prcTnT-2\r\n | ^^\nunexpected \"+ \"\nexpecting end of input, end of line, or letter\n" +correctFasta9 :: Fasta ModItem +correctFasta9 = + [ FastaItem "mol1" $ bareSequence [Mod (Unknown "[FAM]"),Letter 'A',Letter 'C',Letter 'G',Letter 'T',Mod (Unknown "[UNK]")] + , FastaItem "mol2" $ bareSequence [Mod (Unknown "[HEX]"),Letter 'A',Letter 'C',Letter 'C',Letter 'G',Letter 'T'] + , FastaItem "mol3" $ bareSequence [Mod (Unknown "[HEX]"),Letter 'A',Letter 'C',Letter 'G',Letter 'T',Letter 'C',Letter 'A',Mod (Unknown "[UNK]")] + ] + +badFasta10 :: Either String (Fasta ModItem) +badFasta10 = Left "input.fasta:2:16:\n|\n2|[FAM]ACGT[UNK][\n|^\nunexpectednewline\nexpectingmodificationname\n" + fastaSpec :: Spec fastaSpec = describe "Fasta files parser" $ do describe "fromFile" $ do @@ -56,19 +66,21 @@ fastaSpec = describe "Fasta files parser" $ do parseBadFile "test/FASTA/order6.fasta" badFasta6 parseBadFile "test/FASTA/order7.fasta" badFasta7 parseBadFile "test/FASTA/order8.fasta" badFasta8 + parseFile "test/FASTA/order9.fasta" correctFasta9 + parseBadFile "test/FASTA/order10.fasta" badFasta10 describe "toFile" $ do writeFile "test/FASTA/input.fasta" correctFasta5 writeFile "test/FASTA/input.fasta" correctFasta1 writeFile "test/FASTA/input.fasta" correctFasta3 -parseFile :: FilePath -> Fasta Char -> Spec +parseFile :: (Show a, Eq a, ParsableFastaToken a) => FilePath -> Fasta a -> Spec parseFile path cf = it ("correctly parses good fasta from file " <> path) $ do fasta <- fromFile path fasta `shouldBe` cf -parseBadFile :: FilePath -> Either String (Fasta Char) -> Spec +parseBadFile :: (Show a, Eq a, ParsableFastaToken a) => FilePath -> Either String (Fasta a) -> Spec parseBadFile path cf = it ("correctly parses bad fasta from file " <> path) $ do res <- liftIO (readFile path)