diff --git a/README.md b/README.md index 1c49d81..5427f28 100644 --- a/README.md +++ b/README.md @@ -24,6 +24,9 @@ Options: --annottable=ANNOTTABLE Name of table with annotation assignments, at a minium must contain 'protein' and 'profile' columns. The 'profile' column might contain multiple profiles, separated by '&'. + --maxscore=MAXSCORE + Maximum sequence score; overrides the sequence score in scorefile if higher than this value, default Inf + --minscore=MINSCORE Minimum score, default 0 @@ -79,6 +82,10 @@ your output files are named after the profile, minus the `.hmm` and plus `.tblou The score file doesn't have to include all profiles, any that are not there will be set to 0 or whatever you specify with the `--minscore` parameter to the script. +If you have a lot of fragmentary sequences, scores in hmm files can be a little too strict (e.g. +TIGRFAMs). You can use `--maxscore` to limit the minimum score globally. In the output, the correct +score will appear so you can filter afterwards. + ## Running `hmmrank` with an annotation table You can provide a table with annotation data using the `--annottable` option. The table must contain @@ -99,4 +106,5 @@ that match both profiles will be assigned to whichever of the two they match bes Also note that score filtering, if asked for, takes place before evaluating combinations. Long proteins that match multiple profiles may be penalized by this if the target sequence is a fragment. You may want to lower high minimum scores to deal with this problem; take a particular look at -TIGRFAMs since they typically are long proteins with high default GA scores. +TIGRFAMs since they typically are long proteins with high default GA scores. See also description of +`--maxscore` above. diff --git a/conda/meta.yaml b/conda/meta.yaml index 6694e15..cae877c 100644 --- a/conda/meta.yaml +++ b/conda/meta.yaml @@ -7,6 +7,9 @@ package: source: url: https://github.com/erikrikarddaniel/hmmrank/archive/v{{ version }}.tar.gz +build: + noarch: generic + requirements: build: - python diff --git a/src/R-test/Makefile b/src/R-test/Makefile index fb89b27..1e7bc87 100644 --- a/src/R-test/Makefile +++ b/src/R-test/Makefile @@ -4,7 +4,7 @@ CHECK = if [ ` $(DIFF) | wc -l ` -eq 0 ]; then echo "*** $@ OK ***"; else echo " all: hmmrank -hmmrank: hmmrank.00 hmmrank.01 hmmrank.02 hmmrank.03 hmmrank.04 hmmrank.05 hmmrank.06 hmmrank.07 +hmmrank: hmmrank.00 hmmrank.01 hmmrank.02 hmmrank.03 hmmrank.04 hmmrank.05 hmmrank.06 hmmrank.07 hmmrank.08 hmmrank.00: ../R/hmmrank.r --outfile=$@.out $@.d/*.tblout @@ -39,3 +39,8 @@ hmmrank.06: hmmrank.07: ../R/hmmrank.r --annottable=$@.annottable.tsv --minscore=30 --scorefile=$@.profile_scores.tsv --qfromfname --outfile=$@.out $@.d/*.tblout @$(CHECK) + +# New option: --maxscore +hmmrank.08: + ../R/hmmrank.r --annottable=$@.annottable.tsv --minscore=30 --maxscore=100 --scorefile=$@.profile_scores.tsv --qfromfname --outfile=$@.out $@.d/*.tblout + @$(CHECK) diff --git a/src/R-test/hmmrank.08.annottable.tsv b/src/R-test/hmmrank.08.annottable.tsv new file mode 120000 index 0000000..e3d3d39 --- /dev/null +++ b/src/R-test/hmmrank.08.annottable.tsv @@ -0,0 +1 @@ +hmmrank.07.annottable.tsv \ No newline at end of file diff --git a/src/R-test/hmmrank.08.d b/src/R-test/hmmrank.08.d new file mode 120000 index 0000000..e3ee918 --- /dev/null +++ b/src/R-test/hmmrank.08.d @@ -0,0 +1 @@ +hmmrank.06.d \ No newline at end of file diff --git a/src/R-test/hmmrank.08.expect b/src/R-test/hmmrank.08.expect new file mode 100644 index 0000000..7485673 --- /dev/null +++ b/src/R-test/hmmrank.08.expect @@ -0,0 +1,51 @@ +accno profile evalue score min_score rank domain protein ec_number specific +envision.uc0.99_ENV1_ADD_022_k141_148422_1_1 PF06180 1.1e-5 32.5 26.2 1 All CbiK 4.99.1.3 T +envision.uc0.99_ENV1_ADD_022_k141_170195_19_1 PF06180 1.2e-5 32.4 26.2 1 All CbiK 4.99.1.3 T +envision.uc0.99_ENV1_ADD_022_k141_274526_4_1 arCOG01044 3.1e-38 139 30 1 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV1_ADD_022_k141_286016_2_1 ENOG4108Z73 1.8e-68 239.2 30 1 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV1_ADD_022_k141_29689_3_1 TIGR01469 8.4e-94 321.3 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_29689_3_1 TIGR01469 8.4e-94 321.3 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_309625_1_1 TIGR01469 1.5e-94 323.8 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_309625_1_1 TIGR01469 1.5e-94 323.8 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_43200_29_1 TIGR01469 2.2e-93 320 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_43200_29_1 TIGR01469 2.2e-93 320 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV1_ADD_022_k141_43200_29_1 arCOG01044 1.1e-37 137.2 30 2 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV1_MSO_k141_109200_2_1 arCOG01044 2e-38 139.6 30 1 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV2_MSO_k141_13207_6_1 TIGR01466 9.8e-78 268.6 246.35 1 All CobI 2.1.1.130 T +envision.uc0.99_ENV2_S6_k141_202097_1_1 PF06180 2.6e-7 37.9 26.2 1 All CbiK 4.99.1.3 T +envision.uc0.99_ENV3_MSO_k141_117317_2_1 PF06180 5.7e-6 33.5 26.2 1 All CbiK 4.99.1.3 T +envision.uc0.99_ENV3_MSO_k141_120235_7_1 TIGR01467 4.5e-61 213.7 163.25 1 All CobJ 2.1.1.131 T +envision.uc0.99_ENV3_MSO_k141_218191_4_1 ENOG4108Z73 1.2e-74 259.5 30 1 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_218191_4_1 arCOG01044 1.4e-41 150 30 2 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_261229_1_1 ENOG4108Z73 3.3e-72 251.5 30 1 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_261229_1_1 arCOG01044 5e-41 148.1 30 2 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_275855_5_1 TIGR01469 3.6e-94 322.5 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_275855_5_1 TIGR01469 3.6e-94 322.5 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_275855_5_1 PF00590 3e-53 188.6 27.8 2 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_27979_3_1 ENOG4108Z73 4.2e-67 234.7 30 1 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_32180_6_1 arCOG01044 8.9e-39 140.8 30 1 Archaea SirC 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_372698_2_1 TIGR01466 6.6e-76 262.6 246.35 1 All CobI 2.1.1.130 T +envision.uc0.99_ENV3_MSO_k141_383762_1_1 TIGR01469 5.1e-93 318.8 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_383762_1_1 TIGR01469 5.1e-93 318.8 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_383762_1_1 ENOG4108Z73 3.7e-68 238.1 30 2 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_383762_1_1 PF00590 3.2e-52 185.2 27.8 3 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_41650_1_1 TIGR01469 5.6e-94 321.9 185 1 All CysG 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_41650_1_1 TIGR01469 5.6e-94 321.9 185 1 All CobA 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_41650_1_1 PF00590 1e-53 190.1 27.8 2 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_MSO_k141_486766_2_1 TIGR01466 & TIGR01467 1.2e-80 486.40000000000003 409.6 1 Bacteria CobIJ 2.1.1.130 & 2.1.1.131 T +envision.uc0.99_ENV3_MSO_k141_486766_2_1 TIGR01466 & TIGR01467 1.2e-80 486.40000000000003 409.6 1 Archaea CobIJ 2.1.1.130 & 2.1.1.131 T +envision.uc0.99_ENV3_MSO_k141_486766_2_1 TIGR01466 1.2e-80 278.1 246.35 2 All CobI 2.1.1.130 T +envision.uc0.99_ENV3_MSO_k141_486766_2_1 TIGR01467 2e-59 208.3 163.25 3 All CobJ 2.1.1.131 T +envision.uc0.99_ENV3_MSO_k141_581444_2_1 ENOG4108Z73 6.2e-69 240.7 30 1 Bacteria SirCa 1.3.1.76 T +envision.uc0.99_ENV3_MSO_k141_588185_1_1 PF00590 4.5e-52 184.7 27.8 1 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_S3_k141_16845_4_1 PF00590 3.1e-52 185.3 27.8 1 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_S3_k141_232529_2_1 PF00590 6.6e-52 184.2 27.8 1 All FP00 2.1.1.107 F +envision.uc0.99_ENV3_S3_k141_237307_2_1 TIGR01467 2.2e-60 211.5 163.25 1 All CobJ 2.1.1.131 T +envision.uc0.99_ENV3_S3_k141_286858_3_1 PF06180 3.2e-6 34.3 26.2 1 All CbiK 4.99.1.3 T +envision.uc0.99_ENV3_S6_k141_296524_2_1 PF06180 2.5e-5 31.4 26.2 1 All CbiK 4.99.1.3 T +testing_presence_in_all_TIGRFAMS TIGR01466 & TIGR01467 1e-10 200 409.6 1 Bacteria CobIJ 2.1.1.130 & 2.1.1.131 T +testing_presence_in_all_TIGRFAMS TIGR01466 & TIGR01467 1e-10 200 409.6 1 Archaea CobIJ 2.1.1.130 & 2.1.1.131 T +testing_presence_in_all_TIGRFAMS TIGR01466 1e-10 100 246.35 3 All CobI 2.1.1.130 T +testing_presence_in_all_TIGRFAMS TIGR01467 1e-10 100 163.25 3 All CobJ 2.1.1.131 T +testing_presence_in_all_TIGRFAMS TIGR01469 1e-10 100 185 3 All CysG 2.1.1.107 F +testing_presence_in_all_TIGRFAMS TIGR01469 1e-10 100 185 3 All CobA 2.1.1.107 F diff --git a/src/R-test/hmmrank.08.profile_scores.tsv b/src/R-test/hmmrank.08.profile_scores.tsv new file mode 100644 index 0000000..ed9bfda --- /dev/null +++ b/src/R-test/hmmrank.08.profile_scores.tsv @@ -0,0 +1,6 @@ +profile score_type seq_score domain_score +PF00590 GA 27.80 27.80 +PF06180 GA 26.20 26.20 +TIGR01466 GA 246.35 246.35 +TIGR01467 GA 163.25 163.25 +TIGR01469 GA 185.00 185.00 diff --git a/src/R/hmmrank.r b/src/R/hmmrank.r index df8e274..d252e19 100755 --- a/src/R/hmmrank.r +++ b/src/R/hmmrank.r @@ -14,19 +14,23 @@ suppressPackageStartupMessages(library(tidyr)) suppressPackageStartupMessages(library(data.table)) suppressPackageStartupMessages(library(stringr)) -SCRIPT_VERSION = '1.4.0' +SCRIPT_VERSION = '1.4.1' # Get arguments # For interactive testing: # opt <- list('options' = list('verbose' = TRUE, qfromfname = FALSE, minscore = 0, scorefile = 'hmmrank.02.profile_scores.tsv'), 'args' = c('hmmrank.02.d/NrdAe.tblout', 'hmmrank.02.d/NrdAg.tblout', 'hmmrank.02.d/NrdAh.tblout', 'hmmrank.02.d/NrdAi.tblout', 'hmmrank.02.d/NrdAk.tblout', 'hmmrank.02.d/NrdAm.tblout', 'hmmrank.02.d/NrdAn.tblout', 'hmmrank.02.d/NrdAq.tblout', 'hmmrank.02.d/NrdA.tblout', 'hmmrank.02.d/NrdAz3.tblout', 'hmmrank.02.d/NrdAz4.tblout', 'hmmrank.02.d/NrdAz.tblout')) # # Annotation table testing: -# opt <- list('options' = list('verbose' = TRUE, qfromfname = TRUE, minscore = 0, scorefile = 'hmmrank.06.profile_scores.tsv', annottable = 'hmmrank.06.annottable.tsv'), 'args' = c('hmmrank.06.d/arCOG01044.tblout', 'hmmrank.06.d/ENOG4108Z73.tblout', 'hmmrank.06.d/PF00590.tblout', 'hmmrank.06.d/PF06180.tblout', 'hmmrank.06.d/TIGR01466.tblout', 'hmmrank.06.d/TIGR01467.tblout', 'hmmrank.06.d/TIGR01469.tblout')) +# opt <- list('options' = list('verbose' = TRUE, qfromfname = TRUE, minscore = 0, maxscore=100, scorefile = 'hmmrank.08.profile_scores.tsv', annottable = 'hmmrank.08.annottable.tsv'), 'args' = c('hmmrank.08.d/arCOG01044.tblout', 'hmmrank.08.d/ENOG4108Z73.tblout', 'hmmrank.08.d/PF00590.tblout', 'hmmrank.08.d/PF06180.tblout', 'hmmrank.08.d/TIGR01466.tblout', 'hmmrank.08.d/TIGR01467.tblout', 'hmmrank.08.d/TIGR01469.tblout')) option_list = list( make_option( '--annottable', type = "character", help = "Name of table with annotation assignments, at a minium must contain 'protein' and 'profile' columns. The 'profile' column might contain multiple profiles, separated by '&'." ), + make_option( + c('--maxscore'), type='double', default=Inf, + help='Maximum sequence score; overrides the sequence score in scorefile if higher than this value, default %default' + ), make_option( c('--minscore'), type='double', default=0.0, help='Minimum score, default %default' @@ -130,13 +134,20 @@ if ( length(opt$options$scorefile) > 0 ) { tblout[s, on = 'profile', min_score := i.min_score] tblout[is.na(min_score)]$min_score <- opt$options$minscore + tblout$max_score <- opt$options$maxscore } else { logmsg(sprintf("Setting minimum scores to %5.2f", opt$options$minscore)) tblout$min_score <- opt$options$minscore + tblout$max_score <- opt$options$maxscore } # Filter away entries with lower score than the minimum -tblout <- tblout[score >= min_score] +tblout <- lazy_dt(tblout) %>% + group_by(accno, profile) %>% + filter(score >= min(min_score, max_score)) %>% + ungroup() %>% + select(-max_score) %>% + as.data.table() # Read the annottable, if given, split into individual profiles, left join with tableout and summarise if ( length(opt$options$annottable) > 0 ) {