Adding option to maximize score from scorefile (--maxscore)

erikrikarddaniel · Aug 7, 2020 · 0d2af3a · 0d2af3a
1 parent 19c6bc8
commit 0d2af3a
Show file tree

Hide file tree

Showing 8 changed files with 91 additions and 5 deletions.
diff --git a/README.md b/README.md
@@ -24,6 +24,9 @@ Options:
         --annottable=ANNOTTABLE
                 Name of table with annotation assignments, at a minium must contain 'protein' and 'profile' columns. The 'profile' column might contain multiple profiles, separated by '&'.
 
+        --maxscore=MAXSCORE
+                Maximum sequence score; overrides the sequence score in scorefile if higher than this value, default Inf
+
         --minscore=MINSCORE
                 Minimum score, default 0
 
@@ -79,6 +82,10 @@ your output files are named after the profile, minus the `.hmm` and plus `.tblou
 The score file doesn't have to include all profiles, any that are not there will be set to 0 or
 whatever you specify with the `--minscore` parameter to the script.
 
+If you have a lot of fragmentary sequences, scores in hmm files can be a little too strict (e.g.
+TIGRFAMs). You can use `--maxscore` to limit the minimum score globally. In the output, the correct
+score will appear so you can filter afterwards.
+
 ## Running `hmmrank` with an annotation table
 
 You can provide a table with annotation data using the `--annottable` option. The table must contain
@@ -99,4 +106,5 @@ that match both profiles will be assigned to whichever of the two they match bes
 Also note that score filtering, if asked for, takes place before evaluating combinations. Long
 proteins that match multiple profiles may be penalized by this if the target sequence is a fragment.
 You may want to lower high minimum scores to deal with this problem; take a particular look at
-TIGRFAMs since they typically are long proteins with high default GA scores.
+TIGRFAMs since they typically are long proteins with high default GA scores. See also description of
+`--maxscore` above.
diff --git a/conda/meta.yaml b/conda/meta.yaml
@@ -7,6 +7,9 @@ package:
 source:
   url: https://github.com/erikrikarddaniel/hmmrank/archive/v{{ version }}.tar.gz
 
+build:
+  noarch: generic
+
 requirements:
   build:
     - python

diff --git a/src/R-test/Makefile b/src/R-test/Makefile
@@ -4,7 +4,7 @@ CHECK = if [ ` $(DIFF) | wc -l ` -eq 0 ]; then echo "*** $@ OK ***"; else echo "
 
 all: hmmrank
 
-hmmrank: hmmrank.00 hmmrank.01 hmmrank.02 hmmrank.03 hmmrank.04 hmmrank.05 hmmrank.06 hmmrank.07
+hmmrank: hmmrank.00 hmmrank.01 hmmrank.02 hmmrank.03 hmmrank.04 hmmrank.05 hmmrank.06 hmmrank.07 hmmrank.08
 
 hmmrank.00:
 	../R/hmmrank.r --outfile=$@.out $@.d/*.tblout
@@ -39,3 +39,8 @@ hmmrank.06:
 hmmrank.07:
 	../R/hmmrank.r --annottable=$@.annottable.tsv --minscore=30 --scorefile=$@.profile_scores.tsv --qfromfname --outfile=$@.out $@.d/*.tblout
 	@$(CHECK)
+
+# New option: --maxscore
+hmmrank.08:
+	../R/hmmrank.r --annottable=$@.annottable.tsv --minscore=30 --maxscore=100 --scorefile=$@.profile_scores.tsv --qfromfname --outfile=$@.out $@.d/*.tblout
+	@$(CHECK)
diff --git a/src/R-test/hmmrank.08.annottable.tsv b/src/R-test/hmmrank.08.annottable.tsv
@@ -0,0 +1 @@
+hmmrank.07.annottable.tsv
diff --git a/src/R-test/hmmrank.08.d b/src/R-test/hmmrank.08.d
@@ -0,0 +1 @@
+hmmrank.06.d
diff --git a/src/R-test/hmmrank.08.expect b/src/R-test/hmmrank.08.expect
@@ -0,0 +1,51 @@
+accno	profile	evalue	score	min_score	rank	domain	protein	ec_number	specific
+envision.uc0.99_ENV1_ADD_022_k141_148422_1_1	PF06180	1.1e-5	32.5	26.2	1	All	CbiK	4.99.1.3	T
+envision.uc0.99_ENV1_ADD_022_k141_170195_19_1	PF06180	1.2e-5	32.4	26.2	1	All	CbiK	4.99.1.3	T
+envision.uc0.99_ENV1_ADD_022_k141_274526_4_1	arCOG01044	3.1e-38	139	30	1	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV1_ADD_022_k141_286016_2_1	ENOG4108Z73	1.8e-68	239.2	30	1	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV1_ADD_022_k141_29689_3_1	TIGR01469	8.4e-94	321.3	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_29689_3_1	TIGR01469	8.4e-94	321.3	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_309625_1_1	TIGR01469	1.5e-94	323.8	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_309625_1_1	TIGR01469	1.5e-94	323.8	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_43200_29_1	TIGR01469	2.2e-93	320	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_43200_29_1	TIGR01469	2.2e-93	320	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV1_ADD_022_k141_43200_29_1	arCOG01044	1.1e-37	137.2	30	2	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV1_MSO_k141_109200_2_1	arCOG01044	2e-38	139.6	30	1	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV2_MSO_k141_13207_6_1	TIGR01466	9.8e-78	268.6	246.35	1	All	CobI	2.1.1.130	T
+envision.uc0.99_ENV2_S6_k141_202097_1_1	PF06180	2.6e-7	37.9	26.2	1	All	CbiK	4.99.1.3	T
+envision.uc0.99_ENV3_MSO_k141_117317_2_1	PF06180	5.7e-6	33.5	26.2	1	All	CbiK	4.99.1.3	T
+envision.uc0.99_ENV3_MSO_k141_120235_7_1	TIGR01467	4.5e-61	213.7	163.25	1	All	CobJ	2.1.1.131	T
+envision.uc0.99_ENV3_MSO_k141_218191_4_1	ENOG4108Z73	1.2e-74	259.5	30	1	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_218191_4_1	arCOG01044	1.4e-41	150	30	2	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_261229_1_1	ENOG4108Z73	3.3e-72	251.5	30	1	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_261229_1_1	arCOG01044	5e-41	148.1	30	2	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_275855_5_1	TIGR01469	3.6e-94	322.5	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_275855_5_1	TIGR01469	3.6e-94	322.5	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_275855_5_1	PF00590	3e-53	188.6	27.8	2	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_27979_3_1	ENOG4108Z73	4.2e-67	234.7	30	1	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_32180_6_1	arCOG01044	8.9e-39	140.8	30	1	Archaea	SirC	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_372698_2_1	TIGR01466	6.6e-76	262.6	246.35	1	All	CobI	2.1.1.130	T
+envision.uc0.99_ENV3_MSO_k141_383762_1_1	TIGR01469	5.1e-93	318.8	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_383762_1_1	TIGR01469	5.1e-93	318.8	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_383762_1_1	ENOG4108Z73	3.7e-68	238.1	30	2	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_383762_1_1	PF00590	3.2e-52	185.2	27.8	3	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_41650_1_1	TIGR01469	5.6e-94	321.9	185	1	All	CysG	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_41650_1_1	TIGR01469	5.6e-94	321.9	185	1	All	CobA	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_41650_1_1	PF00590	1e-53	190.1	27.8	2	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_MSO_k141_486766_2_1	TIGR01466 & TIGR01467	1.2e-80	486.40000000000003	409.6	1	Bacteria	CobIJ	2.1.1.130 & 2.1.1.131	T
+envision.uc0.99_ENV3_MSO_k141_486766_2_1	TIGR01466 & TIGR01467	1.2e-80	486.40000000000003	409.6	1	Archaea	CobIJ	2.1.1.130 & 2.1.1.131	T
+envision.uc0.99_ENV3_MSO_k141_486766_2_1	TIGR01466	1.2e-80	278.1	246.35	2	All	CobI	2.1.1.130	T
+envision.uc0.99_ENV3_MSO_k141_486766_2_1	TIGR01467	2e-59	208.3	163.25	3	All	CobJ	2.1.1.131	T
+envision.uc0.99_ENV3_MSO_k141_581444_2_1	ENOG4108Z73	6.2e-69	240.7	30	1	Bacteria	SirCa	1.3.1.76	T
+envision.uc0.99_ENV3_MSO_k141_588185_1_1	PF00590	4.5e-52	184.7	27.8	1	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_S3_k141_16845_4_1	PF00590	3.1e-52	185.3	27.8	1	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_S3_k141_232529_2_1	PF00590	6.6e-52	184.2	27.8	1	All	FP00	2.1.1.107	F
+envision.uc0.99_ENV3_S3_k141_237307_2_1	TIGR01467	2.2e-60	211.5	163.25	1	All	CobJ	2.1.1.131	T
+envision.uc0.99_ENV3_S3_k141_286858_3_1	PF06180	3.2e-6	34.3	26.2	1	All	CbiK	4.99.1.3	T
+envision.uc0.99_ENV3_S6_k141_296524_2_1	PF06180	2.5e-5	31.4	26.2	1	All	CbiK	4.99.1.3	T
+testing_presence_in_all_TIGRFAMS	TIGR01466 & TIGR01467	1e-10	200	409.6	1	Bacteria	CobIJ	2.1.1.130 & 2.1.1.131	T
+testing_presence_in_all_TIGRFAMS	TIGR01466 & TIGR01467	1e-10	200	409.6	1	Archaea	CobIJ	2.1.1.130 & 2.1.1.131	T
+testing_presence_in_all_TIGRFAMS	TIGR01466	1e-10	100	246.35	3	All	CobI	2.1.1.130	T
+testing_presence_in_all_TIGRFAMS	TIGR01467	1e-10	100	163.25	3	All	CobJ	2.1.1.131	T
+testing_presence_in_all_TIGRFAMS	TIGR01469	1e-10	100	185	3	All	CysG	2.1.1.107	F
+testing_presence_in_all_TIGRFAMS	TIGR01469	1e-10	100	185	3	All	CobA	2.1.1.107	F
diff --git a/src/R-test/hmmrank.08.profile_scores.tsv b/src/R-test/hmmrank.08.profile_scores.tsv
@@ -0,0 +1,6 @@
+profile	score_type	seq_score	domain_score
+PF00590	GA	27.80	27.80
+PF06180	GA	26.20	26.20
+TIGR01466	GA	246.35	246.35
+TIGR01467	GA	163.25	163.25
+TIGR01469	GA	185.00	185.00
diff --git a/src/R/hmmrank.r b/src/R/hmmrank.r
@@ -14,19 +14,23 @@ suppressPackageStartupMessages(library(tidyr))
 suppressPackageStartupMessages(library(data.table))
 suppressPackageStartupMessages(library(stringr))
 
-SCRIPT_VERSION = '1.4.0'
+SCRIPT_VERSION = '1.4.1'
 
 # Get arguments
 # For interactive testing:
 # opt <- list('options' = list('verbose' = TRUE, qfromfname = FALSE, minscore = 0, scorefile = 'hmmrank.02.profile_scores.tsv'), 'args' = c('hmmrank.02.d/NrdAe.tblout', 'hmmrank.02.d/NrdAg.tblout', 'hmmrank.02.d/NrdAh.tblout', 'hmmrank.02.d/NrdAi.tblout', 'hmmrank.02.d/NrdAk.tblout', 'hmmrank.02.d/NrdAm.tblout', 'hmmrank.02.d/NrdAn.tblout', 'hmmrank.02.d/NrdAq.tblout', 'hmmrank.02.d/NrdA.tblout', 'hmmrank.02.d/NrdAz3.tblout', 'hmmrank.02.d/NrdAz4.tblout', 'hmmrank.02.d/NrdAz.tblout'))
 #
 # Annotation table testing:
-# opt <- list('options' = list('verbose' = TRUE, qfromfname = TRUE, minscore = 0, scorefile = 'hmmrank.06.profile_scores.tsv', annottable = 'hmmrank.06.annottable.tsv'), 'args' = c('hmmrank.06.d/arCOG01044.tblout', 'hmmrank.06.d/ENOG4108Z73.tblout', 'hmmrank.06.d/PF00590.tblout', 'hmmrank.06.d/PF06180.tblout', 'hmmrank.06.d/TIGR01466.tblout', 'hmmrank.06.d/TIGR01467.tblout', 'hmmrank.06.d/TIGR01469.tblout'))
+# opt <- list('options' = list('verbose' = TRUE, qfromfname = TRUE, minscore = 0, maxscore=100, scorefile = 'hmmrank.08.profile_scores.tsv', annottable = 'hmmrank.08.annottable.tsv'), 'args' = c('hmmrank.08.d/arCOG01044.tblout', 'hmmrank.08.d/ENOG4108Z73.tblout', 'hmmrank.08.d/PF00590.tblout', 'hmmrank.08.d/PF06180.tblout', 'hmmrank.08.d/TIGR01466.tblout', 'hmmrank.08.d/TIGR01467.tblout', 'hmmrank.08.d/TIGR01469.tblout'))
 option_list = list(
   make_option(
     '--annottable', type = "character",
     help = "Name of table with annotation assignments, at a minium must contain 'protein' and 'profile' columns. The 'profile' column might contain multiple profiles, separated by '&'."
   ),
+  make_option(
+    c('--maxscore'), type='double', default=Inf,
+    help='Maximum sequence score; overrides the sequence score in scorefile if higher than this value, default %default'
+  ),
   make_option(
     c('--minscore'), type='double', default=0.0,
     help='Minimum score, default %default'
@@ -130,13 +134,20 @@ if ( length(opt$options$scorefile) > 0 ) {
 
   tblout[s, on = 'profile', min_score := i.min_score] 
   tblout[is.na(min_score)]$min_score <- opt$options$minscore
+  tblout$max_score <- opt$options$maxscore
 } else {
   logmsg(sprintf("Setting minimum scores to %5.2f", opt$options$minscore))
   tblout$min_score <- opt$options$minscore
+  tblout$max_score <- opt$options$maxscore
 }
 
 # Filter away entries with lower score than the minimum
-tblout <- tblout[score >= min_score]
+tblout <- lazy_dt(tblout) %>%
+  group_by(accno, profile) %>%
+  filter(score >= min(min_score, max_score)) %>%
+  ungroup() %>%
+  select(-max_score) %>%
+  as.data.table()
 
 # Read the annottable, if given, split into individual profiles, left join with tableout and summarise
 if ( length(opt$options$annottable) > 0 ) {