diff --git a/COORDS/ce10/RefSeqExon.ce10.bed.gz b/COORDS/ce10/RefSeqExon.ce10.bed.gz index 6c66b1c..2e8bb6f 100644 Binary files a/COORDS/ce10/RefSeqExon.ce10.bed.gz and b/COORDS/ce10/RefSeqExon.ce10.bed.gz differ diff --git a/COORDS/ce11/RefSeqExon.ce11.bed.gz b/COORDS/ce11/RefSeqExon.ce11.bed.gz index e60ef47..a3a9d89 100644 Binary files a/COORDS/ce11/RefSeqExon.ce11.bed.gz and b/COORDS/ce11/RefSeqExon.ce11.bed.gz differ diff --git a/COORDS/ce6/RefSeqExon.ce6.bed.gz b/COORDS/ce6/RefSeqExon.ce6.bed.gz index fd25b08..6d6455b 100644 Binary files a/COORDS/ce6/RefSeqExon.ce6.bed.gz and b/COORDS/ce6/RefSeqExon.ce6.bed.gz differ diff --git a/COORDS/danRer10/RefSeqExon.danRer10.bed.gz b/COORDS/danRer10/RefSeqExon.danRer10.bed.gz index 3b99725..0d21a70 100644 Binary files a/COORDS/danRer10/RefSeqExon.danRer10.bed.gz and b/COORDS/danRer10/RefSeqExon.danRer10.bed.gz differ diff --git a/COORDS/danRer11/RefSeqExon.danRer11.bed.gz b/COORDS/danRer11/RefSeqExon.danRer11.bed.gz index 349ce7b..5baa7ff 100644 Binary files a/COORDS/danRer11/RefSeqExon.danRer11.bed.gz and b/COORDS/danRer11/RefSeqExon.danRer11.bed.gz differ diff --git a/COORDS/danRer7/RefSeqExon.danRer7.bed.gz b/COORDS/danRer7/RefSeqExon.danRer7.bed.gz index 20b74f1..f3a98e9 100644 Binary files a/COORDS/danRer7/RefSeqExon.danRer7.bed.gz and b/COORDS/danRer7/RefSeqExon.danRer7.bed.gz differ diff --git a/COORDS/dm3/RefSeqExon.dm3.bed.gz b/COORDS/dm3/RefSeqExon.dm3.bed.gz index 82c8133..db2970e 100644 Binary files a/COORDS/dm3/RefSeqExon.dm3.bed.gz and b/COORDS/dm3/RefSeqExon.dm3.bed.gz differ diff --git a/COORDS/dm6/RefSeqExon.dm6.bed.gz b/COORDS/dm6/RefSeqExon.dm6.bed.gz index ffd99c2..d091853 100644 Binary files a/COORDS/dm6/RefSeqExon.dm6.bed.gz and b/COORDS/dm6/RefSeqExon.dm6.bed.gz differ diff --git a/COORDS/hg18/RefSeqExon.hg18.bed.gz b/COORDS/hg18/RefSeqExon.hg18.bed.gz index 9f537c3..825bead 100644 Binary files a/COORDS/hg18/RefSeqExon.hg18.bed.gz and b/COORDS/hg18/RefSeqExon.hg18.bed.gz differ diff --git a/COORDS/hg19/RefSeqExon.hg19.bed.gz b/COORDS/hg19/RefSeqExon.hg19.bed.gz index 413fd30..a662534 100644 Binary files a/COORDS/hg19/RefSeqExon.hg19.bed.gz and b/COORDS/hg19/RefSeqExon.hg19.bed.gz differ diff --git a/COORDS/hg38/RefSeqExon.hg38.bed.gz b/COORDS/hg38/RefSeqExon.hg38.bed.gz index 0dc57bb..b1a65dc 100644 Binary files a/COORDS/hg38/RefSeqExon.hg38.bed.gz and b/COORDS/hg38/RefSeqExon.hg38.bed.gz differ diff --git a/COORDS/mm10/RefSeqExon.mm10.bed.gz b/COORDS/mm10/RefSeqExon.mm10.bed.gz index 2177215..2d2546c 100644 Binary files a/COORDS/mm10/RefSeqExon.mm10.bed.gz and b/COORDS/mm10/RefSeqExon.mm10.bed.gz differ diff --git a/COORDS/mm9/RefSeqExon.mm9.bed.gz b/COORDS/mm9/RefSeqExon.mm9.bed.gz index 2bdee77..3238d59 100644 Binary files a/COORDS/mm9/RefSeqExon.mm9.bed.gz and b/COORDS/mm9/RefSeqExon.mm9.bed.gz differ diff --git a/COORDS/rn5/RefSeqExon.rn5.bed.gz b/COORDS/rn5/RefSeqExon.rn5.bed.gz index e3a264c..eed6a89 100644 Binary files a/COORDS/rn5/RefSeqExon.rn5.bed.gz and b/COORDS/rn5/RefSeqExon.rn5.bed.gz differ diff --git a/COORDS/rn6/RefSeqExon.rn6.bed.gz b/COORDS/rn6/RefSeqExon.rn6.bed.gz index c4225e4..1044219 100644 Binary files a/COORDS/rn6/RefSeqExon.rn6.bed.gz and b/COORDS/rn6/RefSeqExon.rn6.bed.gz differ diff --git a/ChromHMM.jar b/ChromHMM.jar index 6349cd0..7341af4 100644 Binary files a/ChromHMM.jar and b/ChromHMM.jar differ diff --git a/ChromHMM.zip b/ChromHMM.zip index 1a364e5..e1f1f01 100644 Binary files a/ChromHMM.zip and b/ChromHMM.zip differ diff --git a/ChromHMM_manual.pdf b/ChromHMM_manual.pdf index 6aff163..55e5a32 100644 Binary files a/ChromHMM_manual.pdf and b/ChromHMM_manual.pdf differ diff --git a/edu/mit/compbio/ChromHMM/BrowserOutput.java b/edu/mit/compbio/ChromHMM/BrowserOutput.java index 0f3fd48..a4118aa 100644 --- a/edu/mit/compbio/ChromHMM/BrowserOutput.java +++ b/edu/mit/compbio/ChromHMM/BrowserOutput.java @@ -19,6 +19,7 @@ import java.io.*; import java.util.*; +import java.util.zip.GZIPOutputStream; /** * This class handles generating the browser output. @@ -119,9 +120,14 @@ else if (n1 > n2) */ int numstates; + /** + * True if files should be in gzip format + */ + boolean bgzip; + public BrowserOutput(String szsegmentfile, String szcolormapping,String szidlabelmapping, - String szsegmentationname, String szoutputfileprefix, int numstates) throws IOException + String szsegmentationname, String szoutputfileprefix, int numstates, boolean bgzip) throws IOException { this.szsegmentfile = szsegmentfile; this.szcolormapping =szcolormapping; @@ -129,6 +135,7 @@ public BrowserOutput(String szsegmentfile, String szcolormapping,String szidlabe this.szsegmentationname = szsegmentationname; this.szoutputfileprefix = szoutputfileprefix; this.numstates = numstates; + this.bgzip = bgzip; hmcolor = new HashMap(); hmlabelExtend = new HashMap(); @@ -358,43 +365,95 @@ private void makeLabelMapping() throws IOException */ public void makebrowserdense() throws IOException { - System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed"); - PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed")); + if (bgzip) + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz"); + //PrintWriter pwzip = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz")); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz")); - BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); - String szLine; - boolean bfirst = true; + BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); + String szLine; + boolean bfirst = true; - while ((szLine =brsegment.readLine())!=null) + while ((szLine =brsegment.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t"); + String szcurrchrom = st.nextToken(); + int nbegin = Integer.parseInt(st.nextToken()); + int nend = Integer.parseInt(st.nextToken()); + String szFullID = st.nextToken(); + String szID = szFullID.substring(1); //this removes ordering type + if (bfirst) + { + String szout = "track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + +" ordered)"+"\" visibility=1 itemRgb=\"On\""+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + // +" ordered)"+"\" visibility=1 itemRgb=\"On\""); + bfirst = false; + } + String szColor = (String) hmcolor.get(szID); + if (szColor == null) + { + throw new IllegalArgumentException("Color not given for "+szID); + } + + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) + { + szID = szID+"_"+szsuffix; + } + + String szout = szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor +"\n"; + //pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + } + brsegment.close(); + pwzip.finish(); + pwzip.close(); + } + else { - StringTokenizer st = new StringTokenizer(szLine,"\t"); - String szcurrchrom = st.nextToken(); - int nbegin = Integer.parseInt(st.nextToken()); - int nend = Integer.parseInt(st.nextToken()); - String szFullID = st.nextToken(); - String szID = szFullID.substring(1); //this removes ordering type - if (bfirst) - { - pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) - +" ordered)"+"\" visibility=1 itemRgb=\"On\""); - bfirst = false; - } - String szColor = (String) hmcolor.get(szID); - if (szColor == null) - { - throw new IllegalArgumentException("Color not given for "+szID); - } + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed"); + PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed")); - String szsuffix; - if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) - { - szID = szID+"_"+szsuffix; - } + BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); + String szLine; + boolean bfirst = true; - pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); - } - brsegment.close(); - pw.close(); + while ((szLine =brsegment.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t"); + String szcurrchrom = st.nextToken(); + int nbegin = Integer.parseInt(st.nextToken()); + int nend = Integer.parseInt(st.nextToken()); + String szFullID = st.nextToken(); + String szID = szFullID.substring(1); //this removes ordering type + if (bfirst) + { + pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + +" ordered)"+"\" visibility=1 itemRgb=\"On\""); + bfirst = false; + } + String szColor = (String) hmcolor.get(szID); + if (szColor == null) + { + throw new IllegalArgumentException("Color not given for "+szID); + } + + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) + { + szID = szID+"_"+szsuffix; + } + + pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); + } + brsegment.close(); + pw.close(); + } } @@ -407,8 +466,14 @@ public void makebrowserdense() throws IOException */ public void makebrowserexpanded() throws IOException { - System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed"); - PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed")); + if (bgzip) + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed.gz"); + } + else + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed"); + } String szLine; @@ -491,62 +556,148 @@ public void makebrowserexpanded() throws IOException } Arrays.sort(szChroms); - pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) - +" ordered)"+"\" visibility=2 itemRgb=\"On\""); - int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; - pw.println("browser position "+szChroms[0]+":1-"+nbrowserend); - - for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + if (bgzip) { - //UCSC browser seems to reverse the ordering of browser track files - String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); - for (int nchrom = 0; nchrom < szChroms.length; nchrom++) - { - //omits those segment labels not observed at all on chromosome - ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); - if (alRecs == null) continue; - - int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); - - //this forces browser to display segment until the end of the chromosome - alRecs.add(new BeginEndRec(nmax-1,nmax)); - - int nsize = alRecs.size(); - int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; - int nfinalend = nmax; - - String szoutlabel; - String szsuffix; - if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) - { - szoutlabel = szLabels[nlabel]+"_"+szsuffix; - } - else - { - szoutlabel = szLabels[nlabel]; - } + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed.gz")); + String szout = "track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + +" ordered)"+"\" visibility=2 itemRgb=\"On\""+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + // +" ordered)"+"\" visibility=2 itemRgb=\"On\""); + int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; + + szout = "browser position "+szChroms[0]+":1-"+nbrowserend+"\n"; + btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pwzip.println("browser position "+szChroms[0]+":1-"+nbrowserend); + + for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + { + //UCSC browser seems to reverse the ordering of browser track files + String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); + for (int nchrom = 0; nchrom < szChroms.length; nchrom++) + { + //omits those segment labels not observed at all on chromosome + ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); + if (alRecs == null) continue; + + int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); + + //this forces browser to display segment until the end of the chromosome + alRecs.add(new BeginEndRec(nmax-1,nmax)); + + int nsize = alRecs.size(); + int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; + int nfinalend = nmax; + + String szoutlabel; + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) + { + szoutlabel = szLabels[nlabel]+"_"+szsuffix; + } + else + { + szoutlabel = szLabels[nlabel]; + } + + StringBuffer sbout = new StringBuffer(); + sbout.append(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + //pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + //pw.print(0); //forcing the display to start at the beginning of the chromosome + sbout.append(0); + for (int ni = 0; ni < nsize; ni++) + { + BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); + int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; + sbout.append(","); + sbout.append(ndiff); + //pw.print(","); + //pw.print(ndiff); + } + sbout.append("\t"); + sbout.append(0); + //pw.print("\t"); + //pw.print(0); + for (int ni = 0; ni < nsize; ni++) + { + int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; + sbout.append(","); + sbout.append(nloc); + //pw.print(","); + //pw.print(nloc); + } + sbout.append("\n"); + // pw.println(); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + } + } + pwzip.finish(); + pwzip.close(); + } + else + { + PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed")); + pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + +" ordered)"+"\" visibility=2 itemRgb=\"On\""); + int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; + pw.println("browser position "+szChroms[0]+":1-"+nbrowserend); - pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); - pw.print(0); //forcing the display to start at the beginning of the chromosome - for (int ni = 0; ni < nsize; ni++) - { - BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); - int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; - pw.print(","); - pw.print(ndiff); - } - pw.print("\t"); - pw.print(0); - for (int ni = 0; ni < nsize; ni++) - { - int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; - pw.print(","); - pw.print(nloc); - } - pw.println(); - } + for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + { + //UCSC browser seems to reverse the ordering of browser track files + String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); + for (int nchrom = 0; nchrom < szChroms.length; nchrom++) + { + //omits those segment labels not observed at all on chromosome + ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); + if (alRecs == null) continue; + + int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); + + //this forces browser to display segment until the end of the chromosome + alRecs.add(new BeginEndRec(nmax-1,nmax)); + + int nsize = alRecs.size(); + int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; + int nfinalend = nmax; + + String szoutlabel; + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) + { + szoutlabel = szLabels[nlabel]+"_"+szsuffix; + } + else + { + szoutlabel = szLabels[nlabel]; + } + + pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + pw.print(0); //forcing the display to start at the beginning of the chromosome + for (int ni = 0; ni < nsize; ni++) + { + BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); + int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; + pw.print(","); + pw.print(ndiff); + } + pw.print("\t"); + pw.print(0); + for (int ni = 0; ni < nsize; ni++) + { + int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; + pw.print(","); + pw.print(nloc); + } + pw.println(); + } + } + pw.close(); } - pw.close(); } } diff --git a/edu/mit/compbio/ChromHMM/ChromHMM.java b/edu/mit/compbio/ChromHMM/ChromHMM.java index 2351f94..8c72246 100644 --- a/edu/mit/compbio/ChromHMM/ChromHMM.java +++ b/edu/mit/compbio/ChromHMM/ChromHMM.java @@ -506,6 +506,11 @@ public class ChromHMM boolean bpseudo = false; + /** + * true if output should be zipped files + */ + boolean bgzip = false; + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Stores an integer index and array of boolean flags @@ -610,7 +615,7 @@ public ChromHMM(String szinputdir, String szoutputdir, String szinputfilelist,St int nmaxiterations,double dcovergediff,int nmaxseconds,boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, int nbinsize,String szoutfileID,int nstateorder,boolean bordercols,int nzerotransitionpower, Color theColor, boolean bnormalEM, int nmaxprocessors, boolean blowmem, - int numincludeseq, boolean bprintimage, boolean bscaleemissions, boolean bpseudo) throws IOException + int numincludeseq, boolean bprintimage, boolean bscaleemissions, boolean bpseudo, boolean bgzip) throws IOException { this.szinputdir = szinputdir; this.szoutputdir = szoutputdir; @@ -643,6 +648,7 @@ public ChromHMM(String szinputdir, String szoutputdir, String szinputfilelist,St this.bprintimage = bprintimage; this.bscaleemissions = bscaleemissions; this.bpseudo = bpseudo; + this.bgzip = bgzip; hmlabelExtend = new HashMap(); theRandom = new Random(nseed); @@ -746,7 +752,8 @@ public ChromHMM(String szInitFile, String szoutputdir, String szstateorderingfil * Constructor initializes the variable and loads the data used for making segmentation from a model */ public ChromHMM(String szinputdir, String szinputfilelist, String szchromlengthfile, String szoutputdir, String szInitFile, String szoutfileID, - int nbinsize, boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, boolean blowmem, boolean bscaleemissions) throws IOException + int nbinsize, boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, + boolean blowmem, boolean bscaleemissions, boolean bgzip) throws IOException { this.szinputdir = szinputdir; this.szinputfilelist = szinputfilelist; @@ -760,6 +767,7 @@ public ChromHMM(String szinputdir, String szinputfilelist, String szchromlengthf this.nbinsize = nbinsize; this.blowmem = blowmem; this.bscaleemissions = bscaleemissions; + this.bgzip = bgzip; hmlabelExtend = new HashMap(); if (blowmem) @@ -4601,62 +4609,149 @@ else if (ch=='1') } hsprefix.add(szprefix); + GZIPOutputStream pwprobszip = null; PrintWriter pwprobs = null; - if (bprintposterior) + + GZIPOutputStream pwmaxzip = null; + PrintWriter pwmax = null; + + GZIPOutputStream pwbedzip = null; + PrintWriter pwbed = null; + + if (bgzip) { - //creates the posterior file + //PrintWriter pwprobs = null; + if (bprintposterior) + { + //creates the posterior file - String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION+".gz"; + + System.out.println("Writing to file "+szposterioroutfilename); + pwprobszip = new GZIPOutputStream(new FileOutputStream(szposterioroutfilename)); + //pwprobs = new PrintWriter(szposterioroutfilename); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + + StringBuffer sbout = new StringBuffer(); + //pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + sbout.append(""+chorder+(ni+1)+"\t"); + //pwprobs.print(""+chorder+(ni+1)+"\t"); + } + sbout.append(""+chorder+(numstates)+"\n"); + //pwprobs.println(""+chorder+(numstates)); + + btformat = sbout.toString().getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + //PrintWriter pwmax = null; - System.out.println("Writing to file "+szposterioroutfilename); - pwprobs = new PrintWriter(szposterioroutfilename); - pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - for (int ni = 0; ni < numstates-1; ni++) + + if (bprintstatebyline) { - pwprobs.print(""+chorder+(ni+1)+"\t"); - } - pwprobs.println(""+chorder+(numstates)); - } + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION+".gz"; + + System.out.println("Writing to file "+szmaxoutfilename); + //pwmax = new PrintWriter(szmaxoutfilename); + pwmaxzip = new GZIPOutputStream(new FileOutputStream(szmaxoutfilename)); + + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + + szout = "MaxState "+chorder+"\n"; + btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwmax.println("MaxState "+chorder); + } - PrintWriter pwmax = null; - if (bprintstatebyline) - { - //creates a file which has the state with the maximum posterior probability - String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + //PrintWriter pwbed = null; + //GZIPOutputStream pwbedzip = null; - System.out.println("Writing to file "+szmaxoutfilename); - pwmax = new PrintWriter(szmaxoutfilename); - pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - pwmax.println("MaxState "+chorder); - } + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbedzip = (GZIPOutputStream) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + // (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - PrintWriter pwbed = null; - if (bprintsegment) + if (pwbedzip == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION+".gz"; + + pwbedzip = new GZIPOutputStream(new FileOutputStream(szsegmentoutfilename)); + //pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbedzip); + } + } + } + else { - //creates a file which has the maximum segmentation - //we only have one file per cell type here - pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - if (pwbed == null) + if (bprintposterior) { - //haven't seen this cell type - String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + //creates the posterior file + + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + + System.out.println("Writing to file "+szposterioroutfilename); + pwprobs = new PrintWriter(szposterioroutfilename); + pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + pwprobs.print(""+chorder+(ni+1)+"\t"); + } + pwprobs.println(""+chorder+(numstates)); + } - pwbed = new PrintWriter(szsegmentoutfilename); - System.out.println("Writing to file "+szsegmentoutfilename); + if (bprintstatebyline) + { + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; - hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + System.out.println("Writing to file "+szmaxoutfilename); + pwmax = new PrintWriter(szmaxoutfilename); + pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + pwmax.println("MaxState "+chorder); } - } + //PrintWriter pwbed = null; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + + if (pwbed == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + + pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + } + } + } if (bscaleemissions) - { + { for (int ni = 0; ni < nobserved; ni++) { //going through each combination of marks - //if (traindataObservedSeqFlags_nseq[ni]) + //if (traindataObservedSeqFlags_nseq[ni]) //{ //this signature of marks is observed on the current chromosome so //updating its emission probabilities @@ -4966,70 +5061,197 @@ else if (ch=='1') double dmaxval = 0; int nmaxstate = 0; - //handling the first line - for (int ns = 0; ns < gamma_nt.length; ns++) - { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) - { - if (ns > 0) - { - //print with tab if not the first - pwprobs.print("\t"+nf.format(dprob)); - } - else - { - pwprobs.print(nf.format(dprob)); - } - } + if (bgzip) + { + //handling the first line + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + //print with tab if not the first + szout = "\t"+nf.format(dprob); + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } - if (dprob > dmaxval) - { - //best one found so far - dmaxval = dprob; - nmaxstate = ns; - } - } + if (dprob > dmaxval) + { + //best one found so far + dmaxval = dprob; + nmaxstate = ns; + } + } - if (bprintposterior) - { - pwprobs.println(); - } + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } + if (bprintstatebyline) + { + String szout = (nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } - //this contains the best state of the previous interval - int nmaxstateprev = nmaxstate; + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - for (int nt = 1; nt < numtime_nseq; nt++) - { - gamma_nt = gamma[nt]; + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; + + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + //print with tab the first time + szout = "\t"+nf.format(dprob); + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - dmaxval = 0; - nmaxstate = 0; + if (bprintstatebyline) + { + String szout =""+(nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmaxzip.finish(); + pwmaxzip.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobszip.finish(); + pwprobszip.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + GZIPOutputStream pwzip = (GZIPOutputStream) itr.next(); + //PrintWriter pw = (PrintWriter) itr.next(); + pwzip.finish(); + pwzip.close(); + //pw.close(); + } + } + } + else + { + //handling the first line for (int ns = 0; ns < gamma_nt.length; ns++) { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + + if (bprintposterior) { if (ns > 0) { - //print with tab the first time pwprobs.print("\t"+nf.format(dprob)); - } - else - { + } + else + { pwprobs.print(nf.format(dprob)); } } if (dprob > dmaxval) { + //best one found so far dmaxval = dprob; nmaxstate = ns; } @@ -5037,64 +5259,108 @@ else if (ch=='1') if (bprintposterior) { - pwprobs.println(); + pwprobs.println(); } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } - - if (bprintsegment&&(nmaxstateprev != nmaxstate)) + if (bprintstatebyline) { - //print out last segment we are done with - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); - //start a new segment now - nstart = nt; - nmaxstateprev = nmaxstate; - } - } + pwmax.println(""+(nmaxstate+1)); + } - if (bprintsegment) - { - int nlastcoordinate; - Integer objMaxCoord = null; - if (hmMaxCoord != null) - { - objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); - } + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - if (objMaxCoord != null) - { - nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); - } - else - { - nlastcoordinate = numtime_nseq*nbinsize; - } - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - } + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; - //close out the max state file if that was requested - if (bprintstatebyline) - { - pwmax.close(); - } - //close out the posterior state file if that was requested - if (bprintposterior) - { - pwprobs.close(); - } - } + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + if (ns > 0) + { + //print with tab the first time + pwprobs.print("\t"+nf.format(dprob)); + } + else + { + pwprobs.print(nf.format(dprob)); + } + } - //if segment print was requested then we are going to go close those printwriters - if (bprintsegment) - { - Iterator itr = hmcellToFourColPW.values().iterator(); - while (itr.hasNext()) - { - PrintWriter pw = (PrintWriter) itr.next(); - pw.close(); + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + pwprobs.println(); + } + + if (bprintstatebyline) + { + pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmax.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobs.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + PrintWriter pw = (PrintWriter) itr.next(); + pw.close(); + } + } } } } @@ -5209,51 +5475,125 @@ public void makeSegmentation() throws IOException } hsprefix.add(szprefix); + + GZIPOutputStream pwprobszip = null; + GZIPOutputStream pwmaxzip = null; + GZIPOutputStream pwbedzip = null; PrintWriter pwprobs = null; - if (bprintposterior) + PrintWriter pwmax = null; + PrintWriter pwbed = null; + + if (bgzip) { - //creates the posterior file + if (bprintposterior) + { + //creates the posterior file - String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION+".gz"; + + System.out.println("Writing to file "+szposterioroutfilename); + //pwprobs = new PrintWriter(szposterioroutfilename); + pwprobszip = new GZIPOutputStream(new FileOutputStream(szposterioroutfilename)); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + + //pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + StringBuffer sbout = new StringBuffer(); + for (int ni = 0; ni < numstates-1; ni++) + { + sbout.append(""+chorder+(ni+1)+"\t"); + //pwprobs.print(""+chorder+(ni+1)+"\t"); + } + sbout.append(""+chorder+(numstates)+"\n"); + + btformat = sbout.toString().getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(""+chorder+(numstates)); + } - System.out.println("Writing to file "+szposterioroutfilename); - pwprobs = new PrintWriter(szposterioroutfilename); - pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - for (int ni = 0; ni < numstates-1; ni++) + if (bprintstatebyline) { - pwprobs.print(""+chorder+(ni+1)+"\t"); - } - pwprobs.println(""+chorder+(numstates)); - } + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION+".gz"; + System.out.println("Writing to file "+szmaxoutfilename); + pwmaxzip = new GZIPOutputStream(new FileOutputStream(szmaxoutfilename)); + //pwmax = new PrintWriter(szmaxoutfilename); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq] +"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + + szout = "MaxState "+chorder+"\n"; + + btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + //pwmax.println("MaxState "+chorder); + } - PrintWriter pwmax = null; - if (bprintstatebyline) - { - //creates a file which has the state with the maximum posterior probability - String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbedzip = (GZIPOutputStream) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - System.out.println("Writing to file "+szmaxoutfilename); - pwmax = new PrintWriter(szmaxoutfilename); - pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - pwmax.println("MaxState "+chorder); - } + if (pwbedzip == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION+".gz"; - PrintWriter pwbed = null; - if (bprintsegment) + pwbedzip = new GZIPOutputStream(new FileOutputStream(szsegmentoutfilename)); + //pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbedzip); + } + } + } + else { - //creates a file which has the maximum segmentation - //we only have one file per cell type here - pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + if (bprintposterior) + { + //creates the posterior file + + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; - if (pwbed == null) + System.out.println("Writing to file "+szposterioroutfilename); + pwprobs = new PrintWriter(szposterioroutfilename); + pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + pwprobs.print(""+chorder+(ni+1)+"\t"); + } + pwprobs.println(""+chorder+(numstates)); + } + + if (bprintstatebyline) { - //haven't seen this cell type - String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + System.out.println("Writing to file "+szmaxoutfilename); + pwmax = new PrintWriter(szmaxoutfilename); + pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + pwmax.println("MaxState "+chorder); + } - pwbed = new PrintWriter(szsegmentoutfilename); - System.out.println("Writing to file "+szsegmentoutfilename); + //PrintWriter pwbed = null; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + if (pwbed == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + + pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + } } } @@ -5572,70 +5912,197 @@ public void makeSegmentation() throws IOException double dmaxval = 0; int nmaxstate = 0; - //handling the first line - for (int ns = 0; ns < gamma_nt.length; ns++) - { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) - { - if (ns > 0) - { - //print with tab if not the first - pwprobs.print("\t"+nf.format(dprob)); - } - else - { - pwprobs.print(nf.format(dprob)); - } - } + if (bgzip) + { + //handling the first line + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + szout = "\t"+nf.format(dprob); + //print with tab if not the first + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } - if (dprob > dmaxval) - { - //best one found so far - dmaxval = dprob; - nmaxstate = ns; - } - } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } - if (bprintposterior) - { - pwprobs.println(); - } + if (dprob > dmaxval) + { + //best one found so far + dmaxval = dprob; + nmaxstate = ns; + } + } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - //this contains the best state of the previous interval - int nmaxstateprev = nmaxstate; + if (bprintstatebyline) + { + String szout = ""+(nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } - for (int nt = 1; nt < numtime_nseq; nt++) - { - gamma_nt = gamma[nt]; + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; + + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; + + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + szout = "\t"+nf.format(dprob); + //print with tab the first time + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + //pwprobs.println(); + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (bprintstatebyline) + { + String szout = (nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1) +"\n"; + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - dmaxval = 0; - nmaxstate = 0; + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmaxzip.finish(); + pwmaxzip.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobszip.finish(); + pwprobszip.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + GZIPOutputStream pwzip = (GZIPOutputStream) itr.next(); + pwzip.finish(); + pwzip.close(); + } + } + } + else + { + //handling the first line for (int ns = 0; ns < gamma_nt.length; ns++) { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) { if (ns > 0) { - //print with tab the first time + //print with tab if not the first pwprobs.print("\t"+nf.format(dprob)); - } - else - { + } + else + { pwprobs.print(nf.format(dprob)); } } if (dprob > dmaxval) { + //best one found so far dmaxval = dprob; nmaxstate = ns; } @@ -5646,61 +6113,105 @@ public void makeSegmentation() throws IOException pwprobs.println(); } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } - - if (bprintsegment&&(nmaxstateprev != nmaxstate)) + if (bprintstatebyline) { - //print out last segment we are done with - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); - //start a new segment now - nstart = nt; - nmaxstateprev = nmaxstate; - } - } + pwmax.println(""+(nmaxstate+1)); + } - if (bprintsegment) - { - int nlastcoordinate; - Integer objMaxCoord = null; - if (hmMaxCoord != null) - { - objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); - } + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - if (objMaxCoord != null) - { - nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); - } - else - { - nlastcoordinate = numtime_nseq*nbinsize; - } - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - } + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; - //close out the max state file if that was requested - if (bprintstatebyline) - { - pwmax.close(); - } - //close out the posterior state file if that was requested - if (bprintposterior) - { - pwprobs.close(); - } - } + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + if (ns > 0) + { + //print with tab the first time + pwprobs.print("\t"+nf.format(dprob)); + } + else + { + pwprobs.print(nf.format(dprob)); + } + } - //if segment print was requested then we are going to go close those printwriters - if (bprintsegment) - { - Iterator itr = hmcellToFourColPW.values().iterator(); - while (itr.hasNext()) - { - PrintWriter pw = (PrintWriter) itr.next(); - pw.close(); + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + pwprobs.println(); + } + + if (bprintstatebyline) + { + pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmax.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobs.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + PrintWriter pw = (PrintWriter) itr.next(); + pw.close(); + } + } } } } @@ -10665,10 +11176,11 @@ public static void main(String[] args) throws IOException if (szcommand.equalsIgnoreCase("Version")) { - System.out.println("This is Version 1.15 of ChromHMM (c) Copyright 2008-2012 Massachusetts Institute of Technology"); + System.out.println("This is Version 1.16 of ChromHMM (c) Copyright 2008-2012 Massachusetts Institute of Technology"); } else if ((szcommand.equals("BinarizeBam"))||(szcommand.equalsIgnoreCase("BinarizeBed"))) { + boolean bgzip = false; boolean bpairend = false; String szcontroldir=null; int nflankwidthcontrol = 5; @@ -10725,6 +11237,10 @@ else if (args[nargindex].equals("-g")) { dcountthresh = Double.parseDouble(args[++nargindex]); } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-n")) { nshift = Integer.parseInt(args[++nargindex]); @@ -10830,7 +11346,7 @@ else if (args[nargindex].equals("-w")) nshift,bcenterinterval, noffsetleft,noffsetright,szoutputsignaldir, szoutputbinarydir,szoutputcontroldir, dpoissonthresh,dfoldthresh,bcontainsthresh, - npseudocountcontrol,nbinsize,szcolfields,bpeaks, dcountthresh,szcommand.equalsIgnoreCase("BinarizeBam"),bpairend); + npseudocountcontrol,nbinsize,szcolfields,bpeaks, dcountthresh,szcommand.equalsIgnoreCase("BinarizeBam"),bpairend, bgzip); } else @@ -10844,14 +11360,14 @@ else if (args[nargindex].equals("-w")) if (szcommand.equalsIgnoreCase("BinarizeBed")) { System.out.println("usage BinarizeBed [-b binsize][-c controldir][-center][-colfields chromosome,start,end[,strand]][-e offsetend][-f foldthresh]"+ - "[-g signalthresh][-n shift][-o outputcontroldir][-p poissonthresh][-peaks][-s offsetstart][-strictthresh][-t outputsignaldir]"+ + "[-g signalthresh][-gzip][-n shift][-o outputcontroldir][-p poissonthresh][-peaks][-s offsetstart][-strictthresh][-t outputsignaldir]"+ "[-u pseudocountcontrol][-w flankwidthcontrol] "+ "chromosomelengthfile inputbeddir cellmarkfiletable outputbinarydir"); } else { System.out.println("usage BinarizeBam [-b binsize][-c controldir][-e offsetend][-f foldthresh]"+ - "[-g signalthresh][-o outputcontroldir][-p poissonthresh][-paired|[-center][-n shift][-peaks]]"+ + "[-g signalthresh][-gzip][-o outputcontroldir][-p poissonthresh][-paired|[-center][-n shift][-peaks]]"+ "[-s offsetstart][-strictthresh][-t outputsignaldir]"+ "[-u pseudocountcontrol][-w flankwidthcontrol] "+ "chromosomelengthfile inputbamdir cellmarkfiletable outputbinarydir"); @@ -10861,6 +11377,7 @@ else if (args[nargindex].equals("-w")) else if (szcommand.equalsIgnoreCase("BinarizeSignal")) { boolean bcontainsthresh = true; + boolean bgzip = false; double dfoldthresh = 0; double dcountthresh = 0; double dpoissonthresh = 0.0001; @@ -10894,6 +11411,10 @@ else if (args[nargindex].equals("-g")) { dcountthresh = Double.parseDouble(args[++nargindex]); } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-p")) { dpoissonthresh = Double.parseDouble(args[++nargindex]); @@ -10940,11 +11461,13 @@ else if (args[nargindex].equals("-w")) if (szcontroldir!=null) { Preprocessing.makeBinaryDataFromSignalAgainstControl(szsignaldir,szcontroldir, szoutputdir, - dpoissonthresh, dfoldthresh,bcontainsthresh, nflankwidthcontrol,npseudocountcontrol, dcountthresh); + dpoissonthresh, dfoldthresh,bcontainsthresh, + nflankwidthcontrol,npseudocountcontrol, dcountthresh,bgzip); } else { - Preprocessing.makeBinaryDataFromSignalUniform(szsignaldir, szoutputdir, dpoissonthresh, dfoldthresh, bcontainsthresh,dcountthresh); + Preprocessing.makeBinaryDataFromSignalUniform(szsignaldir, szoutputdir, dpoissonthresh, + dfoldthresh, bcontainsthresh,dcountthresh,bgzip); } } else @@ -10955,7 +11478,7 @@ else if (args[nargindex].equals("-w")) if (!bok) { - System.out.println("usage BinarizeSignal [-c controldir][-f foldthresh][-g signalthresh][-p poissonthresh][-strictthresh][-u pseudocountcontrol][-w flankwidth] signaldir outputdir"); + System.out.println("usage BinarizeSignal [-c controldir][-f foldthresh][-g signalthresh][-gzip][-p poissonthresh][-strictthresh][-u pseudocountcontrol][-w flankwidth] signaldir outputdir"); } } @@ -11208,7 +11731,7 @@ else if (szcommand.equalsIgnoreCase("MakeSegmentation")) int nbinsize = ChromHMM.DEFAULT_BINSIZEBASEPAIRS; String szoutfileID = ""; boolean blowmem = false; - + boolean bgzip = false; int nargindex = 1; try @@ -11223,6 +11746,10 @@ else if (args[nargindex].equals("-f")) { szinputfilelist = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-i")) { szoutfileID = args[++nargindex]; @@ -11307,7 +11834,7 @@ else if (args[nargindex].equals("-printstatesbyline")) { ChromHMM theHMM = new ChromHMM(szinputdir, szinputfilelist,szchromlengthfile, szoutputdir, szmodelfile, szoutfileID, nbinsize, bprintposterior, - bprintsegments,bprintstatebyline, blowmem,bscaleemissions); + bprintsegments,bprintstatebyline, blowmem,bscaleemissions, bgzip); if (blowmem) { @@ -11330,7 +11857,7 @@ else if (args[nargindex].equals("-printstatesbyline")) if (!bok) { - System.out.println("usage: MakeSegmentation [-b binsize][-f inputfilelist][-i outfileID][-l chromosomelengthfile][-lowmem][-many][-nobed]"+ + System.out.println("usage: MakeSegmentation [-b binsize][-f inputfilelist][-gzip][-i outfileID][-l chromosomelengthfile][-lowmem][-many][-nobed]"+ "[-printposterior][-printstatesbyline]"+ " modelfile inputdir outputdir"); } @@ -11339,6 +11866,7 @@ else if (szcommand.equalsIgnoreCase("MakeBrowserFiles")) { String szcolormapping = null; String szlabelmapping = null; + boolean bgzip = false; int nargindex = 1; int numstates = -1; @@ -11349,6 +11877,10 @@ else if (szcommand.equalsIgnoreCase("MakeBrowserFiles")) { szcolormapping = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if ((args[nargindex].equals("-l"))||(args[nargindex].equals("-m"))) { //the -l is for backwards compatibility @@ -11372,7 +11904,7 @@ else if (args[nargindex].equals("-n")) String szsegmentationname = args[nargindex++]; String szoutputfileprefix =args[nargindex]; BrowserOutput theBrowserOutput = new BrowserOutput(szsegmentfile,szcolormapping,szlabelmapping, - szsegmentationname, szoutputfileprefix,numstates); + szsegmentationname, szoutputfileprefix,numstates,bgzip); theBrowserOutput.makebrowserdense(); theBrowserOutput.makebrowserexpanded(); } @@ -11383,7 +11915,7 @@ else if (args[nargindex].equals("-n")) if (!bok) { - System.out.println("usage: MakeBrowserFiles [-c colormappingfile][-m labelmappingfile][-n numstates] segmentfile segmentationname outputfileprefix"); + System.out.println("usage: MakeBrowserFiles [-c colormappingfile][-gzip][-m labelmappingfile][-n numstates] segmentfile segmentationname outputfileprefix"); } } else if (szcommand.equalsIgnoreCase("OverlapEnrichment")) @@ -11729,6 +12261,10 @@ else if (szcommand.equalsIgnoreCase("LearnModel")) String path = ChromHMM.class.getProtectionDomain().getCodeSource().getLocation().getPath(); String decodedPath = URLDecoder.decode(path, "UTF-8"); String szprefixpath = decodedPath.substring(0, decodedPath.lastIndexOf("/") + 1); + //String szfullprefixpathchromsizedir = szprefixpath+"/"+CHROMSIZESDIR; + String szfullprefixpathcoorddir = szprefixpath+"/"+COORDDIR; + String szfullprefixpathanchorfiledir = szprefixpath+"/"+ANCHORFILEDIR; + //System.out.println("prefix path is "+szprefixpath); @@ -11750,6 +12286,7 @@ else if (szcommand.equalsIgnoreCase("LearnModel")) boolean blowmem = false; boolean bscaleemissions = false; boolean bpseudo = false; + boolean bgzip = false; String szinputfilelist = null; String szchromlengthfile = null; @@ -11802,6 +12339,10 @@ else if (args[nargindex].equals("-f")) { szinputfilelist = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-h")) { dinformationsmooth = Double.parseDouble(args[++nargindex]); @@ -11918,6 +12459,18 @@ else if (args[nargindex].equals("-printstatebyline")) { bprintstatebyline = true; } + //else if (args[nargindex].equals("-u")) + //{ + // szfullprefixpathchromsizedir = args[++nargindex]; + //} + else if (args[nargindex].equals("-u")) + { + szfullprefixpathcoorddir = args[++nargindex]; + } + else if (args[nargindex].equals("-v")) + { + szfullprefixpathanchorfiledir = args[++nargindex]; + } else if (args[nargindex].equals("-x")) { nmaxseconds = Integer.parseInt(args[++nargindex]); @@ -12000,9 +12553,12 @@ else if (args[nargindex].equals("-z")) if (szchromlengthfile == null) { + //updated v.1.16 File flength = new File(szprefixpath+"/"+CHROMSIZESDIR+"/"+szassembly+".txt"); + //File flength = new File(szfullprefixpathchromsizedir+"/"+szassembly+".txt"); if (flength.exists()) { + //szchromlengthfile = szfullprefixpathchromsizedir+"/"+szassembly+".txt"; szchromlengthfile = szprefixpath+"/"+CHROMSIZESDIR+"/"+szassembly+".txt"; } } @@ -12010,7 +12566,7 @@ else if (args[nargindex].equals("-z")) szInitFile,dloadsmoothemission,dloadsmoothtransition,dinformationsmooth, nmaxiterations,dconvergediff,nmaxseconds, bprintposterior,bprintsegments,bprintstatebyline, nbinsize,szoutfileID,nstateorder,bordercols,nzerotransitionpower,theColor,bnormalEM, nmaxprocessors, - blowmem,numincludeseq,bprintimage,bscaleemissions, bpseudo); + blowmem,numincludeseq,bprintimage,bscaleemissions, bpseudo, bgzip); theHMM.buildModel(); @@ -12083,7 +12639,15 @@ else if (args[nargindex].equals("-z")) for (int nfile = 0; nfile < prefixes.length; nfile++) { - String szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION; + String szsegmentfile; + if (bgzip) + { + szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION+".gz"; + } + else + { + szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION; + } pwweb.println("