diff --git a/COORDS/ce10/RefSeqExon.ce10.bed.gz b/COORDS/ce10/RefSeqExon.ce10.bed.gz index 6c66b1c..2e8bb6f 100644 Binary files a/COORDS/ce10/RefSeqExon.ce10.bed.gz and b/COORDS/ce10/RefSeqExon.ce10.bed.gz differ diff --git a/COORDS/ce11/RefSeqExon.ce11.bed.gz b/COORDS/ce11/RefSeqExon.ce11.bed.gz index e60ef47..a3a9d89 100644 Binary files a/COORDS/ce11/RefSeqExon.ce11.bed.gz and b/COORDS/ce11/RefSeqExon.ce11.bed.gz differ diff --git a/COORDS/ce6/RefSeqExon.ce6.bed.gz b/COORDS/ce6/RefSeqExon.ce6.bed.gz index fd25b08..6d6455b 100644 Binary files a/COORDS/ce6/RefSeqExon.ce6.bed.gz and b/COORDS/ce6/RefSeqExon.ce6.bed.gz differ diff --git a/COORDS/danRer10/RefSeqExon.danRer10.bed.gz b/COORDS/danRer10/RefSeqExon.danRer10.bed.gz index 3b99725..0d21a70 100644 Binary files a/COORDS/danRer10/RefSeqExon.danRer10.bed.gz and b/COORDS/danRer10/RefSeqExon.danRer10.bed.gz differ diff --git a/COORDS/danRer11/RefSeqExon.danRer11.bed.gz b/COORDS/danRer11/RefSeqExon.danRer11.bed.gz index 349ce7b..5baa7ff 100644 Binary files a/COORDS/danRer11/RefSeqExon.danRer11.bed.gz and b/COORDS/danRer11/RefSeqExon.danRer11.bed.gz differ diff --git a/COORDS/danRer7/RefSeqExon.danRer7.bed.gz b/COORDS/danRer7/RefSeqExon.danRer7.bed.gz index 20b74f1..f3a98e9 100644 Binary files a/COORDS/danRer7/RefSeqExon.danRer7.bed.gz and b/COORDS/danRer7/RefSeqExon.danRer7.bed.gz differ diff --git a/COORDS/dm3/RefSeqExon.dm3.bed.gz b/COORDS/dm3/RefSeqExon.dm3.bed.gz index 82c8133..db2970e 100644 Binary files a/COORDS/dm3/RefSeqExon.dm3.bed.gz and b/COORDS/dm3/RefSeqExon.dm3.bed.gz differ diff --git a/COORDS/dm6/RefSeqExon.dm6.bed.gz b/COORDS/dm6/RefSeqExon.dm6.bed.gz index ffd99c2..d091853 100644 Binary files a/COORDS/dm6/RefSeqExon.dm6.bed.gz and b/COORDS/dm6/RefSeqExon.dm6.bed.gz differ diff --git a/COORDS/hg18/RefSeqExon.hg18.bed.gz b/COORDS/hg18/RefSeqExon.hg18.bed.gz index 9f537c3..825bead 100644 Binary files a/COORDS/hg18/RefSeqExon.hg18.bed.gz and b/COORDS/hg18/RefSeqExon.hg18.bed.gz differ diff --git a/COORDS/hg19/RefSeqExon.hg19.bed.gz b/COORDS/hg19/RefSeqExon.hg19.bed.gz index 413fd30..a662534 100644 Binary files a/COORDS/hg19/RefSeqExon.hg19.bed.gz and b/COORDS/hg19/RefSeqExon.hg19.bed.gz differ diff --git a/COORDS/hg38/RefSeqExon.hg38.bed.gz b/COORDS/hg38/RefSeqExon.hg38.bed.gz index 0dc57bb..b1a65dc 100644 Binary files a/COORDS/hg38/RefSeqExon.hg38.bed.gz and b/COORDS/hg38/RefSeqExon.hg38.bed.gz differ diff --git a/COORDS/mm10/RefSeqExon.mm10.bed.gz b/COORDS/mm10/RefSeqExon.mm10.bed.gz index 2177215..2d2546c 100644 Binary files a/COORDS/mm10/RefSeqExon.mm10.bed.gz and b/COORDS/mm10/RefSeqExon.mm10.bed.gz differ diff --git a/COORDS/mm9/RefSeqExon.mm9.bed.gz b/COORDS/mm9/RefSeqExon.mm9.bed.gz index 2bdee77..3238d59 100644 Binary files a/COORDS/mm9/RefSeqExon.mm9.bed.gz and b/COORDS/mm9/RefSeqExon.mm9.bed.gz differ diff --git a/COORDS/rn5/RefSeqExon.rn5.bed.gz b/COORDS/rn5/RefSeqExon.rn5.bed.gz index e3a264c..eed6a89 100644 Binary files a/COORDS/rn5/RefSeqExon.rn5.bed.gz and b/COORDS/rn5/RefSeqExon.rn5.bed.gz differ diff --git a/COORDS/rn6/RefSeqExon.rn6.bed.gz b/COORDS/rn6/RefSeqExon.rn6.bed.gz index c4225e4..1044219 100644 Binary files a/COORDS/rn6/RefSeqExon.rn6.bed.gz and b/COORDS/rn6/RefSeqExon.rn6.bed.gz differ diff --git a/ChromHMM.jar b/ChromHMM.jar index 6349cd0..7341af4 100644 Binary files a/ChromHMM.jar and b/ChromHMM.jar differ diff --git a/ChromHMM.zip b/ChromHMM.zip index 1a364e5..e1f1f01 100644 Binary files a/ChromHMM.zip and b/ChromHMM.zip differ diff --git a/ChromHMM_manual.pdf b/ChromHMM_manual.pdf index 6aff163..55e5a32 100644 Binary files a/ChromHMM_manual.pdf and b/ChromHMM_manual.pdf differ diff --git a/edu/mit/compbio/ChromHMM/BrowserOutput.java b/edu/mit/compbio/ChromHMM/BrowserOutput.java index 0f3fd48..a4118aa 100644 --- a/edu/mit/compbio/ChromHMM/BrowserOutput.java +++ b/edu/mit/compbio/ChromHMM/BrowserOutput.java @@ -19,6 +19,7 @@ import java.io.*; import java.util.*; +import java.util.zip.GZIPOutputStream; /** * This class handles generating the browser output. @@ -119,9 +120,14 @@ else if (n1 > n2) */ int numstates; + /** + * True if files should be in gzip format + */ + boolean bgzip; + public BrowserOutput(String szsegmentfile, String szcolormapping,String szidlabelmapping, - String szsegmentationname, String szoutputfileprefix, int numstates) throws IOException + String szsegmentationname, String szoutputfileprefix, int numstates, boolean bgzip) throws IOException { this.szsegmentfile = szsegmentfile; this.szcolormapping =szcolormapping; @@ -129,6 +135,7 @@ public BrowserOutput(String szsegmentfile, String szcolormapping,String szidlabe this.szsegmentationname = szsegmentationname; this.szoutputfileprefix = szoutputfileprefix; this.numstates = numstates; + this.bgzip = bgzip; hmcolor = new HashMap(); hmlabelExtend = new HashMap(); @@ -358,43 +365,95 @@ private void makeLabelMapping() throws IOException */ public void makebrowserdense() throws IOException { - System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed"); - PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed")); + if (bgzip) + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz"); + //PrintWriter pwzip = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz")); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed.gz")); - BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); - String szLine; - boolean bfirst = true; + BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); + String szLine; + boolean bfirst = true; - while ((szLine =brsegment.readLine())!=null) + while ((szLine =brsegment.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t"); + String szcurrchrom = st.nextToken(); + int nbegin = Integer.parseInt(st.nextToken()); + int nend = Integer.parseInt(st.nextToken()); + String szFullID = st.nextToken(); + String szID = szFullID.substring(1); //this removes ordering type + if (bfirst) + { + String szout = "track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + +" ordered)"+"\" visibility=1 itemRgb=\"On\""+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + // +" ordered)"+"\" visibility=1 itemRgb=\"On\""); + bfirst = false; + } + String szColor = (String) hmcolor.get(szID); + if (szColor == null) + { + throw new IllegalArgumentException("Color not given for "+szID); + } + + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) + { + szID = szID+"_"+szsuffix; + } + + String szout = szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor +"\n"; + //pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + } + brsegment.close(); + pwzip.finish(); + pwzip.close(); + } + else { - StringTokenizer st = new StringTokenizer(szLine,"\t"); - String szcurrchrom = st.nextToken(); - int nbegin = Integer.parseInt(st.nextToken()); - int nend = Integer.parseInt(st.nextToken()); - String szFullID = st.nextToken(); - String szID = szFullID.substring(1); //this removes ordering type - if (bfirst) - { - pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) - +" ordered)"+"\" visibility=1 itemRgb=\"On\""); - bfirst = false; - } - String szColor = (String) hmcolor.get(szID); - if (szColor == null) - { - throw new IllegalArgumentException("Color not given for "+szID); - } + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed"); + PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSERDENSEEXTENSION+".bed")); - String szsuffix; - if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) - { - szID = szID+"_"+szsuffix; - } + BufferedReader brsegment = Util.getBufferedReader(szsegmentfile); + String szLine; + boolean bfirst = true; - pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); - } - brsegment.close(); - pw.close(); + while ((szLine =brsegment.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t"); + String szcurrchrom = st.nextToken(); + int nbegin = Integer.parseInt(st.nextToken()); + int nend = Integer.parseInt(st.nextToken()); + String szFullID = st.nextToken(); + String szID = szFullID.substring(1); //this removes ordering type + if (bfirst) + { + pw.println("track name=\""+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szFullID.charAt(0)) + +" ordered)"+"\" visibility=1 itemRgb=\"On\""); + bfirst = false; + } + String szColor = (String) hmcolor.get(szID); + if (szColor == null) + { + throw new IllegalArgumentException("Color not given for "+szID); + } + + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get(szFullID))!=null) + { + szID = szID+"_"+szsuffix; + } + + pw.println(szcurrchrom+"\t"+nbegin+"\t"+nend+"\t"+szID+"\t0\t.\t"+nbegin+"\t"+nend+"\t"+szColor); + } + brsegment.close(); + pw.close(); + } } @@ -407,8 +466,14 @@ public void makebrowserdense() throws IOException */ public void makebrowserexpanded() throws IOException { - System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed"); - PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed")); + if (bgzip) + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed.gz"); + } + else + { + System.out.println("Writing to file "+szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed"); + } String szLine; @@ -491,62 +556,148 @@ public void makebrowserexpanded() throws IOException } Arrays.sort(szChroms); - pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) - +" ordered)"+"\" visibility=2 itemRgb=\"On\""); - int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; - pw.println("browser position "+szChroms[0]+":1-"+nbrowserend); - - for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + if (bgzip) { - //UCSC browser seems to reverse the ordering of browser track files - String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); - for (int nchrom = 0; nchrom < szChroms.length; nchrom++) - { - //omits those segment labels not observed at all on chromosome - ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); - if (alRecs == null) continue; - - int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); - - //this forces browser to display segment until the end of the chromosome - alRecs.add(new BeginEndRec(nmax-1,nmax)); - - int nsize = alRecs.size(); - int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; - int nfinalend = nmax; - - String szoutlabel; - String szsuffix; - if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) - { - szoutlabel = szLabels[nlabel]+"_"+szsuffix; - } - else - { - szoutlabel = szLabels[nlabel]; - } + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed.gz")); + String szout = "track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + +" ordered)"+"\" visibility=2 itemRgb=\"On\""+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + // +" ordered)"+"\" visibility=2 itemRgb=\"On\""); + int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; + + szout = "browser position "+szChroms[0]+":1-"+nbrowserend+"\n"; + btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pwzip.println("browser position "+szChroms[0]+":1-"+nbrowserend); + + for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + { + //UCSC browser seems to reverse the ordering of browser track files + String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); + for (int nchrom = 0; nchrom < szChroms.length; nchrom++) + { + //omits those segment labels not observed at all on chromosome + ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); + if (alRecs == null) continue; + + int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); + + //this forces browser to display segment until the end of the chromosome + alRecs.add(new BeginEndRec(nmax-1,nmax)); + + int nsize = alRecs.size(); + int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; + int nfinalend = nmax; + + String szoutlabel; + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) + { + szoutlabel = szLabels[nlabel]+"_"+szsuffix; + } + else + { + szoutlabel = szLabels[nlabel]; + } + + StringBuffer sbout = new StringBuffer(); + sbout.append(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + //pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + //pw.print(0); //forcing the display to start at the beginning of the chromosome + sbout.append(0); + for (int ni = 0; ni < nsize; ni++) + { + BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); + int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; + sbout.append(","); + sbout.append(ndiff); + //pw.print(","); + //pw.print(ndiff); + } + sbout.append("\t"); + sbout.append(0); + //pw.print("\t"); + //pw.print(0); + for (int ni = 0; ni < nsize; ni++) + { + int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; + sbout.append(","); + sbout.append(nloc); + //pw.print(","); + //pw.print(nloc); + } + sbout.append("\n"); + // pw.println(); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + } + } + pwzip.finish(); + pwzip.close(); + } + else + { + PrintWriter pw = new PrintWriter(new FileWriter(szoutputfileprefix+ChromHMM.SZBROWSEREXPANDEDEXTENSION+".bed")); + pw.println("track name=\"Expanded_"+szsegmentationname+"\" description=\" "+szsegmentationname+" ("+ChromHMM.convertCharOrderToStringOrder(szLabelFull.charAt(0)) + +" ordered)"+"\" visibility=2 itemRgb=\"On\""); + int nbrowserend = (int) (((Integer)hmchromMax.get(szChroms[0])).intValue()*.001)+1; + pw.println("browser position "+szChroms[0]+":1-"+nbrowserend); - pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); - pw.print(0); //forcing the display to start at the beginning of the chromosome - for (int ni = 0; ni < nsize; ni++) - { - BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); - int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; - pw.print(","); - pw.print(ndiff); - } - pw.print("\t"); - pw.print(0); - for (int ni = 0; ni < nsize; ni++) - { - int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; - pw.print(","); - pw.print(nloc); - } - pw.println(); - } + for (int nlabel = szLabels.length-1; nlabel >=0; nlabel--) + { + //UCSC browser seems to reverse the ordering of browser track files + String szcolor = (String) hmcolor.get(""+szLabels[nlabel]); + for (int nchrom = 0; nchrom < szChroms.length; nchrom++) + { + //omits those segment labels not observed at all on chromosome + ArrayList alRecs = (ArrayList) hmcoords.get(szChroms[nchrom]+"\t"+szLabels[nlabel]); + if (alRecs == null) continue; + + int nmax = ((Integer) hmchromMax.get(szChroms[nchrom])).intValue(); + + //this forces browser to display segment until the end of the chromosome + alRecs.add(new BeginEndRec(nmax-1,nmax)); + + int nsize = alRecs.size(); + int nmin = ((BeginEndRec) alRecs.get(0)).nbegin; + int nfinalend = nmax; + + String szoutlabel; + String szsuffix; + if ((szsuffix = (String) hmlabelExtend.get((String) hmlabelToFull.get(szLabels[nlabel])))!=null) + { + szoutlabel = szLabels[nlabel]+"_"+szsuffix; + } + else + { + szoutlabel = szLabels[nlabel]; + } + + pw.print(szChroms[nchrom]+"\t"+0+"\t"+nfinalend+"\t"+szoutlabel+"\t0\t.\t"+nmin+"\t"+nfinalend+"\t"+szcolor+"\t"+(nsize+1)+"\t"); + pw.print(0); //forcing the display to start at the beginning of the chromosome + for (int ni = 0; ni < nsize; ni++) + { + BeginEndRec theBeginEndRec = (BeginEndRec) alRecs.get(ni); + int ndiff = theBeginEndRec.nend - theBeginEndRec.nbegin; + pw.print(","); + pw.print(ndiff); + } + pw.print("\t"); + pw.print(0); + for (int ni = 0; ni < nsize; ni++) + { + int nloc = ((BeginEndRec) alRecs.get(ni)).nbegin; + pw.print(","); + pw.print(nloc); + } + pw.println(); + } + } + pw.close(); } - pw.close(); } } diff --git a/edu/mit/compbio/ChromHMM/ChromHMM.java b/edu/mit/compbio/ChromHMM/ChromHMM.java index 2351f94..8c72246 100644 --- a/edu/mit/compbio/ChromHMM/ChromHMM.java +++ b/edu/mit/compbio/ChromHMM/ChromHMM.java @@ -506,6 +506,11 @@ public class ChromHMM boolean bpseudo = false; + /** + * true if output should be zipped files + */ + boolean bgzip = false; + /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /** * Stores an integer index and array of boolean flags @@ -610,7 +615,7 @@ public ChromHMM(String szinputdir, String szoutputdir, String szinputfilelist,St int nmaxiterations,double dcovergediff,int nmaxseconds,boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, int nbinsize,String szoutfileID,int nstateorder,boolean bordercols,int nzerotransitionpower, Color theColor, boolean bnormalEM, int nmaxprocessors, boolean blowmem, - int numincludeseq, boolean bprintimage, boolean bscaleemissions, boolean bpseudo) throws IOException + int numincludeseq, boolean bprintimage, boolean bscaleemissions, boolean bpseudo, boolean bgzip) throws IOException { this.szinputdir = szinputdir; this.szoutputdir = szoutputdir; @@ -643,6 +648,7 @@ public ChromHMM(String szinputdir, String szoutputdir, String szinputfilelist,St this.bprintimage = bprintimage; this.bscaleemissions = bscaleemissions; this.bpseudo = bpseudo; + this.bgzip = bgzip; hmlabelExtend = new HashMap(); theRandom = new Random(nseed); @@ -746,7 +752,8 @@ public ChromHMM(String szInitFile, String szoutputdir, String szstateorderingfil * Constructor initializes the variable and loads the data used for making segmentation from a model */ public ChromHMM(String szinputdir, String szinputfilelist, String szchromlengthfile, String szoutputdir, String szInitFile, String szoutfileID, - int nbinsize, boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, boolean blowmem, boolean bscaleemissions) throws IOException + int nbinsize, boolean bprintposterior, boolean bprintsegment,boolean bprintstatebyline, + boolean blowmem, boolean bscaleemissions, boolean bgzip) throws IOException { this.szinputdir = szinputdir; this.szinputfilelist = szinputfilelist; @@ -760,6 +767,7 @@ public ChromHMM(String szinputdir, String szinputfilelist, String szchromlengthf this.nbinsize = nbinsize; this.blowmem = blowmem; this.bscaleemissions = bscaleemissions; + this.bgzip = bgzip; hmlabelExtend = new HashMap(); if (blowmem) @@ -4601,62 +4609,149 @@ else if (ch=='1') } hsprefix.add(szprefix); + GZIPOutputStream pwprobszip = null; PrintWriter pwprobs = null; - if (bprintposterior) + + GZIPOutputStream pwmaxzip = null; + PrintWriter pwmax = null; + + GZIPOutputStream pwbedzip = null; + PrintWriter pwbed = null; + + if (bgzip) { - //creates the posterior file + //PrintWriter pwprobs = null; + if (bprintposterior) + { + //creates the posterior file - String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION+".gz"; + + System.out.println("Writing to file "+szposterioroutfilename); + pwprobszip = new GZIPOutputStream(new FileOutputStream(szposterioroutfilename)); + //pwprobs = new PrintWriter(szposterioroutfilename); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + + StringBuffer sbout = new StringBuffer(); + //pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + sbout.append(""+chorder+(ni+1)+"\t"); + //pwprobs.print(""+chorder+(ni+1)+"\t"); + } + sbout.append(""+chorder+(numstates)+"\n"); + //pwprobs.println(""+chorder+(numstates)); + + btformat = sbout.toString().getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + //PrintWriter pwmax = null; - System.out.println("Writing to file "+szposterioroutfilename); - pwprobs = new PrintWriter(szposterioroutfilename); - pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - for (int ni = 0; ni < numstates-1; ni++) + + if (bprintstatebyline) { - pwprobs.print(""+chorder+(ni+1)+"\t"); - } - pwprobs.println(""+chorder+(numstates)); - } + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION+".gz"; + + System.out.println("Writing to file "+szmaxoutfilename); + //pwmax = new PrintWriter(szmaxoutfilename); + pwmaxzip = new GZIPOutputStream(new FileOutputStream(szmaxoutfilename)); + + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + + szout = "MaxState "+chorder+"\n"; + btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwmax.println("MaxState "+chorder); + } - PrintWriter pwmax = null; - if (bprintstatebyline) - { - //creates a file which has the state with the maximum posterior probability - String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + //PrintWriter pwbed = null; + //GZIPOutputStream pwbedzip = null; - System.out.println("Writing to file "+szmaxoutfilename); - pwmax = new PrintWriter(szmaxoutfilename); - pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - pwmax.println("MaxState "+chorder); - } + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbedzip = (GZIPOutputStream) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + // (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - PrintWriter pwbed = null; - if (bprintsegment) + if (pwbedzip == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION+".gz"; + + pwbedzip = new GZIPOutputStream(new FileOutputStream(szsegmentoutfilename)); + //pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbedzip); + } + } + } + else { - //creates a file which has the maximum segmentation - //we only have one file per cell type here - pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - if (pwbed == null) + if (bprintposterior) { - //haven't seen this cell type - String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + //creates the posterior file + + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + + System.out.println("Writing to file "+szposterioroutfilename); + pwprobs = new PrintWriter(szposterioroutfilename); + pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + pwprobs.print(""+chorder+(ni+1)+"\t"); + } + pwprobs.println(""+chorder+(numstates)); + } - pwbed = new PrintWriter(szsegmentoutfilename); - System.out.println("Writing to file "+szsegmentoutfilename); + if (bprintstatebyline) + { + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; - hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + System.out.println("Writing to file "+szmaxoutfilename); + pwmax = new PrintWriter(szmaxoutfilename); + pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + pwmax.println("MaxState "+chorder); } - } + //PrintWriter pwbed = null; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + + if (pwbed == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + + pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + } + } + } if (bscaleemissions) - { + { for (int ni = 0; ni < nobserved; ni++) { //going through each combination of marks - //if (traindataObservedSeqFlags_nseq[ni]) + //if (traindataObservedSeqFlags_nseq[ni]) //{ //this signature of marks is observed on the current chromosome so //updating its emission probabilities @@ -4966,70 +5061,197 @@ else if (ch=='1') double dmaxval = 0; int nmaxstate = 0; - //handling the first line - for (int ns = 0; ns < gamma_nt.length; ns++) - { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) - { - if (ns > 0) - { - //print with tab if not the first - pwprobs.print("\t"+nf.format(dprob)); - } - else - { - pwprobs.print(nf.format(dprob)); - } - } + if (bgzip) + { + //handling the first line + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + //print with tab if not the first + szout = "\t"+nf.format(dprob); + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } - if (dprob > dmaxval) - { - //best one found so far - dmaxval = dprob; - nmaxstate = ns; - } - } + if (dprob > dmaxval) + { + //best one found so far + dmaxval = dprob; + nmaxstate = ns; + } + } - if (bprintposterior) - { - pwprobs.println(); - } + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } + if (bprintstatebyline) + { + String szout = (nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } - //this contains the best state of the previous interval - int nmaxstateprev = nmaxstate; + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - for (int nt = 1; nt < numtime_nseq; nt++) - { - gamma_nt = gamma[nt]; + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; + + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + //print with tab the first time + szout = "\t"+nf.format(dprob); + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - dmaxval = 0; - nmaxstate = 0; + if (bprintstatebyline) + { + String szout =""+(nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmaxzip.finish(); + pwmaxzip.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobszip.finish(); + pwprobszip.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + GZIPOutputStream pwzip = (GZIPOutputStream) itr.next(); + //PrintWriter pw = (PrintWriter) itr.next(); + pwzip.finish(); + pwzip.close(); + //pw.close(); + } + } + } + else + { + //handling the first line for (int ns = 0; ns < gamma_nt.length; ns++) { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + + if (bprintposterior) { if (ns > 0) { - //print with tab the first time pwprobs.print("\t"+nf.format(dprob)); - } - else - { + } + else + { pwprobs.print(nf.format(dprob)); } } if (dprob > dmaxval) { + //best one found so far dmaxval = dprob; nmaxstate = ns; } @@ -5037,64 +5259,108 @@ else if (ch=='1') if (bprintposterior) { - pwprobs.println(); + pwprobs.println(); } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } - - if (bprintsegment&&(nmaxstateprev != nmaxstate)) + if (bprintstatebyline) { - //print out last segment we are done with - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); - //start a new segment now - nstart = nt; - nmaxstateprev = nmaxstate; - } - } + pwmax.println(""+(nmaxstate+1)); + } - if (bprintsegment) - { - int nlastcoordinate; - Integer objMaxCoord = null; - if (hmMaxCoord != null) - { - objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); - } + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - if (objMaxCoord != null) - { - nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); - } - else - { - nlastcoordinate = numtime_nseq*nbinsize; - } - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - } + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; - //close out the max state file if that was requested - if (bprintstatebyline) - { - pwmax.close(); - } - //close out the posterior state file if that was requested - if (bprintposterior) - { - pwprobs.close(); - } - } + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + if (ns > 0) + { + //print with tab the first time + pwprobs.print("\t"+nf.format(dprob)); + } + else + { + pwprobs.print(nf.format(dprob)); + } + } - //if segment print was requested then we are going to go close those printwriters - if (bprintsegment) - { - Iterator itr = hmcellToFourColPW.values().iterator(); - while (itr.hasNext()) - { - PrintWriter pw = (PrintWriter) itr.next(); - pw.close(); + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + pwprobs.println(); + } + + if (bprintstatebyline) + { + pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmax.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobs.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + PrintWriter pw = (PrintWriter) itr.next(); + pw.close(); + } + } } } } @@ -5209,51 +5475,125 @@ public void makeSegmentation() throws IOException } hsprefix.add(szprefix); + + GZIPOutputStream pwprobszip = null; + GZIPOutputStream pwmaxzip = null; + GZIPOutputStream pwbedzip = null; PrintWriter pwprobs = null; - if (bprintposterior) + PrintWriter pwmax = null; + PrintWriter pwbed = null; + + if (bgzip) { - //creates the posterior file + if (bprintposterior) + { + //creates the posterior file - String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION+".gz"; + + System.out.println("Writing to file "+szposterioroutfilename); + //pwprobs = new PrintWriter(szposterioroutfilename); + pwprobszip = new GZIPOutputStream(new FileOutputStream(szposterioroutfilename)); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]+"\n"; + + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + + //pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + StringBuffer sbout = new StringBuffer(); + for (int ni = 0; ni < numstates-1; ni++) + { + sbout.append(""+chorder+(ni+1)+"\t"); + //pwprobs.print(""+chorder+(ni+1)+"\t"); + } + sbout.append(""+chorder+(numstates)+"\n"); + + btformat = sbout.toString().getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(""+chorder+(numstates)); + } - System.out.println("Writing to file "+szposterioroutfilename); - pwprobs = new PrintWriter(szposterioroutfilename); - pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - for (int ni = 0; ni < numstates-1; ni++) + if (bprintstatebyline) { - pwprobs.print(""+chorder+(ni+1)+"\t"); - } - pwprobs.println(""+chorder+(numstates)); - } + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION+".gz"; + System.out.println("Writing to file "+szmaxoutfilename); + pwmaxzip = new GZIPOutputStream(new FileOutputStream(szmaxoutfilename)); + //pwmax = new PrintWriter(szmaxoutfilename); + String szout = cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq] +"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + + szout = "MaxState "+chorder+"\n"; + + btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + //pwmax.println("MaxState "+chorder); + } - PrintWriter pwmax = null; - if (bprintstatebyline) - { - //creates a file which has the state with the maximum posterior probability - String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbedzip = (GZIPOutputStream) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - System.out.println("Writing to file "+szmaxoutfilename); - pwmax = new PrintWriter(szmaxoutfilename); - pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); - pwmax.println("MaxState "+chorder); - } + if (pwbedzip == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION+".gz"; - PrintWriter pwbed = null; - if (bprintsegment) + pwbedzip = new GZIPOutputStream(new FileOutputStream(szsegmentoutfilename)); + //pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbedzip); + } + } + } + else { - //creates a file which has the maximum segmentation - //we only have one file per cell type here - pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); + if (bprintposterior) + { + //creates the posterior file + + String szposterioroutfilename = szoutputdir+"/POSTERIOR/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZPOSTERIOREXTENSION; - if (pwbed == null) + System.out.println("Writing to file "+szposterioroutfilename); + pwprobs = new PrintWriter(szposterioroutfilename); + pwprobs.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + for (int ni = 0; ni < numstates-1; ni++) + { + pwprobs.print(""+chorder+(ni+1)+"\t"); + } + pwprobs.println(""+chorder+(numstates)); + } + + if (bprintstatebyline) { - //haven't seen this cell type - String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + //creates a file which has the state with the maximum posterior probability + String szmaxoutfilename = szoutputdir+"/STATEBYLINE/"+szprefix+"_"+chromSeq[nordered_nseq]+ChromHMM.SZSTATEBYLINEEXTENSION; + System.out.println("Writing to file "+szmaxoutfilename); + pwmax = new PrintWriter(szmaxoutfilename); + pwmax.println(cellSeq[nordered_nseq]+"\t"+chromSeq[nordered_nseq]); + pwmax.println("MaxState "+chorder); + } - pwbed = new PrintWriter(szsegmentoutfilename); - System.out.println("Writing to file "+szsegmentoutfilename); + //PrintWriter pwbed = null; + if (bprintsegment) + { + //creates a file which has the maximum segmentation + //we only have one file per cell type here + pwbed = (PrintWriter) hmcellToFourColPW.get(cellSeq[nordered_nseq]); - hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + if (pwbed == null) + { + //haven't seen this cell type + String szsegmentoutfilename = szoutputdir+"/" + szprefix+SZSEGMENTEXTENSION; + + pwbed = new PrintWriter(szsegmentoutfilename); + System.out.println("Writing to file "+szsegmentoutfilename); + hmcellToFourColPW.put(cellSeq[nordered_nseq],pwbed); + } } } @@ -5572,70 +5912,197 @@ public void makeSegmentation() throws IOException double dmaxval = 0; int nmaxstate = 0; - //handling the first line - for (int ns = 0; ns < gamma_nt.length; ns++) - { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) - { - if (ns > 0) - { - //print with tab if not the first - pwprobs.print("\t"+nf.format(dprob)); - } - else - { - pwprobs.print(nf.format(dprob)); - } - } + if (bgzip) + { + //handling the first line + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + szout = "\t"+nf.format(dprob); + //print with tab if not the first + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } - if (dprob > dmaxval) - { - //best one found so far - dmaxval = dprob; - nmaxstate = ns; - } - } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } - if (bprintposterior) - { - pwprobs.println(); - } + if (dprob > dmaxval) + { + //best one found so far + dmaxval = dprob; + nmaxstate = ns; + } + } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } + if (bprintposterior) + { + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + //pwprobs.println(); + } - //this contains the best state of the previous interval - int nmaxstateprev = nmaxstate; + if (bprintstatebyline) + { + String szout = ""+(nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } - for (int nt = 1; nt < numtime_nseq; nt++) - { - gamma_nt = gamma[nt]; + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; + + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; + + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + String szout; + if (ns > 0) + { + szout = "\t"+nf.format(dprob); + //print with tab the first time + //pwprobs.print("\t"+nf.format(dprob)); + } + else + { + szout = nf.format(dprob); + //pwprobs.print(nf.format(dprob)); + } + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + //pwprobs.println(); + String szout = "\n"; + byte[] btformat = szout.getBytes(); + pwprobszip.write(btformat,0,btformat.length); + } + + if (bprintstatebyline) + { + String szout = (nmaxstate+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwmaxzip.write(btformat,0,btformat.length); + //pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1) +"\n"; + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + //pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - dmaxval = 0; - nmaxstate = 0; + String szout = chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)+"\n"; + byte[] btformat = szout.getBytes(); + pwbedzip.write(btformat,0,btformat.length); + + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmaxzip.finish(); + pwmaxzip.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobszip.finish(); + pwprobszip.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + GZIPOutputStream pwzip = (GZIPOutputStream) itr.next(); + pwzip.finish(); + pwzip.close(); + } + } + } + else + { + //handling the first line for (int ns = 0; ns < gamma_nt.length; ns++) { - int nmappedstate = stateordering[ns]; //maps new state to old - double dprob = gamma_nt[nmappedstate]; - if (bprintposterior) + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) { if (ns > 0) { - //print with tab the first time + //print with tab if not the first pwprobs.print("\t"+nf.format(dprob)); - } - else - { + } + else + { pwprobs.print(nf.format(dprob)); } } if (dprob > dmaxval) { + //best one found so far dmaxval = dprob; nmaxstate = ns; } @@ -5646,61 +6113,105 @@ public void makeSegmentation() throws IOException pwprobs.println(); } - if (bprintstatebyline) - { - pwmax.println(""+(nmaxstate+1)); - } - - if (bprintsegment&&(nmaxstateprev != nmaxstate)) + if (bprintstatebyline) { - //print out last segment we are done with - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); - //start a new segment now - nstart = nt; - nmaxstateprev = nmaxstate; - } - } + pwmax.println(""+(nmaxstate+1)); + } - if (bprintsegment) - { - int nlastcoordinate; - Integer objMaxCoord = null; - if (hmMaxCoord != null) - { - objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); - } + //this contains the best state of the previous interval + int nmaxstateprev = nmaxstate; - if (objMaxCoord != null) - { - nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); - } - else - { - nlastcoordinate = numtime_nseq*nbinsize; - } - pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); - } + for (int nt = 1; nt < numtime_nseq; nt++) + { + gamma_nt = gamma[nt]; - //close out the max state file if that was requested - if (bprintstatebyline) - { - pwmax.close(); - } - //close out the posterior state file if that was requested - if (bprintposterior) - { - pwprobs.close(); - } - } + dmaxval = 0; + nmaxstate = 0; + for (int ns = 0; ns < gamma_nt.length; ns++) + { + int nmappedstate = stateordering[ns]; //maps new state to old + double dprob = gamma_nt[nmappedstate]; + if (bprintposterior) + { + if (ns > 0) + { + //print with tab the first time + pwprobs.print("\t"+nf.format(dprob)); + } + else + { + pwprobs.print(nf.format(dprob)); + } + } - //if segment print was requested then we are going to go close those printwriters - if (bprintsegment) - { - Iterator itr = hmcellToFourColPW.values().iterator(); - while (itr.hasNext()) - { - PrintWriter pw = (PrintWriter) itr.next(); - pw.close(); + if (dprob > dmaxval) + { + dmaxval = dprob; + nmaxstate = ns; + } + } + + if (bprintposterior) + { + pwprobs.println(); + } + + if (bprintstatebyline) + { + pwmax.println(""+(nmaxstate+1)); + } + + if (bprintsegment&&(nmaxstateprev != nmaxstate)) + { + //print out last segment we are done with + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+(nt*nbinsize)+"\t"+chorder+(nmaxstateprev+1)); + //start a new segment now + nstart = nt; + nmaxstateprev = nmaxstate; + } + } + + if (bprintsegment) + { + int nlastcoordinate; + Integer objMaxCoord = null; + if (hmMaxCoord != null) + { + objMaxCoord = ((Integer) hmMaxCoord.get(chromSeq[nordered_nseq])); + } + + if (objMaxCoord != null) + { + nlastcoordinate = Math.min(numtime_nseq*nbinsize,((Integer) objMaxCoord).intValue()); + } + else + { + nlastcoordinate = numtime_nseq*nbinsize; + } + pwbed.println(chromSeq[nordered_nseq]+"\t"+(nstart*nbinsize)+"\t"+nlastcoordinate+"\t"+chorder+(nmaxstateprev+1)); + } + + //close out the max state file if that was requested + if (bprintstatebyline) + { + pwmax.close(); + } + //close out the posterior state file if that was requested + if (bprintposterior) + { + pwprobs.close(); + } + + //if segment print was requested then we are going to go close those printwriters + if (bprintsegment) + { + Iterator itr = hmcellToFourColPW.values().iterator(); + while (itr.hasNext()) + { + PrintWriter pw = (PrintWriter) itr.next(); + pw.close(); + } + } } } } @@ -10665,10 +11176,11 @@ public static void main(String[] args) throws IOException if (szcommand.equalsIgnoreCase("Version")) { - System.out.println("This is Version 1.15 of ChromHMM (c) Copyright 2008-2012 Massachusetts Institute of Technology"); + System.out.println("This is Version 1.16 of ChromHMM (c) Copyright 2008-2012 Massachusetts Institute of Technology"); } else if ((szcommand.equals("BinarizeBam"))||(szcommand.equalsIgnoreCase("BinarizeBed"))) { + boolean bgzip = false; boolean bpairend = false; String szcontroldir=null; int nflankwidthcontrol = 5; @@ -10725,6 +11237,10 @@ else if (args[nargindex].equals("-g")) { dcountthresh = Double.parseDouble(args[++nargindex]); } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-n")) { nshift = Integer.parseInt(args[++nargindex]); @@ -10830,7 +11346,7 @@ else if (args[nargindex].equals("-w")) nshift,bcenterinterval, noffsetleft,noffsetright,szoutputsignaldir, szoutputbinarydir,szoutputcontroldir, dpoissonthresh,dfoldthresh,bcontainsthresh, - npseudocountcontrol,nbinsize,szcolfields,bpeaks, dcountthresh,szcommand.equalsIgnoreCase("BinarizeBam"),bpairend); + npseudocountcontrol,nbinsize,szcolfields,bpeaks, dcountthresh,szcommand.equalsIgnoreCase("BinarizeBam"),bpairend, bgzip); } else @@ -10844,14 +11360,14 @@ else if (args[nargindex].equals("-w")) if (szcommand.equalsIgnoreCase("BinarizeBed")) { System.out.println("usage BinarizeBed [-b binsize][-c controldir][-center][-colfields chromosome,start,end[,strand]][-e offsetend][-f foldthresh]"+ - "[-g signalthresh][-n shift][-o outputcontroldir][-p poissonthresh][-peaks][-s offsetstart][-strictthresh][-t outputsignaldir]"+ + "[-g signalthresh][-gzip][-n shift][-o outputcontroldir][-p poissonthresh][-peaks][-s offsetstart][-strictthresh][-t outputsignaldir]"+ "[-u pseudocountcontrol][-w flankwidthcontrol] "+ "chromosomelengthfile inputbeddir cellmarkfiletable outputbinarydir"); } else { System.out.println("usage BinarizeBam [-b binsize][-c controldir][-e offsetend][-f foldthresh]"+ - "[-g signalthresh][-o outputcontroldir][-p poissonthresh][-paired|[-center][-n shift][-peaks]]"+ + "[-g signalthresh][-gzip][-o outputcontroldir][-p poissonthresh][-paired|[-center][-n shift][-peaks]]"+ "[-s offsetstart][-strictthresh][-t outputsignaldir]"+ "[-u pseudocountcontrol][-w flankwidthcontrol] "+ "chromosomelengthfile inputbamdir cellmarkfiletable outputbinarydir"); @@ -10861,6 +11377,7 @@ else if (args[nargindex].equals("-w")) else if (szcommand.equalsIgnoreCase("BinarizeSignal")) { boolean bcontainsthresh = true; + boolean bgzip = false; double dfoldthresh = 0; double dcountthresh = 0; double dpoissonthresh = 0.0001; @@ -10894,6 +11411,10 @@ else if (args[nargindex].equals("-g")) { dcountthresh = Double.parseDouble(args[++nargindex]); } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-p")) { dpoissonthresh = Double.parseDouble(args[++nargindex]); @@ -10940,11 +11461,13 @@ else if (args[nargindex].equals("-w")) if (szcontroldir!=null) { Preprocessing.makeBinaryDataFromSignalAgainstControl(szsignaldir,szcontroldir, szoutputdir, - dpoissonthresh, dfoldthresh,bcontainsthresh, nflankwidthcontrol,npseudocountcontrol, dcountthresh); + dpoissonthresh, dfoldthresh,bcontainsthresh, + nflankwidthcontrol,npseudocountcontrol, dcountthresh,bgzip); } else { - Preprocessing.makeBinaryDataFromSignalUniform(szsignaldir, szoutputdir, dpoissonthresh, dfoldthresh, bcontainsthresh,dcountthresh); + Preprocessing.makeBinaryDataFromSignalUniform(szsignaldir, szoutputdir, dpoissonthresh, + dfoldthresh, bcontainsthresh,dcountthresh,bgzip); } } else @@ -10955,7 +11478,7 @@ else if (args[nargindex].equals("-w")) if (!bok) { - System.out.println("usage BinarizeSignal [-c controldir][-f foldthresh][-g signalthresh][-p poissonthresh][-strictthresh][-u pseudocountcontrol][-w flankwidth] signaldir outputdir"); + System.out.println("usage BinarizeSignal [-c controldir][-f foldthresh][-g signalthresh][-gzip][-p poissonthresh][-strictthresh][-u pseudocountcontrol][-w flankwidth] signaldir outputdir"); } } @@ -11208,7 +11731,7 @@ else if (szcommand.equalsIgnoreCase("MakeSegmentation")) int nbinsize = ChromHMM.DEFAULT_BINSIZEBASEPAIRS; String szoutfileID = ""; boolean blowmem = false; - + boolean bgzip = false; int nargindex = 1; try @@ -11223,6 +11746,10 @@ else if (args[nargindex].equals("-f")) { szinputfilelist = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-i")) { szoutfileID = args[++nargindex]; @@ -11307,7 +11834,7 @@ else if (args[nargindex].equals("-printstatesbyline")) { ChromHMM theHMM = new ChromHMM(szinputdir, szinputfilelist,szchromlengthfile, szoutputdir, szmodelfile, szoutfileID, nbinsize, bprintposterior, - bprintsegments,bprintstatebyline, blowmem,bscaleemissions); + bprintsegments,bprintstatebyline, blowmem,bscaleemissions, bgzip); if (blowmem) { @@ -11330,7 +11857,7 @@ else if (args[nargindex].equals("-printstatesbyline")) if (!bok) { - System.out.println("usage: MakeSegmentation [-b binsize][-f inputfilelist][-i outfileID][-l chromosomelengthfile][-lowmem][-many][-nobed]"+ + System.out.println("usage: MakeSegmentation [-b binsize][-f inputfilelist][-gzip][-i outfileID][-l chromosomelengthfile][-lowmem][-many][-nobed]"+ "[-printposterior][-printstatesbyline]"+ " modelfile inputdir outputdir"); } @@ -11339,6 +11866,7 @@ else if (szcommand.equalsIgnoreCase("MakeBrowserFiles")) { String szcolormapping = null; String szlabelmapping = null; + boolean bgzip = false; int nargindex = 1; int numstates = -1; @@ -11349,6 +11877,10 @@ else if (szcommand.equalsIgnoreCase("MakeBrowserFiles")) { szcolormapping = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if ((args[nargindex].equals("-l"))||(args[nargindex].equals("-m"))) { //the -l is for backwards compatibility @@ -11372,7 +11904,7 @@ else if (args[nargindex].equals("-n")) String szsegmentationname = args[nargindex++]; String szoutputfileprefix =args[nargindex]; BrowserOutput theBrowserOutput = new BrowserOutput(szsegmentfile,szcolormapping,szlabelmapping, - szsegmentationname, szoutputfileprefix,numstates); + szsegmentationname, szoutputfileprefix,numstates,bgzip); theBrowserOutput.makebrowserdense(); theBrowserOutput.makebrowserexpanded(); } @@ -11383,7 +11915,7 @@ else if (args[nargindex].equals("-n")) if (!bok) { - System.out.println("usage: MakeBrowserFiles [-c colormappingfile][-m labelmappingfile][-n numstates] segmentfile segmentationname outputfileprefix"); + System.out.println("usage: MakeBrowserFiles [-c colormappingfile][-gzip][-m labelmappingfile][-n numstates] segmentfile segmentationname outputfileprefix"); } } else if (szcommand.equalsIgnoreCase("OverlapEnrichment")) @@ -11729,6 +12261,10 @@ else if (szcommand.equalsIgnoreCase("LearnModel")) String path = ChromHMM.class.getProtectionDomain().getCodeSource().getLocation().getPath(); String decodedPath = URLDecoder.decode(path, "UTF-8"); String szprefixpath = decodedPath.substring(0, decodedPath.lastIndexOf("/") + 1); + //String szfullprefixpathchromsizedir = szprefixpath+"/"+CHROMSIZESDIR; + String szfullprefixpathcoorddir = szprefixpath+"/"+COORDDIR; + String szfullprefixpathanchorfiledir = szprefixpath+"/"+ANCHORFILEDIR; + //System.out.println("prefix path is "+szprefixpath); @@ -11750,6 +12286,7 @@ else if (szcommand.equalsIgnoreCase("LearnModel")) boolean blowmem = false; boolean bscaleemissions = false; boolean bpseudo = false; + boolean bgzip = false; String szinputfilelist = null; String szchromlengthfile = null; @@ -11802,6 +12339,10 @@ else if (args[nargindex].equals("-f")) { szinputfilelist = args[++nargindex]; } + else if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } else if (args[nargindex].equals("-h")) { dinformationsmooth = Double.parseDouble(args[++nargindex]); @@ -11918,6 +12459,18 @@ else if (args[nargindex].equals("-printstatebyline")) { bprintstatebyline = true; } + //else if (args[nargindex].equals("-u")) + //{ + // szfullprefixpathchromsizedir = args[++nargindex]; + //} + else if (args[nargindex].equals("-u")) + { + szfullprefixpathcoorddir = args[++nargindex]; + } + else if (args[nargindex].equals("-v")) + { + szfullprefixpathanchorfiledir = args[++nargindex]; + } else if (args[nargindex].equals("-x")) { nmaxseconds = Integer.parseInt(args[++nargindex]); @@ -12000,9 +12553,12 @@ else if (args[nargindex].equals("-z")) if (szchromlengthfile == null) { + //updated v.1.16 File flength = new File(szprefixpath+"/"+CHROMSIZESDIR+"/"+szassembly+".txt"); + //File flength = new File(szfullprefixpathchromsizedir+"/"+szassembly+".txt"); if (flength.exists()) { + //szchromlengthfile = szfullprefixpathchromsizedir+"/"+szassembly+".txt"; szchromlengthfile = szprefixpath+"/"+CHROMSIZESDIR+"/"+szassembly+".txt"; } } @@ -12010,7 +12566,7 @@ else if (args[nargindex].equals("-z")) szInitFile,dloadsmoothemission,dloadsmoothtransition,dinformationsmooth, nmaxiterations,dconvergediff,nmaxseconds, bprintposterior,bprintsegments,bprintstatebyline, nbinsize,szoutfileID,nstateorder,bordercols,nzerotransitionpower,theColor,bnormalEM, nmaxprocessors, - blowmem,numincludeseq,bprintimage,bscaleemissions, bpseudo); + blowmem,numincludeseq,bprintimage,bscaleemissions, bpseudo, bgzip); theHMM.buildModel(); @@ -12083,7 +12639,15 @@ else if (args[nargindex].equals("-z")) for (int nfile = 0; nfile < prefixes.length; nfile++) { - String szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION; + String szsegmentfile; + if (bgzip) + { + szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION+".gz"; + } + else + { + szsegmentfile = prefixes[nfile]+ChromHMM.SZSEGMENTEXTENSION; + } pwweb.println("
  • "+prefixes[nfile]+" Segmentation File (Four Column Bed File)
    "); } @@ -12107,17 +12671,33 @@ else if (args[nargindex].equals("-z")) { String szprefix = prefixes[nprefixindex]; - String szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION; + String szsegmentfile; + if (bgzip) + { + szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION+".gz"; + } + else + { + szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION; + } + BrowserOutput theBrowserOutput = new BrowserOutput(szsegmentfile,null,null, - szprefix,szoutputdir+"/"+szprefix,numstates); + szprefix,szoutputdir+"/"+szprefix,numstates,bgzip); theBrowserOutput.makebrowserdense(); theBrowserOutput.makebrowserexpanded(); - pwweb.println("
  • "+szprefix+" Browser Custom Track Dense File
    "); - pwweb.println("
  • "+szprefix+" Browser Custom Track Expanded File
    "); - + if (bgzip) + { + pwweb.println("
  • "+szprefix+" Browser Custom Track Dense File
    "); + pwweb.println("
  • "+szprefix+" Browser Custom Track Expanded File
    "); + } + else + { + pwweb.println("
  • "+szprefix+" Browser Custom Track Dense File
    "); + pwweb.println("
  • "+szprefix+" Browser Custom Track Expanded File
    "); + } } } @@ -12130,13 +12710,22 @@ else if (args[nargindex].equals("-z")) { String szprefix = prefixes[nprefixindex]; pwweb.println("

    "+szprefix+" Enrichments

    "); - String szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION; - File fcoordassembly = new File(szprefixpath+"/"+COORDDIR+"/"+szassembly); + String szsegmentfile; + if (bgzip) + { + szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION+".gz"; + } + else + { + szsegmentfile = szoutputdir+"/"+szprefix+ChromHMM.SZSEGMENTEXTENSION; + } + //File fcoordassembly = new File(szprefixpath+"/"+COORDDIR+"/"+szassembly); + File fcoordassembly = new File(szfullprefixpathcoorddir+"/"+szassembly); if (fcoordassembly.exists()) { if (blowmem) { - StateAnalysis.enrichmentMaxLowMem(szsegmentfile,szprefixpath+"/"+COORDDIR+"/"+szassembly,null,//szinputcoordlist, + StateAnalysis.enrichmentMaxLowMem(szsegmentfile,szfullprefixpathcoorddir+"/"+szassembly,null,//szinputcoordlist, ChromHMM.DEFAULT_OVERLAPENRICHMENT_NOFFSETLEFT, ChromHMM.DEFAULT_OVERLAPENRICHMENT_NOFFSETRIGHT,nbinsize, ChromHMM.DEFAULT_OVERLAPENRICHMENT_BCENTER, !ChromHMM.DEFAULT_OVERLAPENRICHMENT_BCOUNTMULTI, @@ -12146,7 +12735,9 @@ else if (args[nargindex].equals("-z")) } else { - StateAnalysis.enrichmentMax(szsegmentfile,szprefixpath+"/"+COORDDIR+"/"+szassembly,null,//szinputcoordlist, + //update v.1.16 + //StateAnalysis.enrichmentMax(szsegmentfile,szprefixpath+"/"+COORDDIR+"/"+szassembly,null,//szinputcoordlist, + StateAnalysis.enrichmentMax(szsegmentfile,szfullprefixpathcoorddir+"/"+szassembly,null,//szinputcoordlist, ChromHMM.DEFAULT_OVERLAPENRICHMENT_NOFFSETLEFT, ChromHMM.DEFAULT_OVERLAPENRICHMENT_NOFFSETRIGHT,nbinsize, ChromHMM.DEFAULT_OVERLAPENRICHMENT_BCENTER, !ChromHMM.DEFAULT_OVERLAPENRICHMENT_BCOUNTMULTI, @@ -12165,10 +12756,12 @@ else if (args[nargindex].equals("-z")) } else { - System.out.println("Warning: No coordinate directory found for assembly "+szassembly+" in "+szprefixpath+"/"+COORDDIR); + System.out.println("Warning: No coordinate directory found for assembly "+szassembly+" in "+szfullprefixpathcoorddir);//szprefixpath+"/"+COORDDIR); } - File fanchorassembly = new File(szprefixpath+"/"+ANCHORFILEDIR+"/"+szassembly); + //szfullprefixpathanchorfiledir + //File fanchorassembly = new File(szprefixpath+"/"+ANCHORFILEDIR+"/"+szassembly); + File fanchorassembly = new File(szfullprefixpathanchorfiledir+"/"+szassembly); if (fanchorassembly.exists()) { String[] dir = fanchorassembly.list(); @@ -12194,8 +12787,8 @@ else if (args[nargindex].equals("-z")) if (blowmem) { - StateAnalysis.neighborhoodMaxLowMem(szsegmentfile,szprefixpath+"/"+ - ANCHORFILEDIR+"/"+szassembly+"/"+dir[nfile],nbinsize,ChromHMM.DEFAULT_NEIGHBORHOOD_NUMLEFT, + StateAnalysis.neighborhoodMaxLowMem(szsegmentfile, + szfullprefixpathanchorfiledir+"/"+szassembly+"/"+dir[nfile],nbinsize,ChromHMM.DEFAULT_NEIGHBORHOOD_NUMLEFT, ChromHMM.DEFAULT_NEIGHBORHOOD_NUMRIGHT, nbinsize,//nspacing ChromHMM.DEFAULT_NEIGHBORHOOD_BUSESTRAND,ChromHMM.DEFAULT_NEIGHBORHOOD_BUSESIGNAL,null,//szcolfields, ChromHMM.DEFAULT_NEIGHBORHOOD_NOFFSETANCHOR,szoutputdir+"/"+szprefix+"_"+szanchorname+"_neighborhood", @@ -12203,8 +12796,8 @@ else if (args[nargindex].equals("-z")) } else { - StateAnalysis.neighborhoodMax(szsegmentfile,szprefixpath+"/"+ - ANCHORFILEDIR+"/"+szassembly+"/"+dir[nfile],nbinsize,ChromHMM.DEFAULT_NEIGHBORHOOD_NUMLEFT, + StateAnalysis.neighborhoodMax(szsegmentfile, + szfullprefixpathanchorfiledir+"/"+szassembly+"/"+dir[nfile],nbinsize,ChromHMM.DEFAULT_NEIGHBORHOOD_NUMLEFT, ChromHMM.DEFAULT_NEIGHBORHOOD_NUMRIGHT, nbinsize,//nspacing ChromHMM.DEFAULT_NEIGHBORHOOD_BUSESTRAND,ChromHMM.DEFAULT_NEIGHBORHOOD_BUSESIGNAL,null,//szcolfields, ChromHMM.DEFAULT_NEIGHBORHOOD_NOFFSETANCHOR,szoutputdir+"/"+szprefix+"_"+szanchorname+"_neighborhood", @@ -12222,7 +12815,7 @@ else if (args[nargindex].equals("-z")) } else { - System.out.println("Warning: No coordinate directory found for assembly "+szassembly+" in "+szprefixpath+"/"+ANCHORFILEDIR); + System.out.println("Warning: No coordinate directory found for assembly "+szassembly+" in "+szfullprefixpathanchorfiledir); } } } @@ -12247,11 +12840,11 @@ else if (args[nargindex].equals("-z")) if (!bok) { - System.out.println("usage: LearnModel [-b binsize][-color r,g,b][-d convergedelta][-e loadsmoothemission][-f inputfilelist][-h informationsmooth]"+ + System.out.println("usage: LearnModel [-b binsize][-color r,g,b][-d convergedelta][-e loadsmoothemission][-f inputfilelist][-gzip][-h informationsmooth]"+ "[-holdcolumnorder][-i outfileID][-init information|random|load][-l chromosomelengthfile][-lowmem][-m modelinitialfile][-many]"+ "[-n numseq][-nobed][-nobrowser][-noenrich][-noimage][-p maxprocessors][-pseudo][-printposterior][-printstatebyline][-r maxiterations][-s seed]"+ "[-stateordering emission|transition]"+ - "[-t loadsmoothtransition][-x maxseconds][-z zerotransitionpower] inputdir outputdir numstates assembly"); + "[-t loadsmoothtransition][-u coorddir][-v anchorfiledir][-x maxseconds][-z zerotransitionpower] inputdir outputdir numstates assembly"); } } else if (szcommand.equalsIgnoreCase("Reorder")) @@ -12380,12 +12973,124 @@ else if (args[nargindex].equals("-holdcolumnorder")) "[-m labelmappingfile][-noimage][-o stateorderingfile][-stateordering emission|transition] inputmodel outputdir"); } + } + else if (szcommand.equalsIgnoreCase("ConvertGeneTable")) + { + String path = ChromHMM.class.getProtectionDomain().getCodeSource().getLocation().getPath(); + String decodedPath = URLDecoder.decode(path, "UTF-8"); + String szprefixpath = decodedPath.substring(0, decodedPath.lastIndexOf("/") + 1); + //String szfullprefixpathchromsizedir = szprefixpath+"/"+CHROMSIZESDIR; + String szfullprefixpathcoorddir = szprefixpath+"/"+COORDDIR; + String szfullprefixpathanchorfiledir = szprefixpath+"/"+ANCHORFILEDIR; + String szchromlengthfile = null; + int nargindex = 1; + int npromoterwindow = 2000; + boolean bgzip = false; + String szcoorddir; + String szanchordir; + try + { + while (nargindex < args.length-3) + { + //else if (args[nargindex].equals("-u")) + // { + // szfullprefixpathchromsizedir = args[++nargindex]; + // } + if (args[nargindex].equals("-gzip")) + { + bgzip = true; + } + else if (args[nargindex].equals("-l")) + { + szchromlengthfile = args[++nargindex]; + } + else if (args[nargindex].equals("-u")) + { + szfullprefixpathcoorddir = args[++nargindex]; + } + else if (args[nargindex].equals("-v")) + { + szfullprefixpathanchorfiledir = args[++nargindex]; + } + else if (args[nargindex].equals("-w")) + { + npromoterwindow = Integer.parseInt(args[++nargindex]); + } + else + { + bok = false; + } + nargindex++; + } + } + catch (NumberFormatException ex) + { + bok = false; + } + + if (bok && (nargindex==args.length-3)) + { + String sztable = args[nargindex++]; + //String szchromlengths = args[nargindex++]; + String szprefix = args[nargindex++]; + String szassembly = args[nargindex++]; + + szcoorddir = szfullprefixpathcoorddir + "/" + szassembly; + szanchordir = szfullprefixpathanchorfiledir + "/" + szassembly; + + File f = new File(szcoorddir); + if (!f.exists()) + { + if (!f.mkdirs()) + { + throw new IllegalArgumentException(szcoorddir+" does not exist and could not be created!"); + } + } + + f = new File(szanchordir); + if (!f.exists()) + { + if (!f.mkdirs()) + { + throw new IllegalArgumentException(szanchordir+" does not exist and could not be created!"); + } + } + + if (szchromlengthfile == null) + { + //updated v.1.16 + String szdefaultlengthfile = szprefixpath+"/"+CHROMSIZESDIR+"/"+szassembly+".txt"; + File flength = new File(szdefaultlengthfile); + //File flength = new File(szfullprefixpathchromsizedir+"/"+szassembly+".txt"); + if (flength.exists()) + { + //szchromlengthfile = szfullprefixpathchromsizedir+"/"+szassembly+".txt"; + szchromlengthfile = szdefaultlengthfile; + } + else + { + throw new IllegalArgumentException("chromosome length file not specified and default "+szdefaultlengthfile+" not found"); + } + } + + + ConvertGeneTable.convertGeneTableToAnnotations(sztable, szprefix, szassembly, szcoorddir, szanchordir, szchromlengthfile, npromoterwindow, bgzip); + } + else + { + bok = false; + } + + if (!bok) + { + System.out.println("usage: ConvertGeneTable [-gzip][-l chromosomelengthfile][-u coorddir][-v anchordir][-w promoterwindow] inputgenetable prefix assembly"); + } } else { - System.out.println("Need to specify the mode BinarizeBam|BinarizeBed|BinarizeSignal|CompareModels|EvalSubset|LearnModel|MakeBrowserFiles"+ + System.out.println("Need to specify the mode BinarizeBam|BinarizeBed|BinarizeSignal|CompareModels|ConvertGeneTable|EvalSubset|LearnModel|MakeBrowserFiles"+ "|MakeSegmentation|NeighborhoodEnrichment|StatePruning|OverlapEnrichment|Reorder|Version"); } diff --git a/edu/mit/compbio/ChromHMM/ConvertGeneTable.java b/edu/mit/compbio/ChromHMM/ConvertGeneTable.java new file mode 100644 index 0000000..79d961b --- /dev/null +++ b/edu/mit/compbio/ChromHMM/ConvertGeneTable.java @@ -0,0 +1,362 @@ +/** + * ChromHMM - automating chromatin state discovery and characterization + * Copyright (C) 2008-2012 Massachusetts Institute of Technology + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + **/ + +package edu.mit.compbio.ChromHMM; + +import java.io.*; +import java.util.*; +import java.util.zip.GZIPOutputStream; + +public class ConvertGeneTable +{ + + /** + * Reads in a RefSeq or equivalently formatted gene table from the UCSC browser and outputs converted files + * the columns in order from the RefSeq gene table are (additional subsequent columns are ignored if present) + * bin + * name + * chrom + * strand + * txStart + * txEnd + * cdsStart + * cdsEnd + * exonCount + * exonStarts + * exonEnds + */ + static void convertGeneTableToAnnotations(String sztable, String szprefix, + String szassembly, String szcoorddir, + String szanchordir, String szchromlengths, + int npromoterwindow, boolean bgzip) throws IOException + { + + //String sztable = args[0]; + //String szassembly = args[1]; + //String szoutdir = args[2]; + //String szanchordir = args[3]; + //String szchromlengths = args[4]; + String szLine; + HashMap hmlengths = new HashMap(); + BufferedReader brlength = new BufferedReader(new FileReader(szchromlengths)); + while ((szLine = brlength.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t"); + hmlengths.put(st.nextToken(),Integer.valueOf(st.nextToken())); + } + brlength.close(); + BufferedReader br = new BufferedReader(new FileReader(sztable)); + + + PrintWriter pwtss = null; + PrintWriter pwanchortss = null; + PrintWriter pwanchortes = null; + PrintWriter pwgene = null; + PrintWriter pwexon = null; + PrintWriter pwtes = null; + PrintWriter pwtss2kb = null; + + GZIPOutputStream pwtsszip = null; + GZIPOutputStream pwanchortsszip = null; + GZIPOutputStream pwanchorteszip = null; + GZIPOutputStream pwgenezip = null; + GZIPOutputStream pwexonzip = null; + GZIPOutputStream pwteszip = null; + GZIPOutputStream pwtss2kbzip = null; + + if (bgzip) + { + pwtsszip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"TSS."+szassembly+".bed.gz")); + pwanchortsszip = new GZIPOutputStream(new FileOutputStream(szanchordir+"/"+szprefix+"TSS."+szassembly+".txt.gz")); + pwanchorteszip = new GZIPOutputStream(new FileOutputStream(szanchordir+"/"+szprefix+"TES."+szassembly+".txt.gz")); + pwgenezip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"Gene."+szassembly+".bed.gz")); + + pwexonzip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"Exon."+szassembly+".bed.gz")); + pwteszip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"TES."+szassembly+".bed.gz")); + + if (npromoterwindow % 1000 == 0) + { + pwtss2kbzip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"TSS"+(npromoterwindow/1000)+"kb."+szassembly+".bed.gz")); + } + else + { + pwtss2kbzip = new GZIPOutputStream(new FileOutputStream(szcoorddir+"/"+szprefix+"TSS"+npromoterwindow+"bp."+szassembly+".bed.gz")); + } + } + else + { + pwtss = new PrintWriter(szcoorddir+"/"+szprefix+"TSS."+szassembly+".bed"); + pwanchortss = new PrintWriter(szanchordir+"/"+szprefix+"TSS."+szassembly+".txt"); + pwanchortes = new PrintWriter(szanchordir+"/"+szprefix+"TES."+szassembly+".txt"); + pwgene = new PrintWriter(szcoorddir+"/"+szprefix+"Gene."+szassembly+".bed"); + + pwexon = new PrintWriter(szcoorddir+"/"+szprefix+"Exon."+szassembly+".bed"); + pwtes = new PrintWriter(szcoorddir+"/"+szprefix+"TES."+szassembly+".bed"); + + if (npromoterwindow % 1000 == 0) + { + pwtss2kb = new PrintWriter(szcoorddir+"/"+szprefix+"TSS"+(npromoterwindow/1000)+"kb."+szassembly+".bed"); + } + else + { + pwtss2kb = new PrintWriter(szcoorddir+"/"+szprefix+"TSS"+npromoterwindow+"bp."+szassembly+".bed"); + } + } + + HashSet hstss = new HashSet(); + HashSet hsanchortss = new HashSet(); + HashSet hsanchortes = new HashSet(); + HashSet hsgene = new HashSet(); + HashSet hsexon = new HashSet(); + HashSet hstes = new HashSet(); + HashSet hstss2kb = new HashSet(); + + br.readLine(); + + while ((szLine = br.readLine())!=null) + { + StringTokenizer st = new StringTokenizer(szLine,"\t",true); + String szbin = st.nextToken(); + if (!szbin.equals("\t")) + st.nextToken(); + String szname = st.nextToken(); + if (!szname.equals("\t")) + st.nextToken(); + String szchrom = st.nextToken(); + if (!szchrom.equals("\t")) + st.nextToken(); + String szstrand = st.nextToken(); + if (!szstrand.equals("\t")) + st.nextToken(); + String sztxStart = st.nextToken(); + if (!sztxStart.equals("\t")) + st.nextToken(); + String sztxEnd = st.nextToken(); + if (!sztxEnd.equals("\t")) + st.nextToken(); + String szcdsStart = st.nextToken(); + if (!szcdsStart.equals("\t")) + st.nextToken(); + String szcdsEnd = st.nextToken(); + if (!szcdsEnd.equals("\t")) + st.nextToken(); + String szexonCount = st.nextToken(); + if (!szexonCount.equals("\t")) + st.nextToken(); + String szexonStarts = st.nextToken(); + if (!szexonStarts.equals("\t")) + st.nextToken(); + String szexonEnds = st.nextToken(); + if (!szexonEnds.equals("\t")) + st.nextToken(); + //String szscore = st.nextToken(); + //if (!szscore.equals("\t")) + //st.nextToken(); + //String szname2 = st.nextToken(); + //if (!szname2.equals("\t")) + //st.nextToken(); + //String szcdsStartStat = st.nextToken(); + //if (!szcdsStartStat.equals("\t")) + // st.nextToken(); + //String szcdsEndStat = st.nextToken(); + //if (!szcdsEndStat.equals("\t")) + //st.nextToken(); + //String szexonFrames = st.nextToken(); + + + int ntes=-1; + int ntss=-1; + if (szstrand.equals("+")) + { + ntss = Integer.parseInt(sztxStart); + ntes = Integer.parseInt(sztxEnd)-1; + } + else if (szstrand.equals("-")) + { + ntss = Integer.parseInt(sztxEnd)-1; + ntes = Integer.parseInt(sztxStart); + } + else + { + throw new IllegalArgumentException("invalid strand\t"+szstrand); + } + + int nchromlength =((Integer) hmlengths.get(szchrom)).intValue(); + String sztssOut = szchrom+"\t"+ntss+"\t"+(ntss+1); + String sztesOut = szchrom+"\t"+ntes+"\t"+(ntes+1); + String sztss2kbOut = szchrom+"\t"+Math.max((ntss-npromoterwindow),0)+"\t"+Math.min(ntss+npromoterwindow+1,nchromlength); + String szgeneOut = szchrom+"\t"+sztxStart+"\t"+sztxEnd; + String szanchorTSSOut = szchrom+"\t"+ntss+"\t"+szstrand; + String szanchorTESOut = szchrom+"\t"+ntes+"\t"+szstrand; + StringTokenizer stexonStarts = new StringTokenizer(szexonStarts,","); + StringTokenizer stexonEnds = new StringTokenizer(szexonEnds,","); + + if (bgzip) + { + if (!hstss.contains(sztssOut)) + { + byte[] btformat = (sztssOut+"\n").getBytes(); + pwtsszip.write(btformat,0,btformat.length); + //pwtss.println(sztssOut); + hstss.add(sztssOut); + } + + if (!hsanchortss.contains(szanchorTSSOut)) + { + byte[] btformat = (szanchorTSSOut+"\n").getBytes(); + pwanchortsszip.write(btformat,0,btformat.length); + + //pwanchortss.println(szanchorTSSOut); + hsanchortss.add(szanchorTSSOut); + } + + if (!hsanchortes.contains(szanchorTESOut)) + { + byte[] btformat = (szanchorTESOut+"\n").getBytes(); + pwanchorteszip.write(btformat,0,btformat.length); + + //pwanchortes.println(szanchorTESOut); + hsanchortes.add(szanchorTESOut); + } + + if (!hstes.contains(sztesOut)) + { + byte[] btformat = (sztesOut+"\n").getBytes(); + pwteszip.write(btformat,0,btformat.length); + + //pwtes.println(sztesOut); + hstes.add(sztesOut); + } + + if (!hstss2kb.contains(sztss2kbOut)) + { + byte[] btformat = (sztss2kbOut+"\n").getBytes(); + pwtss2kbzip.write(btformat,0,btformat.length); + + //pwtss2kb.println(sztss2kbOut); + hstss2kb.add(sztss2kbOut); + } + + if (!hsgene.contains(szgeneOut)) + { + byte[] btformat = (szgeneOut+"\n").getBytes(); + pwgenezip.write(btformat,0,btformat.length); + + //pwgene.println(szgeneOut); + hsgene.add(szgeneOut); + } + + while (stexonStarts.hasMoreTokens()) + { + String szexonOut = szchrom+"\t"+stexonStarts.nextToken()+"\t"+stexonEnds.nextToken()+"\n"; + if (!hsexon.contains(szexonOut)) + { + byte[] btformat = szexonOut.getBytes(); + pwexonzip.write(btformat,0,btformat.length); + + hsexon.add(szexonOut); + } + + //pwexon.println(szexonOut); + } + } + else + { + if (!hstss.contains(sztssOut)) + { + pwtss.println(sztssOut); + hstss.add(sztssOut); + } + + if (!hsanchortss.contains(szanchorTSSOut)) + { + pwanchortss.println(szanchorTSSOut); + hsanchortss.add(szanchorTSSOut); + } + + if (!hsanchortes.contains(szanchorTESOut)) + { + pwanchortes.println(szanchorTESOut); + hsanchortes.add(szanchorTESOut); + } + + if (!hstes.contains(sztesOut)) + { + pwtes.println(sztesOut); + hstes.add(sztesOut); + } + + if (!hstss2kb.contains(sztss2kbOut)) + { + pwtss2kb.println(sztss2kbOut); + hstss2kb.add(sztss2kbOut); + } + + if (!hsgene.contains(szgeneOut)) + { + pwgene.println(szgeneOut); + hsgene.add(szgeneOut); + } + + while (stexonStarts.hasMoreTokens()) + { + String szexonOut = szchrom+"\t"+stexonStarts.nextToken()+"\t"+stexonEnds.nextToken(); + + if (!hsexon.contains(szexonOut)) + { + pwexon.println(szexonOut); + hsexon.add(szexonOut); + } + } + } + } + br.close(); + + if (bgzip) + { + pwtsszip.finish(); //tss + pwgenezip.finish(); //gene + pwanchortsszip.finish(); + pwanchorteszip.finish(); + pwexonzip.finish(); //exon + pwteszip.finish(); //tss + pwtss2kbzip.finish(); //2kb + + + pwtsszip.close(); //tss + pwgenezip.close(); //gene + pwanchortsszip.close(); + pwanchorteszip.close(); + pwexonzip.close(); //exon + pwteszip.close(); //tss + pwtss2kbzip.close(); //2kb + } + else + { + pwtss.close(); //tss + pwgene.close(); //gene + pwanchortss.close(); + pwanchortes.close(); + pwexon.close(); //exon + pwtes.close(); //tss + pwtss2kb.close(); //2kb + } + + } + +} diff --git a/edu/mit/compbio/ChromHMM/Preprocessing.java b/edu/mit/compbio/ChromHMM/Preprocessing.java index 56be164..08eb040 100644 --- a/edu/mit/compbio/ChromHMM/Preprocessing.java +++ b/edu/mit/compbio/ChromHMM/Preprocessing.java @@ -22,6 +22,7 @@ import htsjdk.samtools.*; import java.io.*; import java.util.*; +import java.util.zip.GZIPOutputStream; /** * This class supports functions to convert either read level data or @@ -554,7 +555,8 @@ public static void makeBinaryDataFromBed(String szchromlengthfile, String szmark int nshift, boolean bcenterinterval,int noffsetleft, int noffsetright, String szoutputsignaldir,String szoutputbinarydir, String szoutputcontroldir, double dpoissonthresh, double dfoldthresh,boolean bcontainsthresh, int npseudocountcontrol,int nbinsize, - String szcolfields, boolean bpeaks, double dcountthresh, boolean bbinarizebam, boolean bpairend + String szcolfields, boolean bpeaks, double dcountthresh, boolean bbinarizebam, boolean bpairend, + boolean bgzip ) throws IOException { @@ -756,29 +758,80 @@ public static void makeBinaryDataFromBed(String szchromlengthfile, String szmark if (bpresent[nchrom]) { //we have signal for this chromosome - String szfile = szoutputsignaldir+"/"+szcell+"_"+chroms[nchrom]+"_signal.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(szfile); - pw.println(szcell+"\t"+chroms[nchrom]); - //outputs mark header - for (int nmark = 0; nmark < marks.length-1; nmark++) - { - pw.print(marks[nmark]+"\t"); - } - pw.println(marks[marks.length-1]); - - //outputs mark signal data - int[][] grid_nchrom = grid[nchrom]; - for (int nbin = 0; nbin < grid_nchrom.length; nbin++) - { - int[] grid_nchrom_nbin = grid_nchrom[nbin]; - for (int nmark = 0; nmark < grid_nchrom_nbin.length-1; nmark++) - { - pw.print(grid_nchrom_nbin[nmark]+"\t"); + if (bgzip) + { + String szfile = szoutputsignaldir+"/"+szcell+"_"+chroms[nchrom]+"_signal.txt.gz"; + System.out.println("Writing to file "+szfile); + + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + + //PrintWriter pw = new PrintWriter(szfile); + String szout = szcell+"\t"+chroms[nchrom] +"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pw.println(szcell+"\t"+chroms[nchrom]); + + //outputs mark header + StringBuffer sbout = new StringBuffer(); + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + sbout.append(marks[nmark]+"\t"); + //pw.print(marks[nmark]+"\t"); + } + sbout.append(marks[marks.length-1]+"\n"); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(marks[marks.length-1]); + + //outputs mark signal data + int[][] grid_nchrom = grid[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + + sbout = new StringBuffer(); + for (int nmark = 0; nmark < grid_nchrom_nbin.length-1; nmark++) + { + sbout.append(grid_nchrom_nbin[nmark]+"\t"); + //pw.print(grid_nchrom_nbin[nmark]+"\t"); + } + sbout.append(grid_nchrom_nbin[marks.length-1]+"\n"); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(grid_nchrom_nbin[marks.length-1]); + } - pw.println(grid_nchrom_nbin[marks.length-1]); + pwzip.finish(); + pwzip.close(); } - pw.close(); + else + { + String szfile = szoutputsignaldir+"/"+szcell+"_"+chroms[nchrom]+"_signal.txt"; + PrintWriter pw = new PrintWriter(szfile); + pw.println(szcell+"\t"+chroms[nchrom]); + //outputs mark header + + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + pw.print(marks[nmark]+"\t"); + } + pw.println(marks[marks.length-1]); + + //outputs mark signal data + int[][] grid_nchrom = grid[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + for (int nmark = 0; nmark < grid_nchrom_nbin.length-1; nmark++) + { + pw.print(grid_nchrom_nbin[nmark]+"\t"); + } + pw.println(grid_nchrom_nbin[marks.length-1]); + } + pw.close(); + } } } } @@ -791,44 +844,115 @@ public static void makeBinaryDataFromBed(String szchromlengthfile, String szmark { if (bpresent[nchrom]) { - //we have signal for this chromosome - String szfile = szoutputcontroldir+"/"+szcell+"_"+chroms[nchrom]+"_controlsignal.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(szfile); - pw.println(szcell+"\t"+chroms[nchrom]); - //outputs mark header - if (numcontrolmarks == 1) + if (bgzip) { - pw.println("Control"); + //we have signal for this chromosome + String szfile = szoutputcontroldir+"/"+szcell+"_"+chroms[nchrom]+"_controlsignal.txt.gz"; + System.out.println("Writing to file "+szfile); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + //PrintWriter pw = new PrintWriter(szfile); + String szout = szcell+"\t"+chroms[nchrom]+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(szcell+"\t"+chroms[nchrom]); + //outputs mark header + if (numcontrolmarks == 1) + { + szout = "Control\n";//szcell+"\t"+chroms[nchrom]+"\n"; + btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + //pw.println("Control"); + } + else + { + StringBuffer sbout = new StringBuffer(); + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + sbout.append(marks[nmark]+"\t"); + //pw.print(marks[nmark]+"\t"); + } + sbout.append(marks[marks.length-1]+"\n"); + //pw.println(marks[marks.length-1]); + + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + } + + //outputs mark signal data + int[][] gridcontrol_nchrom = gridcontrol[nchrom]; + + for (int nbin = 0; nbin < gridcontrol[nchrom].length; nbin++) + { + int[] gridcontrol_nchrom_nbin = gridcontrol_nchrom[nbin]; + if (gridcontrol_nchrom_nbin.length == 1) + { + szout = (gridcontrol_nchrom_nbin[0]-npseudocountcontrol)+"\n"; + + btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(gridcontrol_nchrom_nbin[0]-npseudocountcontrol); + } + else + { + StringBuffer sbout = new StringBuffer(); + for (int nmark = 0; nmark < gridcontrol_nchrom_nbin.length-1; nmark++) + { + sbout.append((gridcontrol_nchrom_nbin[nmark]-npseudocountcontrol)+"\t"); + //pw.print((gridcontrol_nchrom_nbin[nmark]-npseudocountcontrol)+"\t"); + } + sbout.append((gridcontrol_nchrom_nbin[marks.length-1]-npseudocountcontrol)+"\n"); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(gridcontrol_nchrom_nbin[marks.length-1]-npseudocountcontrol); + } + } + pwzip.finish(); + pwzip.close(); } else { - for (int nmark = 0; nmark < marks.length-1; nmark++) - { - pw.print(marks[nmark]+"\t"); - } - pw.println(marks[marks.length-1]); - } - - //outputs mark signal data - int[][] gridcontrol_nchrom = gridcontrol[nchrom]; - for (int nbin = 0; nbin < gridcontrol[nchrom].length; nbin++) - { - int[] gridcontrol_nchrom_nbin = gridcontrol_nchrom[nbin]; - if (gridcontrol_nchrom_nbin.length == 1) + //we have signal for this chromosome + String szfile = szoutputcontroldir+"/"+szcell+"_"+chroms[nchrom]+"_controlsignal.txt"; + System.out.println("Writing to file "+szfile); + PrintWriter pw = new PrintWriter(szfile); + pw.println(szcell+"\t"+chroms[nchrom]); + //outputs mark header + if (numcontrolmarks == 1) { - pw.println(gridcontrol_nchrom_nbin[0]-npseudocountcontrol); + pw.println("Control"); } else { - for (int nmark = 0; nmark < gridcontrol_nchrom_nbin.length-1; nmark++) - { - pw.print((gridcontrol_nchrom_nbin[nmark]-npseudocountcontrol)+"\t"); - } - pw.println(gridcontrol_nchrom_nbin[marks.length-1]-npseudocountcontrol); + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + pw.print(marks[nmark]+"\t"); + } + pw.println(marks[marks.length-1]); } + + //outputs mark signal data + int[][] gridcontrol_nchrom = gridcontrol[nchrom]; + for (int nbin = 0; nbin < gridcontrol[nchrom].length; nbin++) + { + int[] gridcontrol_nchrom_nbin = gridcontrol_nchrom[nbin]; + if (gridcontrol_nchrom_nbin.length == 1) + { + pw.println(gridcontrol_nchrom_nbin[0]-npseudocountcontrol); + } + else + { + for (int nmark = 0; nmark < gridcontrol_nchrom_nbin.length-1; nmark++) + { + pw.print((gridcontrol_nchrom_nbin[nmark]-npseudocountcontrol)+"\t"); + } + pw.println(gridcontrol_nchrom_nbin[marks.length-1]-npseudocountcontrol); + } + } + pw.close(); } - pw.close(); } } } @@ -855,77 +979,178 @@ public static void makeBinaryDataFromBed(String szchromlengthfile, String szmark { if ((bpresent[nchrom])&&(bpresentcontrol[nchrom])) { - String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(szfile); - //we have both primary and control data for the mark - pw.println(szcell+"\t"+chroms[nchrom]); - for (int nmark = 0; nmark < nummarks_m1; nmark++) + if (bgzip) { - pw.print(marks[nmark]+"\t"); - } - pw.println(marks[nummarks_m1]); + String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt.gz"; + System.out.println("Writing to file "+szfile); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + //PrintWriter pw = new PrintWriter(szfile); + //we have both primary and control data for the mark + String szout = szcell+"\t"+chroms[nchrom] +"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(szcell+"\t"+chroms[nchrom]); + StringBuffer sbout = new StringBuffer(); + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + sbout.append(marks[nmark]+"\t"); + //pw.print(marks[nmark]+"\t"); + } + sbout.append(marks[nummarks_m1]+"\n"); + //pw.println(marks[nummarks_m1]); + + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); - int[][] grid_nchrom = grid[nchrom]; - int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; - for (int nbin = 0; nbin < grid_nchrom.length; nbin++) - { - int[] grid_nchrom_nbin = grid_nchrom[nbin]; - int[] sumgrid_nchrom_nbin = sumgridcontrol_nchrom[nbin]; - - for (int nmark = 0; nmark < nummarks_m1; nmark++) - { - int ncontrolval; - if (numcontrolmarks == 1) - { - ncontrolval = sumgrid_nchrom_nbin[0]; - } - else - { - ncontrolval = sumgrid_nchrom_nbin[nmark]; - } - //printing one if count exceeds background threshold - - if (!bpresentmarks[nmark]) - { - pw.print("2\t"); + int[][] grid_nchrom = grid[nchrom]; + int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + int[] sumgrid_nchrom_nbin = sumgridcontrol_nchrom[nbin]; + + sbout = new StringBuffer(); + + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + int ncontrolval; + if (numcontrolmarks == 1) + { + ncontrolval = sumgrid_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgrid_nchrom_nbin[nmark]; + } + //printing one if count exceeds background threshold + + if (!bpresentmarks[nmark]) + { + sbout.append("2\t"); + //pw.print("2\t"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1)) ||(!bpeaks&&(thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]))) + { + sbout.append("1\t"); + //pw.print("1\t"); + } + else + { + sbout.append("0\t"); + //pw.print("0\t"); + } } - else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1)) ||(!bpeaks&&(thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]))) + + int ncontrolval; + if (numcontrolmarks == 1) { - pw.print("1\t"); - } + ncontrolval = sumgrid_nchrom_nbin[0]; + } else { - pw.print("0\t"); + ncontrolval = sumgrid_nchrom_nbin[nummarks_m1]; } - } - int ncontrolval; - if (numcontrolmarks == 1) - { - ncontrolval = sumgrid_nchrom_nbin[0]; + + if (!bpresentmarks[nummarks_m1]) + { + sbout.append("2\n"); + //pw.println("2"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]))) + { + sbout.append("1\n"); + //pw.println("1"); + } + else + { + sbout.append("0\n"); + //pw.println("0"); + } + + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + } - else + pwzip.finish(); + pwzip.close(); + } + else + { + String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt"; + System.out.println("Writing to file "+szfile); + PrintWriter pw = new PrintWriter(szfile); + //we have both primary and control data for the mark + pw.println(szcell+"\t"+chroms[nchrom]); + + for (int nmark = 0; nmark < nummarks_m1; nmark++) { - ncontrolval = sumgrid_nchrom_nbin[nummarks_m1]; - } + pw.print(marks[nmark]+"\t"); + } + pw.println(marks[nummarks_m1]); + + int[][] grid_nchrom = grid[nchrom]; + int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + int[] sumgrid_nchrom_nbin = sumgridcontrol_nchrom[nbin]; + + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + int ncontrolval; + if (numcontrolmarks == 1) + { + ncontrolval = sumgrid_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgrid_nchrom_nbin[nmark]; + } + //printing one if count exceeds background threshold + if (!bpresentmarks[nmark]) + { + pw.print("2\t"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1)) ||(!bpeaks&&(thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]))) + { + pw.print("1\t"); + } + else + { + pw.print("0\t"); + } + } - if (!bpresentmarks[nummarks_m1]) - { - pw.println("2"); - } - else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]))) - { - pw.println("1"); - } - else - { - pw.println("0"); + int ncontrolval; + if (numcontrolmarks == 1) + { + ncontrolval = sumgrid_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgrid_nchrom_nbin[nummarks_m1]; + } + + + if (!bpresentmarks[nummarks_m1]) + { + pw.println("2"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]))) + { + pw.println("1"); + } + else + { + pw.println("0"); + } } + pw.close(); } - pw.close(); } else { @@ -954,51 +1179,127 @@ else if (!bpresent[nchrom]&&bpresentcontrol[nchrom]) { if (bpresent[nchrom]) { - String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(szfile); - pw.println(szcell+"\t"+chroms[nchrom]); - for (int nmark = 0; nmark < marks.length-1; nmark++) - { - pw.print(marks[nmark]+"\t"); - } - pw.println(marks[nummarks_m1]); + if (bgzip) + { + String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt.gz"; + System.out.println("Writing to file "+szfile); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + //PrintWriter pw = new PrintWriter(szfile); + + String szout = szcell+"\t"+chroms[nchrom]+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + StringBuffer sbout = new StringBuffer(); + //pw.println(szcell+"\t"+chroms[nchrom]); + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + sbout.append(marks[nmark]+"\t"); + //pw.print(marks[nmark]+"\t"); + } + sbout.append(marks[nummarks_m1]+"\n"); + //pw.println(marks[nummarks_m1]); + + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + - int[][] grid_nchrom = grid[nchrom]; - for (int nbin = 0; nbin < grid_nchrom.length; nbin++) - { - //printing 1 if signal has met data threshold and 0 otherwise - int[] grid_nchrom_nbin = grid_nchrom[nbin]; - for (int nmark = 0; nmark < nummarks_m1; nmark++) - { - if (!bpresentmarks[nmark]) - { - pw.print("2\t"); - } - else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1))||(!bpeaks&&(thresholds[nmark] <= grid_nchrom_nbin[nmark]))) - { - pw.print("1\t"); - } - else - { - pw.print("0\t"); - } - } + int[][] grid_nchrom = grid[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + sbout = new StringBuffer(); + //printing 1 if signal has met data threshold and 0 otherwise + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + if (!bpresentmarks[nmark]) + { + sbout.append("2\t"); + //pw.print("2\t"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1))||(!bpeaks&&(thresholds[nmark] <= grid_nchrom_nbin[nmark]))) + { + sbout.append("1\t"); + //pw.print("1\t"); + } + else + { + sbout.append("0\t"); + //pw.print("0\t"); + } + } - if (!bpresentmarks[nummarks_m1]) - { - pw.println("2"); - } - else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1] <= grid_nchrom_nbin[nummarks_m1]))) - { - pw.println("1"); - } - else - { - pw.println("0"); - } - } - pw.close(); + if (!bpresentmarks[nummarks_m1]) + { + sbout.append("2\n"); + //pw.println("2"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1] <= grid_nchrom_nbin[nummarks_m1]))) + { + sbout.append("1\n"); + //pw.println("1"); + } + else + { + sbout.append("0\n"); + //pw.println("0"); + } + + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + + } + pwzip.finish(); + pwzip.close(); + } + else + { + String szfile = szoutputbinarydir+"/"+szcell+"_"+chroms[nchrom]+"_binary.txt"; + System.out.println("Writing to file "+szfile); + PrintWriter pw = new PrintWriter(szfile); + pw.println(szcell+"\t"+chroms[nchrom]); + for (int nmark = 0; nmark < marks.length-1; nmark++) + { + pw.print(marks[nmark]+"\t"); + } + pw.println(marks[nummarks_m1]); + + int[][] grid_nchrom = grid[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + //printing 1 if signal has met data threshold and 0 otherwise + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + if (!bpresentmarks[nmark]) + { + pw.print("2\t"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nmark]>=1))||(!bpeaks&&(thresholds[nmark] <= grid_nchrom_nbin[nmark]))) + { + pw.print("1\t"); + } + else + { + pw.print("0\t"); + } + } + + if (!bpresentmarks[nummarks_m1]) + { + pw.println("2"); + } + else if ((bpeaks&&(grid_nchrom_nbin[nummarks_m1]>=1))||(!bpeaks&&(thresholds[nummarks_m1] <= grid_nchrom_nbin[nummarks_m1]))) + { + pw.println("1"); + } + else + { + pw.println("0"); + } + } + pw.close(); + } } } } @@ -1238,7 +1539,7 @@ public static double[] determineMarkThresholdsFromBinnedDataArray(int[][][] grid */ public static void makeBinaryDataFromSignalAgainstControl(String szbinneddataDIR, String szcontrolDIR, String szoutputDIR, double dpoissonthresh, double dfoldthresh, boolean bcontainsthresh, int nflankwidthcontrol, - int npseudocountcontrol, double dcountthresh) throws IOException + int npseudocountcontrol, double dcountthresh, boolean bgzip) throws IOException { int nummarks=-1; int nummarkscontrol=-1; @@ -1488,74 +1789,167 @@ else if (ncurrnummarkscontrol != nummarkscontrol) for (int nchrom = 0; nchrom < grid.length; nchrom++) { String szchrom = chroms[nchrom]; - String szfile = szoutputDIR+"/"+szcell+"_"+szchrom+"_binary.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(new FileWriter(szfile)); - - int nummarks_m1 = nummarks - 1; - pw.println(szcell+"\t"+szchrom); - pw.println(szHeaderLine2); - int[][] grid_nchrom = grid[nchrom]; - int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; - for (int nbin = 0; nbin < grid_nchrom.length; nbin++) - { - int[] grid_nchrom_nbin = grid_nchrom[nbin]; - int[] sumgridcontrol_nchrom_nbin = sumgridcontrol_nchrom[nbin]; - - for (int nmark = 0; nmark < nummarks_m1; nmark++) - { - int ncontrolval; - if (nummarkscontrol == 1) + if (bgzip) + { + String szfile = szoutputDIR+"/"+szcell+"_"+szchrom+"_binary.txt.gz"; + System.out.println("Writing to file "+szfile); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + //PrintWriter pw = new PrintWriter(new FileWriter(szfile)); + + int nummarks_m1 = nummarks - 1; + //pw.println(szcell+"\t"+szchrom); + String szout = szcell+"\t"+szchrom+"\n"; + byte[] btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(szHeaderLine2); + szout = szHeaderLine2+"\n"; + btformat = szout.getBytes(); + pwzip.write(btformat,0,btformat.length); + + + int[][] grid_nchrom = grid[nchrom]; + int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + int[] sumgridcontrol_nchrom_nbin = sumgridcontrol_nchrom[nbin]; + StringBuffer sbout = new StringBuffer(); + + for (int nmark = 0; nmark < nummarks_m1; nmark++) { - //always use the first control mark column if only one column - ncontrolval = sumgridcontrol_nchrom_nbin[0]; + int ncontrolval; + if (nummarkscontrol == 1) + { + //always use the first control mark column if only one column + ncontrolval = sumgridcontrol_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgridcontrol_nchrom_nbin[nmark]; + } + + //printing one if count exceeds background determined threshold + if (grid_nchrom_nbin[nmark] == -1) + { + sbout.append("2\t"); + //pw.print("2\t"); + } + else if (thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]) + { + sbout.append("1\t"); + //pw.print("1\t"); + } + else + { + sbout.append("0\t"); + //pw.print("0\t"); + } } - else + + int ncontrolval; + if (nummarkscontrol == 1) { - ncontrolval = sumgridcontrol_nchrom_nbin[nmark]; - } + ncontrolval = sumgridcontrol_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgridcontrol_nchrom_nbin[nummarks_m1]; + } - //printing one if count exceeds background determined threshold - if (grid_nchrom_nbin[nmark] == -1) - { - pw.print("2\t"); - } - else if (thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]) + if (grid_nchrom_nbin[nummarks_m1] == -1) { - pw.print("1\t"); - } + sbout.append("2\n"); + //pw.println("2"); + } + else if (thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]) + { + sbout.append("1\n"); + //pw.println("1"); + } else { - pw.print("0\t"); - } - } + sbout.append("0\n"); + //pw.println("0"); + } - int ncontrolval; - if (nummarkscontrol == 1) - { - ncontrolval = sumgridcontrol_nchrom_nbin[0]; - } - else - { - ncontrolval = sumgridcontrol_nchrom_nbin[nummarks_m1]; + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); } + pwzip.finish(); + pwzip.close(); + } + else + { + String szfile = szoutputDIR+"/"+szcell+"_"+szchrom+"_binary.txt"; + System.out.println("Writing to file "+szfile); + PrintWriter pw = new PrintWriter(new FileWriter(szfile)); + + int nummarks_m1 = nummarks - 1; + pw.println(szcell+"\t"+szchrom); + pw.println(szHeaderLine2); + + int[][] grid_nchrom = grid[nchrom]; + int[][] sumgridcontrol_nchrom = sumgridcontrol[nchrom]; + for (int nbin = 0; nbin < grid_nchrom.length; nbin++) + { + int[] grid_nchrom_nbin = grid_nchrom[nbin]; + int[] sumgridcontrol_nchrom_nbin = sumgridcontrol_nchrom[nbin]; + + for (int nmark = 0; nmark < nummarks_m1; nmark++) + { + int ncontrolval; + if (nummarkscontrol == 1) + { + //always use the first control mark column if only one column + ncontrolval = sumgridcontrol_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgridcontrol_nchrom_nbin[nmark]; + } + //printing one if count exceeds background determined threshold + if (grid_nchrom_nbin[nmark] == -1) + { + pw.print("2\t"); + } + else if (thresholds[nmark][ncontrolval] <= grid_nchrom_nbin[nmark]) + { + pw.print("1\t"); + } + else + { + pw.print("0\t"); + } + } - if (grid_nchrom_nbin[nummarks_m1] == -1) - { - pw.println("2"); - } - else if (thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]) - { - pw.println("1"); + int ncontrolval; + if (nummarkscontrol == 1) + { + ncontrolval = sumgridcontrol_nchrom_nbin[0]; + } + else + { + ncontrolval = sumgridcontrol_nchrom_nbin[nummarks_m1]; + } + + if (grid_nchrom_nbin[nummarks_m1] == -1) + { + pw.println("2"); + } + else if (thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]) + { + pw.println("1"); + } + else + { + pw.println("0"); + } } - else - { - pw.println("0"); - } + pw.close(); } - pw.close(); } } } @@ -1575,7 +1969,7 @@ else if (thresholds[nummarks_m1][ncontrolval] <= grid_nchrom_nbin[nummarks_m1]) **/ public static void makeBinaryDataFromSignalUniform(String szbinneddataDIR, String szoutputDIR, double dpoissonthresh, double dfoldthresh, - boolean bcontainsthresh, double dcountthresh) throws IOException + boolean bcontainsthresh, double dcountthresh, boolean bgzip) throws IOException { //this computes the binarization without storing in main memory all the data @@ -1739,48 +2133,117 @@ else if (nummarks != sumtags.length) throw new IllegalArgumentException(szbinneddataDIR+"/"+szfilename+" header line must have a two columns delimited by a tab found "+ szChromCellLine); } - String szfile = szoutputDIR+"/"+st.nextToken()+"_"+st.nextToken()+"_binary.txt"; - System.out.println("Writing to file "+szfile); - PrintWriter pw = new PrintWriter(new FileWriter(szfile)); - pw.println(szChromCellLine); - pw.println(szMarkLine); - String szLine; - while ((szLine = br.readLine())!=null) + + if (bgzip) { - st = new StringTokenizer(szLine,"\t"); - for (int ncol = 0; ncol < nummarks-1; ncol++) + String szfile = szoutputDIR+"/"+st.nextToken()+"_"+st.nextToken()+"_binary.txt.gz"; + System.out.println("Writing to file "+szfile); + GZIPOutputStream pwzip = new GZIPOutputStream(new FileOutputStream(szfile)); + //PrintWriter pw = new PrintWriter(new FileWriter(szfile)); + byte[] btformat = (szChromCellLine+"\n").getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(szChromCellLine); + btformat = (szMarkLine+"\n").getBytes(); + pwzip.write(btformat,0,btformat.length); + + //pw.println(szMarkLine); + String szLine; + while ((szLine = br.readLine())!=null) { - double dval = Double.parseDouble(st.nextToken()); - if (dval == -1) + st = new StringTokenizer(szLine,"\t"); + StringBuffer sbout = new StringBuffer(); + + for (int ncol = 0; ncol < nummarks-1; ncol++) { - pw.print("2\t"); + double dval = Double.parseDouble(st.nextToken()); + if (dval == -1) + { + sbout.append("2\t"); + //pw.print("2\t"); + } + else if (thresholds[ncol] <= dval) + { + sbout.append("1\t"); + //pw.print("1\t"); + } + else + { + sbout.append("0\t"); + //pw.print("0\t"); + } } - else if (thresholds[ncol] <= dval) + + double dval = Double.parseDouble(st.nextToken()); + if (dval == -1) { - pw.print("1\t"); + sbout.append("2\n"); + //pw.println("2"); } - else + else if (thresholds[nummarks-1] <= dval) { - pw.print("0\t"); + sbout.append("1\n"); + //pw.println("1"); + } + else + { + sbout.append("0\n"); + //pw.println("0"); } - } - double dval = Double.parseDouble(st.nextToken()); - if (dval == -1) - { - pw.println("2"); - } - else if (thresholds[nummarks-1] <= dval) - { - pw.println("1"); + btformat = sbout.toString().getBytes(); + pwzip.write(btformat,0,btformat.length); + } - else + br.close(); + pwzip.finish(); + pwzip.close(); + } + else + { + String szfile = szoutputDIR+"/"+st.nextToken()+"_"+st.nextToken()+"_binary.txt"; + System.out.println("Writing to file "+szfile); + PrintWriter pw = new PrintWriter(new FileWriter(szfile)); + pw.println(szChromCellLine); + pw.println(szMarkLine); + String szLine; + while ((szLine = br.readLine())!=null) { - pw.println("0"); - } + st = new StringTokenizer(szLine,"\t"); + for (int ncol = 0; ncol < nummarks-1; ncol++) + { + double dval = Double.parseDouble(st.nextToken()); + if (dval == -1) + { + pw.print("2\t"); + } + else if (thresholds[ncol] <= dval) + { + pw.print("1\t"); + } + else + { + pw.print("0\t"); + } + } + + double dval = Double.parseDouble(st.nextToken()); + if (dval == -1) + { + pw.println("2"); + } + else if (thresholds[nummarks-1] <= dval) + { + pw.println("1"); + } + else + { + pw.println("0"); + } + } + br.close(); + pw.close(); } - br.close(); - pw.close(); } } }