diff --git a/src/main/java/genepi/riskscore/io/csv/TabixTableReader.java b/src/main/java/genepi/riskscore/io/csv/TabixTableReader.java new file mode 100644 index 0000000..6fee83b --- /dev/null +++ b/src/main/java/genepi/riskscore/io/csv/TabixTableReader.java @@ -0,0 +1,114 @@ +package genepi.riskscore.io.csv; + +import genepi.io.FileUtil; +import genepi.io.table.reader.AbstractTableReader; +import genepi.io.table.reader.CsvTableReader; +import genepi.io.text.LineReader; +import genepi.riskscore.io.dbsnp.DbSnpReader; +import htsjdk.tribble.readers.TabixReader; + +import java.io.DataInputStream; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Vector; + +public class TabixTableReader extends AbstractTableReader { + + private List headerLines = new Vector(); + + private String[] columns = null; + + private Map index = new HashMap(); + + private String[] row = null; + + private TabixReader tabixReader; + + private TabixReader.Iterator iterator; + + private int lineNumber = 0; + + public TabixTableReader(String filename, String chromosome) throws IOException { + this(filename, chromosome, 0, Integer.MAX_VALUE); + } + + + public TabixTableReader(String filename, String chromosome, int start, int end) throws IOException { + LineReader reader = new LineReader(openTxtOrGzipStream(filename)); + while(reader.next()) { + if (reader.get().startsWith("#")) { + headerLines.add(reader.get()); + } else{ + columns = reader.get().split("\t"); + for (int i = 0 ; i < columns.length; i++){ + index.put(columns[i], i); + } + break; + } + } + reader.close(); + + tabixReader = new TabixReader(filename); + iterator = tabixReader.query(chromosome, start, end); + } + + public List getHeader() { + return headerLines; + } + + @Override + public String[] getColumns() { + return columns; + } + + @Override + public boolean next() { + String line = null; + try { + line = iterator.next(); + } catch (IOException e) { + throw new RuntimeException(e); + } + if (line == null) { + row = null; + return false; + } + row = line.split("\t", -1); + lineNumber++; + if (row.length != columns.length){ + throw new RuntimeException("Different number of columns in line " + lineNumber + ": " + line); + } + return true; + } + + @Override + public int getColumnIndex(String column) { + return index.get(column); + } + + @Override + public String[] getRow() { + return row; + } + + @Override + public void close() { + tabixReader.close(); + } + + @Override + public boolean hasColumn(String column) { + return index.containsKey(column); + } + + private static DataInputStream openTxtOrGzipStream(String filename) throws IOException { + FileInputStream inputStream = new FileInputStream(filename); + InputStream in2 = FileUtil.decompressStream(inputStream); + return new DataInputStream(in2); + } + +} diff --git a/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java b/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java index d98646c..488f22b 100644 --- a/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java +++ b/src/main/java/genepi/riskscore/io/scores/MergedRiskScoreCollection.java @@ -1,9 +1,11 @@ package genepi.riskscore.io.scores; +import genepi.io.table.reader.ITableReader; import genepi.riskscore.io.Chunk; import genepi.riskscore.io.RiskScoreFile; import genepi.riskscore.io.VariantFile; import genepi.riskscore.io.csv.CsvWithHeaderTableReader; +import genepi.riskscore.io.csv.TabixTableReader; import genepi.riskscore.io.formats.RiskScoreFormatFactory.RiskScoreFormat; import genepi.riskscore.model.ReferenceVariant; import genepi.riskscore.model.RiskScoreSummary; @@ -34,6 +36,8 @@ public class MergedRiskScoreCollection implements IRiskScoreCollection { public static String META_EXTENSION = ".info"; + public static String INDEX_EXTENSION = ".tbi"; + public static String COLUMN_CHROMOSOME = "chr_name"; public static String COLUMN_POSITION = "chr_position"; @@ -88,8 +92,10 @@ public void buildIndex(String chromosome, Chunk chunk, String dbsnp, String prox } readerMeta.close(); - - CsvWithHeaderTableReader reader = new CsvWithHeaderTableReader(filename, '\t'); + if (chunk == null){ + chunk = new Chunk(); + } + TabixTableReader reader = new TabixTableReader(filename, chromosome, chunk.getStart(), chunk.getEnd()); String[] columns = reader.getColumns(); numberRiskScores = columns.length - COLUMNS.size(); @@ -114,19 +120,6 @@ public void buildIndex(String chromosome, Chunk chunk, String dbsnp, String prox String _chromosome = reader.getString(COLUMN_CHROMOSOME); int position = reader.getInteger(COLUMN_POSITION); - if (!_chromosome.equals(chromosome)){ - continue; - } - if (chunk != null) { - if (position < chunk.getStart()) { - continue; - } - - if (position > chunk.getEnd()) { - break; - } - } - String otherAllele = reader.getString(COLUMN_OTHER_ALLELE); String effectAllele = reader.getString(COLUMN_EFFECT_ALLELE);