Skip to content

Commit

Permalink
Merge pull request #411 from hammerlab/issue-410
Browse files Browse the repository at this point in the history
Better handle 2bit files with huge headers
  • Loading branch information
ihodes committed May 9, 2016
2 parents e768565 + 0efc85c commit 775e35c
Show file tree
Hide file tree
Showing 6 changed files with 102 additions and 10 deletions.
2 changes: 1 addition & 1 deletion .travis.yml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
sudo: false # Use container-based infrastructure
language: node_js
node_js:
- "0.12"
- "5.1"

script: >
npm run build &&
Expand Down
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@
"mocha-phantomjs-istanbul": "0.0.2",
"number-to-locale-string": "^1.0.0",
"parse-data-uri": "^0.2.0",
"phantomjs": "^1.9.17",
"phantomjs": "1.9.17",
"prepush-hook": "^0.1.0",
"react-addons-test-utils": "^0.14.0",
"sinon": "^1.12.2",
Expand Down
99 changes: 92 additions & 7 deletions src/main/data/TwoBit.js
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,18 @@ var BASE_PAIRS = [
'G' // 3=11
];

/**
The following chunk sizes are optimized against
the human reference genome (hg19.2bit). Assuming that
pileup.js is mostly being used for human genome,
increasing the following numbers might cause nonnecessary
network traffic and might also break our unit tests
that make use of mapped 2bit files.
*/
var FIRST_HEADER_CHUNKSIZE = 16 * 1024, // 16 KB
FIRST_SEQUENCE_CHUNKSIZE = (4 * 1024) - 1, // ~ 4KB
MAX_CHUNKSIZE = 1024 * 1024; // 1 MB

type FileIndexEntry = {
name: string;
offset: number;
Expand Down Expand Up @@ -125,6 +137,66 @@ function markUnknownDNA(basePairs: Array<string>, dnaStartIndex: number, sequenc
}


/**
* An umbrella error type to describe issues with parsing an
* incomplete chunk of data with JBinary's read. If this is being
* raised, we either need to ask for more data (a bigger chunk) or
* report to the user that there might be a problem with the 2bit
* file, specifically with its header.
*/
function IncompleteChunkError(message) {
this.name = "IncompleteChunkError";
this.message = (message || "");
}
IncompleteChunkError.prototype = Error.prototype;

/**
* Wraps a parsing attempt, captures errors related to
* incomplete data and re-throws a specialized error:
* IncompleteChunkError. Otherwise, whatever other error
* is being raised gets escalated.
*/
function parseWithException(parseFunc: Function) {
try {
return parseFunc();
} catch(error) {
// Chrome-like browsers: RangeError; phantomjs: DOMException
if (error.name == "RangeError" || error.name == "INDEX_SIZE_ERR") {
console.log(`Error name: ${error.name}`);
throw new IncompleteChunkError(error);
} else {
throw error;
}
}
}

/**
* Try getting a bigger chunk of the remote file
* until the Incomplete Chunk Error is resolved. This is useful when we need to
* parse the header, but when we don't know the size of the header up front.
* If the intial request returns an incomplete header and hence the
* parsing fails, we next try doubling the requested size.
* The size of the request is capped with `untilSize` so that
* we don't start asking for MBs of data for no use.
* Instead we we throw an error if we reach the cap,
* potentially meaning a corrupted 2bit file.
*/
function retryRemoteGet(remoteFile: RemoteFile, start: number, size: number, untilSize: number, promisedFunc: Function) {
return remoteFile.getBytes(start, size).then(promisedFunc).catch(error => {
if(error.name == "IncompleteChunkError") {
// Do not attempt to download more than `untilSize`
if(size > untilSize) {
throw `Couldn't parse the header ` +
`from the first ${size} bytes of the file. ` +
`Corrupted 2bit file?`;
}
return retryRemoteGet(remoteFile, start, size*2, untilSize, promisedFunc);
} else {
throw error;
}
});
}

class TwoBit {
remoteFile: RemoteFile;
header: Q.Promise<TwoBitHeader>;
Expand All @@ -133,10 +205,15 @@ class TwoBit {
this.remoteFile = remoteFile;
var deferredHeader = Q.defer();
this.header = deferredHeader.promise;

// TODO: if 16k is insufficient, fetch the right amount.
this.remoteFile.getBytes(0, 16*1024).then(function(buffer) {
var header = parseHeader(buffer);
retryRemoteGet(
this.remoteFile,
0, // Beginning of the file
FIRST_HEADER_CHUNKSIZE,
MAX_CHUNKSIZE,
buffer => {
var header = parseWithException(() => {
return parseHeader(buffer);
});
deferredHeader.resolve(header);
}).done();
}
Expand Down Expand Up @@ -178,9 +255,17 @@ class TwoBit {
}
var seq = maybeSeq; // for flow, see facebook/flow#266

// TODO: if 4k is insufficient, fetch the right amount.
return this.remoteFile.getBytes(seq.offset, 4095).then(
buf => parseSequenceRecord(buf, seq.offset));
return retryRemoteGet(
this.remoteFile,
seq.offset,
FIRST_SEQUENCE_CHUNKSIZE,
MAX_CHUNKSIZE,
buffer => {
return parseWithException(() => {
return parseSequenceRecord(buffer, seq.offset);
});
}
);
});
}
}
Expand Down
1 change: 0 additions & 1 deletion src/main/sources/TwoBitDataSource.js
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,6 @@ var createFromTwoBitFile = function(remoteSource: TwoBit): TwoBitSource {
return Q.when(); // empty promise
}

console.log(`Fetching ${span} base pairs`);
remoteSource.getFeaturesInRange(range.contig, range.start(), range.stop())
.then(letters => {
if (!letters) return;
Expand Down
8 changes: 8 additions & 0 deletions src/test/data/TwoBit-test.js
Original file line number Diff line number Diff line change
Expand Up @@ -46,5 +46,13 @@ describe('TwoBit', function() {
});
});

it('should parse huge headers', function() {
var twoBit = new TwoBit(new RemoteFile('/test-data/susScr3-head.2bit'));
// shouldn't throw an exception
return twoBit.header.then(header => {
expect(header.sequenceCount).to.equal(4583);
});
});

// TODO: masked regions
});
Binary file added test-data/susScr3-head.2bit
Binary file not shown.

0 comments on commit 775e35c

Please sign in to comment.