-
-
Notifications
You must be signed in to change notification settings - Fork 93
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
12ad7e7
commit 6fe0e33
Showing
6 changed files
with
404 additions
and
4 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,9 @@ | ||
# Changelog | ||
|
||
## 0.12.0 (provisional) | ||
|
||
* Adding `InvertedIndex`. | ||
|
||
## 0.11.0 | ||
|
||
* Adding bunch of set functions. | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,258 @@ | ||
/** | ||
* Mnemonist Inverted Index | ||
* ========================= | ||
* | ||
* JavaScript implementation of an inverted index. | ||
*/ | ||
var iterateOver = require('./utils/iterate.js'), | ||
helpers = require('./set.js'); | ||
|
||
var intersect = helpers.intersect; | ||
|
||
function identity(x) { | ||
return x; | ||
} | ||
|
||
/** | ||
* InvertedIndex. | ||
* | ||
* @constructor | ||
* @param {function} tokenizer - Tokenizer function. | ||
*/ | ||
function InvertedIndex(descriptor) { | ||
this.clear(); | ||
|
||
if (Array.isArray(descriptor)) { | ||
this.documentTokenizer = descriptor[0]; | ||
this.queryTokenizer = descriptor[1]; | ||
} | ||
else { | ||
this.documentTokenizer = descriptor; | ||
this.queryTokenizer = descriptor; | ||
} | ||
|
||
if (!this.documentTokenizer) | ||
this.documentTokenizer = identity; | ||
if (!this.queryTokenizer) | ||
this.queryTokenizer = identity; | ||
|
||
if (typeof this.documentTokenizer !== 'function') | ||
throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.'); | ||
|
||
if (typeof this.queryTokenizer !== 'function') | ||
throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.'); | ||
} | ||
|
||
/** | ||
* Method used to clear the InvertedIndex. | ||
* | ||
* @return {undefined} | ||
*/ | ||
InvertedIndex.prototype.clear = function() { | ||
|
||
// Properties | ||
this.items = []; | ||
this.mapping = new Map(); | ||
this.size = 0; | ||
this.dimension = 0; | ||
}; | ||
|
||
/** | ||
* Method used to add a document to the index. | ||
* | ||
* @param {any} doc - Item to add. | ||
* @return {InvertedIndex} | ||
*/ | ||
InvertedIndex.prototype.add = function(doc) { | ||
|
||
// Increasing size | ||
this.size++; | ||
|
||
// Storing document | ||
var key = this.items.length; | ||
this.items.push(doc); | ||
|
||
// Tokenizing the document | ||
var tokens = this.documentTokenizer(doc); | ||
|
||
if (!Array.isArray(tokens)) | ||
throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.'); | ||
|
||
// Indexing | ||
var token, | ||
container; | ||
|
||
for (var i = 0, l = tokens.length; i < l; i++) { | ||
token = tokens[i]; | ||
container = this.mapping.get(token); | ||
|
||
if (!container) { | ||
container = new Set(); | ||
this.mapping.set(token, container); | ||
} | ||
|
||
container.add(key); | ||
} | ||
|
||
this.dimension = this.mapping.size; | ||
|
||
return this; | ||
}; | ||
|
||
/** | ||
* Method used to query the index. | ||
* | ||
* @param {any} query - Query | ||
* @return {Set} - Intersection of documents matching the query. | ||
*/ | ||
InvertedIndex.prototype.get = function(query) { | ||
|
||
// Early termination | ||
if (!this.size) | ||
return new Set(); | ||
|
||
// First we need to tokenize the query | ||
var tokens = this.queryTokenizer(query); | ||
|
||
if (!Array.isArray(tokens)) | ||
throw new Error('mnemonist/InvertedIndex.get: tokenizer function should return an array of tokens.'); | ||
|
||
var matchingSet, | ||
token, | ||
set, | ||
i, | ||
l; | ||
|
||
for (i = 0, l = tokens.length; i < l; i++) { | ||
token = tokens[i]; | ||
set = this.mapping.get(token); | ||
|
||
// Empty intersection | ||
if (!set || !set.size) | ||
return new Set(); | ||
|
||
if (!matchingSet) | ||
matchingSet = new Set(set); | ||
else | ||
intersect(matchingSet, set); | ||
} | ||
|
||
var results = new Array(matchingSet.size), | ||
iterator = matchingSet.values(), | ||
step; | ||
|
||
i = 0; | ||
|
||
while ((step = iterator.next(), !step.done)) | ||
results[i++] = this.items[step.value]; | ||
|
||
return results; | ||
}; | ||
|
||
/** | ||
* Method used to iterate over each of the documents. | ||
* | ||
* @param {function} callback - Function to call for each item. | ||
* @param {object} scope - Optional scope. | ||
* @return {undefined} | ||
*/ | ||
InvertedIndex.prototype.forEach = function(callback, scope) { | ||
scope = arguments.length > 1 ? scope : this; | ||
|
||
for (var i = 0, l = this.documents.length; i < l; i++) | ||
callback.call(scope, this.documents[i], i, this); | ||
}; | ||
|
||
/** | ||
* InvertedIndex Iterator class. | ||
*/ | ||
function InvertedIndexIterator(next) { | ||
this.next = next; | ||
} | ||
|
||
/** | ||
* Method returning an iterator over the index's documents. | ||
* | ||
* @return {InvertedIndexIterator} | ||
*/ | ||
InvertedIndex.prototype.documents = function() { | ||
var documents = this.items, | ||
l = documents.length, | ||
i = 0; | ||
|
||
return new InvertedIndexIterator(function() { | ||
if (i >= l) | ||
return { | ||
done: true | ||
}; | ||
|
||
var value = documents[i++]; | ||
|
||
return { | ||
value: value, | ||
done: false | ||
}; | ||
}); | ||
}; | ||
|
||
/** | ||
* Method returning an iterator over the index's tokens. | ||
* | ||
* @return {InvertedIndexIterator} | ||
*/ | ||
InvertedIndex.prototype.tokens = function() { | ||
var iterator = this.mapping.keys(); | ||
|
||
Object.defineProperty(iterator, 'constructor', { | ||
value: InvertedIndexIterator, | ||
enumerable: false | ||
}); | ||
|
||
return iterator; | ||
}; | ||
|
||
/** | ||
* Attaching the #.values method to Symbol.iterator if possible. | ||
*/ | ||
if (typeof Symbol !== 'undefined') | ||
InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents; | ||
|
||
// TODO: fuzzy inverted index | ||
|
||
/** | ||
* Convenience known methods. | ||
*/ | ||
InvertedIndex.prototype.inspect = function() { | ||
var array = this.documents.slice(); | ||
|
||
// Trick so that node displays the name of the constructor | ||
Object.defineProperty(array, 'constructor', { | ||
value: InvertedIndex, | ||
enumerable: false | ||
}); | ||
|
||
return array; | ||
}; | ||
|
||
/** | ||
* Static @.from function taking an abitrary iterable & converting it into | ||
* a InvertedIndex. | ||
* | ||
* @param {Iterable} iterable - Target iterable. | ||
* @param {function} tokenizer - Tokenizer function. | ||
* @return {InvertedIndex} | ||
*/ | ||
InvertedIndex.from = function(iterable, descriptor) { | ||
var index = new InvertedIndex(descriptor); | ||
|
||
iterateOver(iterable, function(doc) { | ||
index.add(doc); | ||
}); | ||
|
||
return index; | ||
}; | ||
|
||
/** | ||
* Exporting. | ||
*/ | ||
module.exports = InvertedIndex; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.