Skip to content

Commit

Permalink
Working on inverted index
Browse files Browse the repository at this point in the history
  • Loading branch information
Yomguithereal committed Apr 13, 2017
1 parent 12ad7e7 commit 6fe0e33
Show file tree
Hide file tree
Showing 6 changed files with 404 additions and 4 deletions.
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,9 @@
# Changelog

## 0.12.0 (provisional)

* Adding `InvertedIndex`.

## 0.11.0

* Adding bunch of set functions.
Expand Down
258 changes: 258 additions & 0 deletions inverted-index.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,258 @@
/**
* Mnemonist Inverted Index
* =========================
*
* JavaScript implementation of an inverted index.
*/
var iterateOver = require('./utils/iterate.js'),
helpers = require('./set.js');

var intersect = helpers.intersect;

function identity(x) {
return x;
}

/**
* InvertedIndex.
*
* @constructor
* @param {function} tokenizer - Tokenizer function.
*/
function InvertedIndex(descriptor) {
this.clear();

if (Array.isArray(descriptor)) {
this.documentTokenizer = descriptor[0];
this.queryTokenizer = descriptor[1];
}
else {
this.documentTokenizer = descriptor;
this.queryTokenizer = descriptor;
}

if (!this.documentTokenizer)
this.documentTokenizer = identity;
if (!this.queryTokenizer)
this.queryTokenizer = identity;

if (typeof this.documentTokenizer !== 'function')
throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.');

if (typeof this.queryTokenizer !== 'function')
throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.');
}

/**
* Method used to clear the InvertedIndex.
*
* @return {undefined}
*/
InvertedIndex.prototype.clear = function() {

// Properties
this.items = [];
this.mapping = new Map();
this.size = 0;
this.dimension = 0;
};

/**
* Method used to add a document to the index.
*
* @param {any} doc - Item to add.
* @return {InvertedIndex}
*/
InvertedIndex.prototype.add = function(doc) {

// Increasing size
this.size++;

// Storing document
var key = this.items.length;
this.items.push(doc);

// Tokenizing the document
var tokens = this.documentTokenizer(doc);

if (!Array.isArray(tokens))
throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.');

// Indexing
var token,
container;

for (var i = 0, l = tokens.length; i < l; i++) {
token = tokens[i];
container = this.mapping.get(token);

if (!container) {
container = new Set();
this.mapping.set(token, container);
}

container.add(key);
}

this.dimension = this.mapping.size;

return this;
};

/**
* Method used to query the index.
*
* @param {any} query - Query
* @return {Set} - Intersection of documents matching the query.
*/
InvertedIndex.prototype.get = function(query) {

// Early termination
if (!this.size)
return new Set();

// First we need to tokenize the query
var tokens = this.queryTokenizer(query);

if (!Array.isArray(tokens))
throw new Error('mnemonist/InvertedIndex.get: tokenizer function should return an array of tokens.');

var matchingSet,
token,
set,
i,
l;

for (i = 0, l = tokens.length; i < l; i++) {
token = tokens[i];
set = this.mapping.get(token);

// Empty intersection
if (!set || !set.size)
return new Set();

if (!matchingSet)
matchingSet = new Set(set);
else
intersect(matchingSet, set);
}

var results = new Array(matchingSet.size),
iterator = matchingSet.values(),
step;

i = 0;

while ((step = iterator.next(), !step.done))
results[i++] = this.items[step.value];

return results;
};

/**
* Method used to iterate over each of the documents.
*
* @param {function} callback - Function to call for each item.
* @param {object} scope - Optional scope.
* @return {undefined}
*/
InvertedIndex.prototype.forEach = function(callback, scope) {
scope = arguments.length > 1 ? scope : this;

for (var i = 0, l = this.documents.length; i < l; i++)
callback.call(scope, this.documents[i], i, this);
};

/**
* InvertedIndex Iterator class.
*/
function InvertedIndexIterator(next) {
this.next = next;
}

/**
* Method returning an iterator over the index's documents.
*
* @return {InvertedIndexIterator}
*/
InvertedIndex.prototype.documents = function() {
var documents = this.items,
l = documents.length,
i = 0;

return new InvertedIndexIterator(function() {
if (i >= l)
return {
done: true
};

var value = documents[i++];

return {
value: value,
done: false
};
});
};

/**
* Method returning an iterator over the index's tokens.
*
* @return {InvertedIndexIterator}
*/
InvertedIndex.prototype.tokens = function() {
var iterator = this.mapping.keys();

Object.defineProperty(iterator, 'constructor', {
value: InvertedIndexIterator,
enumerable: false
});

return iterator;
};

/**
* Attaching the #.values method to Symbol.iterator if possible.
*/
if (typeof Symbol !== 'undefined')
InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents;

// TODO: fuzzy inverted index

/**
* Convenience known methods.
*/
InvertedIndex.prototype.inspect = function() {
var array = this.documents.slice();

// Trick so that node displays the name of the constructor
Object.defineProperty(array, 'constructor', {
value: InvertedIndex,
enumerable: false
});

return array;
};

/**
* Static @.from function taking an abitrary iterable & converting it into
* a InvertedIndex.
*
* @param {Iterable} iterable - Target iterable.
* @param {function} tokenizer - Tokenizer function.
* @return {InvertedIndex}
*/
InvertedIndex.from = function(iterable, descriptor) {
var index = new InvertedIndex(descriptor);

iterateOver(iterable, function(doc) {
index.add(doc);
});

return index;
};

/**
* Exporting.
*/
module.exports = InvertedIndex;
1 change: 0 additions & 1 deletion multi-index.js
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,6 @@ MultiIndex.prototype.forEach = function(callback, scope) {
});
};


/**
* MultiIndex Iterator class.
*/
Expand Down
1 change: 1 addition & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
"damerau-levenshtein": "^1.0.3",
"eslint": "^3.8.1",
"leven": "^2.0.0",
"lodash": "^4.17.4",
"mocha": "^3.1.2"
},
"eslintConfig": {
Expand Down
18 changes: 15 additions & 3 deletions set.js
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ exports.intersection = function() {
if (arguments.length < 2)
throw new Error('mnemonist/Set.intersection: needs at least two arguments.');

var I = new Set();

// First we need to find the smallest set
var smallestSize = Infinity,
smallestSet = null;
Expand All @@ -24,15 +26,17 @@ exports.intersection = function() {
for (i = 0; i < l; i++) {
s = arguments[i];

// If one of the set has no items, we can stop right there
if (s.size === 0)
return I;

if (s.size < smallestSize) {
smallestSize = s.size;
smallestSet = s;
}
}

// Now we need to interset this set with the others
var I = new Set();

// Now we need to intersect this set with the others
var iterator = smallestSet.values(),
step,
item,
Expand Down Expand Up @@ -97,6 +101,14 @@ exports.union = function() {
* @return {Set} - The difference.
*/
exports.difference = function(A, B) {

// If first set is empty
if (!A.size)
return new Set();

if (!B.size)
return new Set(A);

var D = new Set();

var iterator = A.values(),
Expand Down
Loading

0 comments on commit 6fe0e33

Please sign in to comment.