diff --git a/CHANGELOG.md b/CHANGELOG.md index 6ae3892f..cda0c04b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,9 @@ # Changelog +## 0.12.0 (provisional) + +* Adding `InvertedIndex`. + ## 0.11.0 * Adding bunch of set functions. diff --git a/inverted-index.js b/inverted-index.js new file mode 100644 index 00000000..c19c7abb --- /dev/null +++ b/inverted-index.js @@ -0,0 +1,258 @@ +/** + * Mnemonist Inverted Index + * ========================= + * + * JavaScript implementation of an inverted index. + */ +var iterateOver = require('./utils/iterate.js'), + helpers = require('./set.js'); + +var intersect = helpers.intersect; + +function identity(x) { + return x; +} + +/** + * InvertedIndex. + * + * @constructor + * @param {function} tokenizer - Tokenizer function. + */ +function InvertedIndex(descriptor) { + this.clear(); + + if (Array.isArray(descriptor)) { + this.documentTokenizer = descriptor[0]; + this.queryTokenizer = descriptor[1]; + } + else { + this.documentTokenizer = descriptor; + this.queryTokenizer = descriptor; + } + + if (!this.documentTokenizer) + this.documentTokenizer = identity; + if (!this.queryTokenizer) + this.queryTokenizer = identity; + + if (typeof this.documentTokenizer !== 'function') + throw new Error('mnemonist/InvertedIndex.constructor: document tokenizer is not a function.'); + + if (typeof this.queryTokenizer !== 'function') + throw new Error('mnemonist/InvertedIndex.constructor: query tokenizer is not a function.'); +} + +/** + * Method used to clear the InvertedIndex. + * + * @return {undefined} + */ +InvertedIndex.prototype.clear = function() { + + // Properties + this.items = []; + this.mapping = new Map(); + this.size = 0; + this.dimension = 0; +}; + +/** + * Method used to add a document to the index. + * + * @param {any} doc - Item to add. + * @return {InvertedIndex} + */ +InvertedIndex.prototype.add = function(doc) { + + // Increasing size + this.size++; + + // Storing document + var key = this.items.length; + this.items.push(doc); + + // Tokenizing the document + var tokens = this.documentTokenizer(doc); + + if (!Array.isArray(tokens)) + throw new Error('mnemonist/InvertedIndex.add: tokenizer function should return an array of tokens.'); + + // Indexing + var token, + container; + + for (var i = 0, l = tokens.length; i < l; i++) { + token = tokens[i]; + container = this.mapping.get(token); + + if (!container) { + container = new Set(); + this.mapping.set(token, container); + } + + container.add(key); + } + + this.dimension = this.mapping.size; + + return this; +}; + +/** + * Method used to query the index. + * + * @param {any} query - Query + * @return {Set} - Intersection of documents matching the query. + */ +InvertedIndex.prototype.get = function(query) { + + // Early termination + if (!this.size) + return new Set(); + + // First we need to tokenize the query + var tokens = this.queryTokenizer(query); + + if (!Array.isArray(tokens)) + throw new Error('mnemonist/InvertedIndex.get: tokenizer function should return an array of tokens.'); + + var matchingSet, + token, + set, + i, + l; + + for (i = 0, l = tokens.length; i < l; i++) { + token = tokens[i]; + set = this.mapping.get(token); + + // Empty intersection + if (!set || !set.size) + return new Set(); + + if (!matchingSet) + matchingSet = new Set(set); + else + intersect(matchingSet, set); + } + + var results = new Array(matchingSet.size), + iterator = matchingSet.values(), + step; + + i = 0; + + while ((step = iterator.next(), !step.done)) + results[i++] = this.items[step.value]; + + return results; +}; + +/** + * Method used to iterate over each of the documents. + * + * @param {function} callback - Function to call for each item. + * @param {object} scope - Optional scope. + * @return {undefined} + */ +InvertedIndex.prototype.forEach = function(callback, scope) { + scope = arguments.length > 1 ? scope : this; + + for (var i = 0, l = this.documents.length; i < l; i++) + callback.call(scope, this.documents[i], i, this); +}; + +/** + * InvertedIndex Iterator class. + */ +function InvertedIndexIterator(next) { + this.next = next; +} + +/** + * Method returning an iterator over the index's documents. + * + * @return {InvertedIndexIterator} + */ +InvertedIndex.prototype.documents = function() { + var documents = this.items, + l = documents.length, + i = 0; + + return new InvertedIndexIterator(function() { + if (i >= l) + return { + done: true + }; + + var value = documents[i++]; + + return { + value: value, + done: false + }; + }); +}; + +/** + * Method returning an iterator over the index's tokens. + * + * @return {InvertedIndexIterator} + */ +InvertedIndex.prototype.tokens = function() { + var iterator = this.mapping.keys(); + + Object.defineProperty(iterator, 'constructor', { + value: InvertedIndexIterator, + enumerable: false + }); + + return iterator; +}; + +/** + * Attaching the #.values method to Symbol.iterator if possible. + */ +if (typeof Symbol !== 'undefined') + InvertedIndex.prototype[Symbol.iterator] = InvertedIndex.prototype.documents; + +// TODO: fuzzy inverted index + +/** + * Convenience known methods. + */ +InvertedIndex.prototype.inspect = function() { + var array = this.documents.slice(); + + // Trick so that node displays the name of the constructor + Object.defineProperty(array, 'constructor', { + value: InvertedIndex, + enumerable: false + }); + + return array; +}; + +/** + * Static @.from function taking an abitrary iterable & converting it into + * a InvertedIndex. + * + * @param {Iterable} iterable - Target iterable. + * @param {function} tokenizer - Tokenizer function. + * @return {InvertedIndex} + */ +InvertedIndex.from = function(iterable, descriptor) { + var index = new InvertedIndex(descriptor); + + iterateOver(iterable, function(doc) { + index.add(doc); + }); + + return index; +}; + +/** + * Exporting. + */ +module.exports = InvertedIndex; diff --git a/multi-index.js b/multi-index.js index 194da1aa..6f8df2c6 100644 --- a/multi-index.js +++ b/multi-index.js @@ -113,7 +113,6 @@ MultiIndex.prototype.forEach = function(callback, scope) { }); }; - /** * MultiIndex Iterator class. */ diff --git a/package.json b/package.json index 422ed158..5a6cb67b 100644 --- a/package.json +++ b/package.json @@ -50,6 +50,7 @@ "damerau-levenshtein": "^1.0.3", "eslint": "^3.8.1", "leven": "^2.0.0", + "lodash": "^4.17.4", "mocha": "^3.1.2" }, "eslintConfig": { diff --git a/set.js b/set.js index 08a7f8c0..3abfcd1c 100644 --- a/set.js +++ b/set.js @@ -15,6 +15,8 @@ exports.intersection = function() { if (arguments.length < 2) throw new Error('mnemonist/Set.intersection: needs at least two arguments.'); + var I = new Set(); + // First we need to find the smallest set var smallestSize = Infinity, smallestSet = null; @@ -24,15 +26,17 @@ exports.intersection = function() { for (i = 0; i < l; i++) { s = arguments[i]; + // If one of the set has no items, we can stop right there + if (s.size === 0) + return I; + if (s.size < smallestSize) { smallestSize = s.size; smallestSet = s; } } - // Now we need to interset this set with the others - var I = new Set(); - + // Now we need to intersect this set with the others var iterator = smallestSet.values(), step, item, @@ -97,6 +101,14 @@ exports.union = function() { * @return {Set} - The difference. */ exports.difference = function(A, B) { + + // If first set is empty + if (!A.size) + return new Set(); + + if (!B.size) + return new Set(A); + var D = new Set(); var iterator = A.values(), diff --git a/test/inverted-index.js b/test/inverted-index.js new file mode 100644 index 00000000..938800cf --- /dev/null +++ b/test/inverted-index.js @@ -0,0 +1,126 @@ +/* eslint no-new: 0 */ +/** + * Mnemonist Inverted Index Unit Tests + * ==================================== + */ +var assert = require('assert'), + InvertedIndex = require('../inverted-index.js'), + words = require('lodash/words'); + +var STOPWORDS = new Set(['a', 'the', 'i', 'is', 'to']); + +var stemmer = function(word) { + return word.replace(/s$/, ''); +}; + +var tokenizer = function(text) { + return words(text.toLowerCase()) + .filter(function(word) { + return !STOPWORDS.has(word); + }) + .map(stemmer); +}; + +var documentTokenizer = function(doc) { + return tokenizer(doc.text); +}; + +var DOCS = [ + 'The cat eats the mouse.', + 'The mouse likes cheese.', + 'Cheese is something mouses really like to eat.' +]; + +var OBJECT_DOCS = DOCS.map(function(text) { + return {text: text}; +}); + +describe('InvertedIndex', function() { + + it('should throw if given invalid tokenizer function.', function() { + + assert.throws(function() { + new InvertedIndex({hello: 'world'}); + }, /tokenizer/); + }); + + it('should throw if the tokenizer does not return an array.', function() { + assert.throws(function() { + var index = new InvertedIndex(); + + index.add(OBJECT_DOCS[0]); + }, /array/); + }); + + it('should be possible to add items to the index.', function() { + var index = new InvertedIndex(documentTokenizer); + + OBJECT_DOCS.forEach(function(doc) { + index.add(doc); + }); + + assert.strictEqual(index.size, 3); + assert.strictEqual(index.dimension, 7); + }); + + it('should be possible to create an index from an arbitrary iterable.', function() { + var index = InvertedIndex.from(OBJECT_DOCS, documentTokenizer); + + assert.strictEqual(index.size, 3); + assert.strictEqual(index.dimension, 7); + }); + + it('should be possible to query the index.', function() { + var index = InvertedIndex.from(DOCS, tokenizer); + + var results = index.get('A mouse.'); + assert.deepEqual(results, DOCS); + + results = index.get('cheese'); + assert.deepEqual(results, DOCS.slice(1)); + + results = index.get('The cat'); + assert.deepEqual(results, [DOCS[0]]); + + results = index.get('The cat likes'); + assert.deepEqual(results, []); + + results = index.get('really something'); + assert.deepEqual(results, DOCS.slice(-1)); + }); + + it('should be possible to iterate using #.forEach', function() { + var index = InvertedIndex.from(DOCS, tokenizer); + + index.forEach(function(doc, i, instance) { + assert.strictEqual(instance, index); + assert.strictEqual(doc, DOCS[i]); + }); + }); + + it('should be possible to create an iterator over documents.', function() { + var index = InvertedIndex.from(OBJECT_DOCS, documentTokenizer); + + var iterator = index.documents(); + + assert.deepEqual(iterator.next().value, OBJECT_DOCS[0]); + assert.deepEqual(iterator.next().value, OBJECT_DOCS[1]); + assert.deepEqual(iterator.next().value, OBJECT_DOCS[2]); + assert.strictEqual(iterator.next().done, true); + }); + + it('should be possible to create an iterator over tokens.', function() { + var index = InvertedIndex.from(OBJECT_DOCS, documentTokenizer); + + var iterator = index.tokens(); + + assert.deepEqual(iterator.next().value, 'cat'); + assert.deepEqual(iterator.next().value, 'eat'); + assert.deepEqual(iterator.next().value, 'mouse'); + assert.deepEqual(iterator.next().value, 'like'); + assert.deepEqual(iterator.next().value, 'cheese'); + assert.deepEqual(iterator.next().value, 'something'); + assert.deepEqual(iterator.next().value, 'really'); + assert.strictEqual(iterator.next().done, true); + }); +});