From bebaed0fda1aae3d8d94988566fca0e655867e32 Mon Sep 17 00:00:00 2001 From: Joel Lonbeck Date: Fri, 31 Aug 2018 14:54:05 -0700 Subject: [PATCH] reorganized index file --- src/WordMap.ts | 140 +++++++++++++++++++++++++++++++++++ src/__tests__/map.ts | 8 +- src/__tests__/titus_map.ts | 36 ++++++--- src/index.ts | 145 ++----------------------------------- 4 files changed, 176 insertions(+), 153 deletions(-) create mode 100644 src/WordMap.ts diff --git a/src/WordMap.ts b/src/WordMap.ts new file mode 100644 index 0000000..af6d927 --- /dev/null +++ b/src/WordMap.ts @@ -0,0 +1,140 @@ +import Lexer, {Token} from "wordmap-lexer"; +import AlignmentOccurrences from "./algorithms/AlignmentOccurrences"; +import AlignmentPosition from "./algorithms/AlignmentPosition"; +import CharacterLength from "./algorithms/CharacterLength"; +import NgramFrequency from "./algorithms/NgramFrequency"; +import NgramLength from "./algorithms/NgramLength"; +import PhrasePlausibility from "./algorithms/PhrasePlausibility"; +import Uniqueness from "./algorithms/Uniqueness"; +import Engine from "./Engine"; +import Alignment from "./structures/Alignment"; +import Ngram from "./structures/Ngram"; +import Prediction from "./structures/Prediction"; +import Suggestion from "./structures/Suggestion"; + +/** + * Multi-Lingual Word Alignment Prediction + */ +export default class WordMap { + private engine: Engine; + + constructor(opts = {}) { + + this.engine = new Engine(opts); + this.engine.registerAlgorithm(new NgramFrequency()); + this.engine.registerAlgorithm(new AlignmentPosition()); + this.engine.registerAlgorithm(new PhrasePlausibility()); + this.engine.registerAlgorithm(new NgramLength()); + this.engine.registerAlgorithm(new CharacterLength()); + this.engine.registerAlgorithm(new AlignmentOccurrences()); + this.engine.registerAlgorithm(new Uniqueness()); + } + + /** + * Adds an array of corpus + * @param {string[][]} corpus + */ + public appendCorpus(corpus: string[][]) { + for (const pair of corpus) { + this.appendCorpusString(pair[0], pair[1]); + } + } + + /** + * Add corpus to the MAP. + * These may be single sentences or multiple sentence delimited by new lines. + * @param {string} source + * @param {string} target + */ + public appendCorpusString(source: string, target: string) { + const sourceSentences = source.split("\n"); + const targetSentences = target.split("\n"); + const sourceTokens: Token[][] = []; + const targetTokens: Token[][] = []; + + for (const s of sourceSentences) { + sourceTokens.push(Lexer.tokenize(s)); + } + for (const s of targetSentences) { + targetTokens.push(Lexer.tokenize(s)); + } + + this.engine.addCorpus(sourceTokens, targetTokens); + } + + public appendSavedAlignments(alignments: Alignment[]) { + this.engine.addSavedAlignments(alignments); + } + + /** + * Appends some saved alignments. + * This may be multiple lines of text or a single line. + * + * @param {string} source - a string of source phrases separated by new lines + * @param {string} target - a string of target phrases separated by new lines + * @return {Alignment[]} an array of alignment objects (as a convenience) + */ + public appendSavedAlignmentsString(source: string, target: string): Alignment[] { + const alignments: Alignment[] = []; + const sourceLines = source.split("\n"); + const targetLines = target.split("\n"); + const sourceLinesLength = sourceLines.length; + if (sourceLinesLength !== targetLines.length) { + throw new Error("source and target lines must be the same length"); + } + for (let i = 0; i < sourceLinesLength; i++) { + const sourceTokens = Lexer.tokenize(sourceLines[i]); + const targetTokens = Lexer.tokenize(targetLines[i]); + alignments.push(new Alignment( + new Ngram(sourceTokens), + new Ngram(targetTokens) + )); + } + this.appendSavedAlignments(alignments); + return alignments; + } + + /** + * Predicts the word alignments between the sentences + * @param {string} sourceSentence + * @param {string} targetSentence + * @param {number} maxSuggestions + * @return {Suggestion[]} + */ + public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] { + const sourceTokens = Lexer.tokenize(sourceSentence); + const targetTokens = Lexer.tokenize(targetSentence); + + let predictions = this.engine.run(sourceTokens, targetTokens); + predictions = this.engine.score(predictions); + return Engine.suggest(predictions, maxSuggestions); + } + + /** + * Predicts word alignments between the sentences. + * Returns an array of suggestions that match the benchmark. + * + * @param {string} sourceSentence + * @param {string} targetSentence + * @param {Suggestion} benchmark + * @param {number} maxSuggestions + * @return {Suggestion[]} + */ + public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] { + const sourceTokens = Lexer.tokenize(sourceSentence); + const targetTokens = Lexer.tokenize(targetSentence); + + let predictions = this.engine.run(sourceTokens, targetTokens); + predictions = this.engine.score(predictions); + + const validPredictions: Prediction[] = []; + for (const p of predictions) { + for (const a of benchmark) { + if (a.key === p.alignment.key) { + validPredictions.push(p); + } + } + } + return Engine.suggest(validPredictions, maxSuggestions); + } +} diff --git a/src/__tests__/map.ts b/src/__tests__/map.ts index fb05a81..ec113d7 100644 --- a/src/__tests__/map.ts +++ b/src/__tests__/map.ts @@ -1,8 +1,8 @@ import * as fs from "fs-extra"; import * as path from "path"; -import WordMap from "../index"; import Alignment from "../structures/Alignment"; import {makeMockAlignment} from "../util/testUtils"; +import WordMap from "../WordMap"; describe("MAP", () => { @@ -170,7 +170,8 @@ describe("MAP", () => { ); const predictions = suggestions[0].getPredictions(); expect(predictions).toHaveLength(6); - expect(predictions[4].key).not.toEqual("n:φιλάνδρους->n:love:their:own:husbands"); + expect(predictions[4].key).not.toEqual( + "n:φιλάνδρους->n:love:their:own:husbands"); expect(predictions[5].key).toEqual("n:φιλοτέκνους->n:and:children"); }); @@ -185,7 +186,8 @@ describe("MAP", () => { ); const predictions = suggestions[0].getPredictions(); expect(predictions).toHaveLength(7); - expect(predictions[4].key).toEqual("n:φιλάνδρους->n:love:their:own:husbands"); + expect(predictions[4].key).toEqual( + "n:φιλάνδρους->n:love:their:own:husbands"); expect(predictions[6].key).toEqual("n:φιλοτέκνους->n:and:children"); }); }); diff --git a/src/__tests__/titus_map.ts b/src/__tests__/titus_map.ts index 2db2668..419add9 100644 --- a/src/__tests__/titus_map.ts +++ b/src/__tests__/titus_map.ts @@ -1,14 +1,14 @@ import * as fs from "fs-extra"; import * as path from "path"; -import WordMap from "../index"; import Alignment from "../structures/Alignment"; import {makeMockAlignment, scoreSuggestion} from "../util/testUtils"; +import WordMap from "../WordMap"; describe("MAP predictions in Titus", () => { const greek = path.join(__dirname, "fixtures/corpus/tit/greek.txt"); const english = path.join(__dirname, "fixtures/corpus/tit/english.txt"); const map = new WordMap(); - loadCorpus(map, greek, english); + loadCorpus(map, greek, english); it("predicts the first verse", () => { const unalignedPair = [ @@ -16,7 +16,10 @@ describe("MAP predictions in Titus", () => { "Paul a servant of God and an apostle of Jesus Christ for the faith of God s chosen people and the knowledge of the truth that agrees with godliness" ]; const suggestions = map.predict(unalignedPair[0], unalignedPair[1], 2); - const chapterOneAlignmentPath = path.join(__dirname, "fixtures/corpus/tit/alignmentData/1.json"); + const chapterOneAlignmentPath = path.join( + __dirname, + "fixtures/corpus/tit/alignmentData/1.json" + ); scoreSuggestion(suggestions[0], getAlignments(chapterOneAlignmentPath, 1)); console.log("suggestions\n", suggestions.map((s) => { return s.toString(); @@ -42,12 +45,24 @@ describe("MAP predictions in Titus", () => { benchmark.push(makeMockAlignment("ἀδελφοὺς", "brothers")); benchmark.push(makeMockAlignment("αὐτοῦ", "his")); - console.log("suggestions\n", map.predict(secondUnalignedPair[0], secondUnalignedPair[1], 2).map((s) => { - return s.toString(); - }).join("\n")); - console.log("benchmarks\n", map.predictWithBenchmark(secondUnalignedPair[0], secondUnalignedPair[1], benchmark, 2).map((s) => { - return s.toString(); - }).join("\n")); + console.log( + "suggestions\n", + map.predict(secondUnalignedPair[0], secondUnalignedPair[1], 2) + .map((s) => { + return s.toString(); + }) + .join("\n") + ); + console.log( + "benchmarks\n", + map.predictWithBenchmark(secondUnalignedPair[0], + secondUnalignedPair[1], + benchmark, + 2 + ).map((s) => { + return s.toString(); + }).join("\n") + ); }); }); @@ -73,6 +88,7 @@ function loadCorpus(map: WordMap, sourcePath: string, targetPath: string) { * @return {object} */ function getAlignments(filePath: string, verse: number): object { - const verseAlignments = JSON.parse(fs.readFileSync(filePath).toString("utf-8")); + const verseAlignments = JSON.parse(fs.readFileSync(filePath) + .toString("utf-8")); return verseAlignments[verse.toString()].alignments; } diff --git a/src/index.ts b/src/index.ts index 9cc242a..8654f6a 100644 --- a/src/index.ts +++ b/src/index.ts @@ -1,140 +1,5 @@ -import AlignmentOccurrences from "./algorithms/AlignmentOccurrences"; -import AlignmentPosition from "./algorithms/AlignmentPosition"; -import CharacterLength from "./algorithms/CharacterLength"; -import NgramFrequency from "./algorithms/NgramFrequency"; -import NgramLength from "./algorithms/NgramLength"; -import PhrasePlausibility from "./algorithms/PhrasePlausibility"; -import Uniqueness from "./algorithms/Uniqueness"; -import Engine from "./Engine"; -import Lexer, {Token} from "wordmap-lexer"; -import Alignment from "./structures/Alignment"; -import Ngram from "./structures/Ngram"; -import Prediction from "./structures/Prediction"; -import Suggestion from "./structures/Suggestion"; - -/** - * Multi-Lingual Word Alignment Prediction - */ -export default class WordMap { - private engine: Engine; - - constructor(opts = {}) { - - this.engine = new Engine(opts); - this.engine.registerAlgorithm(new NgramFrequency()); - this.engine.registerAlgorithm(new AlignmentPosition()); - this.engine.registerAlgorithm(new PhrasePlausibility()); - this.engine.registerAlgorithm(new NgramLength()); - this.engine.registerAlgorithm(new CharacterLength()); - this.engine.registerAlgorithm(new AlignmentOccurrences()); - this.engine.registerAlgorithm(new Uniqueness()); - } - - /** - * Adds an array of corpus - * @param {string[][]} corpus - */ - public appendCorpus(corpus: string[][]) { - for (const pair of corpus) { - this.appendCorpusString(pair[0], pair[1]); - } - } - - /** - * Add corpus to the MAP. - * These may be single sentences or multiple sentence delimited by new lines. - * @param {string} source - * @param {string} target - */ - public appendCorpusString(source: string, target: string) { - const sourceSentences = source.split("\n"); - const targetSentences = target.split("\n"); - const sourceTokens: Token[][] = []; - const targetTokens: Token[][] = []; - - for (const s of sourceSentences) { - sourceTokens.push(Lexer.tokenize(s)); - } - for (const s of targetSentences) { - targetTokens.push(Lexer.tokenize(s)); - } - - this.engine.addCorpus(sourceTokens, targetTokens); - } - - public appendSavedAlignments(alignments: Alignment[]) { - this.engine.addSavedAlignments(alignments); - } - - /** - * Appends some saved alignments. - * This may be multiple lines of text or a single line. - * - * @param {string} source - a string of source phrases separated by new lines - * @param {string} target - a string of target phrases separated by new lines - * @return {Alignment[]} an array of alignment objects (as a convenience) - */ - public appendSavedAlignmentsString(source: string, target: string): Alignment[] { - const alignments: Alignment[] = []; - const sourceLines = source.split("\n"); - const targetLines = target.split("\n"); - const sourceLinesLength = sourceLines.length; - if (sourceLinesLength !== targetLines.length) { - throw new Error("source and target lines must be the same length"); - } - for (let i = 0; i < sourceLinesLength; i++) { - const sourceTokens = Lexer.tokenize(sourceLines[i]); - const targetTokens = Lexer.tokenize(targetLines[i]); - alignments.push(new Alignment( - new Ngram(sourceTokens), - new Ngram(targetTokens) - )); - } - this.appendSavedAlignments(alignments); - return alignments; - } - - /** - * Predicts the word alignments between the sentences - * @param {string} sourceSentence - * @param {string} targetSentence - * @param {number} maxSuggestions - * @return {Suggestion[]} - */ - public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] { - const sourceTokens = Lexer.tokenize(sourceSentence); - const targetTokens = Lexer.tokenize(targetSentence); - - let predictions = this.engine.run(sourceTokens, targetTokens); - predictions = this.engine.score(predictions); - return Engine.suggest(predictions, maxSuggestions); - } - - /** - * Predicts word alignments between the sentences. - * Returns an array of suggestions that match the benchmark. - * - * @param {string} sourceSentence - * @param {string} targetSentence - * @param {Suggestion} benchmark - * @param {number} maxSuggestions - * @return {Suggestion[]} - */ - public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] { - const sourceTokens = Lexer.tokenize(sourceSentence); - const targetTokens = Lexer.tokenize(targetSentence); - - let predictions = this.engine.run(sourceTokens, targetTokens); - predictions = this.engine.score(predictions); - - const validPredictions: Prediction[] = []; - for (const p of predictions) { - for (const a of benchmark) { - if (a.key === p.alignment.key) { - validPredictions.push(p); - } - } - } - return Engine.suggest(validPredictions, maxSuggestions); - } -} +export {default} from "./WordMap"; +export {default as Alignment} from "./structures/Alignment"; +export {default as Ngram} from "./structures/Ngram"; +export {default as Prediction} from "./structures/Prediction"; +export {default as Suggestion} from "./structures/Suggestion";