Skip to content

Commit

Permalink
reorganized index file
Browse files Browse the repository at this point in the history
  • Loading branch information
da1nerd committed Aug 31, 2018
1 parent 8b1dc5d commit bebaed0
Show file tree
Hide file tree
Showing 4 changed files with 176 additions and 153 deletions.
140 changes: 140 additions & 0 deletions src/WordMap.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,140 @@
import Lexer, {Token} from "wordmap-lexer";
import AlignmentOccurrences from "./algorithms/AlignmentOccurrences";
import AlignmentPosition from "./algorithms/AlignmentPosition";
import CharacterLength from "./algorithms/CharacterLength";
import NgramFrequency from "./algorithms/NgramFrequency";
import NgramLength from "./algorithms/NgramLength";
import PhrasePlausibility from "./algorithms/PhrasePlausibility";
import Uniqueness from "./algorithms/Uniqueness";
import Engine from "./Engine";
import Alignment from "./structures/Alignment";
import Ngram from "./structures/Ngram";
import Prediction from "./structures/Prediction";
import Suggestion from "./structures/Suggestion";

/**
* Multi-Lingual Word Alignment Prediction
*/
export default class WordMap {
private engine: Engine;

constructor(opts = {}) {

this.engine = new Engine(opts);
this.engine.registerAlgorithm(new NgramFrequency());
this.engine.registerAlgorithm(new AlignmentPosition());
this.engine.registerAlgorithm(new PhrasePlausibility());
this.engine.registerAlgorithm(new NgramLength());
this.engine.registerAlgorithm(new CharacterLength());
this.engine.registerAlgorithm(new AlignmentOccurrences());
this.engine.registerAlgorithm(new Uniqueness());
}

/**
* Adds an array of corpus
* @param {string[][]} corpus
*/
public appendCorpus(corpus: string[][]) {
for (const pair of corpus) {
this.appendCorpusString(pair[0], pair[1]);
}
}

/**
* Add corpus to the MAP.
* These may be single sentences or multiple sentence delimited by new lines.
* @param {string} source
* @param {string} target
*/
public appendCorpusString(source: string, target: string) {
const sourceSentences = source.split("\n");
const targetSentences = target.split("\n");
const sourceTokens: Token[][] = [];
const targetTokens: Token[][] = [];

for (const s of sourceSentences) {
sourceTokens.push(Lexer.tokenize(s));
}
for (const s of targetSentences) {
targetTokens.push(Lexer.tokenize(s));
}

this.engine.addCorpus(sourceTokens, targetTokens);
}

public appendSavedAlignments(alignments: Alignment[]) {
this.engine.addSavedAlignments(alignments);
}

/**
* Appends some saved alignments.
* This may be multiple lines of text or a single line.
*
* @param {string} source - a string of source phrases separated by new lines
* @param {string} target - a string of target phrases separated by new lines
* @return {Alignment[]} an array of alignment objects (as a convenience)
*/
public appendSavedAlignmentsString(source: string, target: string): Alignment[] {
const alignments: Alignment[] = [];
const sourceLines = source.split("\n");
const targetLines = target.split("\n");
const sourceLinesLength = sourceLines.length;
if (sourceLinesLength !== targetLines.length) {
throw new Error("source and target lines must be the same length");
}
for (let i = 0; i < sourceLinesLength; i++) {
const sourceTokens = Lexer.tokenize(sourceLines[i]);
const targetTokens = Lexer.tokenize(targetLines[i]);
alignments.push(new Alignment(
new Ngram(sourceTokens),
new Ngram(targetTokens)
));
}
this.appendSavedAlignments(alignments);
return alignments;
}

/**
* Predicts the word alignments between the sentences
* @param {string} sourceSentence
* @param {string} targetSentence
* @param {number} maxSuggestions
* @return {Suggestion[]}
*/
public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] {
const sourceTokens = Lexer.tokenize(sourceSentence);
const targetTokens = Lexer.tokenize(targetSentence);

let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);
return Engine.suggest(predictions, maxSuggestions);
}

/**
* Predicts word alignments between the sentences.
* Returns an array of suggestions that match the benchmark.
*
* @param {string} sourceSentence
* @param {string} targetSentence
* @param {Suggestion} benchmark
* @param {number} maxSuggestions
* @return {Suggestion[]}
*/
public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] {
const sourceTokens = Lexer.tokenize(sourceSentence);
const targetTokens = Lexer.tokenize(targetSentence);

let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);

const validPredictions: Prediction[] = [];
for (const p of predictions) {
for (const a of benchmark) {
if (a.key === p.alignment.key) {
validPredictions.push(p);
}
}
}
return Engine.suggest(validPredictions, maxSuggestions);
}
}
8 changes: 5 additions & 3 deletions src/__tests__/map.ts
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import * as fs from "fs-extra";
import * as path from "path";
import WordMap from "../index";
import Alignment from "../structures/Alignment";
import {makeMockAlignment} from "../util/testUtils";
import WordMap from "../WordMap";

describe("MAP", () => {

Expand Down Expand Up @@ -170,7 +170,8 @@ describe("MAP", () => {
);
const predictions = suggestions[0].getPredictions();
expect(predictions).toHaveLength(6);
expect(predictions[4].key).not.toEqual("n:φιλάνδρους->n:love:their:own:husbands");
expect(predictions[4].key).not.toEqual(
"n:φιλάνδρους->n:love:their:own:husbands");
expect(predictions[5].key).toEqual("n:φιλοτέκνους->n:and:children");
});

Expand All @@ -185,7 +186,8 @@ describe("MAP", () => {
);
const predictions = suggestions[0].getPredictions();
expect(predictions).toHaveLength(7);
expect(predictions[4].key).toEqual("n:φιλάνδρους->n:love:their:own:husbands");
expect(predictions[4].key).toEqual(
"n:φιλάνδρους->n:love:their:own:husbands");
expect(predictions[6].key).toEqual("n:φιλοτέκνους->n:and:children");
});
});
Expand Down
36 changes: 26 additions & 10 deletions src/__tests__/titus_map.ts
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
import * as fs from "fs-extra";
import * as path from "path";
import WordMap from "../index";
import Alignment from "../structures/Alignment";
import {makeMockAlignment, scoreSuggestion} from "../util/testUtils";
import WordMap from "../WordMap";

describe("MAP predictions in Titus", () => {
const greek = path.join(__dirname, "fixtures/corpus/tit/greek.txt");
const english = path.join(__dirname, "fixtures/corpus/tit/english.txt");
const map = new WordMap();
loadCorpus(map, greek, english);
loadCorpus(map, greek, english);

it("predicts the first verse", () => {
const unalignedPair = [
"Παῦλος, δοῦλος Θεοῦ, ἀπόστολος δὲ Ἰησοῦ Χριστοῦ, κατὰ πίστιν ἐκλεκτῶν Θεοῦ, καὶ ἐπίγνωσιν ἀληθείας, τῆς κατ’ εὐσέβειαν",
"Paul a servant of God and an apostle of Jesus Christ for the faith of God s chosen people and the knowledge of the truth that agrees with godliness"
];
const suggestions = map.predict(unalignedPair[0], unalignedPair[1], 2);
const chapterOneAlignmentPath = path.join(__dirname, "fixtures/corpus/tit/alignmentData/1.json");
const chapterOneAlignmentPath = path.join(
__dirname,
"fixtures/corpus/tit/alignmentData/1.json"
);
scoreSuggestion(suggestions[0], getAlignments(chapterOneAlignmentPath, 1));
console.log("suggestions\n", suggestions.map((s) => {
return s.toString();
Expand All @@ -42,12 +45,24 @@ describe("MAP predictions in Titus", () => {
benchmark.push(makeMockAlignment("ἀδελφοὺς", "brothers"));
benchmark.push(makeMockAlignment("αὐτοῦ", "his"));

console.log("suggestions\n", map.predict(secondUnalignedPair[0], secondUnalignedPair[1], 2).map((s) => {
return s.toString();
}).join("\n"));
console.log("benchmarks\n", map.predictWithBenchmark(secondUnalignedPair[0], secondUnalignedPair[1], benchmark, 2).map((s) => {
return s.toString();
}).join("\n"));
console.log(
"suggestions\n",
map.predict(secondUnalignedPair[0], secondUnalignedPair[1], 2)
.map((s) => {
return s.toString();
})
.join("\n")
);
console.log(
"benchmarks\n",
map.predictWithBenchmark(secondUnalignedPair[0],
secondUnalignedPair[1],
benchmark,
2
).map((s) => {
return s.toString();
}).join("\n")
);
});
});

Expand All @@ -73,6 +88,7 @@ function loadCorpus(map: WordMap, sourcePath: string, targetPath: string) {
* @return {object}
*/
function getAlignments(filePath: string, verse: number): object {
const verseAlignments = JSON.parse(fs.readFileSync(filePath).toString("utf-8"));
const verseAlignments = JSON.parse(fs.readFileSync(filePath)
.toString("utf-8"));
return verseAlignments[verse.toString()].alignments;
}
145 changes: 5 additions & 140 deletions src/index.ts
Original file line number Diff line number Diff line change
@@ -1,140 +1,5 @@
import AlignmentOccurrences from "./algorithms/AlignmentOccurrences";
import AlignmentPosition from "./algorithms/AlignmentPosition";
import CharacterLength from "./algorithms/CharacterLength";
import NgramFrequency from "./algorithms/NgramFrequency";
import NgramLength from "./algorithms/NgramLength";
import PhrasePlausibility from "./algorithms/PhrasePlausibility";
import Uniqueness from "./algorithms/Uniqueness";
import Engine from "./Engine";
import Lexer, {Token} from "wordmap-lexer";
import Alignment from "./structures/Alignment";
import Ngram from "./structures/Ngram";
import Prediction from "./structures/Prediction";
import Suggestion from "./structures/Suggestion";

/**
* Multi-Lingual Word Alignment Prediction
*/
export default class WordMap {
private engine: Engine;

constructor(opts = {}) {

this.engine = new Engine(opts);
this.engine.registerAlgorithm(new NgramFrequency());
this.engine.registerAlgorithm(new AlignmentPosition());
this.engine.registerAlgorithm(new PhrasePlausibility());
this.engine.registerAlgorithm(new NgramLength());
this.engine.registerAlgorithm(new CharacterLength());
this.engine.registerAlgorithm(new AlignmentOccurrences());
this.engine.registerAlgorithm(new Uniqueness());
}

/**
* Adds an array of corpus
* @param {string[][]} corpus
*/
public appendCorpus(corpus: string[][]) {
for (const pair of corpus) {
this.appendCorpusString(pair[0], pair[1]);
}
}

/**
* Add corpus to the MAP.
* These may be single sentences or multiple sentence delimited by new lines.
* @param {string} source
* @param {string} target
*/
public appendCorpusString(source: string, target: string) {
const sourceSentences = source.split("\n");
const targetSentences = target.split("\n");
const sourceTokens: Token[][] = [];
const targetTokens: Token[][] = [];

for (const s of sourceSentences) {
sourceTokens.push(Lexer.tokenize(s));
}
for (const s of targetSentences) {
targetTokens.push(Lexer.tokenize(s));
}

this.engine.addCorpus(sourceTokens, targetTokens);
}

public appendSavedAlignments(alignments: Alignment[]) {
this.engine.addSavedAlignments(alignments);
}

/**
* Appends some saved alignments.
* This may be multiple lines of text or a single line.
*
* @param {string} source - a string of source phrases separated by new lines
* @param {string} target - a string of target phrases separated by new lines
* @return {Alignment[]} an array of alignment objects (as a convenience)
*/
public appendSavedAlignmentsString(source: string, target: string): Alignment[] {
const alignments: Alignment[] = [];
const sourceLines = source.split("\n");
const targetLines = target.split("\n");
const sourceLinesLength = sourceLines.length;
if (sourceLinesLength !== targetLines.length) {
throw new Error("source and target lines must be the same length");
}
for (let i = 0; i < sourceLinesLength; i++) {
const sourceTokens = Lexer.tokenize(sourceLines[i]);
const targetTokens = Lexer.tokenize(targetLines[i]);
alignments.push(new Alignment(
new Ngram(sourceTokens),
new Ngram(targetTokens)
));
}
this.appendSavedAlignments(alignments);
return alignments;
}

/**
* Predicts the word alignments between the sentences
* @param {string} sourceSentence
* @param {string} targetSentence
* @param {number} maxSuggestions
* @return {Suggestion[]}
*/
public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] {
const sourceTokens = Lexer.tokenize(sourceSentence);
const targetTokens = Lexer.tokenize(targetSentence);

let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);
return Engine.suggest(predictions, maxSuggestions);
}

/**
* Predicts word alignments between the sentences.
* Returns an array of suggestions that match the benchmark.
*
* @param {string} sourceSentence
* @param {string} targetSentence
* @param {Suggestion} benchmark
* @param {number} maxSuggestions
* @return {Suggestion[]}
*/
public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] {
const sourceTokens = Lexer.tokenize(sourceSentence);
const targetTokens = Lexer.tokenize(targetSentence);

let predictions = this.engine.run(sourceTokens, targetTokens);
predictions = this.engine.score(predictions);

const validPredictions: Prediction[] = [];
for (const p of predictions) {
for (const a of benchmark) {
if (a.key === p.alignment.key) {
validPredictions.push(p);
}
}
}
return Engine.suggest(validPredictions, maxSuggestions);
}
}
export {default} from "./WordMap";
export {default as Alignment} from "./structures/Alignment";
export {default as Ngram} from "./structures/Ngram";
export {default as Prediction} from "./structures/Prediction";
export {default as Suggestion} from "./structures/Suggestion";

0 comments on commit bebaed0

Please sign in to comment.