-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
176 additions
and
153 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import Lexer, {Token} from "wordmap-lexer"; | ||
import AlignmentOccurrences from "./algorithms/AlignmentOccurrences"; | ||
import AlignmentPosition from "./algorithms/AlignmentPosition"; | ||
import CharacterLength from "./algorithms/CharacterLength"; | ||
import NgramFrequency from "./algorithms/NgramFrequency"; | ||
import NgramLength from "./algorithms/NgramLength"; | ||
import PhrasePlausibility from "./algorithms/PhrasePlausibility"; | ||
import Uniqueness from "./algorithms/Uniqueness"; | ||
import Engine from "./Engine"; | ||
import Alignment from "./structures/Alignment"; | ||
import Ngram from "./structures/Ngram"; | ||
import Prediction from "./structures/Prediction"; | ||
import Suggestion from "./structures/Suggestion"; | ||
|
||
/** | ||
* Multi-Lingual Word Alignment Prediction | ||
*/ | ||
export default class WordMap { | ||
private engine: Engine; | ||
|
||
constructor(opts = {}) { | ||
|
||
this.engine = new Engine(opts); | ||
this.engine.registerAlgorithm(new NgramFrequency()); | ||
this.engine.registerAlgorithm(new AlignmentPosition()); | ||
this.engine.registerAlgorithm(new PhrasePlausibility()); | ||
this.engine.registerAlgorithm(new NgramLength()); | ||
this.engine.registerAlgorithm(new CharacterLength()); | ||
this.engine.registerAlgorithm(new AlignmentOccurrences()); | ||
this.engine.registerAlgorithm(new Uniqueness()); | ||
} | ||
|
||
/** | ||
* Adds an array of corpus | ||
* @param {string[][]} corpus | ||
*/ | ||
public appendCorpus(corpus: string[][]) { | ||
for (const pair of corpus) { | ||
this.appendCorpusString(pair[0], pair[1]); | ||
} | ||
} | ||
|
||
/** | ||
* Add corpus to the MAP. | ||
* These may be single sentences or multiple sentence delimited by new lines. | ||
* @param {string} source | ||
* @param {string} target | ||
*/ | ||
public appendCorpusString(source: string, target: string) { | ||
const sourceSentences = source.split("\n"); | ||
const targetSentences = target.split("\n"); | ||
const sourceTokens: Token[][] = []; | ||
const targetTokens: Token[][] = []; | ||
|
||
for (const s of sourceSentences) { | ||
sourceTokens.push(Lexer.tokenize(s)); | ||
} | ||
for (const s of targetSentences) { | ||
targetTokens.push(Lexer.tokenize(s)); | ||
} | ||
|
||
this.engine.addCorpus(sourceTokens, targetTokens); | ||
} | ||
|
||
public appendSavedAlignments(alignments: Alignment[]) { | ||
this.engine.addSavedAlignments(alignments); | ||
} | ||
|
||
/** | ||
* Appends some saved alignments. | ||
* This may be multiple lines of text or a single line. | ||
* | ||
* @param {string} source - a string of source phrases separated by new lines | ||
* @param {string} target - a string of target phrases separated by new lines | ||
* @return {Alignment[]} an array of alignment objects (as a convenience) | ||
*/ | ||
public appendSavedAlignmentsString(source: string, target: string): Alignment[] { | ||
const alignments: Alignment[] = []; | ||
const sourceLines = source.split("\n"); | ||
const targetLines = target.split("\n"); | ||
const sourceLinesLength = sourceLines.length; | ||
if (sourceLinesLength !== targetLines.length) { | ||
throw new Error("source and target lines must be the same length"); | ||
} | ||
for (let i = 0; i < sourceLinesLength; i++) { | ||
const sourceTokens = Lexer.tokenize(sourceLines[i]); | ||
const targetTokens = Lexer.tokenize(targetLines[i]); | ||
alignments.push(new Alignment( | ||
new Ngram(sourceTokens), | ||
new Ngram(targetTokens) | ||
)); | ||
} | ||
this.appendSavedAlignments(alignments); | ||
return alignments; | ||
} | ||
|
||
/** | ||
* Predicts the word alignments between the sentences | ||
* @param {string} sourceSentence | ||
* @param {string} targetSentence | ||
* @param {number} maxSuggestions | ||
* @return {Suggestion[]} | ||
*/ | ||
public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] { | ||
const sourceTokens = Lexer.tokenize(sourceSentence); | ||
const targetTokens = Lexer.tokenize(targetSentence); | ||
|
||
let predictions = this.engine.run(sourceTokens, targetTokens); | ||
predictions = this.engine.score(predictions); | ||
return Engine.suggest(predictions, maxSuggestions); | ||
} | ||
|
||
/** | ||
* Predicts word alignments between the sentences. | ||
* Returns an array of suggestions that match the benchmark. | ||
* | ||
* @param {string} sourceSentence | ||
* @param {string} targetSentence | ||
* @param {Suggestion} benchmark | ||
* @param {number} maxSuggestions | ||
* @return {Suggestion[]} | ||
*/ | ||
public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] { | ||
const sourceTokens = Lexer.tokenize(sourceSentence); | ||
const targetTokens = Lexer.tokenize(targetSentence); | ||
|
||
let predictions = this.engine.run(sourceTokens, targetTokens); | ||
predictions = this.engine.score(predictions); | ||
|
||
const validPredictions: Prediction[] = []; | ||
for (const p of predictions) { | ||
for (const a of benchmark) { | ||
if (a.key === p.alignment.key) { | ||
validPredictions.push(p); | ||
} | ||
} | ||
} | ||
return Engine.suggest(validPredictions, maxSuggestions); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,140 +1,5 @@ | ||
import AlignmentOccurrences from "./algorithms/AlignmentOccurrences"; | ||
import AlignmentPosition from "./algorithms/AlignmentPosition"; | ||
import CharacterLength from "./algorithms/CharacterLength"; | ||
import NgramFrequency from "./algorithms/NgramFrequency"; | ||
import NgramLength from "./algorithms/NgramLength"; | ||
import PhrasePlausibility from "./algorithms/PhrasePlausibility"; | ||
import Uniqueness from "./algorithms/Uniqueness"; | ||
import Engine from "./Engine"; | ||
import Lexer, {Token} from "wordmap-lexer"; | ||
import Alignment from "./structures/Alignment"; | ||
import Ngram from "./structures/Ngram"; | ||
import Prediction from "./structures/Prediction"; | ||
import Suggestion from "./structures/Suggestion"; | ||
|
||
/** | ||
* Multi-Lingual Word Alignment Prediction | ||
*/ | ||
export default class WordMap { | ||
private engine: Engine; | ||
|
||
constructor(opts = {}) { | ||
|
||
this.engine = new Engine(opts); | ||
this.engine.registerAlgorithm(new NgramFrequency()); | ||
this.engine.registerAlgorithm(new AlignmentPosition()); | ||
this.engine.registerAlgorithm(new PhrasePlausibility()); | ||
this.engine.registerAlgorithm(new NgramLength()); | ||
this.engine.registerAlgorithm(new CharacterLength()); | ||
this.engine.registerAlgorithm(new AlignmentOccurrences()); | ||
this.engine.registerAlgorithm(new Uniqueness()); | ||
} | ||
|
||
/** | ||
* Adds an array of corpus | ||
* @param {string[][]} corpus | ||
*/ | ||
public appendCorpus(corpus: string[][]) { | ||
for (const pair of corpus) { | ||
this.appendCorpusString(pair[0], pair[1]); | ||
} | ||
} | ||
|
||
/** | ||
* Add corpus to the MAP. | ||
* These may be single sentences or multiple sentence delimited by new lines. | ||
* @param {string} source | ||
* @param {string} target | ||
*/ | ||
public appendCorpusString(source: string, target: string) { | ||
const sourceSentences = source.split("\n"); | ||
const targetSentences = target.split("\n"); | ||
const sourceTokens: Token[][] = []; | ||
const targetTokens: Token[][] = []; | ||
|
||
for (const s of sourceSentences) { | ||
sourceTokens.push(Lexer.tokenize(s)); | ||
} | ||
for (const s of targetSentences) { | ||
targetTokens.push(Lexer.tokenize(s)); | ||
} | ||
|
||
this.engine.addCorpus(sourceTokens, targetTokens); | ||
} | ||
|
||
public appendSavedAlignments(alignments: Alignment[]) { | ||
this.engine.addSavedAlignments(alignments); | ||
} | ||
|
||
/** | ||
* Appends some saved alignments. | ||
* This may be multiple lines of text or a single line. | ||
* | ||
* @param {string} source - a string of source phrases separated by new lines | ||
* @param {string} target - a string of target phrases separated by new lines | ||
* @return {Alignment[]} an array of alignment objects (as a convenience) | ||
*/ | ||
public appendSavedAlignmentsString(source: string, target: string): Alignment[] { | ||
const alignments: Alignment[] = []; | ||
const sourceLines = source.split("\n"); | ||
const targetLines = target.split("\n"); | ||
const sourceLinesLength = sourceLines.length; | ||
if (sourceLinesLength !== targetLines.length) { | ||
throw new Error("source and target lines must be the same length"); | ||
} | ||
for (let i = 0; i < sourceLinesLength; i++) { | ||
const sourceTokens = Lexer.tokenize(sourceLines[i]); | ||
const targetTokens = Lexer.tokenize(targetLines[i]); | ||
alignments.push(new Alignment( | ||
new Ngram(sourceTokens), | ||
new Ngram(targetTokens) | ||
)); | ||
} | ||
this.appendSavedAlignments(alignments); | ||
return alignments; | ||
} | ||
|
||
/** | ||
* Predicts the word alignments between the sentences | ||
* @param {string} sourceSentence | ||
* @param {string} targetSentence | ||
* @param {number} maxSuggestions | ||
* @return {Suggestion[]} | ||
*/ | ||
public predict(sourceSentence: string, targetSentence: string, maxSuggestions: number = 1): Suggestion[] { | ||
const sourceTokens = Lexer.tokenize(sourceSentence); | ||
const targetTokens = Lexer.tokenize(targetSentence); | ||
|
||
let predictions = this.engine.run(sourceTokens, targetTokens); | ||
predictions = this.engine.score(predictions); | ||
return Engine.suggest(predictions, maxSuggestions); | ||
} | ||
|
||
/** | ||
* Predicts word alignments between the sentences. | ||
* Returns an array of suggestions that match the benchmark. | ||
* | ||
* @param {string} sourceSentence | ||
* @param {string} targetSentence | ||
* @param {Suggestion} benchmark | ||
* @param {number} maxSuggestions | ||
* @return {Suggestion[]} | ||
*/ | ||
public predictWithBenchmark(sourceSentence: string, targetSentence: string, benchmark: Alignment[], maxSuggestions: number = 1): Suggestion[] { | ||
const sourceTokens = Lexer.tokenize(sourceSentence); | ||
const targetTokens = Lexer.tokenize(targetSentence); | ||
|
||
let predictions = this.engine.run(sourceTokens, targetTokens); | ||
predictions = this.engine.score(predictions); | ||
|
||
const validPredictions: Prediction[] = []; | ||
for (const p of predictions) { | ||
for (const a of benchmark) { | ||
if (a.key === p.alignment.key) { | ||
validPredictions.push(p); | ||
} | ||
} | ||
} | ||
return Engine.suggest(validPredictions, maxSuggestions); | ||
} | ||
} | ||
export {default} from "./WordMap"; | ||
export {default as Alignment} from "./structures/Alignment"; | ||
export {default as Ngram} from "./structures/Ngram"; | ||
export {default as Prediction} from "./structures/Prediction"; | ||
export {default as Suggestion} from "./structures/Suggestion"; |