-
Notifications
You must be signed in to change notification settings - Fork 53
SpellCorrect Exercise Solution
jasonbaldridge edited this page Feb 5, 2013
·
3 revisions
Here is a solution that covers all the steps in SpellCorrect-Exercise.
object SpellingCorrector {
def main(args: Array[String]) {
// The input sentence
val input = args(0)
// Get the vocabulary from one or both input files (when available)
val firstVocab = getVocab(args(1))
val secondVocab = if (args.length>2) getVocab(args(2)) else Set[String]()
val vocab = firstVocab ++ secondVocab
// A map from words to their counts. Can be used later to look up
// vectors for candidates without needing to recompute the counts.
// (Trading use of more space to make cosine computations faster.)
val vocabVectors: Map[String,Map[String,Int]] =
vocab.map(word => (word, getNgramCounts(word))).toMap
// Build the inverted index.
val invertedIndex = vocabVectors
.toSeq
.flatMap { case(word, ngrams) => {
ngrams.map { case(ngram, count) => (ngram,word) }.toSeq
}}
.groupBy(x=>x._1)
.mapValues(_.map(_._2))
.withDefault(x=>Seq[String]())
// Process the input sentence.
println("Detecting spelling errors in: " + input)
input.split(" ").foreach { token => {
if (!vocab(token)) {
println("ERROR: " + token)
val typoNgrams = getNgramCounts(token)
// Get all the candidates and output 10.
val candidates = typoNgrams.keys.flatMap(invertedIndex).toSeq
println(" Candidates: " + candidates.take(10).mkString(" "))
}
}}
}
// Get a word list from a file with one word per line
def getVocab(filename: String) =
io.Source.fromFile(filename).getLines.toSet
// Get the character ngrams in a word with their counts
def getNgramCounts(word: String, size: Int = 3): Map[String,Int] =
("#"+word+"#")
.sliding(size)
.toSeq
.groupBy(x=>x)
.mapValues(_.length)
.withDefault(x=>0)
}