diff --git a/src/main/scala/chalk/corpora/MascUtil.scala b/src/main/scala/chalk/corpora/MascUtil.scala index a62d25f..52a693a 100644 --- a/src/main/scala/chalk/corpora/MascUtil.scala +++ b/src/main/scala/chalk/corpora/MascUtil.scala @@ -269,7 +269,7 @@ object MascUtil { def getNodes(doc: Elem) = (doc \\ "node").toSeq.flatMap { nxml => val link = (nxml \ "link") - if (!link.isEmpty) { + if (link.nonEmpty) { val targets = (link.head \ "@targets").toString.split(" ").toSeq Some(MNode(xmlId(nxml), targets)) } else throw new Exception("Missing link element.") //None OK? @@ -291,7 +291,7 @@ object MascUtil { // Have to go through some pains to make sure we get a POS for every token. def getPos(anno: MAnnotation) = { if (anno.features.isDefinedAt("msd")) anno.features("msd") - else if (anno.features.get("kind").getOrElse("") == "urlAddress") "URL" + else if (anno.features.getOrElse("kind", "") == "urlAddress") "URL" else if (anno.features.isDefinedAt("categor")) anno.features("categor") else "UNK" } diff --git a/src/main/scala/chalk/lang/eng/PorterStemmer.scala b/src/main/scala/chalk/lang/eng/PorterStemmer.scala index ea265f1..4bcc185 100644 --- a/src/main/scala/chalk/lang/eng/PorterStemmer.scala +++ b/src/main/scala/chalk/lang/eng/PorterStemmer.scala @@ -77,10 +77,10 @@ class PorterStemmer { def vowelInStem(s: String): Boolean = { for (i <- 0 to b.length - 1 - s.length) { if (!cons(i)) { - return true + true } } - return false + false } /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ diff --git a/src/main/scala/chalk/text/HTML.scala b/src/main/scala/chalk/text/HTML.scala index 472ef0e..00f0a17 100644 --- a/src/main/scala/chalk/text/HTML.scala +++ b/src/main/scala/chalk/text/HTML.scala @@ -22,35 +22,35 @@ package chalk.text */ object HTML { - val regex = "&#?\\w+;".r; + val regex = "&#?\\w+;".r /** Unescapes all HTML entities in the given input. */ def unescapeEntitiesIn(input : String) = { regex.replaceAllIn(input, m => { - val txt = m.group(0); + val txt = m.group(0) try { val codepoint = { if (txt(1) == '#') { if (txt(2) == 'x') { - Integer.parseInt(txt.substring(3, txt.length-1), 16); + Integer.parseInt(txt.substring(3, txt.length-1), 16) } else { - Integer.parseInt(txt.substring(2, txt.length-1)); + Integer.parseInt(txt.substring(2, txt.length-1)) } } else { - entities(txt.substring(1, txt.length-1)); + entities(txt.substring(1, txt.length-1)) } } if (codepoint == '$') { - new String("\\$"); + new String("\\$") } else { - new String(Character.toChars(codepoint)); + new String(Character.toChars(codepoint)) } } catch { case _:Exception => // exception while processing .. append raw input - txt; + txt } - }); + }) } /** List of HTML entities with their corresponding code points borrowed from pythons htmlentities package. */ @@ -307,5 +307,5 @@ object HTML { "ouml" -> 246, "raquo" -> 187, "sigma" -> 963 - ); + ) } diff --git a/src/main/scala/chalk/text/Unicode.scala b/src/main/scala/chalk/text/Unicode.scala index 5de977f..44f89f2 100644 --- a/src/main/scala/chalk/text/Unicode.scala +++ b/src/main/scala/chalk/text/Unicode.scala @@ -24,21 +24,21 @@ package chalk.text object Unicode { private def inRanges(cp : Int, rangeStarts : Array[Int], rangeEnds : Array[Int]) : Boolean = { if (cp < 0) { - return false; + return false } - var i = 0; + var i = 0 while (i < rangeStarts.length && cp < rangeStarts(i)) { - i += 1; + i += 1 } - (i < rangeEnds.length) && (cp <= rangeEnds(i)); + (i < rangeEnds.length) && (cp <= rangeEnds(i)) } private val punctuationRangeStarts = - Array(0xFF01,0xFF1A,0xFF3B,0xFF5B,0xFFE0,0xFE10,0xFE30,0x3000,0xFE50,0x2E00,0x0021,0x003A,0x005B,0x007B,0x2000,0x0080,0x00A1,0x00B4,0x00B6,0x00BF,0x00D7,0x00F7).sorted; + Array(0xFF01,0xFF1A,0xFF3B,0xFF5B,0xFFE0,0xFE10,0xFE30,0x3000,0xFE50,0x2E00,0x0021,0x003A,0x005B,0x007B,0x2000,0x0080,0x00A1,0x00B4,0x00B6,0x00BF,0x00D7,0x00F7).sorted private val punctuationRangeEnds = - Array(0xFF0F,0xFF20,0xFF40,0xFF65,0xFFEE,0xFE1F,0xFE4F,0x303F,0xFE6F,0x2E7F,0x002F,0x003F,0x0060,0x007E,0x206F,0x00FF,0x00B1,0x00B4,0x00BB,0x00BF,0x00D7,0x00F7).sorted; + Array(0xFF0F,0xFF20,0xFF40,0xFF65,0xFFEE,0xFE1F,0xFE4F,0x303F,0xFE6F,0x2E7F,0x002F,0x003F,0x0060,0x007E,0x206F,0x00FF,0x00B1,0x00B4,0x00BB,0x00BF,0x00D7,0x00F7).sorted /** Returns true if the given unicode code point is punctuation. */ def isPunctuation(cp : Int) = { diff --git a/src/main/scala/chalk/text/analyze/CaseFolder.scala b/src/main/scala/chalk/text/analyze/CaseFolder.scala index 5ebefda..5761c3e 100644 --- a/src/main/scala/chalk/text/analyze/CaseFolder.scala +++ b/src/main/scala/chalk/text/analyze/CaseFolder.scala @@ -27,5 +27,5 @@ class CaseFolder extends Analyzer { } object CaseFolder extends CaseFolder { - override def apply(in: String): String = in.toLowerCase; + override def apply(in: String): String = in.toLowerCase } diff --git a/src/main/scala/chalk/text/analyze/EnglishWordClassGenerator.scala b/src/main/scala/chalk/text/analyze/EnglishWordClassGenerator.scala index 625a18e..076c45c 100644 --- a/src/main/scala/chalk/text/analyze/EnglishWordClassGenerator.scala +++ b/src/main/scala/chalk/text/analyze/EnglishWordClassGenerator.scala @@ -10,60 +10,60 @@ object EnglishWordClassGenerator extends Analyzer with Serializable { def apply(x: String) = signatureFor(x) def signatureFor(word: String) = { - val sb = new StringBuilder; - val wlen = word.length(); - val numCaps = (word: Seq[Char]).count(_.isUpper); - val hasDigit = word.exists(_.isDigit); - val hasDash = word.contains('-'); - val hasLower = numCaps < wlen; - val ch0 = word.charAt(0); - val lowered = word.toLowerCase(); + val sb = new StringBuilder + val wlen = word.length() + val numCaps = (word: Seq[Char]).count(_.isUpper) + val hasDigit = word.exists(_.isDigit) + val hasDash = word.contains('-') + val hasLower = numCaps < wlen + val ch0 = word.charAt(0) + val lowered = word.toLowerCase() if (Character.isUpperCase(ch0) || Character.isTitleCase(ch0)) { if (numCaps == 1) { - sb.append("-INITC"); + sb.append("-INITC") } else { - sb.append("-CAPS"); + sb.append("-CAPS") } } else if (!Character.isLetter(ch0) && numCaps > 0) { - sb.append("-CAPS"); + sb.append("-CAPS") } else if (hasLower) { - sb.append("-LC"); + sb.append("-LC") } if (hasDigit) { - sb.append("-NUM"); + sb.append("-NUM") } if (hasDash) { - sb.append("-DASH"); + sb.append("-DASH") } if (lowered.endsWith("s") && wlen >= 3) { // here length 3, so you don't miss out on ones like 80s - val ch2 = lowered.charAt(wlen - 2); + val ch2 = lowered.charAt(wlen - 2) // not -ess suffixes or greek/latin -us, -is if (ch2 != 's' && ch2 != 'i' && ch2 != 'u') { sb.append("-s"); } } else if (word.length() >= 5 && !hasDash && !(hasDigit && numCaps > 0)) { if (lowered.endsWith("ed")) { - sb.append("-ed"); + sb.append("-ed") } else if (lowered.endsWith("ing")) { - sb.append("-ing"); + sb.append("-ing") } else if (lowered.endsWith("ion")) { - sb.append("-ion"); + sb.append("-ion") } else if (lowered.endsWith("er")) { - sb.append("-er"); + sb.append("-er") } else if (lowered.endsWith("est")) { - sb.append("-est"); + sb.append("-est") } else if (lowered.endsWith("ly")) { - sb.append("-ly"); + sb.append("-ly") } else if (lowered.endsWith("ity")) { - sb.append("-ity"); + sb.append("-ity") } else if (lowered.endsWith("y")) { - sb.append("-y"); + sb.append("-y") } else if (lowered.endsWith("al")) { - sb.append("-al"); + sb.append("-al") } } - sb.toString; + sb.toString } } diff --git a/src/main/scala/chalk/text/analyze/PorterStemmer.scala b/src/main/scala/chalk/text/analyze/PorterStemmer.scala index b9b402b..96d6748 100644 --- a/src/main/scala/chalk/text/analyze/PorterStemmer.scala +++ b/src/main/scala/chalk/text/analyze/PorterStemmer.scala @@ -66,7 +66,7 @@ object PorterStemmer extends Stemmer { def extra(w: String) = { if (w.endsWith("at") || w.endsWith("bl") || w.endsWith("iz")) w + 'e' // double consonant: - else if (doublec(w) && !("lsz".contains(w.last))) w.substring(0, w.length - 1); + else if (doublec(w) && !("lsz".contains(w.last))) w.substring(0, w.length - 1) else if (m(w) == 1 && cvc(w)) w + "e" else w } diff --git a/src/main/scala/chalk/text/analyze/WordShapeGenerator.scala b/src/main/scala/chalk/text/analyze/WordShapeGenerator.scala index 7b5a199..97b04cf 100644 --- a/src/main/scala/chalk/text/analyze/WordShapeGenerator.scala +++ b/src/main/scala/chalk/text/analyze/WordShapeGenerator.scala @@ -9,19 +9,19 @@ object WordShapeGenerator extends Analyzer with Serializable { def apply(v1: String) = signatureFor(v1) def signatureFor(word: String) = { - val result = new StringBuilder(word.length); - var i = 0; + val result = new StringBuilder(word.length) + var i = 0 while (i < word.length) { - val c = word(i); - val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c; + val c = word(i) + val x = if (c.isLetter && c.isUpper) 'X' else if (c.isLetter) 'x' else if (c.isDigit) 'd' else c if (result.length > 1 && (result.last == x) && result(result.length - 2) == x) { result += 'e' } else if (result.length > 1 && result.last == 'e' && result(result.length - 2) == x) { () // nothing } else { - result += x; + result += x } - i += 1; + i += 1 } result.toString } diff --git a/src/main/scala/chalk/text/tokenize/RegexSearchTokenizer.scala b/src/main/scala/chalk/text/tokenize/RegexSearchTokenizer.scala index 942e339..08bbb35 100644 --- a/src/main/scala/chalk/text/tokenize/RegexSearchTokenizer.scala +++ b/src/main/scala/chalk/text/tokenize/RegexSearchTokenizer.scala @@ -24,6 +24,6 @@ package chalk.text.tokenize case class RegexSearchTokenizer(pattern : String) extends Tokenizer { override def apply(doc : String) = new Iterable[String] { - override def iterator = (pattern.r.findAllIn(doc)); + override def iterator = (pattern.r.findAllIn(doc)) } } diff --git a/src/main/scala/chalk/text/tokenize/RegexSplitTokenizer.scala b/src/main/scala/chalk/text/tokenize/RegexSplitTokenizer.scala index 982cdaf..1cec832 100644 --- a/src/main/scala/chalk/text/tokenize/RegexSplitTokenizer.scala +++ b/src/main/scala/chalk/text/tokenize/RegexSplitTokenizer.scala @@ -22,6 +22,6 @@ package chalk.text.tokenize * @author dramage */ case class RegexSplitTokenizer(pattern : String) extends Tokenizer { - override def apply(doc : String) = doc.split(pattern); + override def apply(doc : String) = doc.split(pattern) } diff --git a/src/main/scala/chalk/text/tokenize/SimpleEnglishTokenizer.scala b/src/main/scala/chalk/text/tokenize/SimpleEnglishTokenizer.scala index 9ff1e9f..a6b2875 100644 --- a/src/main/scala/chalk/text/tokenize/SimpleEnglishTokenizer.scala +++ b/src/main/scala/chalk/text/tokenize/SimpleEnglishTokenizer.scala @@ -31,34 +31,34 @@ import breeze.io.TextReader; * * @author dramage */ -trait SimpleEnglishTokenizer extends Tokenizer; +trait SimpleEnglishTokenizer extends Tokenizer object SimpleEnglishTokenizer { - def apply() = V1(); + def apply() = V1() /** Version 0 of the SimpleEnglishTokenizer. */ class V0 extends SimpleEnglishTokenizer { override def apply(in : String) : Iterable[String] = { - var string = in; - string = V0.r1.replaceAllIn(string, ""); - string = V0.r2.replaceAllIn(string, "$1 "); - string = V0.r3.replaceAllIn(string, " $1"); - string.split("\\s+"); + var string = in + string = V0.r1.replaceAllIn(string, "") + string = V0.r2.replaceAllIn(string, "$1 ") + string = V0.r3.replaceAllIn(string, " $1") + string.split("\\s+") } } object V0 { // delete word-final hyphens when followed by newlines - val r1 = "(?<=\\w)-\\s*\n\\s*".r; + val r1 = "(?<=\\w)-\\s*\n\\s*".r // add spaces around non-word-internal punctuation - val r2 = "(?<=\\W)(\\p{P})(?! )".r; - val r3 = "(?! )(\\p{P})(?=\\W)".r; + val r2 = "(?<=\\W)(\\p{P})(?! )".r + val r3 = "(?! )(\\p{P})(?=\\W)".r - private val _instance = new V0(); - def apply() = _instance; + private val _instance = new V0() + def apply() = _instance def name = "SimpleEnglishTokenizer.V0" } @@ -74,51 +74,51 @@ object SimpleEnglishTokenizer { apply(TextReader.fromString(in)).toIterable def apply(in : TextReader) : Iterator[String] = new Iterator[String] { - var nv : String = null; - var sb = new java.lang.StringBuilder(); + var nv : String = null + var sb = new java.lang.StringBuilder() - prepare(); + prepare() private def prepare() { - in.skipWhitespace(); + in.skipWhitespace() - val cp = in.peek(); + val cp = in.peek() if (cp == -1) { - nv = null; + nv = null } else if (Character.isLetterOrDigit(cp)) { - nv = in.readWhile(Character.isLetterOrDigit); + nv = in.readWhile(Character.isLetterOrDigit) if (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1))) { - sb.setLength(0); - sb.append(nv); + sb.setLength(0) + sb.append(nv) do { - sb.append(Character.toChars(in.read)); - sb.append(in.readWhile(Character.isLetterOrDigit)); - } while (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1))); - nv = sb.toString; + sb.append(Character.toChars(in.read)) + sb.append(in.readWhile(Character.isLetterOrDigit)) + } while (Unicode.isPunctuation(in.peek(0)) && Character.isLetterOrDigit(in.peek(1))) + nv = sb.toString } } else if (Unicode.isPunctuation(cp)) { - nv = in.readWhile(Unicode.isPunctuation); + nv = in.readWhile(Unicode.isPunctuation) } else { - nv = in.readWhile((c : Int) => !Character.isWhitespace(c)); + nv = in.readWhile((c : Int) => !Character.isWhitespace(c)) } } def hasNext = - nv != null; + nv != null def next = { - val rv = nv; - prepare(); - rv; + val rv = nv + prepare() + rv } } } object V1 { - private val _instance = new V1(); - def apply() = _instance; + private val _instance = new V1() + def apply() = _instance } } diff --git a/src/main/scala/chalk/text/tokenize/Tokenizer.scala b/src/main/scala/chalk/text/tokenize/Tokenizer.scala index e5738fb..26e0849 100644 --- a/src/main/scala/chalk/text/tokenize/Tokenizer.scala +++ b/src/main/scala/chalk/text/tokenize/Tokenizer.scala @@ -29,10 +29,10 @@ import chalk.text.transform.Transformer @SerialVersionUID(1) trait Tokenizer extends (String => Iterable[String]) with Serializable { def andThen(g : Transformer) : Tokenizer = - this ~> g; + this ~> g def ~> (g : Transformer) = - new Tokenizer.Chain(this, g); + new Tokenizer.Chain(this, g) override def toString = getClass.getName } @@ -48,13 +48,13 @@ trait Tokenizer extends (String => Iterable[String]) with Serializable { object Tokenizer { /** Standard implementation wrapping an underlying function of String => Iterable[String]. */ class Impl(val f : String => Iterable[String], val name : String) extends Tokenizer { - override def apply(txt : String) = f(txt); - override def toString = name; + override def apply(txt : String) = f(txt) + override def toString = name override def equals(other : Any) = other match { - case that : Impl => this.f == that.f; - case _ => false; + case that : Impl => this.f == that.f + case _ => false } - override def hashCode = f.hashCode; + override def hashCode = f.hashCode } @@ -68,16 +68,16 @@ object Tokenizer { */ class Chain(val f : Tokenizer, val g : Transformer) extends Tokenizer { protected val tokenize : (String => Iterable[String]) = - if (f.isInstanceOf[Transformer]) f else f.andThen((i : Iterable[String]) => i.view); + if (f.isInstanceOf[Transformer]) f else f.andThen((i : Iterable[String]) => i.view) - override def apply(txt : String) = g(tokenize(txt)); - override def toString = f.toString + " ~> " + g.toString; + override def apply(txt : String) = g(tokenize(txt)) + override def toString = f.toString + " ~> " + g.toString override def equals(other : Any) = other match { - case that : Chain => this.f == that.f && this.g == that.g; - case _ => false; + case that : Chain => this.f == that.f && this.g == that.g + case _ => false } - override def hashCode = f.hashCode * 37 + g.hashCode; + override def hashCode = f.hashCode * 37 + g.hashCode } diff --git a/src/main/scala/chalk/text/tokenize/WhitespaceTokenizer.scala b/src/main/scala/chalk/text/tokenize/WhitespaceTokenizer.scala index 83b6f67..f16c7d1 100644 --- a/src/main/scala/chalk/text/tokenize/WhitespaceTokenizer.scala +++ b/src/main/scala/chalk/text/tokenize/WhitespaceTokenizer.scala @@ -20,13 +20,13 @@ package chalk.text.tokenize * * @author dramage */ -class WhitespaceTokenizer() extends RegexSplitTokenizer("\\s+"); +class WhitespaceTokenizer() extends RegexSplitTokenizer("\\s+") object WhitespaceTokenizer { - def apply() : WhitespaceTokenizer = new WhitespaceTokenizer; + def apply() : WhitespaceTokenizer = new WhitespaceTokenizer - private val _instance : WhitespaceTokenizer = apply(); - def apply(in : String) : Iterable[String] = _instance(in); + private val _instance : WhitespaceTokenizer = apply() + def apply(in : String) : Iterable[String] = _instance(in) } diff --git a/src/main/scala/chalk/text/transform/MinimumLengthFilter.scala b/src/main/scala/chalk/text/transform/MinimumLengthFilter.scala index 8647578..aad1f16 100644 --- a/src/main/scala/chalk/text/transform/MinimumLengthFilter.scala +++ b/src/main/scala/chalk/text/transform/MinimumLengthFilter.scala @@ -22,5 +22,5 @@ package chalk.text.transform */ case class MinimumLengthFilter(minLength: Int) extends Transformer { override def apply(doc: Iterable[String]): Iterable[String] = - doc.filter(token => token.length >= minLength); + doc.filter(token => token.length >= minLength) } diff --git a/src/main/scala/chalk/text/transform/WordsAndNumbersOnlyFilter.scala b/src/main/scala/chalk/text/transform/WordsAndNumbersOnlyFilter.scala index 7dd97d1..d3de507 100644 --- a/src/main/scala/chalk/text/transform/WordsAndNumbersOnlyFilter.scala +++ b/src/main/scala/chalk/text/transform/WordsAndNumbersOnlyFilter.scala @@ -7,7 +7,7 @@ package chalk.text.transform */ case class WordsAndNumbersOnlyFilter() extends Transformer { override def apply(terms : Iterable[String]) = - terms.filter(term => TokenType.Word.matches(term) || TokenType.Number.matches(term)); + terms.filter(term => TokenType.Word.matches(term) || TokenType.Number.matches(term)) } /** @@ -22,23 +22,23 @@ sealed trait TokenType object TokenType { abstract class RegexToken(val pattern : java.util.regex.Pattern) extends TokenType { def matches(token : String) = - pattern.matcher(token).matches; + pattern.matcher(token).matches } - case object Number extends RegexToken("^.*\\p{N}.*$".r.pattern); + case object Number extends RegexToken("^.*\\p{N}.*$".r.pattern) case object Punctuation extends RegexToken("^[\\p{P}\\p{S}]+$".r.pattern) - case object Word extends RegexToken("^.*\\p{L}+.*$".r.pattern); - case object Other extends TokenType; + case object Word extends RegexToken("^.*\\p{L}+.*$".r.pattern) + case object Other extends TokenType def apply(token : String) : TokenType = { if (Word.matches(token)) { - Word; + Word } else if (Number.matches(token)) { - Number; + Number } else if (Punctuation.matches(token)) { - Punctuation; + Punctuation } else { - Other; + Other } } }