#18 progress. Moved non-actor bits out of AnalysisEngine.scala into A…

…nalysisFunction.scala and created example AnalysisPipeline main method for processing w/o Actors.
scalanlp · Jul 21, 2013 · 2edd8c5 · 2edd8c5
1 parent b6d0999
commit 2edd8c5
Show file tree

Hide file tree

Showing 2 changed files with 59 additions and 32 deletions.
diff --git a/src/main/scala/chalk/slab/AnalysisEngine.scala b/src/main/scala/chalk/slab/AnalysisEngine.scala
@@ -6,23 +6,11 @@ import akka.util.Timeout
 import scala.collection.mutable.ListBuffer
 import scala.concurrent.duration._
 
-/**
-  * An analysis function that takes a Slab with declared annotation types in it and outputs
-  * a new Slab with additional annotations of a new type.
-  *
-  * Documentation for the type variables:
-  *   C = Content type
-  *   B = Base annonation type
-  *   I = Input annotation type
-  *   O = Output annotation type
-  */ 
-trait AnalysisFunction[C,B,-I<:B,+O<:B] extends (Slab[C,B,I] => Slab[C,B,B with O])
-
 /**
   * An actor that mixes-in an AnalysisFunction and hands Slabs contained in Process messages over
   * to the function.
   */
-trait AnalysisComponent[C,B,-I<:B,+O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] {
+trait AnalysisComponent[C,B,I<:B,O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] {
   import AnalysisComponent._
   def receive = {
     case Process(slab) => sender ! apply(slab.asInstanceOf[Slab[C,B,I]])
@@ -36,30 +24,12 @@ object AnalysisComponent {
   case class Process[C,B,I<:B](slab: Slab[C,B,I])
 }
 
-/**
-  * A simple regex sentence segmenter.
-  */
-trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] {
-  def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) =
-    slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
-}
-
 /**
   * An actor that uses SentenceSegmenter.
   */
 class SentenceSegmenterActor extends SentenceSegmenter
     with AnalysisComponent[String,StringAnnotation,StringAnnotation,Sentence]
 
-/**
-  * A simple regex tokenizer.
-  */
-trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] {
-  def apply(slab: Slab[String, StringAnnotation, Sentence]) =
-    slab ++ slab.iterator[Sentence].flatMap(sentence =>
-      "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
-        Token(sentence.begin + m.start, sentence.begin + m.end)))
-}
-
 /**
   * An actor that uses Tokenizer.
   */
@@ -73,8 +43,9 @@ object AnalysisEngine {
   import AnalysisComponent._
   import StringAnnotation._
 
+  val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
+
   def main(args: Array[String]) {
-    val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"
 
     val slab = Slab(text)
     val system = ActorSystem("ChalkSystem")

diff --git a/src/main/scala/chalk/slab/AnalysisFunction.scala b/src/main/scala/chalk/slab/AnalysisFunction.scala
@@ -0,0 +1,56 @@
+package chalk.slab
+
+/**
+  * An analysis function that takes a Slab with declared annotation types in it and outputs
+  * a new Slab with additional annotations of a new type.
+  *
+  * Documentation for the type variables:
+  *   C = Content type
+  *   B = Base annonation type
+  *   I = Input annotation type
+  *   O = Output annotation type
+  */ 
+trait AnalysisFunction[C,B,I<:B,O<:B] extends (Slab[C,B,I] => Slab[C,B,B with I with O])
+
+object StringIdentityAnalyzer extends AnalysisFunction[String, StringAnnotation, StringAnnotation, StringAnnotation] {
+  def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = slab
+}
+
+/**
+  * A simple regex sentence segmenter.
+  */
+trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] {
+  def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) =
+    slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
+}
+
+/**
+  * A simple regex tokenizer.
+  */
+trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] {
+  def apply(slab: Slab[String, StringAnnotation, Sentence]) =
+    slab ++ slab.iterator[Sentence].flatMap(sentence =>
+      "\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
+        Token(sentence.begin + m.start, sentence.begin + m.end)))
+}
+
+
+object AnalysisPipeline {
+  import StringAnnotation._
+
+  def main (args: Array[String]) {
+    val sentenceSegmenter = new SentenceSegmenter{}
+    val tokenizer = new Tokenizer {}
+    val pipeline = StringIdentityAnalyzer andThen sentenceSegmenter andThen tokenizer
+    val slab = pipeline(Slab(AnalysisEngine.text))
+    // Notice that the last sentence (lacking EOS char) is missing.
+    val sentences = slab.iterator[Sentence].toList
+    println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n"))
+
+    val tokens = slab.iterator[Token].toList
+    println("\nTOKENS\n\n" + tokens.map(_.in(slab).content).mkString("\n"))
+
+  }
+
+
+}