Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
#18 progress. Moved non-actor bits out of AnalysisEngine.scala into A…
Browse files Browse the repository at this point in the history
…nalysisFunction.scala and created example AnalysisPipeline main method for processing w/o Actors.
  • Loading branch information
jasonbaldridge committed Jul 21, 2013
1 parent b6d0999 commit 2edd8c5
Show file tree
Hide file tree
Showing 2 changed files with 59 additions and 32 deletions.
35 changes: 3 additions & 32 deletions src/main/scala/chalk/slab/AnalysisEngine.scala
Original file line number Diff line number Diff line change
Expand Up @@ -6,23 +6,11 @@ import akka.util.Timeout
import scala.collection.mutable.ListBuffer
import scala.concurrent.duration._

/**
* An analysis function that takes a Slab with declared annotation types in it and outputs
* a new Slab with additional annotations of a new type.
*
* Documentation for the type variables:
* C = Content type
* B = Base annonation type
* I = Input annotation type
* O = Output annotation type
*/
trait AnalysisFunction[C,B,-I<:B,+O<:B] extends (Slab[C,B,I] => Slab[C,B,B with O])

/**
* An actor that mixes-in an AnalysisFunction and hands Slabs contained in Process messages over
* to the function.
*/
trait AnalysisComponent[C,B,-I<:B,+O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] {
trait AnalysisComponent[C,B,I<:B,O<:B] extends Actor with ActorLogging with AnalysisFunction[C,B,I,O] {
import AnalysisComponent._
def receive = {
case Process(slab) => sender ! apply(slab.asInstanceOf[Slab[C,B,I]])
Expand All @@ -36,30 +24,12 @@ object AnalysisComponent {
case class Process[C,B,I<:B](slab: Slab[C,B,I])
}

/**
* A simple regex sentence segmenter.
*/
trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] {
def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) =
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
}

/**
* An actor that uses SentenceSegmenter.
*/
class SentenceSegmenterActor extends SentenceSegmenter
with AnalysisComponent[String,StringAnnotation,StringAnnotation,Sentence]

/**
* A simple regex tokenizer.
*/
trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] {
def apply(slab: Slab[String, StringAnnotation, Sentence]) =
slab ++ slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Token(sentence.begin + m.start, sentence.begin + m.end)))
}

/**
* An actor that uses Tokenizer.
*/
Expand All @@ -73,8 +43,9 @@ object AnalysisEngine {
import AnalysisComponent._
import StringAnnotation._

val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"

def main(args: Array[String]) {
val text = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"

val slab = Slab(text)
val system = ActorSystem("ChalkSystem")
Expand Down
56 changes: 56 additions & 0 deletions src/main/scala/chalk/slab/AnalysisFunction.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
package chalk.slab

/**
* An analysis function that takes a Slab with declared annotation types in it and outputs
* a new Slab with additional annotations of a new type.
*
* Documentation for the type variables:
* C = Content type
* B = Base annonation type
* I = Input annotation type
* O = Output annotation type
*/
trait AnalysisFunction[C,B,I<:B,O<:B] extends (Slab[C,B,I] => Slab[C,B,B with I with O])

object StringIdentityAnalyzer extends AnalysisFunction[String, StringAnnotation, StringAnnotation, StringAnnotation] {
def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = slab
}

/**
* A simple regex sentence segmenter.
*/
trait SentenceSegmenter extends AnalysisFunction[String, StringAnnotation, StringAnnotation, Sentence] {
def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) =
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))
}

/**
* A simple regex tokenizer.
*/
trait Tokenizer extends AnalysisFunction[String, StringAnnotation, Sentence, Token] {
def apply(slab: Slab[String, StringAnnotation, Sentence]) =
slab ++ slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Token(sentence.begin + m.start, sentence.begin + m.end)))
}


object AnalysisPipeline {
import StringAnnotation._

def main (args: Array[String]) {
val sentenceSegmenter = new SentenceSegmenter{}
val tokenizer = new Tokenizer {}
val pipeline = StringIdentityAnalyzer andThen sentenceSegmenter andThen tokenizer
val slab = pipeline(Slab(AnalysisEngine.text))
// Notice that the last sentence (lacking EOS char) is missing.
val sentences = slab.iterator[Sentence].toList
println("\nSENTENCES\n\n" + sentences.map(_.in(slab).content).mkString("\n"))

val tokens = slab.iterator[Token].toList
println("\nTOKENS\n\n" + tokens.map(_.in(slab).content).mkString("\n"))

}


}

0 comments on commit 2edd8c5

Please sign in to comment.