Skip to content
This repository has been archived by the owner on May 29, 2020. It is now read-only.

Commit

Permalink
Merge pull request #21 from bethard/master
Browse files Browse the repository at this point in the history
Renames StringAnnotation to Span, adds StringSlab alias
  • Loading branch information
jasonbaldridge committed Jul 22, 2013
2 parents d356125 + d30fa19 commit b3d3421
Show file tree
Hide file tree
Showing 4 changed files with 50 additions and 34 deletions.
20 changes: 11 additions & 9 deletions src/main/scala/chalk/slab/AnalysisEngine.scala
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,14 @@ object AnalysisComponent {
case class Process[C,B,I<:B](slab: Slab[C,B,I])
}

trait StringAnalysisComponent[I<:StringAnnotation,O<:StringAnnotation]
extends AnalysisComponent[String,StringAnnotation,I,O]
trait StringAnalysisComponent[I<:Span,O<:Span]
extends AnalysisComponent[String,Span,I,O]

/**
* An actor that uses SentenceSegmenter.
*/
class SentenceSegmenterActor extends SentenceSegmenter[StringAnnotation]
with StringAnalysisComponent[StringAnnotation,Sentence]
class SentenceSegmenterActor extends SentenceSegmenter[Span]
with StringAnalysisComponent[Span,Sentence]

/**
* An actor that uses Tokenizer.
Expand All @@ -51,7 +51,8 @@ class AnalysisEngine extends Actor with ActorLogging {

import AnalysisComponent._
import AnalysisEngine._
import StringAnnotation._
import Span._
import Slab.StringSlab
implicit val ec = context.dispatcher
implicit val timeout = Timeout(10 seconds)

Expand All @@ -62,8 +63,8 @@ class AnalysisEngine extends Actor with ActorLogging {
case Process(slab) =>
log.info("Processing slab:\n " + slab.content)
(for {
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[Slab[String,StringAnnotation,Sentence]]
slab2 <- (tokenizer ? Process(slab1)).mapTo[Slab[String,StringAnnotation,Sentence with Token]]
slab1 <- (sentenceSegmenter ? Process(slab)).mapTo[StringSlab[Sentence]]
slab2 <- (tokenizer ? Process(slab1)).mapTo[StringSlab[Sentence with Token]]
} yield {
slab2
}) pipeTo sender
Expand All @@ -81,7 +82,8 @@ object AnalysisEngine {
case class ProcessCorpus(corpus: Iterator[String])

import AnalysisComponent._
import StringAnnotation._
import Span._
import Slab.StringSlab

val text1 = "Here is an example text. It has four sentences and it mentions Jimi Hendrix and Austin, Texas! In this third sentence, it also brings up Led Zeppelin and Radiohead, but does it ask a question? It also has a straggler sentence that doesn't end with punctuation"

Expand All @@ -99,7 +101,7 @@ object AnalysisEngine {
val corpus = Iterator(text1,text2,text3)

for {
slabs <- (engine ? ProcessCorpus(corpus)).mapTo[Iterator[Slab[String,StringAnnotation,Sentence with Token]]]
slabs <- (engine ? ProcessCorpus(corpus)).mapTo[Iterator[StringSlab[Sentence with Token]]]
slab <- slabs
} {
// Notice that the last sentence (lacking EOS char) is missing.
Expand Down
22 changes: 12 additions & 10 deletions src/main/scala/chalk/slab/AnalysisFunction.scala
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package chalk.slab

import Slab.StringSlab

/**
* An analysis function that takes a Slab with declared annotation types in it and outputs
* a new Slab with additional annotations of a new type.
Expand All @@ -12,17 +14,17 @@ package chalk.slab
*/
trait AnalysisFunction[C,B,I<:B,O<:B] extends (Slab[C,B,I] => Slab[C,B,B with I with O])

trait StringAnalysisFunction[I<:StringAnnotation,O<:StringAnnotation] extends (Slab[String,StringAnnotation,I] => Slab[String,StringAnnotation,StringAnnotation with I with O])
trait StringAnalysisFunction[I<:Span,O<:Span] extends (StringSlab[I] => StringSlab[Span with I with O])

object StringIdentityAnalyzer extends StringAnalysisFunction[StringAnnotation, StringAnnotation] {
def apply(slab: Slab[String, StringAnnotation, StringAnnotation]) = slab
object StringIdentityAnalyzer extends StringAnalysisFunction[Span, Span] {
def apply(slab: StringSlab[Span]) = slab
}

/**
* A simple regex sentence segmenter.
*/
trait SentenceSegmenter[I <: StringAnnotation] extends StringAnalysisFunction[I, Sentence] {
def apply(slab: Slab[String, StringAnnotation, I]) =
trait SentenceSegmenter[I <: Span] extends StringAnalysisFunction[I, Sentence] {
def apply(slab: StringSlab[I]) =
// the [Sentence] is required because of https://issues.scala-lang.org/browse/SI-7647
slab.++[Sentence]("[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end)))
}
Expand All @@ -31,7 +33,7 @@ trait SentenceSegmenter[I <: StringAnnotation] extends StringAnalysisFunction[I,
* A simple regex tokenizer.
*/
trait Tokenizer[I <: Sentence] extends StringAnalysisFunction[I, Token] {
def apply(slab: Slab[String, StringAnnotation, I]) =
def apply(slab: StringSlab[I]) =
// the [Token] is required because of https://issues.scala-lang.org/browse/SI-7647
slab.++[Token](slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Expand All @@ -40,15 +42,15 @@ trait Tokenizer[I <: Sentence] extends StringAnalysisFunction[I, Token] {


object AnalysisPipeline {
import StringAnnotation._
import Span._

// added only to demonstrate necesssity of [I] parameter on analyzers
private[AnalysisPipeline] case class Document(val begin: Int, val end: Int) extends StringAnnotation
private[AnalysisPipeline] def documentAdder(slab: Slab[String, StringAnnotation, StringAnnotation]) =
private[AnalysisPipeline] case class Document(val begin: Int, val end: Int) extends Span
private[AnalysisPipeline] def documentAdder(slab: StringSlab[Span]) =
slab ++ Iterator(Document(0, slab.content.length))

def main (args: Array[String]) {
def sentenceSegmenter[I <: StringAnnotation] = new SentenceSegmenter[I]{}
def sentenceSegmenter[I <: Span] = new SentenceSegmenter[I]{}
def tokenizer[I <: Sentence] = new Tokenizer[I]{}
val pipeline = StringIdentityAnalyzer andThen documentAdder andThen sentenceSegmenter andThen tokenizer
val slab = pipeline(Slab(AnalysisEngine.text1))
Expand Down
34 changes: 22 additions & 12 deletions src/main/scala/chalk/slab/Slab.scala
Original file line number Diff line number Diff line change
Expand Up @@ -38,34 +38,44 @@ abstract class SlabAnnotationOps[ContentType, BaseAnnotationType, AnnotationType
// =========================
// Annotation infrastructure
// =========================
trait StringAnnotation {
trait Span {
val begin: Int
val end: Int
def in[AnnotationTypes <: StringAnnotation](slab: Slab[String, StringAnnotation, AnnotationTypes]) =
new SlabAnnotationOps(this, slab) {
def content = this.slab.content.substring(this.annotation.begin, this.annotation.end)
}
}

object StringAnnotation {
implicit object StringAnnotationHasBounds extends Slab.HasBounds[StringAnnotation] {
def covers(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
object Span {
implicit class SpanInStringSlab(val span: Span) extends AnyVal {
def in[AnnotationTypes <: Span](slab: Slab.StringSlab[AnnotationTypes]) =
new StringSpanAnnotationOps(this.span, slab)
}

class StringSpanAnnotationOps[AnnotationType >: AnnotationTypes <: Span: ClassTag, AnnotationTypes <: Span](
annotation: AnnotationType,
slab: Slab.StringSlab[AnnotationTypes])
extends SlabAnnotationOps[String, Span, AnnotationType, AnnotationTypes](annotation, slab) {
def content = this.slab.content.substring(this.annotation.begin, this.annotation.end)
}

implicit object StringAnnotationHasBounds extends Slab.HasBounds[Span] {
def covers(annotation1: Span, annotation2: Span): Boolean =
annotation1.begin <= annotation2.begin && annotation2.end <= annotation1.end
def follows(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
def follows(annotation1: Span, annotation2: Span): Boolean =
annotation2.end <= annotation1.begin
def precedes(annotation1: StringAnnotation, annotation2: StringAnnotation): Boolean =
def precedes(annotation1: Span, annotation2: Span): Boolean =
annotation1.end <= annotation2.begin
}
}

// ===========
// Annotations
// ===========
case class Sentence(val begin: Int, val end: Int) extends StringAnnotation
case class Token(val begin: Int, val end: Int) extends StringAnnotation
case class Sentence(val begin: Int, val end: Int) extends Span
case class Token(val begin: Int, val end: Int) extends Span


object Slab {
type StringSlab[+AnnotationTypes <: Span] = Slab[String, Span, AnnotationTypes]

def apply[ContentType, BaseAnnotationType: HasBounds](content: ContentType): Slab[ContentType, BaseAnnotationType, BaseAnnotationType] =
new HorribleInefficientSlab(content)

Expand Down
8 changes: 5 additions & 3 deletions src/test/scala/chalk/slab/SlabTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ class SlabTest extends FunSuite {
// =========
// Analyzers
// =========
val stringBegin = (slab: Slab[String, StringAnnotation, StringAnnotation]) => slab
import Slab.StringSlab

def sentenceSegmenter[AnnotationTypes <: StringAnnotation](slab: Slab[String, StringAnnotation, AnnotationTypes]) =
val stringBegin = (slab: StringSlab[Span]) => slab

def sentenceSegmenter[AnnotationTypes <: Span](slab: StringSlab[AnnotationTypes]) =
slab ++ "[^\\s.!?]+[^.!?]+[.!?]".r.findAllMatchIn(slab.content).map(m => Sentence(m.start, m.end))

def tokenizer[AnnotationTypes <: Sentence](slab: Slab[String, StringAnnotation, AnnotationTypes]) =
def tokenizer[AnnotationTypes <: Sentence](slab: StringSlab[AnnotationTypes]) =
slab ++ slab.iterator[Sentence].flatMap(sentence =>
"\\p{L}+|\\p{P}+|\\p{N}+".r.findAllMatchIn(sentence.in(slab).content).map(m =>
Token(sentence.begin + m.start, sentence.begin + m.end)))
Expand Down

0 comments on commit b3d3421

Please sign in to comment.