Skip to content

Commit

Permalink
Use a separate field Resource.article to hold the parsed/cleaned-up…
Browse files Browse the repository at this point in the history
… DOM tree, so it’s clear whether article extraction has been performed or not. This also leaves the original `Document` untouched, so chained plugins can continue to operate on the original DOM tree.
  • Loading branch information
chimbori committed May 24, 2022
1 parent cd85faa commit 30c2267
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 2 deletions.
6 changes: 6 additions & 0 deletions src/main/kotlin/com/chimbori/crux/Crux.kt
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,9 @@ public data class Resource(
/** Parsed DOM tree for this resource, if available. */
val document: Document? = null,

/** Extracted and cleaned-up DOM tree for this resource, if available. */
val article: Document? = null,

/**
* Text fields extracted from this resource, stored as key-value pairs. It is recommended to use well-defined keys
* from [com.chimbori.crux.Fields] for all standard fields. Custom fields are also supported, in case none of the
Expand Down Expand Up @@ -126,8 +129,10 @@ public data class Resource(
public operator fun plus(anotherResource: Resource): Resource = Resource(
url = anotherResource.url ?: url,
document = anotherResource.document ?: document,
article = anotherResource.article ?: article,
fields = fields + anotherResource.fields,
urls = urls + anotherResource.urls,
objects = objects + anotherResource.objects,
)

/**
Expand All @@ -137,6 +142,7 @@ public data class Resource(
public fun removeNullValues(): Resource = copy(
fields = fields.filterValues { !it.isNullOrBlank() },
urls = urls.filterValues { it != null },
objects = objects.filterValues { it != null },
)

/** For any potential extension functions to be defined on the companion object. */
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ public class ArticleExtractor : Plugin {
val extractedDoc = PostprocessHelpers.postprocess(bestMatchElement)
return Resource(
objects = mapOf(DURATION_MS to extractedDoc.estimatedReadingTimeMs()),
document = extractedDoc
article = extractedDoc
)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ class ArticleExtractorTest {
val readingTimeMinutes = (parsed.objects.get(DURATION_MS) as? Int)?.div(60_000)
assertEquals(51, readingTimeMinutes)

val extractedArticle = parsed.document
val extractedArticle = parsed.article
assertNotNull(extractedArticle)
assertStartsWith(
""""Galileo" redirects here. For other uses, see Galileo (disambiguation) and Galileo Galilei (disambiguation).""",
Expand Down

0 comments on commit 30c2267

Please sign in to comment.