Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Familienkarte Collector #602

Merged
merged 4 commits into from
Feb 4, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -16,45 +16,44 @@ import java.time.Clock
import java.util.concurrent.Callable

class Fetcher(
private val clock: Clock,
private val sleeper: Sleeper,
private val httpClient: HttpClientWrapper,
private val manualSetDelay: Long? = null
private val manualSetDelay: Long? = null,
private val userAgent: String = "boudicca.events.crawler/1.0 (https://boudicca.events/)",
private val clock: Clock = Clock.systemDefaultZone(),
private val sleeper: Sleeper = Sleeper { ms -> Thread.sleep(ms) },
private val httpClient: HttpClientWrapper = createHttpClientWrapper(userAgent)
) {
companion object {
@Volatile
var fetcherCache: FetcherCache = NoopFetcherCache
}

constructor(manualSetDelay: Long? = null) : this(
Clock.systemDefaultZone(),
Sleeper { ms -> Thread.sleep(ms) },
createHttpClientWrapper(),
manualSetDelay
)

private val LOG = LoggerFactory.getLogger(this::class.java)

private var lastRequestEnd = 0L
private var lastRequestDuration = 0L

fun fetchUrl(url: String): String {
if (fetcherCache.containsEntry(url)) {
LOG.debug("Returning Cached entry for URL $url")
return fetcherCache.getEntry(url)
}
Collections.startHttpCall(url)
val response = doRequest(url) { httpClient.doGet(url) }
fetcherCache.putEntry(url, response)
LOG.debug("Added new entry to cache with URL: $url")
return response
}

fun fetchUrlPost(url: String, contentType: String, content: ByteArray): String {
val cacheKey = "$url|${String(content)}"
fun fetchUrlPost(url: String, contentType: String, content: String): String {
val cacheKey = "$url|${content}"
if (fetcherCache.containsEntry(cacheKey)) {
LOG.debug("Using cached entry for Key: $cacheKey")
return fetcherCache.getEntry(cacheKey)
}
Collections.startHttpCall(url, String(content)) //TODO what if this is not string content?

Collections.startHttpCall(url, content)
val response = doRequest(url) { httpClient.doPost(url, contentType, content) }
LOG.debug("Added new entry to cache with Key: $cacheKey")
fetcherCache.putEntry(cacheKey, response)
return response
}
Expand Down Expand Up @@ -105,27 +104,25 @@ class Fetcher(
}
return waitTime
}

}

private fun createHttpClientWrapper(): HttpClientWrapper {
private fun createHttpClientWrapper(userAgent: String): HttpClientWrapper {
val httpClient = HttpClient.newBuilder()
.followRedirects(HttpClient.Redirect.NORMAL)
.build()
return object : HttpClientWrapper {
override fun doGet(url: String): Pair<Int, String> {
val request = HttpRequest.newBuilder(URI.create(url))
.GET()
.header("User-Agent", "boudicca.events collector")
.header("User-Agent", userAgent)
.build()

return doRequest(request)
}

override fun doPost(url: String, contentType: String, content: ByteArray): Pair<Int, String> {
override fun doPost(url: String, contentType: String, content: String): Pair<Int, String> {
val request = HttpRequest.newBuilder(URI.create(url))
.POST(BodyPublishers.ofByteArray(content))
.header("User-Agent", "boudicca.events collector")
.POST(BodyPublishers.ofString(content))
.header("User-Agent", userAgent)
.header("Content-Type", contentType)
.build()

Expand All @@ -136,7 +133,5 @@ private fun createHttpClientWrapper(): HttpClientWrapper {
val response = httpClient.send(request, BodyHandlers.ofString())
return Pair(response.statusCode(), response.body())
}

}
}

Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,5 @@ package base.boudicca.api.eventcollector.fetcher

interface HttpClientWrapper {
fun doGet(url: String): Pair<Int, String>
fun doPost(url: String, contentType: String, content: ByteArray): Pair<Int, String>
fun doPost(url: String, contentType: String, content: String): Pair<Int, String>
}
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,16 @@ class FetcherTest {
return callback.call()
}

override fun doPost(url: String, contentType: String, content: ByteArray): Pair<Int, String> {
override fun doPost(url: String, contentType: String, content: String): Pair<Int, String> {
return callback.call()
}

}

private val fetcher = Fetcher(
instantSource.withZone(ZoneId.systemDefault()),
{ instantSource.setMillis(instantSource.millis() + it) },
httpClientWrapper
clock = instantSource.withZone(ZoneId.systemDefault()),
sleeper = { instantSource.setMillis(instantSource.millis() + it) },
httpClient = httpClientWrapper
)

@BeforeEach
Expand Down Expand Up @@ -137,10 +137,10 @@ class FetcherTest {
@Test
fun testManualSetDelay() {
val customFetcher = Fetcher(
instantSource.withZone(ZoneId.systemDefault()),
{ instantSource.setMillis(instantSource.millis() + it) },
httpClientWrapper,
6789
manualSetDelay = 6789,
clock = instantSource.withZone(ZoneId.systemDefault()),
sleeper = { instantSource.setMillis(instantSource.millis() + it) },
httpClient = httpClientWrapper
)
httpClientWrapper.callback = Callable {
Pair(200, "OK")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ fun main() {
.addEventCollector(ZeroxACollector())
.addEventCollector(CCCEventsCollector())
.addEventCollector(ClerieDeChaosEventsCollector())
.addEventCollector(FamilienkarteEventCollector())
.build()

eventCollectorCoordinator.startWebUi()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,5 @@ fun main() {
// .debug(RemoteCollectorCollector("http://localhost:8080"))

//enable/add any collectors you want to test here
// .debug(AlpenvereinCollector())
.debug(ClerieDeChaosEventsCollector())
.debug(FamilienkarteEventCollector())
}
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import java.time.format.DateTimeFormatter

class AlpenvereinCollector : TwoStepEventCollector<String>("alpenverein") {

private val fetcher = Fetcher(12 * 1000) //they have a crawl-delay of 12 seconds...
private val fetcher = Fetcher(manualSetDelay = 12 * 1000) //they have a crawl-delay of 12 seconds...

override fun getAllUnparsedEvents(): List<String> {

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
package events.boudicca.eventcollector.collectors

import base.boudicca.SemanticKeys
import base.boudicca.api.eventcollector.Fetcher
import base.boudicca.api.eventcollector.TwoStepEventCollector
import base.boudicca.model.structured.StructuredEvent
import org.jsoup.Jsoup
import org.jsoup.nodes.Element
import java.net.URI
import java.time.LocalDate
import java.time.LocalTime
import java.time.OffsetDateTime
import java.time.ZoneId
import java.time.format.DateTimeFormatter

class FamilienkarteEventCollector : TwoStepEventCollector<String>("familienkarte") {

// TODO: handle pagination, currently only the first 25 answers are parsed

Check warning on line 18 in boudicca.events/eventcollectors/src/main/kotlin/events/boudicca/eventcollector/collectors/FamilienkarteEventCollector.kt

View workflow job for this annotation

GitHub Actions / Check Code Quality

[detekt] reported by reviewdog 🐶 Forbidden TODO todo marker in comment, please do the changes. Raw Output: /github/workspace/boudicca.events/eventcollectors/src/main/kotlin/events/boudicca/eventcollector/collectors/FamilienkarteEventCollector.kt:18:5: warning: Forbidden TODO todo marker in comment, please do the changes. (detekt.ForbiddenComment)
// TODO: handle other categories and locations (and adjust the type respectively)

Check warning on line 19 in boudicca.events/eventcollectors/src/main/kotlin/events/boudicca/eventcollector/collectors/FamilienkarteEventCollector.kt

View workflow job for this annotation

GitHub Actions / Check Code Quality

[detekt] reported by reviewdog 🐶 Forbidden TODO todo marker in comment, please do the changes. Raw Output: /github/workspace/boudicca.events/eventcollectors/src/main/kotlin/events/boudicca/eventcollector/collectors/FamilienkarteEventCollector.kt:19:5: warning: Forbidden TODO todo marker in comment, please do the changes. (detekt.ForbiddenComment)

private val fetcher = Fetcher()
private val baseUrl = "https://www.familienkarte.at"

override fun getAllUnparsedEvents(): List<String> {
val dateTimeFormatter = DateTimeFormatter.ofPattern("yyyy-MM-dd")
val dateFrom = LocalDate.now().format(dateTimeFormatter)
val dateTo = LocalDate.now().plusYears(1).format(dateTimeFormatter)
@Suppress("MaxLineLength") // this is an url that would not gain readability by linebreaking
val eventsUrl = "$baseUrl/de/freizeit/veranstaltungen/veranstaltungskalender.html?events_cat_key=8&date_from=$dateFrom&date_to=$dateTo"
val document = Jsoup.parse(fetcher.fetchUrl(eventsUrl))
return document
.select("div.detailButton a")
.map { it.attr("href") }
}

override fun parseStructuredEvent(event: String): StructuredEvent {
val eventUrl = baseUrl + event
val eventSite = Jsoup.parse(fetcher.fetchUrl(eventUrl))

val name = eventSite.select("div.eventDetailWrapper h1").text()
val (startDate, endDate) = parseStartAndEndDateTime(eventSite)
val pictureUrl = eventSite.select("div.eventEntry img")
.first()
?.attr("src")
?.let { URI.create(it) }

return StructuredEvent
.builder(name, startDate)
.withProperty(SemanticKeys.URL_PROPERTY, URI.create(eventUrl))
.withProperty(SemanticKeys.SOURCES_PROPERTY, listOf(eventUrl))
.withProperty(SemanticKeys.TYPE_PROPERTY, "theater")
.withProperty(SemanticKeys.ENDDATE_PROPERTY, endDate)
.withProperty(SemanticKeys.DESCRIPTION_TEXT_PROPERTY, eventSite.select("div.eventDetailDescr").text())
.withProperty(SemanticKeys.PICTURE_URL_PROPERTY, pictureUrl)
.withProperty(SemanticKeys.LOCATION_NAME_PROPERTY, eventSite.select("div.eventDetailLocation").text())
.build()
}


private fun parseStartAndEndDateTime(element: Element): Pair<OffsetDateTime, OffsetDateTime?> {
val fullDateString = element.select("div.eventDetailWrapper").first()?.text()
?: throw IllegalArgumentException("Could not find element containing start date")

val localDate = parseDate(fullDateString)
val (startTime, endTime) = parseStartAndEndtime(fullDateString)

return Pair(
localDate.atTime(startTime).atZone(ZoneId.of("Europe/Vienna")).toOffsetDateTime(),
localDate.atTime(endTime).atZone(ZoneId.of("Europe/Vienna")).toOffsetDateTime(),
)
}

private fun parseDate(fullDateString: String): LocalDate {
val dateRegex = """\b(\d{2})\.(\d{2})\.(\d{2})\b""".toRegex()
val dateMatch = dateRegex.find(fullDateString)
?: throw IllegalArgumentException("Could not find date in $fullDateString")
val (day, month, year) = dateMatch.destructured

val formattedDate = "$day.$month.$year"
val localDate = LocalDate.parse(formattedDate, DateTimeFormatter.ofPattern("dd.MM.uu"))

return localDate
}

private fun parseStartAndEndtime(fullDateString: String): Pair<LocalTime?, LocalTime?> {
val timeRegex = """\b(\d{2}:\d{2})(?:\s*-\s*(\d{2}:\d{2}))?\b""".toRegex()
val timeMatch = timeRegex.find(fullDateString)
?: throw IllegalArgumentException("Could not find start (& endtime) in $fullDateString")
val (startTimeString, endTimeString) = timeMatch.destructured

val timeFormatter = DateTimeFormatter.ofPattern("HH:mm")
val startTime = LocalTime.parse(startTimeString, timeFormatter)
val endTime = endTimeString.takeIf { it.isNotEmpty() }?.let { LocalTime.parse(it, timeFormatter) }

return Pair(startTime, endTime)
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class LandestheaterLinzCollector :
fetcher.fetchUrlPost(
"https://www.landestheater-linz.at/DE/repos/evoscripts/lth/getEvents",
"application/x-www-form-urlencoded",
"cal=${now}&monthTo=${to}".encodeToByteArray()
"cal=${now}&monthTo=${to}"
)
)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ import java.time.LocalTime
import java.time.OffsetDateTime
import java.time.ZoneId
import java.time.format.DateTimeFormatter
import java.util.*
import java.util.Locale
import java.util.regex.Pattern

class PlanetTTCollector : TwoStepEventCollector<Element>("planettt") {
Expand All @@ -34,9 +34,7 @@ class PlanetTTCollector : TwoStepEventCollector<Element>("planettt") {
val response = fetcher.fetchUrlPost(
"https://planet.tt/wp-admin/admin-ajax.php",
"application/x-www-form-urlencoded; charset=UTF-8",
"action=pl_events_list&_ajax_nonce=${nonces.first}&start=0&length=200&search=&location=&eventid=-1".toByteArray(
Charsets.UTF_8
)
"action=pl_events_list&_ajax_nonce=${nonces.first}&start=0&length=200&search=&location=&eventid=-1"
)
val jsonResponse = Parser.default().parse(StringReader(response)) as JsonObject
val events = Jsoup.parse(jsonResponse.obj("data")!!.string("events")!!)
Expand Down Expand Up @@ -64,9 +62,7 @@ class PlanetTTCollector : TwoStepEventCollector<Element>("planettt") {
val response = fetcher.fetchUrlPost(
"https://planet.tt/wp-admin/admin-ajax.php",
"application/x-www-form-urlencoded; charset=UTF-8",
"action=pl_events_modal&_ajax_nonce=${modalNonce}&eventid=$eventId&postid=$postId".toByteArray(
Charsets.UTF_8
)
"action=pl_events_modal&_ajax_nonce=${modalNonce}&eventid=$eventId&postid=$postId"
)
val jsonResponse = Parser.default().parse(StringReader(response)) as JsonObject
val fullEvent = Jsoup.parse(jsonResponse.string("data")!!)
Expand Down