Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
MartinWeise committed Jan 3, 2025
0 parents commit 2e89732
Show file tree
Hide file tree
Showing 16 changed files with 5,318 additions and 0 deletions.
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# IDE
.idea/

# Environment
venv/
.env
12 changes: 12 additions & 0 deletions Pipfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
[[source]]
url = "https://pypi.org/simple"
verify_ssl = true
name = "pypi"

[packages]
requests = "*"
lxml = "*"
ics = "*"

[requires]
python_version = "3.11"
482 changes: 482 additions & 0 deletions Pipfile.lock

Large diffs are not rendered by default.

30 changes: 30 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
# Web-Kalender Zistersdorf

Kleines Hobby-Projekt um die Zugänglichkeit (♿) des Web-Kalenders der
Gemeinde [Zistersdorf](https://www.zistersdorf.gv.at/system/web/kalender.aspx) zu erhöhen durch automatisches Aufrufen
der Webseite und speichern in das iCal Format.

## Download

* Ganzer Kalender als [.ics](zistersdorf.ics)

### Gefiltert nach Ort:

* Kalender *Stadt 1* als [.ics](zistersdorf_stadt_1.ics)
* Kalender *Stadt 2* als [.ics](zistersdorf_stadt_2.ics)
* Kalender *Ort 1* als [.ics](zistersdorf_ort_1.ics)
* Kalender *Ort 2* als [.ics](zistersdorf_ort_2.ics)

### Gefiltert nach Typ:

* Kalender *Stillgruppe* als [.ics](zistersdorf_stillgruppe.ics)
* Kalender *Mutterberatung* als [.ics](zistersdorf_mutterberatung.ics)

## Ausführen

Kompatibilität mit Python 3+

```shell
pipenv install
python3 ./crawler_trash.py
```
97 changes: 97 additions & 0 deletions crawler_trash.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
#!/usr/bin/env python3
import codecs
import datetime
import logging
import re
import sys
from io import StringIO
from time import sleep

import requests
from ics import Calendar
from lxml import etree

from dto.dto import ParseResponse, Event

logging.basicConfig(level=logging.DEBUG)


def crawl_page(page: int = 0) -> str:
url = f'https://www.zistersdorf.gv.at/system/web/kalender.aspx?page={page}'
res = requests.get(url=url)
if res.status_code != 200:
logging.error(f'Failed to crawl page {page}: status code is not 200 (OK)')
sys.exit(4)
logging.debug(f'successfully crawled page {page}')
return res.content.decode("utf-8")


# parse page and return max number of pages
def parse_page(html: str, pages: int | None) -> ParseResponse:
tree = etree.parse(StringIO(html), parser=etree.HTMLParser())
size = tree.xpath('//a[@title="Letzte Seite"]')
response = ParseResponse()
if pages is None:
if not size:
logging.error(f'Failed to find last element!')
sys.exit(1)
match = re.search('page=([0-9]+)', size[0].get('href', ''))
if not match.group(1):
logging.error(f'Failed to find last page number from: {match}')
sys.exit(2)
response.pages = int(match.group(1))
logging.info(f'Found {response.pages} pages to crawl')
else:
response.pages = pages
response.events = []
dates: list[datetime] = [parse_date(date.text) for date in tree.xpath('//tr//td[position()=1]')]
titles: list[str] = [title.text for title in tree.xpath('//tr//td[position()=2]//a')]
calendar_types: list[str] = [calendar_type.text for calendar_type in tree.xpath('//tr//td[position()=3]//span')]
for i in range(len(dates)):
event = Event(name=f'{titles[i]} ({calendar_types[i]})', location=calendar_types[i], begin=dates[i],
created=datetime.datetime.now())
event.make_all_day()
response.events.append(event)
logging.debug(f'found {len(dates)} events')
return response


def parse_date(text: str) -> datetime:
match = re.search('([0-9]{2}\.[0-9]{2}\.[0-9]{4})', text)
if not match.group(1):
logging.error(f'Failed to find date from: {text}')
sys.exit(3)
return datetime.datetime.strptime(match.group(1), '%d.%m.%Y')


if __name__ == '__main__':
calendar_all = Calendar()
calendars = []
filters = ['Stadt 1', 'Stadt 2', 'Ort 1', 'Ort 2', 'Mutterberatung', 'Stillgruppe']
for filter in filters:
calendar = Calendar(creator=filter)
calendars.append(calendar)
# page 0
res = parse_page(crawl_page(0), None)
for event in res.events:
calendar_all.events.add(event)
for calendar in calendars:
# add special calendar
if calendar.creator in event.location:
calendar.events.add(event)
# pages 1..n
for page in range(1, res.pages + 1):
logging.debug(f'sleep for 3s...')
sleep(3)
res = parse_page(crawl_page(page), res.pages)
for event in res.events:
calendar_all.events.add(event)
for calendar in calendars:
# add special calendar
if calendar.creator in event.location:
calendar.events.add(event)
with codecs.open('zistersdorf.ics', 'w', 'utf-8') as f:
f.writelines(calendar_all.serialize_iter())
for calendar in calendars:
with codecs.open(f'zistersdorf_{calendar.creator.lower().replace(" ", "_")}.ics', 'w', 'utf-8') as f:
f.writelines(calendar.serialize_iter())
Empty file added dto/__init__.py
Empty file.
Binary file added dto/__pycache__/__init__.cpython-311.pyc
Binary file not shown.
Binary file added dto/__pycache__/dto.cpython-311.pyc
Binary file not shown.
8 changes: 8 additions & 0 deletions dto/dto.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from typing import List

from ics import Event


class ParseResponse:
events: List[Event]
pages: int
Loading

0 comments on commit 2e89732

Please sign in to comment.