From 287ccc1a083a531bf846b535f5117994e38a0390 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 20:55:08 +0100 Subject: [PATCH 1/8] Added github workflow for automatic parsing and updated code accordingly --- .github/workflows/parse_fyrliste.yaml | 71 +++++++++++++++++++++++++++ parse_fyrlys/parse.py | 15 ++++-- parse_fyrlys/requirements.txt | 2 + 3 files changed, 85 insertions(+), 3 deletions(-) create mode 100644 .github/workflows/parse_fyrliste.yaml create mode 100644 parse_fyrlys/requirements.txt diff --git a/.github/workflows/parse_fyrliste.yaml b/.github/workflows/parse_fyrliste.yaml new file mode 100644 index 0000000..da5dcf0 --- /dev/null +++ b/.github/workflows/parse_fyrliste.yaml @@ -0,0 +1,71 @@ +name: Run Python Script and Create PR + +on: + push: + branches: + - main + schedule: + - cron: "0 0 * * *" # Runs daily at midnight UTC + workflow_dispatch: # Allows manual triggering + +jobs: + generate-and-pr: + runs-on: ubuntu-latest + + steps: + # Step 1: Check out the repository + - name: Checkout Repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 # Needed for creating branches + + # Step 2: Set up Python + - name: Set up Python + uses: actions/setup-python@v4 + with: + python-version: "3.11" # Specify your Python version + + # Step 3: Install dependencies (if any) + - name: Install Dependencies + run: | + python -m pip install --upgrade pip + pip install -r parse_fyrlys/requirements.txt # If you have dependencies + + # Step 4: Run the Python script + - name: Run Script + run: | + python parse_fyrlys/parse.py # Update with your script path + + # Step 5: Configure Git + - name: Configure Git + run: | + git config user.name "github-actions[bot]" + git config user.email "github-actions[bot]@users.noreply.github.com" + + # Step 6: Check for changes and commit + - name: Commit Changes + id: commit_changes + run: | + git add lighthouses.qml parse_fyrlys/lighthouses.json + if git diff --cached --quiet; then + echo "No changes to commit." + echo "::set-output name=changes::false" + else + git commit -m "Update generated files [skip ci]" + echo "::set-output name=changes::true" + fi + + # Step 7: Create Pull Request if there are changes + - name: Create Pull Request + if: steps.commit_changes.outputs.changes == 'true' + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ secrets.GITHUB_TOKEN }} + commit-message: Update generated files + branch: update-generated-files-${{ github.run_number }} + title: "Update Generated Files" + body: | + This PR updates the generated files based on the latest run. + labels: automated-pr + # You can specify the base branch if different from the default + base: main diff --git a/parse_fyrlys/parse.py b/parse_fyrlys/parse.py index 8f24568..7c0e79d 100644 --- a/parse_fyrlys/parse.py +++ b/parse_fyrlys/parse.py @@ -1,5 +1,6 @@ import json import time +import os from tqdm import tqdm import pdfplumber from parse_utils import color_map, dump_qml,extract_character, merge_text_elements, extract_character, find_text, find_element_containing_point, find_text_element_containing_point, extract_text_elements, perform_text_extraction, SCALING_FACTOR @@ -153,7 +154,15 @@ def should_keep_lighthouse(lighthouse): } return lighthouses_on_page.values() -pdf_path = "Fyrliste_HeleLandet.pdf" + +pdf_path = "parse_fyrlys/Fyrliste_HeleLandet.pdf" +if not os.path.exists(pdf_path): + print("Downloading Fyrliste_HeleLandet.pdf from https://nfs.kystverket.no/fyrlister/Fyrliste_HeleLandet.pdf") + # Download from https://nfs.kystverket.no/fyrlister/Fyrliste_HeleLandet.pdf + import requests + response = requests.get("https://nfs.kystverket.no/fyrlister/Fyrliste_HeleLandet.pdf") + with open(pdf_path, "wb") as f: + f.write(response.content) total_number_of_lighthouses = 0 lighthouses = [] @@ -171,10 +180,10 @@ def should_keep_lighthouse(lighthouse): lighthouses_on_page = parse_lighthouses(text_elements) lighthouses.extend(lighthouses_on_page) lighthouses_as_dicts = [asdict(lighthouse) for lighthouse in lighthouses] -with open("lighthouses.json", "w") as f: +with open("parse_fyrlys/lighthouses.json", "w") as f: json.dump(lighthouses_as_dicts, f, indent=2, ensure_ascii=False) qml_string = dump_qml(lighthouses_as_dicts) -with open("../lighthouses.qml", "w") as f: +with open("lighthouses.qml", "w") as f: f.write(qml_string) print("total_number_of_lighthouses: ", total_number_of_lighthouses) print("total_real_number_of_lighthouses: ", len(lighthouses)) diff --git a/parse_fyrlys/requirements.txt b/parse_fyrlys/requirements.txt new file mode 100644 index 0000000..72789a5 --- /dev/null +++ b/parse_fyrlys/requirements.txt @@ -0,0 +1,2 @@ +pdfplumber==0.11.4 +tqdm==4.67.1 \ No newline at end of file From 78821df4139101181a9918637bd13e7508159569 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 20:58:37 +0100 Subject: [PATCH 2/8] Run on PRs --- .github/workflows/parse_fyrliste.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/parse_fyrliste.yaml b/.github/workflows/parse_fyrliste.yaml index da5dcf0..aa454fa 100644 --- a/.github/workflows/parse_fyrliste.yaml +++ b/.github/workflows/parse_fyrliste.yaml @@ -4,6 +4,8 @@ on: push: branches: - main + pull_request: + types: [opened, synchronize, reopened] schedule: - cron: "0 0 * * *" # Runs daily at midnight UTC workflow_dispatch: # Allows manual triggering From 2ebf438a54e26fd94a5fdd5a8442c689bfec6498 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 20:59:27 +0100 Subject: [PATCH 3/8] Added requests to requirements --- parse_fyrlys/requirements.txt | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/parse_fyrlys/requirements.txt b/parse_fyrlys/requirements.txt index 72789a5..0e1b14f 100644 --- a/parse_fyrlys/requirements.txt +++ b/parse_fyrlys/requirements.txt @@ -1,2 +1,3 @@ pdfplumber==0.11.4 -tqdm==4.67.1 \ No newline at end of file +tqdm==4.67.1 +requests==2.32.3 \ No newline at end of file From 189c0f1b0cd099fd62acb0b9df054f6fa3290dbe Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 21:01:30 +0100 Subject: [PATCH 4/8] WIP: only run a few pages --- parse_fyrlys/parse.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/parse_fyrlys/parse.py b/parse_fyrlys/parse.py index 7c0e79d..1e4a4c9 100644 --- a/parse_fyrlys/parse.py +++ b/parse_fyrlys/parse.py @@ -179,6 +179,8 @@ def should_keep_lighthouse(lighthouse): text_elements = perform_text_extraction(pdf_page) lighthouses_on_page = parse_lighthouses(text_elements) lighthouses.extend(lighthouses_on_page) + if i > 100: + break lighthouses_as_dicts = [asdict(lighthouse) for lighthouse in lighthouses] with open("parse_fyrlys/lighthouses.json", "w") as f: json.dump(lighthouses_as_dicts, f, indent=2, ensure_ascii=False) From bdef52574e7a008aa0a4af33b81bf7800b174b4b Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 21:16:21 +0100 Subject: [PATCH 5/8] WIP: ref PR branch --- .github/workflows/parse_fyrliste.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/parse_fyrliste.yaml b/.github/workflows/parse_fyrliste.yaml index aa454fa..974fb4d 100644 --- a/.github/workflows/parse_fyrliste.yaml +++ b/.github/workflows/parse_fyrliste.yaml @@ -20,6 +20,7 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 # Needed for creating branches + ref: anders/github_workflow # Step 2: Set up Python - name: Set up Python From 1ee40a013bed5e0197d99d87e63479a4da5be3c6 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 21:18:52 +0100 Subject: [PATCH 6/8] Only run manual or cron --- .github/workflows/parse_fyrliste.yaml | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/.github/workflows/parse_fyrliste.yaml b/.github/workflows/parse_fyrliste.yaml index 974fb4d..847f35c 100644 --- a/.github/workflows/parse_fyrliste.yaml +++ b/.github/workflows/parse_fyrliste.yaml @@ -1,11 +1,6 @@ name: Run Python Script and Create PR on: - push: - branches: - - main - pull_request: - types: [opened, synchronize, reopened] schedule: - cron: "0 0 * * *" # Runs daily at midnight UTC workflow_dispatch: # Allows manual triggering @@ -20,7 +15,7 @@ jobs: uses: actions/checkout@v3 with: fetch-depth: 0 # Needed for creating branches - ref: anders/github_workflow + ref: main # Step 2: Set up Python - name: Set up Python From 4c7638f8e243ea5c30121befee171f52305b3d6d Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 21:19:21 +0100 Subject: [PATCH 7/8] Run every week --- .github/workflows/parse_fyrliste.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/parse_fyrliste.yaml b/.github/workflows/parse_fyrliste.yaml index 847f35c..e040eeb 100644 --- a/.github/workflows/parse_fyrliste.yaml +++ b/.github/workflows/parse_fyrliste.yaml @@ -2,7 +2,7 @@ name: Run Python Script and Create PR on: schedule: - - cron: "0 0 * * *" # Runs daily at midnight UTC + - cron: "0 0 * * 0" # Runs weekly at midnight UTC on Sunday workflow_dispatch: # Allows manual triggering jobs: From 7e95fcb6f3afb921bc02f6e40881f4a74f933041 Mon Sep 17 00:00:00 2001 From: Anders Hafreager Date: Wed, 8 Jan 2025 21:19:41 +0100 Subject: [PATCH 8/8] Run all pages again --- parse_fyrlys/parse.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/parse_fyrlys/parse.py b/parse_fyrlys/parse.py index 1e4a4c9..328b0bf 100644 --- a/parse_fyrlys/parse.py +++ b/parse_fyrlys/parse.py @@ -179,8 +179,7 @@ def should_keep_lighthouse(lighthouse): text_elements = perform_text_extraction(pdf_page) lighthouses_on_page = parse_lighthouses(text_elements) lighthouses.extend(lighthouses_on_page) - if i > 100: - break + lighthouses_as_dicts = [asdict(lighthouse) for lighthouse in lighthouses] with open("parse_fyrlys/lighthouses.json", "w") as f: json.dump(lighthouses_as_dicts, f, indent=2, ensure_ascii=False)