Skip to content

Commit

Permalink
[release_table] Improve script (#305)
Browse files Browse the repository at this point in the history
- Add strict typing to the fields. This makes the script fail if some column does not have the expected type (for example because of a change in the HTML page).
- Support regex and templating for all fields (not only the releaseCycle). This make it possible to extract only the necessary information without having to do some sort of 'magic' cleanup (replacements in dates have been reverted).
- Do not inject 'releaseCycle' anymore in the JSON (there is already the name).
  • Loading branch information
marcwrobel committed Feb 14, 2024
1 parent c6881fe commit a801200
Show file tree
Hide file tree
Showing 2 changed files with 80 additions and 28 deletions.
6 changes: 0 additions & 6 deletions src/common/dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,16 +43,10 @@ def parse_datetime(text: str, formats: list[str] = frozenset([
# so that we don't have to deal with some special cases in formats
text = (
text.strip()
.replace("th, ", " ") # November 10th, 2015 -> November 10, 2015
.replace("st, ", " ") # March 31st, 2015 -> March 31, 2015
.replace("Augu ", "August ") # 17 Augu 2023 -> 17 August 2023 - revert after st replacement
.replace("augu ", "August ") # 17 Augu 2023 -> 17 august 2023 - revert after st replacement
.replace("rd, ", " ") # March 3rd, 2015 -> March 3, 2015
.replace(", ", " ") # November 10, 2015 -> November 10 2015
.replace(". ", " ") # November 10. 2015 -> November 10 2015
.replace("(", "") # (November 10 2015) -> November 10 2015)
.replace(")", "") # (November 10 2015) -> (November 10 2015
.replace("*", "") # November 10 2015* -> November 10 2015
)
for fmt in formats:
try:
Expand Down
102 changes: 80 additions & 22 deletions src/release_table.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,10 @@
import re
import sys
from datetime import datetime

from bs4 import BeautifulSoup
from bs4 import BeautifulSoup, PageElement
from common import dates, endoflife, http, releasedata
from liquid import Template

"""Fetch release-level data from an HTML table in a web page.
Expand All @@ -27,6 +30,72 @@

METHOD = "release_table"


class Field:
SUPPORTED_TYPES = ["date", "string"]
DATE_FIELDS = ["releaseDate", "support", "eol", "extendedSupport"]
DEFAULT_REGEX = r"^(?P<value>.+)$"
DEFAULT_TEMPLATE = "{{value}}"
DEFAULT_RELEASE_REGEX = r"^v?(?P<value>\d+(\.\d+)?)$"

def __init__(self, name: str, definition: str | dict, columns: list[str]) -> None:
if isinstance(definition, str):
definition = {"column": definition}

self.name = name
if self.name == "releaseCycle":
definition["type"] = "string"
definition["regex"] = definition.get("regex", [self.DEFAULT_RELEASE_REGEX])
definition["template"] = definition.get("template", self.DEFAULT_TEMPLATE)

self.column = definition["column"].lower()
if self.column not in columns:
msg = f"column {self.column} not found in {columns}"
raise ValueError(msg)
self.column_index = columns.index(self.column)

self.type = definition.get("type", "string")
if self.name in self.DATE_FIELDS:
self.type = "date" # override type for known date fields
elif self.type not in self.SUPPORTED_TYPES:
msg = f"unsupported type: {self.type} for field {self.name}"
raise ValueError(msg)

regex = definition.get("regex", [self.DEFAULT_REGEX])
regex = regex if isinstance(regex, list) else [regex]
self.include_version_patterns = [re.compile(r, re.MULTILINE) for r in regex]

exclude_regex = definition.get("regex_exclude", [])
exclude_regex = exclude_regex if isinstance(exclude_regex, list) else [exclude_regex]
self.exclude_version_patterns = [re.compile(r, re.MULTILINE) for r in exclude_regex]

self.template = Template(definition.get("template", self.DEFAULT_TEMPLATE)) \
if "template" in definition or regex else None

def extract_from(self, cells: list[PageElement]) -> str | datetime | None:
raw_value = cells[self.column_index].get_text(strip=True)

for exclude_pattern in self.exclude_version_patterns:
if exclude_pattern.match(raw_value):
return None

for include_pattern in self.include_version_patterns:
match = include_pattern.match(raw_value)
if not match:
continue

str_value = self.template.render(**match.groupdict()) if self.template else raw_value
if self.type == "date":
return dates.parse_date(str_value)
return str_value

if self.name == "releaseCycle":
return None # skipping entire rows is allowed

msg = f"{raw_value} is not matching any regex in {self.include_version_patterns}"
raise ValueError(msg)


p_filter = sys.argv[1] if len(sys.argv) > 1 else None
m_filter = sys.argv[2] if len(sys.argv) > 2 else None
for config in endoflife.list_configs(p_filter, METHOD, m_filter):
Expand All @@ -39,31 +108,20 @@
message = f"No table found for {config.product} with selector {config.data['selector']}"
raise ValueError(message)

index_by_target = {}
headers = [th.get_text().strip().lower() for th in table.select(config.data["headers_selector"])]
for target, column in config.data["mapping"].items():
index_by_target[target] = headers.index(str(column).lower())
release_cycle_field = Field("releaseCycle", config.data["fields"].pop("releaseCycle"), headers)
fields = [Field(name, definition, headers) for name, definition in config.data["fields"].items()]
min_column_count = max([f.column_index for f in fields] + [release_cycle_field.column_index]) + 1

min_column_count = max(index_by_target.values()) + 1
release_cycle_index = index_by_target.pop("releaseCycle")
for row in table.select(config.data["rows_selector"]):
cells = row.findAll("td")
if len(cells) < min_column_count:
row_cells = row.findAll("td")
if len(row_cells) < min_column_count:
continue

release_cycle = cells[release_cycle_index].get_text().strip()
release_cycle_match = config.first_match(release_cycle)
if not release_cycle_match:
release_cycle = release_cycle_field.extract_from(row_cells)
if not release_cycle:
continue

release = product_data.get_release(config.render(release_cycle_match))
release.set_field("releaseCycle", release.name())
for target, index in index_by_target.items():
value_str = cells[index].get_text().strip()

try:
value = dates.parse_date(value_str)
except ValueError:
value = value_str

release.set_field(target, value)
release = product_data.get_release(release_cycle)
for field in fields:
release.set_field(field.name, field.extract_from(row_cells))

0 comments on commit a801200

Please sign in to comment.