Skip to content

Commit

Permalink
feat: All tenders will have a closing date. Improved logic for obtain…
Browse files Browse the repository at this point in the history
…ing tenders based on the OCDS API and CSV. Created the first integration tests
  • Loading branch information
nguaman committed Dec 14, 2024
1 parent 17d3857 commit 40ec62f
Show file tree
Hide file tree
Showing 34 changed files with 1,084 additions and 278 deletions.
1 change: 0 additions & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
push:
tags:
- "*.*.*"
workflow_dispatch:

jobs:
pypi-publish:
Expand Down
2 changes: 2 additions & 0 deletions examples/L1_published_IV_this_month.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,7 @@
"code": tender.code,
"region": tender.region,
"status": tender.status,
"closing_date": tender.closing_date,
"opening_date": tender.opening_date,
}
)
3 changes: 2 additions & 1 deletion examples/tender_by_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

licitpy = Licitpy()

tender = licitpy.tenders.from_code("750301-54-L124")
tender = licitpy.tenders.from_code("2446-900-L124")

pprint(
{
Expand All @@ -13,6 +13,7 @@
"title": tender.title,
"status": tender.status,
"opening_date": tender.opening_date,
"closing_date": tender.closing_date,
"region": tender.region,
}
)
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ prerelease_token = "alpha"

[tool.pytest.ini_options]
addopts = "--cov=src/licitpy --cov-config=.coveragerc --cov-report=term-missing --cov-fail-under=80"
markers = ["integration: Tests that interact with live external systems"]


pythonpath = ["src"]

[tool.mypy]
Expand Down
179 changes: 87 additions & 92 deletions src/licitpy/downloader/tender.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional
from typing import Dict, List, Optional, Set

import pandas
from pydantic import HttpUrl, ValidationError
Expand All @@ -9,17 +9,17 @@

from licitpy.downloader.base import BaseDownloader
from licitpy.parsers.tender import TenderParser
from licitpy.settings import settings
from licitpy.types.attachments import Attachment
from licitpy.types.download import MassiveDownloadSource
from licitpy.types.tender.open_contract import OpenContract
from licitpy.types.tender.status import StatusFromCSV
from licitpy.types.tender.status import Status
from licitpy.types.tender.tender import (
EnrichedTender,
Question,
QuestionAnswer,
TenderDataConsolidated,
TenderFromAPI,
TenderFromCSV,
TenderFromSource,
)


Expand Down Expand Up @@ -60,7 +60,7 @@ def get_tenders_codes_from_api(

# Extract tender codes from the first batch of data
tenders = [
TenderFromAPI(CodigoExterno=str(tender["urlTender"]).split("/")[-1])
TenderFromAPI(code=str(tender["urlTender"]).split("/")[-1])
for tender in records["data"]
]

Expand All @@ -83,7 +83,7 @@ def get_tenders_codes_from_api(

# Append tender codes from the current batch to the tenders list
tenders.extend(
TenderFromAPI(CodigoExterno=str(tender["urlTender"]).split("/")[-1])
TenderFromAPI(code=str(tender["urlTender"]).split("/")[-1])
for tender in records["data"]
)

Expand All @@ -97,10 +97,11 @@ def get_tenders_from_csv(
columns: List[str] = [
"CodigoExterno",
"FechaPublicacion",
"RegionUnidad",
# "RegionUnidad",
"Estado",
"Nombre",
"Descripcion",
# "Nombre",
# "Descripcion",
# "FechaCierre",
]

dates_columns = ["FechaPublicacion"]
Expand All @@ -113,18 +114,21 @@ def get_tenders_from_csv(
if any(df.groupby("CodigoExterno")["FechaPublicacion"].nunique() > 1):
raise ValueError("Inconsistent publication dates found")

# The FechaPublicacion comes in a date string format
df["FechaPublicacion"] = df["FechaPublicacion"].dt.date
# # The FechaPublicacion comes in a date string format
# df["FechaPublicacion"] = df["FechaPublicacion"].dt.date

# # The FechaCierre comes in a date string format
# df["FechaCierre"] = df["FechaCierre"].dt.date

# Strip leading and trailing whitespace from the 'RegionUnidad' column
df["RegionUnidad"] = df["RegionUnidad"].str.strip()
# df["RegionUnidad"] = df["RegionUnidad"].str.strip()

# Drop duplicate records based on the 'code' column, keeping the first occurrence
df = df.drop_duplicates(subset="CodigoExterno", keep="first")

# Sort the DataFrame by 'opening_date' in ascending order
# The date is in the following format YYYY-MM-DD (ISO 8601)
df = df.sort_values(by="FechaPublicacion", ascending=True)
# # Sort the DataFrame by 'opening_date' in ascending order
# # The date is in the following format YYYY-MM-DD (ISO 8601)
# df = df.sort_values(by="FechaPublicacion", ascending=True)

# Reset the index of the DataFrame after sorting
df.reset_index(drop=True, inplace=True)
Expand All @@ -135,12 +139,13 @@ def get_tenders_from_csv(

tenders = [
TenderFromCSV(
RegionUnidad=tender["RegionUnidad"],
FechaPublicacion=tender["FechaPublicacion"],
# RegionUnidad=tender["RegionUnidad"],
# FechaPublicacion=tender["FechaPublicacion"],
CodigoExterno=tender["CodigoExterno"],
Estado=tender["Estado"],
Nombre=tender["Nombre"],
Descripcion=tender["Descripcion"],
# Nombre=tender["Nombre"],
# Descripcion=tender["Descripcion"],
# FechaCierre=tender["FechaCierre"],
)
for tender in df.to_dict(orient="records")
]
Expand Down Expand Up @@ -176,101 +181,91 @@ def get_tender_ocds_data_from_api(self, code: str) -> OpenContract:
try:
return OpenContract(**data)
except ValidationError as e:
raise Exception(f"Error parsing OCDS data for tender {code}") from e
raise Exception(f"Error downloading OCDS data for tender {code}") from e

def enrich_tender_with_ocds(self, code: str) -> EnrichedTender:
def get_tender_ocds_data_from_codes(
self, tenders: List[TenderDataConsolidated]
) -> Dict[str, OpenContract]:

data = self.get_tender_ocds_data_from_api(code)
data_tenders: Dict[str, OpenContract] = {}

# 2024-11-16 10:27:00-03:00 <class 'datetime.datetime'>
opening_date = self.parser.get_tender_opening_date_from_tender_ocds_data(data)
with ThreadPoolExecutor(max_workers=32) as executor:

region = self.parser.get_tender_region_from_tender_ocds_data(data)
status = self.parser.get_tender_status_from_tender_ocds_data(data)
title = self.parser.get_tender_title_from_tender_ocds_data(data)
desc = self.parser.get_tender_description_from_tender_ocds_data(data)
future_to_tender = {
executor.submit(self.get_tender_ocds_data_from_api, tender.code): tender
for tender in tenders
}

return EnrichedTender(
title=title,
description=desc,
region=region,
status=status.name,
opening_date=opening_date.date(),
)
# for future in as_completed(future_to_tender):
for future in tqdm(
as_completed(future_to_tender),
total=len(tenders),
desc="Downloading OCDS data",
):

def get_tenders(self, year: int, month: int) -> List[TenderFromCSV]:
tender = future_to_tender[future]
data = future.result()

# From the API:
# [
# TenderFromAPI(CodigoExterno='2943-12-LQ24')
# ]
tenders_from_api: List[TenderFromAPI] = self.get_tenders_codes_from_api(
year, month
)
data_tenders[tender.code] = data

# From the CSV.
# [TenderFromCSV(
# CodigoExterno='3149-41-LP24',
# RegionUnidad=<Region.II: 'Región de Antofagasta'>,
# FechaPublicacion=datetime.date(2024, 11, 1),
# Estado=<StatusFromCSV.AWARDED: 'Adjudicada'>,
# Nombre='SERVICIO ....',
# Descripcion='El objetivo de esta contratación es para amenizar el ...')
# ]
return data_tenders

def get_consolidated_tender_data(
self, year: int, month: int
) -> List[TenderDataConsolidated]:
"""
Retrieves and consolidates tenders from both the API (OCDS) and CSV sources for a given year and month.
tenders_from_csv: List[TenderFromCSV] = self.get_tenders_from_csv(year, month)
This method fetches tender codes from the API and tender details from the CSV, then merges them into a single list
of consolidated tender data. The consolidation ensures that each tender is uniquely represented by its code.
# Filtering tenders that are internal QA tests from Mercado Publico.
# eg: 500977-191-LS24 : Nombre Unidad : MpOperacionesC
Args:
year (int): The year for which to retrieve tenders.
month (int): The month for which to retrieve tenders.
csv_tender_codes = {
tender.CodigoExterno
for tender in tenders_from_csv
if not tender.CodigoExterno.startswith("500977-")
}
Returns:
List[TenderDataConsolidated]: A list of consolidated tender data, including tender codes and statuses.
"""

api_tenders_missing_date_codes = [
tender
for tender in tenders_from_api
if tender.CodigoExterno not in csv_tender_codes
and not tender.CodigoExterno.startswith("500977-")
]
# Consolidate the tenders from the CSV and the API
tenders_consolidated: List[TenderDataConsolidated] = []

api_tenders_enriched: List[TenderFromCSV] = []
# Get only the tender codes from the API (OCDS)
tenders_codes_from_api = self.get_tenders_codes_from_api(year, month)

with ThreadPoolExecutor(max_workers=16) as executor:
# Get the tenders from the CSV
tenders_from_csv = self.get_tenders_from_csv(year, month)

futures = {
executor.submit(
self.enrich_tender_with_ocds, tender.CodigoExterno
): tender
for tender in api_tenders_missing_date_codes
}
existing_codes: Set[str] = set()

for future in tqdm(
as_completed(futures),
total=len(futures),
desc=f"Fetching publication dates {year}-{month:02}",
disable=settings.disable_progress_bar,
):
# Merge the tenders from the CSV and the API

# From the CSV, we retrieve the following fields:
# - The tender code
# - The tender status (Published, Awarded, etc.)

tender_code: str = futures[future].CodigoExterno
enrichment = future.result()
for csv_tender in tenders_from_csv:

tender = TenderFromCSV(
CodigoExterno=tender_code,
FechaPublicacion=enrichment.opening_date,
RegionUnidad=enrichment.region.value,
Estado=StatusFromCSV[enrichment.status.name],
Nombre=enrichment.title,
Descripcion=enrichment.description,
tenders_consolidated.append(
TenderDataConsolidated(
code=csv_tender.CodigoExterno,
status=Status(csv_tender.Estado.name),
)
)

existing_codes.add(csv_tender.CodigoExterno)

# From the API, we only retrieve the tender code because we download
# the indexes from the OCDS API.
for api_tender in tenders_codes_from_api:

api_tenders_enriched.append(tender)
if api_tender.code in existing_codes:
continue

tenders = tenders_from_csv + api_tenders_enriched
tenders_consolidated.append(TenderDataConsolidated(code=api_tender.code))
existing_codes.add(api_tender.code)

return sorted(tenders, key=lambda tender: tender.FechaPublicacion, reverse=True)
return tenders_consolidated

def get_tender_url_from_code(self, code: str) -> HttpUrl:
"""
Expand Down
12 changes: 7 additions & 5 deletions src/licitpy/entities/tender.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def __init__(
status: Optional[Status] = None,
title: Optional[str] = None,
description: Optional[str] = None,
opening_date: Optional[date] = None,
opening_date: Optional[datetime] = None,
closing_date: Optional[datetime] = None,
services: Optional[TenderServices] = None,
):

Expand All @@ -42,9 +43,10 @@ def __init__(
self._description: Optional[str] = description

self._ocds: Optional[OpenContract] = None
self._opening_date: Optional[date] = opening_date
self._opening_date: Optional[datetime] = opening_date
self._closing_date: Optional[datetime] = closing_date

self._tier: Optional[Tier] = None
self._closing_date: Optional[datetime] = None
self._attachment_url: Optional[HttpUrl] = None
self._attachments: Optional[List[Attachment]] = None
self._signed_base: Optional[Attachment] = None
Expand Down Expand Up @@ -73,7 +75,7 @@ def html(self) -> str:
return self._html

@property
def opening_date(self) -> date:
def opening_date(self) -> datetime:
if self._opening_date is None:
self._opening_date = self.services.get_opening_date(self.ocds)
return self._opening_date
Expand Down Expand Up @@ -171,7 +173,7 @@ def from_data(
status: Optional[Status] = None,
title: Optional[str] = None,
description: Optional[str] = None,
opening_date: Optional[date] = None,
opening_date: Optional[datetime] = None,
services: Optional[TenderServices] = None,
) -> Tender:
return cls(
Expand Down
6 changes: 3 additions & 3 deletions src/licitpy/entities/tenders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def in_region(self, region: Region) -> Tenders:
def to_pandas(self) -> pandas.DataFrame:
raise NotImplementedError

@classmethod
def from_tenders(cls, tenders: List[Tender]) -> Tenders:
return cls(tenders)
# @classmethod
# def from_tenders(cls, tenders: List[Tender]) -> Tenders:
# return cls(tenders)

@property
def codes(self) -> List[str]:
Expand Down
6 changes: 6 additions & 0 deletions src/licitpy/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ def get_attribute_by_element_id(
raise ElementNotFoundException(f"Element with ID '{element_id}' not found")

attribute_elements = html_element[0].xpath(f".//{attribute}")

if not attribute_elements:
raise ElementNotFoundException(
f"Element with ID '{element_id}' has no attribute '{attribute}'"
)

value: str = attribute_elements[0]

return value.strip()
Expand Down
Loading

0 comments on commit 40ec62f

Please sign in to comment.