Skip to content

Commit

Permalink
Merge pull request #35 from nguaman/feat/34-tenders-without-explicit-…
Browse files Browse the repository at this point in the history
…closing-dates

feat: All tenders will have a closing date. Improved logic for obtain…
  • Loading branch information
nguaman authored Dec 14, 2024
2 parents 17d3857 + 40ec62f commit 13997ce
Show file tree
Hide file tree
Showing 34 changed files with 1,084 additions and 278 deletions.
1 change: 0 additions & 1 deletion .github/workflows/publish.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@ on:
push:
tags:
- "*.*.*"
workflow_dispatch:

jobs:
pypi-publish:
Expand Down
2 changes: 2 additions & 0 deletions examples/L1_published_IV_this_month.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,5 +22,7 @@
"code": tender.code,
"region": tender.region,
"status": tender.status,
"closing_date": tender.closing_date,
"opening_date": tender.opening_date,
}
)
3 changes: 2 additions & 1 deletion examples/tender_by_code.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

licitpy = Licitpy()

tender = licitpy.tenders.from_code("750301-54-L124")
tender = licitpy.tenders.from_code("2446-900-L124")

pprint(
{
Expand All @@ -13,6 +13,7 @@
"title": tender.title,
"status": tender.status,
"opening_date": tender.opening_date,
"closing_date": tender.closing_date,
"region": tender.region,
}
)
Expand Down
3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,9 @@ prerelease_token = "alpha"

[tool.pytest.ini_options]
addopts = "--cov=src/licitpy --cov-config=.coveragerc --cov-report=term-missing --cov-fail-under=80"
markers = ["integration: Tests that interact with live external systems"]


pythonpath = ["src"]

[tool.mypy]
Expand Down
179 changes: 87 additions & 92 deletions src/licitpy/downloader/tender.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import List, Optional
from typing import Dict, List, Optional, Set

import pandas
from pydantic import HttpUrl, ValidationError
Expand All @@ -9,17 +9,17 @@

from licitpy.downloader.base import BaseDownloader
from licitpy.parsers.tender import TenderParser
from licitpy.settings import settings
from licitpy.types.attachments import Attachment
from licitpy.types.download import MassiveDownloadSource
from licitpy.types.tender.open_contract import OpenContract
from licitpy.types.tender.status import StatusFromCSV
from licitpy.types.tender.status import Status
from licitpy.types.tender.tender import (
EnrichedTender,
Question,
QuestionAnswer,
TenderDataConsolidated,
TenderFromAPI,
TenderFromCSV,
TenderFromSource,
)


Expand Down Expand Up @@ -60,7 +60,7 @@ def get_tenders_codes_from_api(

# Extract tender codes from the first batch of data
tenders = [
TenderFromAPI(CodigoExterno=str(tender["urlTender"]).split("/")[-1])
TenderFromAPI(code=str(tender["urlTender"]).split("/")[-1])
for tender in records["data"]
]

Expand All @@ -83,7 +83,7 @@ def get_tenders_codes_from_api(

# Append tender codes from the current batch to the tenders list
tenders.extend(
TenderFromAPI(CodigoExterno=str(tender["urlTender"]).split("/")[-1])
TenderFromAPI(code=str(tender["urlTender"]).split("/")[-1])
for tender in records["data"]
)

Expand All @@ -97,10 +97,11 @@ def get_tenders_from_csv(
columns: List[str] = [
"CodigoExterno",
"FechaPublicacion",
"RegionUnidad",
# "RegionUnidad",
"Estado",
"Nombre",
"Descripcion",
# "Nombre",
# "Descripcion",
# "FechaCierre",
]

dates_columns = ["FechaPublicacion"]
Expand All @@ -113,18 +114,21 @@ def get_tenders_from_csv(
if any(df.groupby("CodigoExterno")["FechaPublicacion"].nunique() > 1):
raise ValueError("Inconsistent publication dates found")

# The FechaPublicacion comes in a date string format
df["FechaPublicacion"] = df["FechaPublicacion"].dt.date
# # The FechaPublicacion comes in a date string format
# df["FechaPublicacion"] = df["FechaPublicacion"].dt.date

# # The FechaCierre comes in a date string format
# df["FechaCierre"] = df["FechaCierre"].dt.date

# Strip leading and trailing whitespace from the 'RegionUnidad' column
df["RegionUnidad"] = df["RegionUnidad"].str.strip()
# df["RegionUnidad"] = df["RegionUnidad"].str.strip()

# Drop duplicate records based on the 'code' column, keeping the first occurrence
df = df.drop_duplicates(subset="CodigoExterno", keep="first")

# Sort the DataFrame by 'opening_date' in ascending order
# The date is in the following format YYYY-MM-DD (ISO 8601)
df = df.sort_values(by="FechaPublicacion", ascending=True)
# # Sort the DataFrame by 'opening_date' in ascending order
# # The date is in the following format YYYY-MM-DD (ISO 8601)
# df = df.sort_values(by="FechaPublicacion", ascending=True)

# Reset the index of the DataFrame after sorting
df.reset_index(drop=True, inplace=True)
Expand All @@ -135,12 +139,13 @@ def get_tenders_from_csv(

tenders = [
TenderFromCSV(
RegionUnidad=tender["RegionUnidad"],
FechaPublicacion=tender["FechaPublicacion"],
# RegionUnidad=tender["RegionUnidad"],
# FechaPublicacion=tender["FechaPublicacion"],
CodigoExterno=tender["CodigoExterno"],
Estado=tender["Estado"],
Nombre=tender["Nombre"],
Descripcion=tender["Descripcion"],
# Nombre=tender["Nombre"],
# Descripcion=tender["Descripcion"],
# FechaCierre=tender["FechaCierre"],
)
for tender in df.to_dict(orient="records")
]
Expand Down Expand Up @@ -176,101 +181,91 @@ def get_tender_ocds_data_from_api(self, code: str) -> OpenContract:
try:
return OpenContract(**data)
except ValidationError as e:
raise Exception(f"Error parsing OCDS data for tender {code}") from e
raise Exception(f"Error downloading OCDS data for tender {code}") from e

def enrich_tender_with_ocds(self, code: str) -> EnrichedTender:
def get_tender_ocds_data_from_codes(
self, tenders: List[TenderDataConsolidated]
) -> Dict[str, OpenContract]:

data = self.get_tender_ocds_data_from_api(code)
data_tenders: Dict[str, OpenContract] = {}

# 2024-11-16 10:27:00-03:00 <class 'datetime.datetime'>
opening_date = self.parser.get_tender_opening_date_from_tender_ocds_data(data)
with ThreadPoolExecutor(max_workers=32) as executor:

region = self.parser.get_tender_region_from_tender_ocds_data(data)
status = self.parser.get_tender_status_from_tender_ocds_data(data)
title = self.parser.get_tender_title_from_tender_ocds_data(data)
desc = self.parser.get_tender_description_from_tender_ocds_data(data)
future_to_tender = {
executor.submit(self.get_tender_ocds_data_from_api, tender.code): tender
for tender in tenders
}

return EnrichedTender(
title=title,
description=desc,
region=region,
status=status.name,
opening_date=opening_date.date(),
)
# for future in as_completed(future_to_tender):
for future in tqdm(
as_completed(future_to_tender),
total=len(tenders),
desc="Downloading OCDS data",
):

def get_tenders(self, year: int, month: int) -> List[TenderFromCSV]:
tender = future_to_tender[future]
data = future.result()

# From the API:
# [
# TenderFromAPI(CodigoExterno='2943-12-LQ24')
# ]
tenders_from_api: List[TenderFromAPI] = self.get_tenders_codes_from_api(
year, month
)
data_tenders[tender.code] = data

# From the CSV.
# [TenderFromCSV(
# CodigoExterno='3149-41-LP24',
# RegionUnidad=<Region.II: 'Región de Antofagasta'>,
# FechaPublicacion=datetime.date(2024, 11, 1),
# Estado=<StatusFromCSV.AWARDED: 'Adjudicada'>,
# Nombre='SERVICIO ....',
# Descripcion='El objetivo de esta contratación es para amenizar el ...')
# ]
return data_tenders

def get_consolidated_tender_data(
self, year: int, month: int
) -> List[TenderDataConsolidated]:
"""
Retrieves and consolidates tenders from both the API (OCDS) and CSV sources for a given year and month.
tenders_from_csv: List[TenderFromCSV] = self.get_tenders_from_csv(year, month)
This method fetches tender codes from the API and tender details from the CSV, then merges them into a single list
of consolidated tender data. The consolidation ensures that each tender is uniquely represented by its code.
# Filtering tenders that are internal QA tests from Mercado Publico.
# eg: 500977-191-LS24 : Nombre Unidad : MpOperacionesC
Args:
year (int): The year for which to retrieve tenders.
month (int): The month for which to retrieve tenders.
csv_tender_codes = {
tender.CodigoExterno
for tender in tenders_from_csv
if not tender.CodigoExterno.startswith("500977-")
}
Returns:
List[TenderDataConsolidated]: A list of consolidated tender data, including tender codes and statuses.
"""

api_tenders_missing_date_codes = [
tender
for tender in tenders_from_api
if tender.CodigoExterno not in csv_tender_codes
and not tender.CodigoExterno.startswith("500977-")
]
# Consolidate the tenders from the CSV and the API
tenders_consolidated: List[TenderDataConsolidated] = []

api_tenders_enriched: List[TenderFromCSV] = []
# Get only the tender codes from the API (OCDS)
tenders_codes_from_api = self.get_tenders_codes_from_api(year, month)

with ThreadPoolExecutor(max_workers=16) as executor:
# Get the tenders from the CSV
tenders_from_csv = self.get_tenders_from_csv(year, month)

futures = {
executor.submit(
self.enrich_tender_with_ocds, tender.CodigoExterno
): tender
for tender in api_tenders_missing_date_codes
}
existing_codes: Set[str] = set()

for future in tqdm(
as_completed(futures),
total=len(futures),
desc=f"Fetching publication dates {year}-{month:02}",
disable=settings.disable_progress_bar,
):
# Merge the tenders from the CSV and the API

# From the CSV, we retrieve the following fields:
# - The tender code
# - The tender status (Published, Awarded, etc.)

tender_code: str = futures[future].CodigoExterno
enrichment = future.result()
for csv_tender in tenders_from_csv:

tender = TenderFromCSV(
CodigoExterno=tender_code,
FechaPublicacion=enrichment.opening_date,
RegionUnidad=enrichment.region.value,
Estado=StatusFromCSV[enrichment.status.name],
Nombre=enrichment.title,
Descripcion=enrichment.description,
tenders_consolidated.append(
TenderDataConsolidated(
code=csv_tender.CodigoExterno,
status=Status(csv_tender.Estado.name),
)
)

existing_codes.add(csv_tender.CodigoExterno)

# From the API, we only retrieve the tender code because we download
# the indexes from the OCDS API.
for api_tender in tenders_codes_from_api:

api_tenders_enriched.append(tender)
if api_tender.code in existing_codes:
continue

tenders = tenders_from_csv + api_tenders_enriched
tenders_consolidated.append(TenderDataConsolidated(code=api_tender.code))
existing_codes.add(api_tender.code)

return sorted(tenders, key=lambda tender: tender.FechaPublicacion, reverse=True)
return tenders_consolidated

def get_tender_url_from_code(self, code: str) -> HttpUrl:
"""
Expand Down
12 changes: 7 additions & 5 deletions src/licitpy/entities/tender.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,8 @@ def __init__(
status: Optional[Status] = None,
title: Optional[str] = None,
description: Optional[str] = None,
opening_date: Optional[date] = None,
opening_date: Optional[datetime] = None,
closing_date: Optional[datetime] = None,
services: Optional[TenderServices] = None,
):

Expand All @@ -42,9 +43,10 @@ def __init__(
self._description: Optional[str] = description

self._ocds: Optional[OpenContract] = None
self._opening_date: Optional[date] = opening_date
self._opening_date: Optional[datetime] = opening_date
self._closing_date: Optional[datetime] = closing_date

self._tier: Optional[Tier] = None
self._closing_date: Optional[datetime] = None
self._attachment_url: Optional[HttpUrl] = None
self._attachments: Optional[List[Attachment]] = None
self._signed_base: Optional[Attachment] = None
Expand Down Expand Up @@ -73,7 +75,7 @@ def html(self) -> str:
return self._html

@property
def opening_date(self) -> date:
def opening_date(self) -> datetime:
if self._opening_date is None:
self._opening_date = self.services.get_opening_date(self.ocds)
return self._opening_date
Expand Down Expand Up @@ -171,7 +173,7 @@ def from_data(
status: Optional[Status] = None,
title: Optional[str] = None,
description: Optional[str] = None,
opening_date: Optional[date] = None,
opening_date: Optional[datetime] = None,
services: Optional[TenderServices] = None,
) -> Tender:
return cls(
Expand Down
6 changes: 3 additions & 3 deletions src/licitpy/entities/tenders.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,9 +30,9 @@ def in_region(self, region: Region) -> Tenders:
def to_pandas(self) -> pandas.DataFrame:
raise NotImplementedError

@classmethod
def from_tenders(cls, tenders: List[Tender]) -> Tenders:
return cls(tenders)
# @classmethod
# def from_tenders(cls, tenders: List[Tender]) -> Tenders:
# return cls(tenders)

@property
def codes(self) -> List[str]:
Expand Down
6 changes: 6 additions & 0 deletions src/licitpy/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,12 @@ def get_attribute_by_element_id(
raise ElementNotFoundException(f"Element with ID '{element_id}' not found")

attribute_elements = html_element[0].xpath(f".//{attribute}")

if not attribute_elements:
raise ElementNotFoundException(
f"Element with ID '{element_id}' has no attribute '{attribute}'"
)

value: str = attribute_elements[0]

return value.strip()
Expand Down
Loading

0 comments on commit 13997ce

Please sign in to comment.