Skip to content

Commit

Permalink
fix: Fixed handling of metadata only records
Browse files Browse the repository at this point in the history
  • Loading branch information
monotasker committed Nov 12, 2024
1 parent 4abf0b7 commit d8c3728
Show file tree
Hide file tree
Showing 3 changed files with 95 additions and 72 deletions.
2 changes: 2 additions & 0 deletions invenio_record_importer_kcworks/record_loader.py
Original file line number Diff line number Diff line change
Expand Up @@ -1003,7 +1003,9 @@ def import_record_to_invenio(
app.logger.warning(
f"files in metadata: {pformat(metadata_record.get('files'))}"
)
FilesHelper(is_draft=is_draft).set_to_metadata_only(draft_id)
metadata_record["files"]["enabled"] = False
existing_record["files"]["enabled"] = False

# Attach the record to the communities
result[
Expand Down
19 changes: 18 additions & 1 deletion invenio_record_importer_kcworks/services/files.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from flask import current_app as app
from invenio_access.permissions import system_identity
from invenio_db import db
from invenio_files_rest.errors import InvalidKeyError, BucketLockedError
from invenio_pidstore.errors import PIDUnregistered
from invenio_rdm_records.proxies import (
current_rdm_records_service as records_service,
)
Expand Down Expand Up @@ -35,6 +35,23 @@ def __init__(self, is_draft: bool):
records_service.draft_files if is_draft else records_service.files
)

@unit_of_work()
def set_to_metadata_only(
self, draft_id: str, uow: Optional[UnitOfWork] = None
):
try:
record = records_service.read(system_identity, draft_id)._record
except PIDUnregistered:
record = records_service.read_draft(
system_identity, draft_id
)._record
if record.files.entries:
for k in record.files.entries.keys():
self._delete_file(draft_id, k, uow)
record.files.enabled = False
record["access"]["status"] = "metadata-only"
uow.register(RecordCommitOp(record))

@unit_of_work()
def _delete_file(
self, draft_id: str, key: str, uow: Optional[UnitOfWork] = None
Expand Down
146 changes: 75 additions & 71 deletions invenio_record_importer_kcworks/services/stats/stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,18 +290,12 @@ def get_field_value(record: dict, field: str) -> int:
print(f"downloads: {downloads}")
print(f"record creation date: {record_creation}")

pid = record.pid

files_request = records_service.files.list_files(
system_identity, record_id
).to_dict()

first_file = files_request["entries"][0]
file_id = first_file["file_id"]
file_key = first_file["key"]
size = first_file["size"]
bucket_id = first_file["bucket_id"]

pid = record.pid

# Check for existing view and download events
# imported events are flagged with country: "imported"
# this is a hack
Expand All @@ -311,12 +305,7 @@ def get_field_value(record: dict, field: str) -> int:
app.logger.warning(f"Error searching for view events: {e}")
print(f"Error searching for view events: {e}")
existing_view_events = []
try:
existing_download_events = download_events_search(file_id)
except NotFoundError as e:
app.logger.warning(f"Error searching for download events: {e}")
print(f"Error searching for download events: {e}")
existing_download_events = []

if verbose:
app.logger.info(
"existing view events: "
Expand Down Expand Up @@ -398,67 +387,82 @@ def get_field_value(record: dict, field: str) -> int:
)
current_stats.publish("record-view", view_events)

existing_download_count = len(existing_download_events)
if existing_download_count == downloads:
app.logger.info(
" skipping download events creation. "
f"{existing_download_count} "
"download events already exist."
)
else:
if existing_download_count > downloads:
# TODO: we should probably log this, but
# we don't want to raise an error and prevent
# the import from completing
#
# raise TooManyDownloadEventsError(
# " existing imported download events exceed expected "
# "count."
# )
pass
else:
# only create enough new download events to reach the expected
# count
if existing_download_count > 0:
downloads -= existing_download_count
download_events = []
for idx, dt in enumerate(
self.generate_datetimes(record_creation, downloads)
):
uid = str(uuid.uuid4())
doc = {
"timestamp": dt.naive.isoformat(),
"bucket_id": str(bucket_id), # UUID
"file_id": str(file_id), # UUID
"file_key": file_key,
"size": size,
"recid": record_id,
"parent_recid": metadata_record["parent"]["id"],
"is_robot": False,
"user_id": uid,
"session_id": uid,
"country": "imported",
"unique_id": f"{str(bucket_id)}_{str(file_id)}",
"via_api": False,
}
# doc = anonymize_user(doc)
# NOTE: no longer using anonymize_user to generate
# unique_session_id and visitor_id from
# user_id, session_id, user_agent, ip_address
# instead, we're using a UUID that should be unique
# to ensure events are aggregated as unique
doc["unique_session_id"] = uid
doc["visitor_id"] = uid
download_events.append(build_file_unique_id(doc))
print(
"DOWNLOAD EVENTS being published:",
pformat(len(download_events)),
if files_request["enabled"] is True:
first_file = files_request["entries"][0]
file_id = first_file["file_id"]
file_key = first_file["key"]
size = first_file["size"]
bucket_id = first_file["bucket_id"]

try:
existing_download_events = download_events_search(file_id)
except NotFoundError as e:
app.logger.warning(f"Error searching for download events: {e}")
print(f"Error searching for download events: {e}")
existing_download_events = []

existing_download_count = len(existing_download_events)

if existing_download_count == downloads:
app.logger.info(
" skipping download events creation. "
f"{existing_download_count} "
"download events already exist."
)
current_stats.publish("file-download", download_events)
else:
if existing_download_count > downloads:
# TODO: we should probably log this, but
# we don't want to raise an error and prevent
# the import from completing
#
# raise TooManyDownloadEventsError(
# " existing imported download events exceed expected "
# "count."
# )
pass
else:
# only create enough new download events to reach the expected
# count
if existing_download_count > 0:
downloads -= existing_download_count
download_events = []
for idx, dt in enumerate(
self.generate_datetimes(record_creation, downloads)
):
uid = str(uuid.uuid4())
doc = {
"timestamp": dt.naive.isoformat(),
"bucket_id": str(bucket_id), # UUID
"file_id": str(file_id), # UUID
"file_key": file_key,
"size": size,
"recid": record_id,
"parent_recid": metadata_record["parent"]["id"],
"is_robot": False,
"user_id": uid,
"session_id": uid,
"country": "imported",
"unique_id": f"{str(bucket_id)}_{str(file_id)}",
"via_api": False,
}
# doc = anonymize_user(doc)
# NOTE: no longer using anonymize_user to generate
# unique_session_id and visitor_id from
# user_id, session_id, user_agent, ip_address
# instead, we're using a UUID that should be unique
# to ensure events are aggregated as unique
doc["unique_session_id"] = uid
doc["visitor_id"] = uid
download_events.append(build_file_unique_id(doc))
print(
"DOWNLOAD EVENTS being published:",
pformat(len(download_events)),
)
current_stats.publish("file-download", download_events)

try:
app.logger.warning("Trying to process events...")
app.logger.warning(f"EAGER: { eager }")
app.logger.warning(f"EAGER: {eager}")
if eager:
app.logger.warning("Processing events...")
events = process_events(["record-view", "file-download"])
Expand Down

0 comments on commit d8c3728

Please sign in to comment.