Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bayesian format obsolescence modeling #116

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions AIPscan/API/namespace_report_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,3 +140,14 @@ class AgentData(Resource):
def get(self, storage_service_id):
"""List user agents and their transfers"""
return report_data.agents_transfers(storage_service_id=storage_service_id)


@api.route("/bayesian-format-modeling/<storage_service_id>")
class BayesianList(Resource):
@api.doc("formats and creation dates")
def get(self, storage_service_id):
"""List file formats and file creation dates"""
aip_data = report_data.bayesian_format_modeling(
storage_service_id=storage_service_id
)
return aip_data
1 change: 1 addition & 0 deletions AIPscan/Data/fields.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

FIELD_COUNT = "Count"
FIELD_CREATED_DATE = "CreatedDate"
FIELD_CREATED_DATES = "CreatedDates"

FIELD_DERIVATIVE_COUNT = "DerivativeCount"
FIELD_DERIVATIVE_FORMAT = "DerivativeFormat"
Expand Down
60 changes: 60 additions & 0 deletions AIPscan/Data/report_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,3 +341,63 @@ def agents_transfers(storage_service_id):
ingests.append(log_line)
report[fields.FIELD_INGESTS] = ingests
return report


def _add_date_to_formats_list(formats, file_format):
"""Add the created date associated with a format instance to a
formats list if it already exists. If the format is not already in
the list then add the entire format record.
"""
if not formats:
formats.append(file_format)
for fmt in formats:
if fmt[fields.FIELD_PUID] == file_format[fields.FIELD_PUID]:
fmt[fields.FIELD_CREATED_DATES].append(
file_format[fields.FIELD_CREATED_DATES][0]
)
return formats
formats.append(file_format)
return formats


def bayesian_format_modeling(storage_service_id):
"""Return a breakdown of file formats and the date they were
recorded as being created in the system.
"""
report = {}
formats = []

storage_service = _get_storage_service(storage_service_id)
try:
report[fields.FIELD_STORAGE_NAME] = storage_service.name
except AttributeError:
# No storage service has been returned and so we have nothing
# to return.
report[fields.FIELD_STORAGE_NAME] = None
return report

aips = AIP.query.filter_by(storage_service_id=storage_service.id).all()
for aip in aips:
original_files = File.query.filter_by(
aip_id=aip.id, file_type=FileType.original
)
for original in original_files:
if not original.date_created:
# Date isn't available for the file, e.g. extracted
# from zip.
continue
file_format = {}
file_format[fields.FIELD_CREATED_DATES] = [
original.date_created.strftime("%Y-%m-%d")
]
file_format[fields.FIELD_PUID] = original.puid
if not original.puid:
file_format[fields.FIELD_PUID] = "Unknown"
file_format[fields.FIELD_FORMAT] = "{} {}".format(
original.file_format, original.format_version
)
if not original.format_version:
file_format[fields.FIELD_FORMAT] = original.file_format
formats = _add_date_to_formats_list(formats, file_format)
report[fields.FIELD_ALL_AIPS] = formats
return report
9 changes: 8 additions & 1 deletion AIPscan/Data/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,9 @@ def _create_test_file(**kwargs):
uuid=kwargs.get("uuid", str(uuid.uuid4())),
file_type=kwargs.get("file_type", FileType.original),
size=kwargs.get("size", 0),
date_created=kwargs.get("date_created", datetime.now()),
date_created=kwargs.get(
"date_created", datetime.strptime("1970-01-01", "%Y-%m-%d")
),
puid=kwargs.get("puid", "fmt/test-1"),
file_format=kwargs.get("file_format", "ACME File Format"),
format_version=kwargs.get("format_version", "0.0.0"),
Expand Down Expand Up @@ -185,13 +187,15 @@ def app_with_populated_files(scope="package"):
size=ORIGINAL_FILE_SIZE,
puid=TIFF_PUID,
file_format=TIFF_FILE_FORMAT,
date_created=datetime.strptime("1970-01-01", "%Y-%m-%d"),
)

_ = _create_test_file(
file_type=FileType.preservation,
size=PRESERVATION_FILE_SIZE,
puid=TIFF_PUID,
file_format=TIFF_FILE_FORMAT,
date_created=datetime.strptime("1970-01-02", "%Y-%m-%d"),
)

user_agent = _create_test_agent()
Expand Down Expand Up @@ -241,6 +245,7 @@ def app_with_populated_format_versions(scope="package"):
file_format=JPEG_FILE_FORMAT,
format_version=JPEG_1_01_FORMAT_VERSION,
aip_id=aip1.id,
date_created=datetime.strptime("1970-01-03", "%Y-%m-%d"),
)

_ = _create_test_file(
Expand All @@ -249,6 +254,7 @@ def app_with_populated_format_versions(scope="package"):
file_format=JPEG_FILE_FORMAT,
format_version=JPEG_1_02_FORMAT_VERSION,
aip_id=aip2.id,
date_created=datetime.strptime("1970-01-04", "%Y-%m-%d"),
)

_ = _create_test_file(
Expand All @@ -257,6 +263,7 @@ def app_with_populated_format_versions(scope="package"):
file_format="ISO Disk Image File",
format_version=None,
aip_id=aip2.id,
date_created=datetime.strptime("1970-01-05", "%Y-%m-%d"),
)

yield app
Expand Down
57 changes: 57 additions & 0 deletions AIPscan/Data/tests/test_bayesian_format_modeling_data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# -*- coding: utf-8 -*-

from AIPscan.Data import fields, report_data
from AIPscan.Data.tests import MOCK_STORAGE_SERVICE_ID

TEST_STORAGE_SERVICE = "test storage service"


def test_bayesian_modeling_data_one(app_with_populated_files,):
"""Ensure that the data returned fro the Bayesian format modeling
endpoint is returned as expected for our first application fixture
with just one format recorded.
"""
report = report_data.bayesian_format_modeling(
storage_service_id=MOCK_STORAGE_SERVICE_ID
)

assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE
assert len(report[fields.FIELD_ALL_AIPS]) == 1
assert (
report[fields.FIELD_ALL_AIPS][0][fields.FIELD_FORMAT]
== "Tagged Image File Format 0.0.0"
)
assert report[fields.FIELD_ALL_AIPS][0][fields.FIELD_PUID] == "fmt/353"
assert len(report[fields.FIELD_ALL_AIPS][0][fields.FIELD_CREATED_DATES]) == 2


def test_bayesian_modeling_data_two(app_with_populated_format_versions,):
"""Ensure that the data returned fro the Bayesian format modeling
endpoint is returned as expected for our second application fixture
with three formats recorded.
"""
report = report_data.bayesian_format_modeling(
storage_service_id=MOCK_STORAGE_SERVICE_ID
)

assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE
assert len(report[fields.FIELD_ALL_AIPS]) == 3

formats = ["JPEG 1.01", "JPEG 1.02", "ISO Disk Image File"]
puids = ["fmt/43", "fmt/44", "fmt/468"]
dates = [["1970-01-03", "1970-01-03"], ["1970-01-04"], ["1970-01-05"]]

result_formats = [
item[fields.FIELD_FORMAT] for item in report[fields.FIELD_ALL_AIPS]
]
result_puids = [item[fields.FIELD_PUID] for item in report[fields.FIELD_ALL_AIPS]]
result_dates = [
item[fields.FIELD_CREATED_DATES] for item in report[fields.FIELD_ALL_AIPS]
]

assert set(formats) == set(result_formats)
assert set(puids) == set(result_puids)

assert dates[0] in result_dates
assert dates[1] in result_dates
assert dates[2] in result_dates
102 changes: 102 additions & 0 deletions AIPscan/Reporter/report_bayesian_modeling.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,102 @@
# -*- coding: utf-8 -*-

"""Report Bayesian formats provides the user with information on the
distribution of a specific file format (PUID) over time. Through the
lens of this distribution we might understand more about when a format
first came into existence - when a format was abundant - and signs that
the format is on its way to becoming obsolete.

Based on Nick Krabbenhoeft's "Bayesian Modeling of File Format
Obsolescence"
"""

import base64
from datetime import datetime
from io import BytesIO

import seaborn as sns
from flask import render_template, request
from matplotlib import pyplot as plt

from AIPscan.Data import fields, report_data
from AIPscan.Reporter import reporter, request_params

sns.set_theme()


def retrieve_year(date):
"""Retrieve year from date."""
return datetime.strptime(date, "%Y-%m-%d").strftime("%Y")


SIGNIFICANT_RESULTS = 10


def save_rug_plots(format_report, significant_results=SIGNIFICANT_RESULTS):
"""Return Base64 encoded figures to caller

:param format_report: Bayesian format modeling report.
:param significant_results: Number of results worth returning in
these reports, e.g. 1 does not tell us a lot of information.

:returns: Empty array or array of Base64 encoded images to be
rendered.
"""

# Seaborn setup.
sns.set(rc={"figure.figsize": (30, 8)})
sns.set_theme(style="ticks", palette="icefire")

# Process AIP format data.
all_aips = format_report.get(fields.FIELD_ALL_AIPS, [])
plot_output = []
dates = []
idx = 0
for idx, aip in enumerate(all_aips):
dates = []
PUID = aip.get(fields.FIELD_PUID)
format_dates = aip.get(fields.FIELD_CREATED_DATES)
if len(format_dates) <= significant_results:
continue
year = [int(retrieve_year(date)) for date in format_dates]
dates.extend(year)
fig, axes = plt.subplots()

# Setup axes to be useful to the reader.
min_date = min(dates) - 10
max_date = max(dates) + 10
axes.set_xlim(min_date, max_date)
axes.set_xticks(range(min_date, max_date, 5))

# Plot our chart.
plot = sns.rugplot(data=dates, height=1, y=None, x=dates, legend=True, ax=axes)
plot.set(yticklabels=[])

# Save the chart image to memory.
img = BytesIO()
fig.savefig(img, bbox_inches="tight", pad_inches=0.3, transparent=True)
fig.clf()
img.seek(0)

# Convert bytes to Base64 encoding for rendering later.
plot = base64.b64encode(img.getvalue()).decode("utf8")
plot_output.append({PUID: plot})

return plot_output


@reporter.route("/bayesian_rug_plot/", methods=["GET"])
def report_bayesian_modeling_rug():
"""Bayesian format modeling rug plot."""

storage_service_id = request.args.get(request_params["storage_service_id"])

format_report = report_data.bayesian_format_modeling(storage_service_id)
figures = save_rug_plots(format_report)

return render_template(
"bayesian_rug_plot.html",
storage_service_id=storage_service_id,
storage_service_name=format_report.get(fields.FIELD_STORAGE_NAME),
figures=figures,
)
26 changes: 26 additions & 0 deletions AIPscan/Reporter/templates/bayesian_rug_plot.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{% extends "report_base.html" %}

{% block content %}

<div class="alert alert-secondary">
<a class="noprint" onClick="window.print();return false"><button type="button" class="btn btn-info" style="float:right;">Print</button></a>
<strong>Report:</strong>
<span title="Represents a distribution of file-format instance creation dates. Thicker lines demonstrate a greater number of format instances recorded for a single date in time and less-dense portions of the distribution either side of rug-plot show fewer instances of file-formats being recorded. Those to the right given a long-enough gap between today and the last lines may begin to show us a file-format in the process of becoming obsolete.">
Bayesian format modeling
</span>
<br>
<strong>Storage Service:</strong> {{ storage_service_name }}
<br>
</div>

{% for figure in figures %}
{% for value in figure %}
<b>&nbsp; {{ value }}:</b>
<img class="plot" src="data:image/png;base64, {{ figure[value] }}" />
<br>
{% endfor %}
{% endfor %}

</div>

{% endblock %}
21 changes: 21 additions & 0 deletions AIPscan/Reporter/templates/reports.html
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,16 @@
</tr>
{% endif %}

<tr>
<td>Bayesian format modeling</td>
<td></td>
<td></td>
<td></td>
<td>
<a href="#"><button type="button" id="bayesianRugPlot" class="btn btn-info" style="margin-left:5px; margin-bottom:5px;">Rug plot</button></a>
</td>
</tr>

<tr>
<td>Transfers log </td>
<td></td>
Expand Down Expand Up @@ -406,6 +416,17 @@
);
window.open(url);
});
$("#bayesianRugPlot").on("click", function() {
const URL_AIP_CONTENTS = "/reporter/bayesian_rug_plot/";
var storageServiceID = $('#ss').val();
var url = (
window.location.origin +
URL_AIP_CONTENTS +
'?amss_id=' +
storageServiceID
);
window.open(url);
});
$("#transferLogTabular").on("click", function() {
const URL_AIP_CONTENTS = "/reporter/ingest_log_tabular/";
var storageServiceID = $('#ss').val();
Expand Down
Loading