From 472089c989fe8f047c653cf2142e3fe9ffff16fd Mon Sep 17 00:00:00 2001 From: Ross Spencer Date: Fri, 19 Feb 2021 10:48:57 -0500 Subject: [PATCH] Bayesian format obsolescence modeling --- AIPscan/API/namespace_report_data.py | 11 ++ AIPscan/Data/fields.py | 1 + AIPscan/Data/report_data.py | 60 ++++++++++ AIPscan/Data/tests/conftest.py | 9 +- .../test_bayesian_format_modeling_data.py | 57 +++++++++ AIPscan/Reporter/report_bayesian_modeling.py | 102 ++++++++++++++++ .../Reporter/templates/bayesian_rug_plot.html | 26 +++++ AIPscan/Reporter/templates/reports.html | 21 ++++ .../tests/test_bayesian_format_modeling.py | 110 ++++++++++++++++++ AIPscan/Reporter/views.py | 1 + AIPscan/static/css/custom.css | 8 ++ requirements/base.txt | 2 + 12 files changed, 407 insertions(+), 1 deletion(-) create mode 100644 AIPscan/Data/tests/test_bayesian_format_modeling_data.py create mode 100644 AIPscan/Reporter/report_bayesian_modeling.py create mode 100644 AIPscan/Reporter/templates/bayesian_rug_plot.html create mode 100644 AIPscan/Reporter/tests/test_bayesian_format_modeling.py diff --git a/AIPscan/API/namespace_report_data.py b/AIPscan/API/namespace_report_data.py index aaf4d070..9658d3c5 100644 --- a/AIPscan/API/namespace_report_data.py +++ b/AIPscan/API/namespace_report_data.py @@ -140,3 +140,14 @@ class AgentData(Resource): def get(self, storage_service_id): """List user agents and their transfers""" return report_data.agents_transfers(storage_service_id=storage_service_id) + + +@api.route("/bayesian-format-modeling/") +class BayesianList(Resource): + @api.doc("formats and creation dates") + def get(self, storage_service_id): + """List file formats and file creation dates""" + aip_data = report_data.bayesian_format_modeling( + storage_service_id=storage_service_id + ) + return aip_data diff --git a/AIPscan/Data/fields.py b/AIPscan/Data/fields.py index 2427bf59..25b9594e 100644 --- a/AIPscan/Data/fields.py +++ b/AIPscan/Data/fields.py @@ -11,6 +11,7 @@ FIELD_COUNT = "Count" FIELD_CREATED_DATE = "CreatedDate" +FIELD_CREATED_DATES = "CreatedDates" FIELD_DERIVATIVE_COUNT = "DerivativeCount" FIELD_DERIVATIVE_FORMAT = "DerivativeFormat" diff --git a/AIPscan/Data/report_data.py b/AIPscan/Data/report_data.py index 55140046..77d17aa0 100644 --- a/AIPscan/Data/report_data.py +++ b/AIPscan/Data/report_data.py @@ -341,3 +341,63 @@ def agents_transfers(storage_service_id): ingests.append(log_line) report[fields.FIELD_INGESTS] = ingests return report + + +def _add_date_to_formats_list(formats, file_format): + """Add the created date associated with a format instance to a + formats list if it already exists. If the format is not already in + the list then add the entire format record. + """ + if not formats: + formats.append(file_format) + for fmt in formats: + if fmt[fields.FIELD_PUID] == file_format[fields.FIELD_PUID]: + fmt[fields.FIELD_CREATED_DATES].append( + file_format[fields.FIELD_CREATED_DATES][0] + ) + return formats + formats.append(file_format) + return formats + + +def bayesian_format_modeling(storage_service_id): + """Return a breakdown of file formats and the date they were + recorded as being created in the system. + """ + report = {} + formats = [] + + storage_service = _get_storage_service(storage_service_id) + try: + report[fields.FIELD_STORAGE_NAME] = storage_service.name + except AttributeError: + # No storage service has been returned and so we have nothing + # to return. + report[fields.FIELD_STORAGE_NAME] = None + return report + + aips = AIP.query.filter_by(storage_service_id=storage_service.id).all() + for aip in aips: + original_files = File.query.filter_by( + aip_id=aip.id, file_type=FileType.original + ) + for original in original_files: + if not original.date_created: + # Date isn't available for the file, e.g. extracted + # from zip. + continue + file_format = {} + file_format[fields.FIELD_CREATED_DATES] = [ + original.date_created.strftime("%Y-%m-%d") + ] + file_format[fields.FIELD_PUID] = original.puid + if not original.puid: + file_format[fields.FIELD_PUID] = "Unknown" + file_format[fields.FIELD_FORMAT] = "{} {}".format( + original.file_format, original.format_version + ) + if not original.format_version: + file_format[fields.FIELD_FORMAT] = original.file_format + formats = _add_date_to_formats_list(formats, file_format) + report[fields.FIELD_ALL_AIPS] = formats + return report diff --git a/AIPscan/Data/tests/conftest.py b/AIPscan/Data/tests/conftest.py index d681c6b0..4bb6e9bb 100644 --- a/AIPscan/Data/tests/conftest.py +++ b/AIPscan/Data/tests/conftest.py @@ -109,7 +109,9 @@ def _create_test_file(**kwargs): uuid=kwargs.get("uuid", str(uuid.uuid4())), file_type=kwargs.get("file_type", FileType.original), size=kwargs.get("size", 0), - date_created=kwargs.get("date_created", datetime.now()), + date_created=kwargs.get( + "date_created", datetime.strptime("1970-01-01", "%Y-%m-%d") + ), puid=kwargs.get("puid", "fmt/test-1"), file_format=kwargs.get("file_format", "ACME File Format"), format_version=kwargs.get("format_version", "0.0.0"), @@ -185,6 +187,7 @@ def app_with_populated_files(scope="package"): size=ORIGINAL_FILE_SIZE, puid=TIFF_PUID, file_format=TIFF_FILE_FORMAT, + date_created=datetime.strptime("1970-01-01", "%Y-%m-%d"), ) _ = _create_test_file( @@ -192,6 +195,7 @@ def app_with_populated_files(scope="package"): size=PRESERVATION_FILE_SIZE, puid=TIFF_PUID, file_format=TIFF_FILE_FORMAT, + date_created=datetime.strptime("1970-01-02", "%Y-%m-%d"), ) user_agent = _create_test_agent() @@ -241,6 +245,7 @@ def app_with_populated_format_versions(scope="package"): file_format=JPEG_FILE_FORMAT, format_version=JPEG_1_01_FORMAT_VERSION, aip_id=aip1.id, + date_created=datetime.strptime("1970-01-03", "%Y-%m-%d"), ) _ = _create_test_file( @@ -249,6 +254,7 @@ def app_with_populated_format_versions(scope="package"): file_format=JPEG_FILE_FORMAT, format_version=JPEG_1_02_FORMAT_VERSION, aip_id=aip2.id, + date_created=datetime.strptime("1970-01-04", "%Y-%m-%d"), ) _ = _create_test_file( @@ -257,6 +263,7 @@ def app_with_populated_format_versions(scope="package"): file_format="ISO Disk Image File", format_version=None, aip_id=aip2.id, + date_created=datetime.strptime("1970-01-05", "%Y-%m-%d"), ) yield app diff --git a/AIPscan/Data/tests/test_bayesian_format_modeling_data.py b/AIPscan/Data/tests/test_bayesian_format_modeling_data.py new file mode 100644 index 00000000..5e054b89 --- /dev/null +++ b/AIPscan/Data/tests/test_bayesian_format_modeling_data.py @@ -0,0 +1,57 @@ +# -*- coding: utf-8 -*- + +from AIPscan.Data import fields, report_data +from AIPscan.Data.tests import MOCK_STORAGE_SERVICE_ID + +TEST_STORAGE_SERVICE = "test storage service" + + +def test_bayesian_modeling_data_one(app_with_populated_files,): + """Ensure that the data returned fro the Bayesian format modeling + endpoint is returned as expected for our first application fixture + with just one format recorded. + """ + report = report_data.bayesian_format_modeling( + storage_service_id=MOCK_STORAGE_SERVICE_ID + ) + + assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE + assert len(report[fields.FIELD_ALL_AIPS]) == 1 + assert ( + report[fields.FIELD_ALL_AIPS][0][fields.FIELD_FORMAT] + == "Tagged Image File Format 0.0.0" + ) + assert report[fields.FIELD_ALL_AIPS][0][fields.FIELD_PUID] == "fmt/353" + assert len(report[fields.FIELD_ALL_AIPS][0][fields.FIELD_CREATED_DATES]) == 2 + + +def test_bayesian_modeling_data_two(app_with_populated_format_versions,): + """Ensure that the data returned fro the Bayesian format modeling + endpoint is returned as expected for our second application fixture + with three formats recorded. + """ + report = report_data.bayesian_format_modeling( + storage_service_id=MOCK_STORAGE_SERVICE_ID + ) + + assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE + assert len(report[fields.FIELD_ALL_AIPS]) == 3 + + formats = ["JPEG 1.01", "JPEG 1.02", "ISO Disk Image File"] + puids = ["fmt/43", "fmt/44", "fmt/468"] + dates = [["1970-01-03", "1970-01-03"], ["1970-01-04"], ["1970-01-05"]] + + result_formats = [ + item[fields.FIELD_FORMAT] for item in report[fields.FIELD_ALL_AIPS] + ] + result_puids = [item[fields.FIELD_PUID] for item in report[fields.FIELD_ALL_AIPS]] + result_dates = [ + item[fields.FIELD_CREATED_DATES] for item in report[fields.FIELD_ALL_AIPS] + ] + + assert set(formats) == set(result_formats) + assert set(puids) == set(result_puids) + + assert dates[0] in result_dates + assert dates[1] in result_dates + assert dates[2] in result_dates diff --git a/AIPscan/Reporter/report_bayesian_modeling.py b/AIPscan/Reporter/report_bayesian_modeling.py new file mode 100644 index 00000000..780d28dd --- /dev/null +++ b/AIPscan/Reporter/report_bayesian_modeling.py @@ -0,0 +1,102 @@ +# -*- coding: utf-8 -*- + +"""Report Bayesian formats provides the user with information on the +distribution of a specific file format (PUID) over time. Through the +lens of this distribution we might understand more about when a format +first came into existence - when a format was abundant - and signs that +the format is on its way to becoming obsolete. + +Based on Nick Krabbenhoeft's "Bayesian Modeling of File Format +Obsolescence" +""" + +import base64 +from datetime import datetime +from io import BytesIO + +import seaborn as sns +from flask import render_template, request +from matplotlib import pyplot as plt + +from AIPscan.Data import fields, report_data +from AIPscan.Reporter import reporter, request_params + +sns.set_theme() + + +def retrieve_year(date): + """Retrieve year from date.""" + return datetime.strptime(date, "%Y-%m-%d").strftime("%Y") + + +SIGNIFICANT_RESULTS = 10 + + +def save_rug_plots(format_report, significant_results=SIGNIFICANT_RESULTS): + """Return Base64 encoded figures to caller + + :param format_report: Bayesian format modeling report. + :param significant_results: Number of results worth returning in + these reports, e.g. 1 does not tell us a lot of information. + + :returns: Empty array or array of Base64 encoded images to be + rendered. + """ + + # Seaborn setup. + sns.set(rc={"figure.figsize": (30, 8)}) + sns.set_theme(style="ticks", palette="icefire") + + # Process AIP format data. + all_aips = format_report.get(fields.FIELD_ALL_AIPS, []) + plot_output = [] + dates = [] + idx = 0 + for idx, aip in enumerate(all_aips): + dates = [] + PUID = aip.get(fields.FIELD_PUID) + format_dates = aip.get(fields.FIELD_CREATED_DATES) + if len(format_dates) <= significant_results: + continue + year = [int(retrieve_year(date)) for date in format_dates] + dates.extend(year) + fig, axes = plt.subplots() + + # Setup axes to be useful to the reader. + min_date = min(dates) - 10 + max_date = max(dates) + 10 + axes.set_xlim(min_date, max_date) + axes.set_xticks(range(min_date, max_date, 5)) + + # Plot our chart. + plot = sns.rugplot(data=dates, height=1, y=None, x=dates, legend=True, ax=axes) + plot.set(yticklabels=[]) + + # Save the chart image to memory. + img = BytesIO() + fig.savefig(img, bbox_inches="tight", pad_inches=0.3, transparent=True) + fig.clf() + img.seek(0) + + # Convert bytes to Base64 encoding for rendering later. + plot = base64.b64encode(img.getvalue()).decode("utf8") + plot_output.append({PUID: plot}) + + return plot_output + + +@reporter.route("/bayesian_rug_plot/", methods=["GET"]) +def report_bayesian_modeling_rug(): + """Bayesian format modeling rug plot.""" + + storage_service_id = request.args.get(request_params["storage_service_id"]) + + format_report = report_data.bayesian_format_modeling(storage_service_id) + figures = save_rug_plots(format_report) + + return render_template( + "bayesian_rug_plot.html", + storage_service_id=storage_service_id, + storage_service_name=format_report.get(fields.FIELD_STORAGE_NAME), + figures=figures, + ) diff --git a/AIPscan/Reporter/templates/bayesian_rug_plot.html b/AIPscan/Reporter/templates/bayesian_rug_plot.html new file mode 100644 index 00000000..1ccc3044 --- /dev/null +++ b/AIPscan/Reporter/templates/bayesian_rug_plot.html @@ -0,0 +1,26 @@ +{% extends "report_base.html" %} + +{% block content %} + +
+ + Report: + + Bayesian format modeling + +
+ Storage Service: {{ storage_service_name }} +
+
+ + {% for figure in figures %} + {% for value in figure %} +   {{ value }}: + +
+ {% endfor %} + {% endfor %} + + + +{% endblock %} diff --git a/AIPscan/Reporter/templates/reports.html b/AIPscan/Reporter/templates/reports.html index c1d54e42..93c113ae 100644 --- a/AIPscan/Reporter/templates/reports.html +++ b/AIPscan/Reporter/templates/reports.html @@ -148,6 +148,16 @@ {% endif %} + + Bayesian format modeling + + + + + + + + Transfers log @@ -406,6 +416,17 @@ ); window.open(url); }); + $("#bayesianRugPlot").on("click", function() { + const URL_AIP_CONTENTS = "/reporter/bayesian_rug_plot/"; + var storageServiceID = $('#ss').val(); + var url = ( + window.location.origin + + URL_AIP_CONTENTS + + '?amss_id=' + + storageServiceID + ); + window.open(url); + }); $("#transferLogTabular").on("click", function() { const URL_AIP_CONTENTS = "/reporter/ingest_log_tabular/"; var storageServiceID = $('#ss').val(); diff --git a/AIPscan/Reporter/tests/test_bayesian_format_modeling.py b/AIPscan/Reporter/tests/test_bayesian_format_modeling.py new file mode 100644 index 00000000..01ff3af2 --- /dev/null +++ b/AIPscan/Reporter/tests/test_bayesian_format_modeling.py @@ -0,0 +1,110 @@ +# -*- coding: utf-8 -*- + +import json + +import pytest + +from AIPscan.Reporter.report_bayesian_modeling import save_rug_plots + +format_report = """ +{ + "AllAIPs": [ + { + "CreatedDates": [ + "2020-02-11", + "2020-02-11", + "2019-09-16", + "2019-09-16" + ], + "PUID": "fmt/199", + "Format": "MPEG-4 Media File None" + }, + { + "CreatedDates": [ + "2020-01-15", + "2020-03-04", + "2020-01-08", + "2018-08-09", + "1987-07-27", + "2018-08-09" + ], + "PUID": "x-fmt/111", + "Format": "Plain Text None" + }, + { + "CreatedDates": [ + "2018-08-02", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2020-01-08", + "2018-08-09", + "2018-01-22", + "2018-01-31", + "2019-02-25", + "2018-08-02", + "2020-11-19", + "1986-03-11", + "2018-08-09", + "2020-01-08", + "2020-01-08" + ], + "PUID": "fmt/43", + "Format": "JPEG 1.01" + } + ] +} +""" + + +@pytest.mark.parametrize( + "number_of_significant_results, length_of_results, keys_to_test", + [ + # Threshold for significant results means that all results are + # returned. + (0, 3, ["fmt/199", "x-fmt/111", "fmt/43"]), + # Threshold for significant results means that two results are + # returned. + (5, 2, ["x-fmt/111", "fmt/43"]), + # Threshold for significant results means that one result is + # returned. + (10, 1, ["fmt/43"]), + # Threshold for significant results means zero results are + # returned. + (24, 0, []), + ], +) +def test_save_rug_plots( + mocker, number_of_significant_results, length_of_results, keys_to_test +): + """Ensure that we retrieve the correctly shaped data for the rug + plot report. + """ + format_report_json = json.loads(format_report) + + ENCODED_DATA = "encoded data" + + mocker.patch("base64.b64encode", return_value=ENCODED_DATA.encode("utf8")) + result = save_rug_plots(format_report_json, number_of_significant_results) + + if length_of_results == 0 and len(result) == 0: + # Output is as was expected, we can happily return from here. + return + + assert len(result) == length_of_results + + puids = [] + for res in result: + puids.extend(res.keys()) + assert [*res.values()][0] == ENCODED_DATA + + assert set(puids) == set(keys_to_test) diff --git a/AIPscan/Reporter/views.py b/AIPscan/Reporter/views.py index aa7dfd6e..299a0aeb 100644 --- a/AIPscan/Reporter/views.py +++ b/AIPscan/Reporter/views.py @@ -17,6 +17,7 @@ report_aip_contents, report_aips_by_format, report_aips_by_puid, + report_bayesian_modeling, report_format_versions_count, report_formats_count, report_ingest_log, diff --git a/AIPscan/static/css/custom.css b/AIPscan/static/css/custom.css index ee178dbd..d096d589 100755 --- a/AIPscan/static/css/custom.css +++ b/AIPscan/static/css/custom.css @@ -10,6 +10,14 @@ body { font-family: 'Roboto', sans-serif } +img.plot { + padding: 0; + display: block; + margin: 0 auto; + max-height: 100%; + max-width: 100%; +} + #console { padding: 10px; border-radius: .25rem; diff --git a/requirements/base.txt b/requirements/base.txt index 2ad5206a..50cc3d96 100644 --- a/requirements/base.txt +++ b/requirements/base.txt @@ -18,12 +18,14 @@ Jinja2==2.11.1 kombu==4.6.10 lxml==4.6.2 MarkupSafe==1.1.1 +matplotlib==3.3.4 metsrw==0.3.15 natsort==7.0.1 pandas==1.1.4 plotly-express==0.4.1 pytz==2020.1 requests==2.23.0 +seaborn==0.11.1 six==1.14.0 SQLAlchemy==1.3.15 urllib3==1.25.8