-
Notifications
You must be signed in to change notification settings - Fork 3
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Bayesian format obsolescence modeling
- Loading branch information
1 parent
6f8f18a
commit 8fc6d4b
Showing
12 changed files
with
407 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,57 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
from AIPscan.Data import fields, report_data | ||
from AIPscan.Data.tests import MOCK_STORAGE_SERVICE_ID | ||
|
||
TEST_STORAGE_SERVICE = "test storage service" | ||
|
||
|
||
def test_bayesian_modeling_data_one(app_with_populated_files,): | ||
"""Ensure that the data returned fro the Bayesian format modeling | ||
endpoint is returned as expected for our first application fixture | ||
with just one format recorded. | ||
""" | ||
report = report_data.bayesian_format_modeling( | ||
storage_service_id=MOCK_STORAGE_SERVICE_ID | ||
) | ||
|
||
assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE | ||
assert len(report[fields.FIELD_ALL_AIPS]) == 1 | ||
assert ( | ||
report[fields.FIELD_ALL_AIPS][0][fields.FIELD_FORMAT] | ||
== "Tagged Image File Format 0.0.0" | ||
) | ||
assert report[fields.FIELD_ALL_AIPS][0][fields.FIELD_PUID] == "fmt/353" | ||
assert len(report[fields.FIELD_ALL_AIPS][0][fields.FIELD_CREATED_DATES]) == 2 | ||
|
||
|
||
def test_bayesian_modeling_data_two(app_with_populated_format_versions,): | ||
"""Ensure that the data returned fro the Bayesian format modeling | ||
endpoint is returned as expected for our second application fixture | ||
with three formats recorded. | ||
""" | ||
report = report_data.bayesian_format_modeling( | ||
storage_service_id=MOCK_STORAGE_SERVICE_ID | ||
) | ||
|
||
assert report[fields.FIELD_STORAGE_NAME] == TEST_STORAGE_SERVICE | ||
assert len(report[fields.FIELD_ALL_AIPS]) == 3 | ||
|
||
formats = ["JPEG 1.01", "JPEG 1.02", "ISO Disk Image File"] | ||
puids = ["fmt/43", "fmt/44", "fmt/468"] | ||
dates = [["1970-01-03", "1970-01-03"], ["1970-01-04"], ["1970-01-05"]] | ||
|
||
result_formats = [ | ||
item[fields.FIELD_FORMAT] for item in report[fields.FIELD_ALL_AIPS] | ||
] | ||
result_puids = [item[fields.FIELD_PUID] for item in report[fields.FIELD_ALL_AIPS]] | ||
result_dates = [ | ||
item[fields.FIELD_CREATED_DATES] for item in report[fields.FIELD_ALL_AIPS] | ||
] | ||
|
||
assert set(formats) == set(result_formats) | ||
assert set(puids) == set(result_puids) | ||
|
||
assert dates[0] in result_dates | ||
assert dates[1] in result_dates | ||
assert dates[2] in result_dates |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
# -*- coding: utf-8 -*- | ||
|
||
"""Report Bayesian formats provides the user with information on the | ||
distribution of a specific file format (PUID) over time. Through the | ||
lens of this distribution we might understand more about when a format | ||
first came into existence - when a format was abundant - and signs that | ||
the format is on its way to becoming obsolete. | ||
Based on Nick Krabbenhoeft's "Bayesian Modeling of File Format | ||
Obsolescence" | ||
""" | ||
|
||
import base64 | ||
from datetime import datetime | ||
from io import BytesIO | ||
|
||
import seaborn as sns | ||
from flask import render_template, request | ||
from matplotlib import pyplot as plt | ||
|
||
from AIPscan.Data import fields, report_data | ||
from AIPscan.Reporter import reporter, request_params | ||
|
||
sns.set_theme() | ||
|
||
|
||
def retrieve_year(date): | ||
"""Retrieve year from date.""" | ||
return datetime.strptime(date, "%Y-%m-%d").strftime("%Y") | ||
|
||
|
||
SIGNIFICANT_RESULTS = 10 | ||
|
||
|
||
def save_rug_plots(format_report, significant_results=SIGNIFICANT_RESULTS): | ||
"""Return Base64 encoded figures to caller | ||
:param format_report: Bayesian format modeling report. | ||
:param significant_results: Number of results worth returning in | ||
these reports, e.g. 1 does not tell us a lot of information. | ||
:returns: Empty array or array of Base64 encoded images to be | ||
rendered. | ||
""" | ||
|
||
# Seaborn setup. | ||
sns.set(rc={"figure.figsize": (30, 8)}) | ||
sns.set_theme(style="ticks", palette="icefire") | ||
|
||
# Process AIP format data. | ||
all_aips = format_report.get(fields.FIELD_ALL_AIPS, []) | ||
plot_output = [] | ||
dates = [] | ||
idx = 0 | ||
for idx, aip in enumerate(all_aips): | ||
dates = [] | ||
PUID = aip.get(fields.FIELD_PUID) | ||
format_dates = aip.get(fields.FIELD_CREATED_DATES) | ||
if len(format_dates) <= significant_results: | ||
continue | ||
year = [int(retrieve_year(date)) for date in format_dates] | ||
dates.extend(year) | ||
fig, axes = plt.subplots() | ||
|
||
# Setup axes to be useful to the reader. | ||
min_date = min(dates) - 10 | ||
max_date = max(dates) + 10 | ||
axes.set_xlim(min_date, max_date) | ||
axes.set_xticks(range(min_date, max_date, 5)) | ||
|
||
# Plot our chart. | ||
plot = sns.rugplot(data=dates, height=1, y=None, x=dates, legend=True, ax=axes) | ||
plot.set(yticklabels=[]) | ||
|
||
# Save the chart image to memory. | ||
img = BytesIO() | ||
fig.savefig(img, bbox_inches="tight", pad_inches=0.3, transparent=True) | ||
fig.clf() | ||
img.seek(0) | ||
|
||
# Convert bytes to Base64 encoding for rendering later. | ||
plot = base64.b64encode(img.getvalue()).decode("utf8") | ||
plot_output.append({PUID: plot}) | ||
|
||
return plot_output | ||
|
||
|
||
@reporter.route("/bayesian_rug_plot/", methods=["GET"]) | ||
def report_bayesian_modeling_rug(): | ||
"""Bayesian format modeling rug plot.""" | ||
|
||
storage_service_id = request.args.get(request_params["storage_service_id"]) | ||
|
||
format_report = report_data.bayesian_format_modeling(storage_service_id) | ||
figures = save_rug_plots(format_report) | ||
|
||
return render_template( | ||
"bayesian_rug_plot.html", | ||
storage_service_id=storage_service_id, | ||
storage_service_name=format_report.get(fields.FIELD_STORAGE_NAME), | ||
figures=figures, | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,26 @@ | ||
{% extends "report_base.html" %} | ||
|
||
{% block content %} | ||
|
||
<div class="alert alert-secondary"> | ||
<a class="noprint" onClick="window.print();return false"><button type="button" class="btn btn-info" style="float:right;">Print</button></a> | ||
<strong>Report:</strong> | ||
<span title="Represents a distribution of file-format instance creation dates. Thicker lines demonstrate a greater number of format instances recorded for a single date in time and less-dense portions of the distribution either side of rug-plot show fewer instances of file-formats being recorded. Those to the right given a long-enough gap between today and the last lines may begin to show us a file-format in the process of becoming obsolete."> | ||
Bayesian format modeling | ||
</span> | ||
<br> | ||
<strong>Storage Service:</strong> {{ storage_service_name }} | ||
<br> | ||
</div> | ||
|
||
{% for figure in figures %} | ||
{% for value in figure %} | ||
<b> {{ value }}:</b> | ||
<img src="data:image/png;base64, {{ figure[value] }}" /> | ||
<br> | ||
{% endfor %} | ||
{% endfor %} | ||
|
||
</div> | ||
|
||
{% endblock %} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.