Skip to content

Commit

Permalink
Use real format types in data generator (#297)
Browse files Browse the repository at this point in the history
  • Loading branch information
mcantelon committed Mar 19, 2024
1 parent f3313fd commit 0d846a2
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 8 deletions.
24 changes: 23 additions & 1 deletion tools/generate-test-data
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,13 @@ from config import CONFIGS
help="Maximum number of files to create per AIP (default 30).",
type=int,
)
@click.option(
"--number-of-format-types",
"-e",
default=50,
help="Maximum number of possible format types to assign to files (default 50).",
type=int,
)
@click.option("--seed", default=0)
def main(
storage_services_to_create,
Expand All @@ -61,6 +68,7 @@ def main(
locations_max_aip_count,
aip_min_file_count,
aip_max_file_count,
number_of_format_types,
seed,
):
# Initialize Flash app context
Expand Down Expand Up @@ -89,6 +97,16 @@ def main(
fetch_job = data.create_fake_fetch_job(ss.id)
fetch_jobs[ss.id] = fetch_job.id

# Determine format types to use
format_types = data.format_types_from_csv(
"tools/data/generate-test-data/format_types.csv"
)

if number_of_format_types >= len(format_types):
number_of_format_types = len(format_types)

format_types = format_types[0:number_of_format_types]

# Populate storage service locations
ss_locations_to_create = (
storage_services_to_create * locations_per_storage_service
Expand Down Expand Up @@ -127,7 +145,11 @@ def main(
pipeline.id, ss_id, sl.id, fetch_jobs[ss.id]
)
data.create_fake_aip_files(
aip_min_file_count, aip_max_file_count, agents, aip
aip_min_file_count,
aip_max_file_count,
agents,
aip,
format_types,
)
aipcount += 1

Expand Down
27 changes: 20 additions & 7 deletions tools/helpers/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from datetime import datetime, timedelta

from faker import Faker
import pandas as pd

from AIPscan import db
from AIPscan.models import (
Expand All @@ -29,6 +30,11 @@ def randint(start, end):
return fake.random.randint(start, end)


def format_types_from_csv(filepath):
df = pd.read_csv(filepath)
return df.to_dict(orient="records")


def create_or_fetch_fake_pipeline():
pipeline = db.session.query(Pipeline).first()

Expand Down Expand Up @@ -112,21 +118,28 @@ def create_fake_aip(pipeline_id, storage_service_id, storage_location_id, fetch_
return aip


def create_fake_aip_files(min_files, max_files, agents, aip):
def create_fake_aip_files(min_files, max_files, agents, aip, format_types):
for _ in range(1, randint(min_files, max_files)):
fake_puid = "fmt/" + str(randint(1, 21))
# Pick a format type and use it to create a fake filepath
format_type = random.choice(format_types)
filepath = fake.file_path()
filepath_obj = pathlib.Path(fake.file_path())

extension=str(format_type["extensions"]).split(",")[0]
filepath = str(filepath_obj.with_suffix("." + extension))
filename = filepath_obj.name

aipfile = File(
aip_id=aip.id,
name=fake.text(20)[:-1],
filepath=fake.file_path(),
name=filename,
filepath=filepath,
uuid=fake.uuid4(),
file_type="original",
size=randint(1000, 1_000_000),
date_created=aip.create_date,
puid=fake_puid,
file_format=fake.file_extension(),
format_version=fake.text(20)[:-1],
puid=format_type["puid"],
file_format=format_type["name"],
format_version=format_type["version"],
checksum_type=fake.text(20)[:-1],
checksum_value=fake.text(20)[:-1],
premis_object="",
Expand Down

0 comments on commit 0d846a2

Please sign in to comment.