Skip to content

Commit

Permalink
Add prefix to S3 ingest
Browse files Browse the repository at this point in the history
  • Loading branch information
jayvarner committed Dec 11, 2024
1 parent 0f9fa91 commit c0f8328
Show file tree
Hide file tree
Showing 5 changed files with 88 additions and 4 deletions.
22 changes: 22 additions & 0 deletions readux_ingest_ecds/migrations/0006_auto_20241211_1429.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 3.2.23 on 2024-12-11 14:29

from django.db import migrations, models


class Migration(migrations.Migration):

dependencies = [
('readux_ingest_ecds', '0005_local_warnings'),
]

operations = [
migrations.AlterModelOptions(
name='s3ingest',
options={'verbose_name_plural': 'S3 Ingests'},
),
migrations.AddField(
model_name='s3ingest',
name='prefix',
field=models.CharField(blank=True, help_text='Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.', max_length=255, null=True),
),
]
8 changes: 7 additions & 1 deletion readux_ingest_ecds/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -422,6 +422,12 @@ class S3Ingest(models.Model):
null=True,
related_name="ecds_ingest_created_s3",
)
prefix = models.CharField(
null=True,
blank=True,
max_length=255,
help_text="""Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.""",
)

class Meta:
verbose_name_plural = "S3 Ingests"
Expand Down Expand Up @@ -463,7 +469,7 @@ def ingest(self):

open(trigger_file, "a", encoding="utf-8").close()

image_files, _ = s3_copy(self.s3_bucket, pid)
image_files, _ = s3_copy(self.s3_bucket, pid, prefix=self.prefix)

for image_file in image_files:
with open(trigger_file, "a", encoding="utf-8") as t_file:
Expand Down
5 changes: 4 additions & 1 deletion readux_ingest_ecds/services/file_services.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ def canvas_dimensions(image_name):
return (0, 0)


def s3_copy(source, pid):
def s3_copy(source, pid, prefix=None):
"""Copy S3 objects to ingest
Args:
Expand All @@ -172,6 +172,9 @@ def s3_copy(source, pid):
if pid in obj.key and not str(obj.key).endswith("/")
]

if prefix is not None:
keys_to_copy = [key for key in keys_to_copy if prefix in key]

images = []
ocr = []
for key in keys_to_copy:
Expand Down
1 change: 1 addition & 0 deletions test_app/tests/factories.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@ class Meta:
image_server = SubFactory(ImageServerFactory)
creator = SubFactory(UserFactory)
s3_bucket = "source"
prefix = None


class BulkFactory(DjangoModelFactory):
Expand Down
56 changes: 54 additions & 2 deletions test_app/tests/test_s3.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,11 +48,13 @@ def teardown_class():
if item.endswith(".csv") or item.endswith(".zip"):
os.remove(os.path.join("./", item))

def create_source_images(self, pid=None, count=1, include_pid_in_file=True):
def create_source_images(
self, pid=None, count=1, include_pid_in_file=True, prefix="tmp"
):
if pid is None:
raise Exception("You must supply a pid kwarg")

sub_dir = "tmp" if include_pid_in_file else pid
sub_dir = prefix if include_pid_in_file else pid
self.fs_storage = FileSystemStorage(
root_path=tempfile.gettempdir(),
rel_path=sub_dir,
Expand All @@ -62,6 +64,7 @@ def create_source_images(self, pid=None, count=1, include_pid_in_file=True):
os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "images"),
exist_ok=True,
)

os.makedirs(
os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "ocr"),
exist_ok=True,
Expand Down Expand Up @@ -169,3 +172,52 @@ def test_s3_ingest_pid_in_filename(self):
assert Manifest.objects.get(pid=pid).canvas_set.count() == 3
assert len(ingested_images) == 3
assert len(ingested_ocr) == 3

def test_s3_ingest_with_prefix(self):
pids, pid_file = self.create_pids(pid_count=3, image_count=2)

for pid in pids:
self.create_source_images(pid=pid, count=2, prefix="emory")

upload_file = SimpleUploadedFile(
name=os.path.basename(pid_file),
content=open(pid_file, "rb").read(),
)

ingest = S3IngestFactory(metadata_spreadsheet=upload_file, prefix="emory")
# ingest.ingest()
s3_ingest_task(ingest.id)

destination_bucket = self.s3.Bucket(settings.INGEST_BUCKET)

source_files = [str(obj.key) for obj in self.s3.Bucket("source").objects.all()]

for pid in pids:
ingested_images = [
os.path.basename(str(obj.key))
for obj in destination_bucket.objects.all()
if str(obj.key).startswith(f"{settings.INGEST_STAGING_PREFIX}/{pid}_")
]

ingested_ocr = [
os.path.basename(str(obj.key))
for obj in destination_bucket.objects.all()
if str(obj.key).startswith(f"{settings.INGEST_OCR_PREFIX}/{pid}/{pid}_")
]

canvases = [
canvas.pid.replace(".tiff", ".jpg")
for canvas in Manifest.objects.get(pid=pid).canvas_set.all()
]

for source_file in source_files:
if pid in source_file and source_file.endswith("jpg"):
if "emory" in source_file:
assert os.path.basename(source_file) in canvases
if "emory" not in source_file:
assert os.path.basename(source_file) not in canvases

assert Manifest.objects.filter(pid=pid).exists()
assert Manifest.objects.get(pid=pid).canvas_set.count() == 2
assert len(ingested_images) == 2
assert len(ingested_ocr) == 2

0 comments on commit c0f8328

Please sign in to comment.