Add prefix to S3 ingest

ecds · Dec 11, 2024 · c0f8328 · c0f8328
1 parent 0f9fa91
commit c0f8328
Show file tree

Hide file tree

Showing 5 changed files with 88 additions and 4 deletions.
diff --git a/readux_ingest_ecds/migrations/0006_auto_20241211_1429.py b/readux_ingest_ecds/migrations/0006_auto_20241211_1429.py
@@ -0,0 +1,22 @@
+# Generated by Django 3.2.23 on 2024-12-11 14:29
+
+from django.db import migrations, models
+
+
+class Migration(migrations.Migration):
+
+    dependencies = [
+        ('readux_ingest_ecds', '0005_local_warnings'),
+    ]
+
+    operations = [
+        migrations.AlterModelOptions(
+            name='s3ingest',
+            options={'verbose_name_plural': 'S3 Ingests'},
+        ),
+        migrations.AddField(
+            model_name='s3ingest',
+            name='prefix',
+            field=models.CharField(blank=True, help_text='Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.', max_length=255, null=True),
+        ),
+    ]
diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py
@@ -422,6 +422,12 @@ class S3Ingest(models.Model):
         null=True,
         related_name="ecds_ingest_created_s3",
     )
+    prefix = models.CharField(
+        null=True,
+        blank=True,
+        max_length=255,
+        help_text="""Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.""",
+    )
 
     class Meta:
         verbose_name_plural = "S3 Ingests"
@@ -463,7 +469,7 @@ def ingest(self):
 
                 open(trigger_file, "a", encoding="utf-8").close()
 
-                image_files, _ = s3_copy(self.s3_bucket, pid)
+                image_files, _ = s3_copy(self.s3_bucket, pid, prefix=self.prefix)
 
                 for image_file in image_files:
                     with open(trigger_file, "a", encoding="utf-8") as t_file:

diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py
@@ -152,7 +152,7 @@ def canvas_dimensions(image_name):
     return (0, 0)
 
 
-def s3_copy(source, pid):
+def s3_copy(source, pid, prefix=None):
     """Copy S3 objects to ingest
 
     Args:
@@ -172,6 +172,9 @@ def s3_copy(source, pid):
         if pid in obj.key and not str(obj.key).endswith("/")
     ]
 
+    if prefix is not None:
+        keys_to_copy = [key for key in keys_to_copy if prefix in key]
+
     images = []
     ocr = []
     for key in keys_to_copy:

diff --git a/test_app/tests/factories.py b/test_app/tests/factories.py
@@ -65,6 +65,7 @@ class Meta:
     image_server = SubFactory(ImageServerFactory)
     creator = SubFactory(UserFactory)
     s3_bucket = "source"
+    prefix = None
 
 
 class BulkFactory(DjangoModelFactory):

diff --git a/test_app/tests/test_s3.py b/test_app/tests/test_s3.py
@@ -48,11 +48,13 @@ def teardown_class():
             if item.endswith(".csv") or item.endswith(".zip"):
                 os.remove(os.path.join("./", item))
 
-    def create_source_images(self, pid=None, count=1, include_pid_in_file=True):
+    def create_source_images(
+        self, pid=None, count=1, include_pid_in_file=True, prefix="tmp"
+    ):
         if pid is None:
             raise Exception("You must supply a pid kwarg")
 
-        sub_dir = "tmp" if include_pid_in_file else pid
+        sub_dir = prefix if include_pid_in_file else pid
         self.fs_storage = FileSystemStorage(
             root_path=tempfile.gettempdir(),
             rel_path=sub_dir,
@@ -62,6 +64,7 @@ def create_source_images(self, pid=None, count=1, include_pid_in_file=True):
             os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "images"),
             exist_ok=True,
         )
+
         os.makedirs(
             os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "ocr"),
             exist_ok=True,
@@ -169,3 +172,52 @@ def test_s3_ingest_pid_in_filename(self):
             assert Manifest.objects.get(pid=pid).canvas_set.count() == 3
             assert len(ingested_images) == 3
             assert len(ingested_ocr) == 3
+
+    def test_s3_ingest_with_prefix(self):
+        pids, pid_file = self.create_pids(pid_count=3, image_count=2)
+
+        for pid in pids:
+            self.create_source_images(pid=pid, count=2, prefix="emory")
+
+        upload_file = SimpleUploadedFile(
+            name=os.path.basename(pid_file),
+            content=open(pid_file, "rb").read(),
+        )
+
+        ingest = S3IngestFactory(metadata_spreadsheet=upload_file, prefix="emory")
+        # ingest.ingest()
+        s3_ingest_task(ingest.id)
+
+        destination_bucket = self.s3.Bucket(settings.INGEST_BUCKET)
+
+        source_files = [str(obj.key) for obj in self.s3.Bucket("source").objects.all()]
+
+        for pid in pids:
+            ingested_images = [
+                os.path.basename(str(obj.key))
+                for obj in destination_bucket.objects.all()
+                if str(obj.key).startswith(f"{settings.INGEST_STAGING_PREFIX}/{pid}_")
+            ]
+
+            ingested_ocr = [
+                os.path.basename(str(obj.key))
+                for obj in destination_bucket.objects.all()
+                if str(obj.key).startswith(f"{settings.INGEST_OCR_PREFIX}/{pid}/{pid}_")
+            ]
+
+            canvases = [
+                canvas.pid.replace(".tiff", ".jpg")
+                for canvas in Manifest.objects.get(pid=pid).canvas_set.all()
+            ]
+
+            for source_file in source_files:
+                if pid in source_file and source_file.endswith("jpg"):
+                    if "emory" in source_file:
+                        assert os.path.basename(source_file) in canvases
+                    if "emory" not in source_file:
+                        assert os.path.basename(source_file) not in canvases
+
+            assert Manifest.objects.filter(pid=pid).exists()
+            assert Manifest.objects.get(pid=pid).canvas_set.count() == 2
+            assert len(ingested_images) == 2
+            assert len(ingested_ocr) == 2