diff --git a/readux_ingest_ecds/migrations/0006_auto_20241211_1429.py b/readux_ingest_ecds/migrations/0006_auto_20241211_1429.py new file mode 100644 index 0000000..5516a83 --- /dev/null +++ b/readux_ingest_ecds/migrations/0006_auto_20241211_1429.py @@ -0,0 +1,22 @@ +# Generated by Django 3.2.23 on 2024-12-11 14:29 + +from django.db import migrations, models + + +class Migration(migrations.Migration): + + dependencies = [ + ('readux_ingest_ecds', '0005_local_warnings'), + ] + + operations = [ + migrations.AlterModelOptions( + name='s3ingest', + options={'verbose_name_plural': 'S3 Ingests'}, + ), + migrations.AddField( + model_name='s3ingest', + name='prefix', + field=models.CharField(blank=True, help_text='Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.', max_length=255, null=True), + ), + ] diff --git a/readux_ingest_ecds/models.py b/readux_ingest_ecds/models.py index 57ca494..79bfd0a 100644 --- a/readux_ingest_ecds/models.py +++ b/readux_ingest_ecds/models.py @@ -422,6 +422,12 @@ class S3Ingest(models.Model): null=True, related_name="ecds_ingest_created_s3", ) + prefix = models.CharField( + null=True, + blank=True, + max_length=255, + help_text="""Optional: The name of a subdirectory in the bucket. This will limit where the ingest will look for files.""", + ) class Meta: verbose_name_plural = "S3 Ingests" @@ -463,7 +469,7 @@ def ingest(self): open(trigger_file, "a", encoding="utf-8").close() - image_files, _ = s3_copy(self.s3_bucket, pid) + image_files, _ = s3_copy(self.s3_bucket, pid, prefix=self.prefix) for image_file in image_files: with open(trigger_file, "a", encoding="utf-8") as t_file: diff --git a/readux_ingest_ecds/services/file_services.py b/readux_ingest_ecds/services/file_services.py index 43bc353..41fe622 100644 --- a/readux_ingest_ecds/services/file_services.py +++ b/readux_ingest_ecds/services/file_services.py @@ -152,7 +152,7 @@ def canvas_dimensions(image_name): return (0, 0) -def s3_copy(source, pid): +def s3_copy(source, pid, prefix=None): """Copy S3 objects to ingest Args: @@ -172,6 +172,9 @@ def s3_copy(source, pid): if pid in obj.key and not str(obj.key).endswith("/") ] + if prefix is not None: + keys_to_copy = [key for key in keys_to_copy if prefix in key] + images = [] ocr = [] for key in keys_to_copy: diff --git a/test_app/tests/factories.py b/test_app/tests/factories.py index 9cf3e64..cd7bd9f 100644 --- a/test_app/tests/factories.py +++ b/test_app/tests/factories.py @@ -65,6 +65,7 @@ class Meta: image_server = SubFactory(ImageServerFactory) creator = SubFactory(UserFactory) s3_bucket = "source" + prefix = None class BulkFactory(DjangoModelFactory): diff --git a/test_app/tests/test_s3.py b/test_app/tests/test_s3.py index b73a0d3..c97c78b 100644 --- a/test_app/tests/test_s3.py +++ b/test_app/tests/test_s3.py @@ -48,11 +48,13 @@ def teardown_class(): if item.endswith(".csv") or item.endswith(".zip"): os.remove(os.path.join("./", item)) - def create_source_images(self, pid=None, count=1, include_pid_in_file=True): + def create_source_images( + self, pid=None, count=1, include_pid_in_file=True, prefix="tmp" + ): if pid is None: raise Exception("You must supply a pid kwarg") - sub_dir = "tmp" if include_pid_in_file else pid + sub_dir = prefix if include_pid_in_file else pid self.fs_storage = FileSystemStorage( root_path=tempfile.gettempdir(), rel_path=sub_dir, @@ -62,6 +64,7 @@ def create_source_images(self, pid=None, count=1, include_pid_in_file=True): os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "images"), exist_ok=True, ) + os.makedirs( os.path.join(self.fs_storage.root_path, self.fs_storage.rel_path, "ocr"), exist_ok=True, @@ -169,3 +172,52 @@ def test_s3_ingest_pid_in_filename(self): assert Manifest.objects.get(pid=pid).canvas_set.count() == 3 assert len(ingested_images) == 3 assert len(ingested_ocr) == 3 + + def test_s3_ingest_with_prefix(self): + pids, pid_file = self.create_pids(pid_count=3, image_count=2) + + for pid in pids: + self.create_source_images(pid=pid, count=2, prefix="emory") + + upload_file = SimpleUploadedFile( + name=os.path.basename(pid_file), + content=open(pid_file, "rb").read(), + ) + + ingest = S3IngestFactory(metadata_spreadsheet=upload_file, prefix="emory") + # ingest.ingest() + s3_ingest_task(ingest.id) + + destination_bucket = self.s3.Bucket(settings.INGEST_BUCKET) + + source_files = [str(obj.key) for obj in self.s3.Bucket("source").objects.all()] + + for pid in pids: + ingested_images = [ + os.path.basename(str(obj.key)) + for obj in destination_bucket.objects.all() + if str(obj.key).startswith(f"{settings.INGEST_STAGING_PREFIX}/{pid}_") + ] + + ingested_ocr = [ + os.path.basename(str(obj.key)) + for obj in destination_bucket.objects.all() + if str(obj.key).startswith(f"{settings.INGEST_OCR_PREFIX}/{pid}/{pid}_") + ] + + canvases = [ + canvas.pid.replace(".tiff", ".jpg") + for canvas in Manifest.objects.get(pid=pid).canvas_set.all() + ] + + for source_file in source_files: + if pid in source_file and source_file.endswith("jpg"): + if "emory" in source_file: + assert os.path.basename(source_file) in canvases + if "emory" not in source_file: + assert os.path.basename(source_file) not in canvases + + assert Manifest.objects.filter(pid=pid).exists() + assert Manifest.objects.get(pid=pid).canvas_set.count() == 2 + assert len(ingested_images) == 2 + assert len(ingested_ocr) == 2