spcl · octonawish-akcodes · Aug 25, 2024 · Aug 26, 2024 · Aug 28, 2024 · coderabbitai
diff --git a/benchmarks-data b/benchmarks-data
diff --git a/benchmarks/400.inference/421.image-captioning/config.json b/benchmarks/400.inference/421.image-captioning/config.json
@@ -0,0 +1,6 @@
+{
+    "timeout": 60,
+    "memory": 256,
+    "languages": ["python"]
+  }
+
diff --git a/benchmarks/400.inference/421.image-captioning/input.py b/benchmarks/400.inference/421.image-captioning/input.py
@@ -0,0 +1,42 @@
+import glob
+import os
+
+def buckets_count():
+    return (1, 1)
+
+'''
+    Generate test, small, and large workload for image captioning benchmark.
+
+    :param data_dir: Directory where benchmark data is placed
+    :param size: Workload size
+    :param benchmarks_bucket: Storage container for the benchmark
+    :param input_paths: List of input paths
+    :param output_paths: List of output paths
+    :param upload_func: Upload function taking three params (bucket_idx, key, filepath)
+'''
+def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):
+    input_files = []
+    for ext in ['*.jpg', '*.jpeg', '*.png']:
+        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
+
+    if not input_files:
+        raise ValueError("No input files found in the provided directory.")
+
+    for file in input_files:
+        img = os.path.relpath(file, data_dir)
+        upload_func(0, img, file)
+
+    input_config = {
+        'object': {
+            'key': img,
+            'width': 200,
+            'height': 200
+        },
+        'bucket': {
+            'bucket': benchmarks_bucket,
+            'input': input_paths[0],
+            'output': output_paths[0]
+        }
+    }
+
+    return input_config
-    input_files = []
-    for ext in ['*.jpg', '*.jpeg', '*.png']:
-        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
-    
-    if not input_files:
-        raise ValueError("No input files found in the provided directory.")
-
-    for file in input_files:
-        img = os.path.relpath(file, data_dir)
-        upload_func(0, img, file)
-
-    input_config = {
-        'object': {
-            'key': img,
-            'width': 200,
-            'height': 200
-        },
-        'bucket': {
-            'bucket': benchmarks_bucket,
-            'input': input_paths[0],
-            'output': output_paths[0]
-        }
-    }
-    
-    return input_config
+def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):
+    input_files = []
+    for ext in ['*.jpg', '*.jpeg', '*.png']:
+        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
+    
+    if not input_files:
+        raise ValueError("No input files found in the provided directory.")
+    img = None  # Define img outside the loop to ensure it's available later
+    for file in input_files:
+        img = os.path.relpath(file, data_dir)
+        upload_func(0, img, file)
+    
+    if img is None:
+        raise ValueError("No valid image files processed.")
-    input_files = []
-    for ext in ['*.jpg', '*.jpeg', '*.png']:
-        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
-    
-    if not input_files:
-        raise ValueError("No input files found in the provided directory.")
-
-    for file in input_files:
-        img = os.path.relpath(file, data_dir)
-        upload_func(0, img, file)
-
-    input_config = {
-        'object': {
-            'key': img,
-            'width': 200,
-            'height': 200
-        },
-        'bucket': {
-            'bucket': benchmarks_bucket,
-            'input': input_paths[0],
-            'output': output_paths[0]
-        }
-    }
-    
-    return input_config
+def generate_input(data_dir, size, benchmarks_bucket, input_paths, output_paths, upload_func):
+    input_files = []
+    for ext in ['*.jpg', '*.jpeg', '*.png']:
+        input_files.extend(glob.glob(os.path.join(data_dir, ext)))
+    
+    if not input_files:
+        raise ValueError("No input files found in the provided directory.")
+    img = None  # Define img outside the loop to ensure it's available later
+    for file in input_files:
+        img = os.path.relpath(file, data_dir)
+        upload_func(0, img, file)
+    
+    if img is None:
+        raise ValueError("No valid image files processed.")
diff --git a/benchmarks/400.inference/421.image-captioning/python/function.py b/benchmarks/400.inference/421.image-captioning/python/function.py
@@ -0,0 +1,53 @@
+import datetime
+import io
+from urllib.parse import unquote_plus
+from PIL import Image
+import torch
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from . import storage
+
+# Load the pre-trained ViT-GPT2 model
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+model.eval()
+
+client = storage.storage.get_instance()
+
+def generate_caption(image_bytes):
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
+        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+    return generated_text
+
+def handler(event):
+    bucket = event.get('bucket').get('bucket')
+    input_prefix = event.get('bucket').get('input')
+    key = unquote_plus(event.get('object').get('key'))
+
+    download_begin = datetime.datetime.now()
+    img = client.download_stream(bucket, os.path.join(input_prefix, key))
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    caption = generate_caption(img)
+    process_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        'result': {
+            'caption': caption,
+        },
+        'measurement': {
+            'download_time': download_time,
+            'download_size': len(img),
+            'compute_time': process_time
+        }
+    }
-import datetime
-import io
-from urllib.parse import unquote_plus
-from PIL import Image
-import torch
-from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
-from . import storage
-
-# Load the pre-trained ViT-GPT2 model
-model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-
-model.eval()
-
-client = storage.storage.get_instance()
-
-def generate_caption(image_bytes):
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
-        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-
-    return generated_text
-
-def handler(event):
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    key = unquote_plus(event.get('object').get('key'))
-    
-    download_begin = datetime.datetime.now()
-    img = client.download_stream(bucket, os.path.join(input_prefix, key))
-    download_end = datetime.datetime.now()
-
-    process_begin = datetime.datetime.now()
-    caption = generate_caption(img)
-    process_end = datetime.datetime.now()
-
-    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
-    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
-
-    return {
-        'result': {
-            'caption': caption,
-        },
-        'measurement': {
-            'download_time': download_time,
-            'download_size': len(img),
-            'compute_time': process_time
-        }
-    }
+import datetime
+import io
+from urllib.parse import unquote_plus
+from PIL import Image
+import torch
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from . import storage
+import os
+
+# Load the pre-trained ViT-GPT2 model
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+model.eval()
+
+client = storage.storage.get_instance()
+
+def generate_caption(image_bytes):
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
+        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+    return generated_text
+
+def handler(event):
+    bucket = event.get('bucket').get('bucket')
+    input_prefix = event.get('bucket').get('input')
+    key = unquote_plus(event.get('object').get('key'))
+    
+    download_begin = datetime.datetime.now()
+    img = client.download_stream(bucket, os.path.join(input_prefix, key))
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    caption = generate_caption(img)
+    process_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        'result': {
+            'caption': caption,
+        },
+        'measurement': {
+            'download_time': download_time,
+            'download_size': len(img),
+            'compute_time': process_time
+        }
+    }
-import datetime
-import io
-from urllib.parse import unquote_plus
-from PIL import Image
-import torch
-from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
-from . import storage
-
-# Load the pre-trained ViT-GPT2 model
-model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
-
-model.eval()
-
-client = storage.storage.get_instance()
-
-def generate_caption(image_bytes):
-    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
-    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
-
-    with torch.no_grad():
-        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
-        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
-
-    return generated_text
-
-def handler(event):
-    bucket = event.get('bucket').get('bucket')
-    input_prefix = event.get('bucket').get('input')
-    key = unquote_plus(event.get('object').get('key'))
-    
-    download_begin = datetime.datetime.now()
-    img = client.download_stream(bucket, os.path.join(input_prefix, key))
-    download_end = datetime.datetime.now()
-
-    process_begin = datetime.datetime.now()
-    caption = generate_caption(img)
-    process_end = datetime.datetime.now()
-
-    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
-    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
-
-    return {
-        'result': {
-            'caption': caption,
-        },
-        'measurement': {
-            'download_time': download_time,
-            'download_size': len(img),
-            'compute_time': process_time
-        }
-    }
+import datetime
+import io
+from urllib.parse import unquote_plus
+from PIL import Image
+import torch
+from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
+from . import storage
+import os
+
+# Load the pre-trained ViT-GPT2 model
+model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+image_processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
+
+model.eval()
+
+client = storage.storage.get_instance()
+
+def generate_caption(image_bytes):
+    image = Image.open(io.BytesIO(image_bytes)).convert("RGB")
+    pixel_values = image_processor(images=image, return_tensors="pt").pixel_values
+
+    with torch.no_grad():
+        generated_ids = model.generate(pixel_values, max_length=16, num_beams=4)
+        generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
+
+    return generated_text
+
+def handler(event):
+    bucket = event.get('bucket').get('bucket')
+    input_prefix = event.get('bucket').get('input')
+    key = unquote_plus(event.get('object').get('key'))
+    
+    download_begin = datetime.datetime.now()
+    img = client.download_stream(bucket, os.path.join(input_prefix, key))
+    download_end = datetime.datetime.now()
+
+    process_begin = datetime.datetime.now()
+    caption = generate_caption(img)
+    process_end = datetime.datetime.now()
+
+    download_time = (download_end - download_begin) / datetime.timedelta(microseconds=1)
+    process_time = (process_end - process_begin) / datetime.timedelta(microseconds=1)
+
+    return {
+        'result': {
+            'caption': caption,
+        },
+        'measurement': {
+            'download_time': download_time,
+            'download_size': len(img),
+            'compute_time': process_time
+        }
+    }
diff --git a/benchmarks/400.inference/421.image-captioning/python/requirements.txt b/benchmarks/400.inference/421.image-captioning/python/requirements.txt
@@ -0,0 +1,3 @@
+transformers==4.44.2
+torch==2.4.0
+pillow==10.4.0