Implement reading from pdf on s3 bucket

anna-lybid · Nov 26, 2023 · 6690336 · 6690336
1 parent b88fb36
commit 6690336
Show file tree

Hide file tree

Showing 6 changed files with 69 additions and 2 deletions.
diff --git a/Images/s3-bucket-with-cv.jpg b/Images/s3-bucket-with-cv.jpg
diff --git a/README.md b/README.md
@@ -1,2 +1,25 @@
 # ocr-textbook
 Optical Character Recognition using Python
+
+# Technologies
+
+- Python
+- Amazon S3 - AWS
+- Git
+- ТeceractOCR
+- Pillow
+- PyMuPDF
+
+# Installation
+
+Python 3.7+ is required.
+
+```
+git clone https://github.com/anna-lybid/planetarium-api-project.git
+cd planetarium-api-project
+python -m venv venv
+source venv/bin/activate
+pip install -r requirements.txt
+python manage.py migrate
+python manage.py runserver
+```
diff --git a/app/main.py b/app/main.py
diff --git a/main.py b/main.py
@@ -0,0 +1,20 @@
+import cv2 as cv
+from PIL import Image
+import pytesseract
+
+image_file = "Images/s3-bucket-with-cv.jpg"
+
+image = cv.imread(image_file)
+
+if image is not None:
+    cv.imshow("Image", image)
+    cv.waitKey(0)
+    cv.destroyAllWindows()
+else:
+    print("Image is not found or could not be opened.")
+
+img = Image.open(image_file)
+
+text = pytesseract.image_to_string(img)
+
+print(text)
diff --git a/read_cv.py b/read_cv.py
@@ -0,0 +1,26 @@
+import boto3
+import fitz
+import pytesseract
+from PIL import Image
+
+bucket_name = "anna-lybid-s3-demo"
+key = "My CV/CV. Anna Lybid. Python developer.pdf"
+
+s3 = boto3.client("s3")
+
+response = s3.get_object(Bucket=bucket_name, Key=key)
+
+pdf_data = response["Body"].read()
+
+pdf_document = fitz.open(stream=pdf_data, filetype='pdf')
+
+first_page = pdf_document.load_page(0)
+image = first_page.get_pixmap()
+
+image_path = "images/converted_from_pdf.png"
+image.save(image_path)
+
+img = Image.open(image_path)
+text = pytesseract.image_to_string(img)
+
+print(text)
diff --git a/requirements.txt b/requirements.txt