diff --git a/Images/s3-bucket-with-cv.jpg b/Images/s3-bucket-with-cv.jpg new file mode 100644 index 0000000..835b4f0 Binary files /dev/null and b/Images/s3-bucket-with-cv.jpg differ diff --git a/README.md b/README.md index 34635af..ffa80cc 100644 --- a/README.md +++ b/README.md @@ -1,2 +1,25 @@ # ocr-textbook Optical Character Recognition using Python + +# Technologies + +- Python +- Amazon S3 - AWS +- Git +- ТeceractOCR +- Pillow +- PyMuPDF + +# Installation + +Python 3.7+ is required. + +``` +git clone https://github.com/anna-lybid/planetarium-api-project.git +cd planetarium-api-project +python -m venv venv +source venv/bin/activate +pip install -r requirements.txt +python manage.py migrate +python manage.py runserver +``` diff --git a/app/main.py b/app/main.py deleted file mode 100644 index 4b2e5dc..0000000 --- a/app/main.py +++ /dev/null @@ -1,2 +0,0 @@ -import cv2 -from PIL import Image \ No newline at end of file diff --git a/main.py b/main.py new file mode 100644 index 0000000..10ec061 --- /dev/null +++ b/main.py @@ -0,0 +1,20 @@ +import cv2 as cv +from PIL import Image +import pytesseract + +image_file = "Images/s3-bucket-with-cv.jpg" + +image = cv.imread(image_file) + +if image is not None: + cv.imshow("Image", image) + cv.waitKey(0) + cv.destroyAllWindows() +else: + print("Image is not found or could not be opened.") + +img = Image.open(image_file) + +text = pytesseract.image_to_string(img) + +print(text) diff --git a/read_cv.py b/read_cv.py new file mode 100644 index 0000000..158720c --- /dev/null +++ b/read_cv.py @@ -0,0 +1,26 @@ +import boto3 +import fitz +import pytesseract +from PIL import Image + +bucket_name = "anna-lybid-s3-demo" +key = "My CV/CV. Anna Lybid. Python developer.pdf" + +s3 = boto3.client("s3") + +response = s3.get_object(Bucket=bucket_name, Key=key) + +pdf_data = response["Body"].read() + +pdf_document = fitz.open(stream=pdf_data, filetype='pdf') + +first_page = pdf_document.load_page(0) +image = first_page.get_pixmap() + +image_path = "images/converted_from_pdf.png" +image.save(image_path) + +img = Image.open(image_path) +text = pytesseract.image_to_string(img) + +print(text) diff --git a/requirements.txt b/requirements.txt index 18ccb1f..96a4418 100644 Binary files a/requirements.txt and b/requirements.txt differ