-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Implement OCR
- Loading branch information
Showing
16 changed files
with
284 additions
and
7 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"recommendations": [ | ||
"rebornix.ruby", | ||
"oderwat.indent-rainbow", | ||
"ms-vsliveshare.vsliveshare" | ||
] | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
{ | ||
"editor.rulers": [ | ||
100, | ||
120 | ||
], | ||
"files.exclude": { | ||
"**/.git": true, | ||
"**/.svn": true, | ||
"**/.hg": true, | ||
"**/.DS_Store": true, | ||
"**/tmp": true | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,104 @@ | ||
# frozen_string_literal: true | ||
|
||
require "http" | ||
require "pdfservices/jwt_provider" | ||
require "pdfservices/ocr/result" | ||
require "yaml" | ||
|
||
module PdfServices | ||
module Ocr | ||
class Operation | ||
PRESIGNED_URL_ENDPOINT = "https://pdf-services.adobe.io/assets" | ||
OCR_ENDPOINT = "https://pdf-services.adobe.io/operation/ocr" | ||
ASSETS_ENDPOINT = "https://pdf-services.adobe.io/assets" | ||
|
||
def initialize(credentials = nil) | ||
@credentials = credentials | ||
end | ||
|
||
def get_presigned_url | ||
response = api.post(PRESIGNED_URL_ENDPOINT, json: {mediaType: "application/pdf"}) | ||
if response.status == 200 | ||
JSON.parse(response.body.to_s) | ||
else | ||
Result.new(nil, "Unexpected response status from get presigned url: #{response.status}") | ||
end | ||
end | ||
|
||
def upload_asset(source_pdf) | ||
presigned_url = get_presigned_url | ||
upload_uri = presigned_url["uploadUri"] | ||
asset_id = presigned_url["assetID"] | ||
aws = HTTP.headers({"Content-Type": "application/pdf"}) | ||
response = aws.put(upload_uri, body: File.open(source_pdf)) | ||
if response.status == 200 | ||
asset_id | ||
else | ||
Result.new(nil, "Unexpected response status from asset upload: #{response.status}") | ||
end | ||
end | ||
|
||
def delete_the_asset(asset_id) | ||
api.delete("#{ASSETS_ENDPOINT}/#{asset_id}") | ||
end | ||
|
||
def execute(source_pdf) | ||
asset_id = upload_asset(source_pdf) | ||
response = api.post(OCR_ENDPOINT, json: {assetID: asset_id}) | ||
if response.status == 201 | ||
document_url = response.headers["location"] | ||
poll_document_result(document_url, asset_id) | ||
else | ||
Result.new(nil, "Unexpected response status from ocr endpoint: #{response.status}\nasset_id: #{asset_id}") | ||
end | ||
end | ||
|
||
private | ||
|
||
def api_headers | ||
{ | ||
Authorization: "Bearer #{JwtProvider.get_jwt(@credentials)}", | ||
"x-api-key": @credentials.client_id, | ||
"Content-Type": "application/json" | ||
} | ||
end | ||
|
||
def api | ||
@api ||= HTTP.headers(api_headers) | ||
end | ||
|
||
def poll_document_result(url, original_asset_id) | ||
sleep(1) | ||
response = api.get(url) | ||
if response.status == 200 | ||
json_response = JSON.parse(response.body.to_s) | ||
ocr_asset_id = json_response&.[]("asset")&.[]("assetID") | ||
case json_response["status"] | ||
when "in progress" | ||
poll_document_result(url, original_asset_id) | ||
when "done" | ||
# download_the_asset | ||
response = HTTP.get(json_response["asset"]["downloadUri"]) | ||
# delete the assets | ||
delete_the_asset(original_asset_id) if !original_asset_id.nil? | ||
delete_the_asset(ocr_asset_id) if !ocr_asset_id.nil? | ||
# return the result | ||
Result.new(response.body, nil) | ||
when "failed" | ||
# delete the original asset | ||
delete_the_asset(original_asset_id) if !original_asset_id.nil? | ||
Result.new(nil, "OCR Failed") | ||
else | ||
# delete the original asset | ||
delete_the_asset(original_asset_id) if original_asset_id.present? | ||
Result.new(nil, "Unexpected status from polling: #{json_response["status"]}") | ||
end | ||
else | ||
# delete the original asset | ||
delete_the_asset(original_asset_id) if original_asset_id.present? | ||
Result.new(nil, "Unexpected response status from polling: #{json_response["status"]}") | ||
end | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
require "json" | ||
require "multipart_parser/reader" | ||
|
||
module PdfServices | ||
module Ocr | ||
class Result | ||
attr_accessor :document_body, :error | ||
|
||
def initialize(document_body, error) | ||
@document_body = document_body | ||
@error = error | ||
end | ||
|
||
def success? | ||
@document_body != nil | ||
end | ||
|
||
def save_as_file(file_path) | ||
location = File.join(Dir.pwd, file_path) | ||
File.write(location, @document_body) | ||
end | ||
end | ||
end | ||
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
# frozen_string_literal: true | ||
|
||
module PdfServices | ||
VERSION = "0.1.1" | ||
VERSION = "0.1.2" | ||
end |
Binary file not shown.
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
{ | ||
"status": "in progress" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
{ | ||
"status": "done", | ||
"asset": { | ||
"assetID": "ocr'd:asset-id", | ||
"downloadUri": "https://ocr.file.url" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
{ | ||
"assetID": "urn:a-real-long-asset-asset-id", | ||
"uploadUri": "https://a.presigned.url" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
# frozen_string_literal: true | ||
|
||
require "lib/pdfservices" | ||
|
||
credentials = ::PdfServices::CredentialsBuilder.new | ||
.with_client_id(ENV["PDF_SERVICES_CLIENT_ID"]) | ||
.with_client_secret(ENV["PDF_SERVICES_CLIENT_SECRET"]) | ||
.with_organization_id(ENV["PDF_SERVICES_ORGANIZATION_ID"]) | ||
.with_account_id(ENV["PDF_SERVICES_ACCOUNT_ID"]) | ||
.with_private_key(ENV["PDF_SERVICES_PRIVATE_KEY"]) | ||
.build | ||
|
||
operation = ::PdfServices::Ocr::Operation.new(credentials) | ||
|
||
result = operation.execute("test/fixtures/files/not_yet_ocr.pdf") | ||
|
||
puts(result.error) | ||
|
||
result.save_as_file("tmp/ocr_result.pdf") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,88 @@ | ||
# frozen_string_literal: true | ||
|
||
require "test_helper" | ||
|
||
class OcrTest < Minitest::Test | ||
def test_it_works | ||
stub_valid_response_sequence | ||
|
||
# Initial setup, create credentials instance. | ||
credentials = valid_credentials | ||
|
||
# source file | ||
source_pdf_path = File.join(Dir.pwd, "test", "fixtures", "files", "not_yet_ocr.pdf") | ||
|
||
operation = ::PdfServices::Ocr::Operation.new(credentials) | ||
|
||
# Execute the operation | ||
result = operation.execute(source_pdf_path) | ||
|
||
assert result.success? | ||
assert_equal "this is fake ocr'd pdf\n", result.document_body.to_s | ||
end | ||
|
||
private | ||
|
||
def stub_valid_response_sequence | ||
# get JWT for requests | ||
stub_request(:post, "https://ims-na1.adobelogin.com/ims/exchange/jwt/") | ||
.to_return(status: 200, body: json_fixture("valid_jwt_response")) | ||
|
||
# get a presigned url to upload the source pdf | ||
stub_request(:post, "https://pdf-services.adobe.io/assets") | ||
.with(headers: secured_headers) | ||
.to_return( | ||
status: 200, | ||
headers: json_headers, | ||
body: json_fixture("presigned_upload_url_response") | ||
) | ||
|
||
# upload the source pdf | ||
stub_request(:put, "https://a.presigned.url").to_return(status: 200) | ||
|
||
# request the OCR operation | ||
stub_request(:post, "https://pdf-services.adobe.io/operation/ocr") | ||
.with(headers: secured_headers) | ||
.to_return( | ||
status: 201, | ||
headers: {"location" => "https://some.polling.url"}.merge(json_headers) | ||
) | ||
|
||
# poll for the result | ||
stub_request(:get, "https://some.polling.url") | ||
.with(headers: secured_headers) | ||
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_in_progress")) | ||
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_in_progress")) | ||
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_done")) | ||
|
||
# download the ocr'd pdf | ||
stub_request(:get, "https://ocr.file.url") | ||
.to_return(status: 200, headers: pdf_headers, body: file_fixture("fake_ocr_done.pdf")) | ||
|
||
# delete the original asset | ||
stub_request(:delete, "https://pdf-services.adobe.io/assets/urn:a-real-long-asset-asset-id") | ||
.with(headers: secured_headers) | ||
.to_return(status: 200, body: "", headers: {}) | ||
|
||
# delete the ocr'd asset | ||
stub_request(:delete, "https://pdf-services.adobe.io/assets/ocr'd:asset-id") | ||
.with(headers: secured_headers) | ||
.to_return(status: 200, body: "", headers: {}) | ||
end | ||
|
||
def secured_headers | ||
{ | ||
Authorization: "Bearer fake1.fake2.fake3", | ||
"Content-Type": "application/json", | ||
"X-Api-Key": "123someclientid" | ||
} | ||
end | ||
|
||
def json_headers | ||
{"Content-Type" => "application/json;charset=UTF-8"} | ||
end | ||
|
||
def pdf_headers | ||
{"Content-Type" => "application/pdf"} | ||
end | ||
end |