Skip to content

Commit

Permalink
Merge pull request #6 from ARPC/ocr
Browse files Browse the repository at this point in the history
Implement OCR
  • Loading branch information
Garrett Keefe authored Jan 24, 2023
2 parents 50b70e8 + ad3b729 commit df6d0d7
Show file tree
Hide file tree
Showing 16 changed files with 284 additions and 7 deletions.
7 changes: 7 additions & 0 deletions .vscode/extensions.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"recommendations": [
"rebornix.ruby",
"oderwat.indent-rainbow",
"ms-vsliveshare.vsliveshare"
]
}
13 changes: 13 additions & 0 deletions .vscode/settings.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
{
"editor.rulers": [
100,
120
],
"files.exclude": {
"**/.git": true,
"**/.svn": true,
"**/.hg": true,
"**/.DS_Store": true,
"**/tmp": true
}
}
8 changes: 4 additions & 4 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
PATH
remote: .
specs:
pdfservices (0.1.1)
pdfservices (0.1.2)
http (~> 5.1)
json (~> 2.6)
jwt (>= 1.0, < 3.0)
Expand All @@ -22,7 +22,7 @@ GEM
ffi (>= 1.0.0)
rake
hashdiff (1.0.1)
http (5.1.0)
http (5.1.1)
addressable (~> 2.8)
http-cookie (~> 1.0)
http-form_data (~> 2.2)
Expand All @@ -31,7 +31,7 @@ GEM
domain_name (~> 0.5)
http-form_data (2.3.0)
json (2.6.2)
jwt (2.5.0)
jwt (2.6.0)
llhttp-ffi (0.4.0)
ffi-compiler (~> 1.0)
rake (~> 13.0)
Expand Down Expand Up @@ -74,9 +74,9 @@ GEM
hashdiff (>= 0.4.0, < 2.0.0)

PLATFORMS
arm64-darwin-21
ruby
x86_64-linux
arm64-darwin-21

DEPENDENCIES
minitest (~> 5.0)
Expand Down
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,10 @@ Then you need to create a json file with your credentials:
}

```
### Supported API calls:

Right now the only supported API is document merge. See `test/pdf_services_sdk/test_integration.rb` for an example usage.
- Document merge. See `test/pdf_services_sdk/test_document_merge.rb` for an example usage.
- OCR. See `test/pdf_services_sdk/test_ocr.rb` for an example usage.

## Development

Expand Down
6 changes: 6 additions & 0 deletions lib/pdfservices.rb
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
require "jwt"
require "multipart_parser/reader"
require "yaml"
require "pdfservices/version"

module PdfServices
autoload :CredentialsBuilder, "pdfservices/credentials_builder"
Expand All @@ -15,4 +16,9 @@ module DocumentMerge
autoload :Operation, "pdfservices/document_merge/operation"
autoload :Result, "pdfservices/document_merge/result"
end

module Ocr
autoload :Operation, "pdfservices/ocr/operation"
autoload :Result, "pdfservices/ocr/result"
end
end
104 changes: 104 additions & 0 deletions lib/pdfservices/ocr/operation.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
# frozen_string_literal: true

require "http"
require "pdfservices/jwt_provider"
require "pdfservices/ocr/result"
require "yaml"

module PdfServices
module Ocr
class Operation
PRESIGNED_URL_ENDPOINT = "https://pdf-services.adobe.io/assets"
OCR_ENDPOINT = "https://pdf-services.adobe.io/operation/ocr"
ASSETS_ENDPOINT = "https://pdf-services.adobe.io/assets"

def initialize(credentials = nil)
@credentials = credentials
end

def get_presigned_url
response = api.post(PRESIGNED_URL_ENDPOINT, json: {mediaType: "application/pdf"})
if response.status == 200
JSON.parse(response.body.to_s)
else
Result.new(nil, "Unexpected response status from get presigned url: #{response.status}")
end
end

def upload_asset(source_pdf)
presigned_url = get_presigned_url
upload_uri = presigned_url["uploadUri"]
asset_id = presigned_url["assetID"]
aws = HTTP.headers({"Content-Type": "application/pdf"})
response = aws.put(upload_uri, body: File.open(source_pdf))
if response.status == 200
asset_id
else
Result.new(nil, "Unexpected response status from asset upload: #{response.status}")
end
end

def delete_the_asset(asset_id)
api.delete("#{ASSETS_ENDPOINT}/#{asset_id}")
end

def execute(source_pdf)
asset_id = upload_asset(source_pdf)
response = api.post(OCR_ENDPOINT, json: {assetID: asset_id})
if response.status == 201
document_url = response.headers["location"]
poll_document_result(document_url, asset_id)
else
Result.new(nil, "Unexpected response status from ocr endpoint: #{response.status}\nasset_id: #{asset_id}")
end
end

private

def api_headers
{
Authorization: "Bearer #{JwtProvider.get_jwt(@credentials)}",
"x-api-key": @credentials.client_id,
"Content-Type": "application/json"
}
end

def api
@api ||= HTTP.headers(api_headers)
end

def poll_document_result(url, original_asset_id)
sleep(1)
response = api.get(url)
if response.status == 200
json_response = JSON.parse(response.body.to_s)
ocr_asset_id = json_response&.[]("asset")&.[]("assetID")
case json_response["status"]
when "in progress"
poll_document_result(url, original_asset_id)
when "done"
# download_the_asset
response = HTTP.get(json_response["asset"]["downloadUri"])
# delete the assets
delete_the_asset(original_asset_id) if !original_asset_id.nil?
delete_the_asset(ocr_asset_id) if !ocr_asset_id.nil?
# return the result
Result.new(response.body, nil)
when "failed"
# delete the original asset
delete_the_asset(original_asset_id) if !original_asset_id.nil?
Result.new(nil, "OCR Failed")
else
# delete the original asset
delete_the_asset(original_asset_id) if original_asset_id.present?
Result.new(nil, "Unexpected status from polling: #{json_response["status"]}")
end
else
# delete the original asset
delete_the_asset(original_asset_id) if original_asset_id.present?
Result.new(nil, "Unexpected response status from polling: #{json_response["status"]}")
end
end
end
end
end
24 changes: 24 additions & 0 deletions lib/pdfservices/ocr/result.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
require "json"
require "multipart_parser/reader"

module PdfServices
module Ocr
class Result
attr_accessor :document_body, :error

def initialize(document_body, error)
@document_body = document_body
@error = error
end

def success?
@document_body != nil
end

def save_as_file(file_path)
location = File.join(Dir.pwd, file_path)
File.write(location, @document_body)
end
end
end
end
2 changes: 1 addition & 1 deletion lib/pdfservices/version.rb
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# frozen_string_literal: true

module PdfServices
VERSION = "0.1.1"
VERSION = "0.1.2"
end
Binary file added test/fixtures/files/fake_ocr_done.pdf
Binary file not shown.
Binary file added test/fixtures/files/not_yet_ocr.pdf
Binary file not shown.
3 changes: 3 additions & 0 deletions test/fixtures/ocr_done.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
{
"status": "in progress"
}
7 changes: 7 additions & 0 deletions test/fixtures/ocr_in_progress.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
{
"status": "done",
"asset": {
"assetID": "ocr'd:asset-id",
"downloadUri": "https://ocr.file.url"
}
}
4 changes: 4 additions & 0 deletions test/fixtures/presigned_upload_url_response.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
{
"assetID": "urn:a-real-long-asset-asset-id",
"uploadUri": "https://a.presigned.url"
}
19 changes: 19 additions & 0 deletions test/integration_spikes/ocr.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
# frozen_string_literal: true

require "lib/pdfservices"

credentials = ::PdfServices::CredentialsBuilder.new
.with_client_id(ENV["PDF_SERVICES_CLIENT_ID"])
.with_client_secret(ENV["PDF_SERVICES_CLIENT_SECRET"])
.with_organization_id(ENV["PDF_SERVICES_ORGANIZATION_ID"])
.with_account_id(ENV["PDF_SERVICES_ACCOUNT_ID"])
.with_private_key(ENV["PDF_SERVICES_PRIVATE_KEY"])
.build

operation = ::PdfServices::Ocr::Operation.new(credentials)

result = operation.execute("test/fixtures/files/not_yet_ocr.pdf")

puts(result.error)

result.save_as_file("tmp/ocr_result.pdf")
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

require "test_helper"

class IntegrationTest < Minitest::Test
class DocumentMergeTest < Minitest::Test
def test_it_works
stub_valid_response_sequence

Expand Down
88 changes: 88 additions & 0 deletions test/pdf_services_sdk/test_ocr.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# frozen_string_literal: true

require "test_helper"

class OcrTest < Minitest::Test
def test_it_works
stub_valid_response_sequence

# Initial setup, create credentials instance.
credentials = valid_credentials

# source file
source_pdf_path = File.join(Dir.pwd, "test", "fixtures", "files", "not_yet_ocr.pdf")

operation = ::PdfServices::Ocr::Operation.new(credentials)

# Execute the operation
result = operation.execute(source_pdf_path)

assert result.success?
assert_equal "this is fake ocr'd pdf\n", result.document_body.to_s
end

private

def stub_valid_response_sequence
# get JWT for requests
stub_request(:post, "https://ims-na1.adobelogin.com/ims/exchange/jwt/")
.to_return(status: 200, body: json_fixture("valid_jwt_response"))

# get a presigned url to upload the source pdf
stub_request(:post, "https://pdf-services.adobe.io/assets")
.with(headers: secured_headers)
.to_return(
status: 200,
headers: json_headers,
body: json_fixture("presigned_upload_url_response")
)

# upload the source pdf
stub_request(:put, "https://a.presigned.url").to_return(status: 200)

# request the OCR operation
stub_request(:post, "https://pdf-services.adobe.io/operation/ocr")
.with(headers: secured_headers)
.to_return(
status: 201,
headers: {"location" => "https://some.polling.url"}.merge(json_headers)
)

# poll for the result
stub_request(:get, "https://some.polling.url")
.with(headers: secured_headers)
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_in_progress"))
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_in_progress"))
.to_return(status: 200, headers: json_headers, body: json_fixture("ocr_done"))

# download the ocr'd pdf
stub_request(:get, "https://ocr.file.url")
.to_return(status: 200, headers: pdf_headers, body: file_fixture("fake_ocr_done.pdf"))

# delete the original asset
stub_request(:delete, "https://pdf-services.adobe.io/assets/urn:a-real-long-asset-asset-id")
.with(headers: secured_headers)
.to_return(status: 200, body: "", headers: {})

# delete the ocr'd asset
stub_request(:delete, "https://pdf-services.adobe.io/assets/ocr'd:asset-id")
.with(headers: secured_headers)
.to_return(status: 200, body: "", headers: {})
end

def secured_headers
{
Authorization: "Bearer fake1.fake2.fake3",
"Content-Type": "application/json",
"X-Api-Key": "123someclientid"
}
end

def json_headers
{"Content-Type" => "application/json;charset=UTF-8"}
end

def pdf_headers
{"Content-Type" => "application/pdf"}
end
end

0 comments on commit df6d0d7

Please sign in to comment.