Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Switch web framework from Cuba to Roda #260

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

source "https://rubygems.org"
platform :jruby do
gem "cuba"
gem "roda"
gem "rack"
gem "tilt"
gem "rufus-lru"
Expand Down
6 changes: 3 additions & 3 deletions Gemfile.lock
Original file line number Diff line number Diff line change
@@ -1,12 +1,12 @@
GEM
remote: https://rubygems.org/
specs:
cuba (3.3.0)
rack
jruby-jars (1.7.15)
jruby-rack (1.1.16)
rack (1.5.2)
rake (10.3.2)
roda (1.3.0)
rack
rubyzip (1.1.6)
rufus-lru (1.0.5)
tabula-extractor (0.7.6-java)
Expand All @@ -23,10 +23,10 @@ PLATFORMS
java

DEPENDENCIES
cuba
jruby-jars (= 1.7.15)
rack
rake
roda
rufus-lru
tabula-extractor (~> 0.7.6)
tilt
Expand Down
4 changes: 2 additions & 2 deletions config.ru
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# encoding: UTF-8
require_relative './webapp/tabula_settings.rb'
require_relative './webapp/tabula_web.rb'
run Cuba
run Roda.app

if "#{$PROGRAM_NAME}".include?("tabula.jar")
# only do this if running as jar or app. (if "rackup", we don't
Expand All @@ -10,7 +10,7 @@ if "#{$PROGRAM_NAME}".include?("tabula.jar")
require 'java'

# don't do "java_import java.net.URI" -- it conflicts with Ruby URI and
# makes Cuba/Rack really really upset. just call "java.*" classes
# makes Roda/Rack really really upset. just call "java.*" classes
# directly.
port = java.lang.Integer.getInteger('jetty.port', 8080)
url = "http://127.0.0.1:#{port}"
Expand Down
144 changes: 61 additions & 83 deletions webapp/tabula_debug.rb
Original file line number Diff line number Diff line change
@@ -1,108 +1,86 @@
require 'json'

class TabulaDebug < Cuba
define do
class TabulaDebug < Roda
clear_middleware!

on ":file_id/characters" do |file_id|
par = JSON.load(req.params['coords']).first
route do
on :file_id, :method=>:get do |file_id|
par = JSON.load(request['coords']).first
page = par['page']

pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf')
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page])

text_elements = extractor.extract.next.get_text([par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f])

res['Content-Type'] = 'application/json'
res.write text_elements.map { |te|
{ 'left' => te.left,
'top' => te.top,
'width' => te.width,
'height' => te.height,
'text' => te.text }
}.to_json
end

on ":file_id/text_chunks" do |file_id|
par = JSON.load(req.params['coords']).first
page = par['page']

pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf')
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page])

text_elements = extractor.extract.next.get_text([par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f])

text_chunks = Tabula::TextElement.merge_words(text_elements)
is "characters" do |file_id|
text_elements = extractor.extract.next.get_text([par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f])

text_elements.map { |te|
{ 'left' => te.left,
'top' => te.top,
'width' => te.width,
'height' => te.height,
'text' => te.text }
}
end

puts text_chunks.inspect
is "text_chunks" do |file_id|
text_elements = extractor.extract.next.get_text([par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f])

res['Content-Type'] = 'application/json'
res.write text_chunks.map { |te|
{ 'left' => te.left,
'top' => te.top,
'width' => te.width,
'height' => te.height,
'text' => te.text }
}.to_json
end
text_chunks = Tabula::TextElement.merge_words(text_elements)

text_chunks.map { |te|
{ 'left' => te.left,
'top' => te.top,
'width' => te.width,
'height' => te.height,
'text' => te.text }
}
end

on ":file_id/clipping_paths" do |file_id|
par = JSON.load(req.params['coords']).first
page = par['page']

pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf')
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page])
extractor.debug_clipping_paths = true
is "clipping_paths" do |file_id|
extractor.debug_clipping_paths = true

extractor.extract.next
extractor.extract.next

res['Content-Type'] = 'application/json'
res.write extractor.clipping_paths.map { |cp|
{
'left' => cp.left,
'top' => cp.top,
'width' => cp.width,
'height' => cp.height
extractor.clipping_paths.map { |cp|
{
'left' => cp.left,
'top' => cp.top,
'width' => cp.width,
'height' => cp.height
}
}
}.to_json
end

on ":file_id/rulings" do |file_id|
par = JSON.load(req.params['coords']).first
page = par['page']
end

pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf')
extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page])
is "rulings" do |file_id|
# crop lines to area of interest
top, left, bottom, right = [par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f]

# crop lines to area of interest
par = JSON.load(req.params['coords']).first
top, left, bottom, right = [par['y1'].to_f,
par['x1'].to_f,
par['y2'].to_f,
par['x2'].to_f]
area = Tabula::ZoneEntity.new(top, left,
right - left, bottom - top)

area = Tabula::ZoneEntity.new(top, left,
right - left, bottom - top)
page_obj = extractor.extract.next
page_area = page_obj.get_area(area)
rulings = page_area.ruling_lines

page_obj = extractor.extract.next
page_area = page_obj.get_area(area)
rulings = page_area.ruling_lines
intersections = {}
if request['show_intersections'] != 'false'
intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines,
page_area.vertical_ruling_lines)
end

intersections = {}
if req.params['show_intersections'] != 'false'
intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines,
page_area.vertical_ruling_lines)
{:rulings => rulings.uniq, :intersections => intersections.keys}
end

res['Content-Type'] = 'application/json'
res.write({:rulings => rulings.uniq, :intersections => intersections.keys }.to_json)
end

end
end
85 changes: 42 additions & 43 deletions webapp/tabula_job_progress.rb
Original file line number Diff line number Diff line change
@@ -1,52 +1,51 @@
require_relative '../lib/tabula_job_executor/executor.rb'

class TabulaJobProgress < Cuba
define do
on ":upload_id/json" do |batch_id|
# upload_id is the "job id" uuid that resque-status provides
batch = Tabula::Background::JobExecutor.get_by_batch(batch_id)
res['Content-Type'] = 'application/json'
message = {}
if batch.empty?
res.status = 404
message[:status] = "error"
message[:message] = "No such job"
message[:pct_complete] = 0
elsif batch.any? { |uuid, job| job.failed? }
message[:status] = "error"
message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again."
message[:pct_complete] = 99
res.write message.to_json
else
s = batch.find { |uuid, job| job.working? }
message[:status] = !s.nil? ? s.last.status['status'] : 'completed'
message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : ''
message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i
message[:file_id] = req.params['file_id']
message[:upload_id] = batch_id
res.write message.to_json
end
end
class TabulaJobProgress < Roda
clear_middleware!

on ":upload_id" do |batch_id|
route do
on :upload_id, :method=>:get do |batch_id|
# upload_id is the "job id" uuid that resque-status provides
batch = Tabula::Background::JobExecutor.get_by_batch(batch_id)

if batch.empty?
res.status = 404
res.write ""
res.write view("upload_error.html",
:message => "invalid upload_id (TODO: make this generic 404)")
elsif batch.any? { |uuid, job| job.failed? }
res.write view("upload_error.html",
:message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again.")
else
s = batch.find { |uuid, job| job.working? }
res.write view("upload_status.html",
:status => !s.nil? ? s.last.message : 'completed',
:pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i,
:upload_id => batch_id,
:file_id => req.params['file_id'])
is "json" do |batch_id|
message = {}
if batch.empty?
response.status = 404
message[:status] = "error"
message[:message] = "No such job"
message[:pct_complete] = 0
elsif batch.any? { |uuid, job| job.failed? }
message[:status] = "error"
message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again."
message[:pct_complete] = 99
else
s = batch.find { |uuid, job| job.working? }
message[:status] = !s.nil? ? s.last.status['status'] : 'completed'
message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : ''
message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i
message[:file_id] = request['file_id']
message[:upload_id] = batch_id
end
message
end

is do
if batch.empty?
response.status = 404
view("upload_error.html", :locals=>{
:message => "invalid upload_id (TODO: make this generic 404)"})
elsif batch.any? { |uuid, job| job.failed? }
view("upload_error.html", :locals=>{
:message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again."})
else
s = batch.find { |uuid, job| job.working? }
view("upload_status.html", :locals=>{
:status => !s.nil? ? s.last.message : 'completed',
:pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i,
:upload_id => batch_id,
:file_id => request['file_id']})
end
end
end
end
Expand Down
Loading