From 9054b29637ebb17ca3f80900b5bdc657991380cb Mon Sep 17 00:00:00 2001 From: Jeremy Evans Date: Tue, 10 Feb 2015 15:28:02 -0800 Subject: [PATCH] Switch web framework from Cuba to Roda Roda is a fork of Cuba. Here are the advantages I see to this change: 1) Uses terminal routes. One of the reasons behind the fork is that by default, routes in Cuba are not terminal. So the following requests are currently handled the same by Tabula: POST /upload POST /upload/with/additional/stuff Basically, you can stick whatever you want at the end of a path, and Cuba will just ignore it. Roda has built in support for terminal routes, so adding stuff at the end of the path will result in a 404, like practically all websites. 2) Comes with a JSON plugin. This simplifies JSON handling, and makes it so you don't have to set the Content-Type to application/json and call .to_json manually. You just have the route block return an array or hash, and Roda will automatically convert it to json for you. While I was converting the app from Cuba to Roda, I noticed there were some parts that could derive more benefit from the routing tree. One example is TabulaDebug, where I moved a some duplicate code in every route up to the enclosing branch, so the behavior is shared by all of the routes under the branch. Some things get a little more verbose with the switch, such as the second argument to Roda#view is a general options hash instead of specific to local variables. In general that could be made simpler by passing data to the views implicitly using instance variables instead of local variables. Some other changes: 1) require 'tilt/erb' explicitly. This avoids some warnings printed by tilt, which should be the case for both Roda and Cuba. 2) require tabula_debug and tabula_job_progress outside of the routing tree, so they aren't required on every request that uses them. --- Gemfile | 2 +- Gemfile.lock | 6 +- config.ru | 4 +- webapp/tabula_debug.rb | 144 ++++++++++++++-------------------- webapp/tabula_job_progress.rb | 85 ++++++++++---------- webapp/tabula_web.rb | 121 ++++++++++++++-------------- webapp/views/layout.erb | 4 +- 7 files changed, 173 insertions(+), 193 deletions(-) diff --git a/Gemfile b/Gemfile index 40dbaf7e..14eb6ae1 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ source "https://rubygems.org" platform :jruby do - gem "cuba" + gem "roda" gem "rack" gem "tilt" gem "rufus-lru" diff --git a/Gemfile.lock b/Gemfile.lock index 63333175..84eab59c 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -1,12 +1,12 @@ GEM remote: https://rubygems.org/ specs: - cuba (3.3.0) - rack jruby-jars (1.7.15) jruby-rack (1.1.16) rack (1.5.2) rake (10.3.2) + roda (1.3.0) + rack rubyzip (1.1.6) rufus-lru (1.0.5) tabula-extractor (0.7.6-java) @@ -23,10 +23,10 @@ PLATFORMS java DEPENDENCIES - cuba jruby-jars (= 1.7.15) rack rake + roda rufus-lru tabula-extractor (~> 0.7.6) tilt diff --git a/config.ru b/config.ru index 0d241877..12d743f5 100644 --- a/config.ru +++ b/config.ru @@ -1,7 +1,7 @@ # encoding: UTF-8 require_relative './webapp/tabula_settings.rb' require_relative './webapp/tabula_web.rb' -run Cuba +run Roda.app if "#{$PROGRAM_NAME}".include?("tabula.jar") # only do this if running as jar or app. (if "rackup", we don't @@ -10,7 +10,7 @@ if "#{$PROGRAM_NAME}".include?("tabula.jar") require 'java' # don't do "java_import java.net.URI" -- it conflicts with Ruby URI and - # makes Cuba/Rack really really upset. just call "java.*" classes + # makes Roda/Rack really really upset. just call "java.*" classes # directly. port = java.lang.Integer.getInteger('jetty.port', 8080) url = "http://127.0.0.1:#{port}" diff --git a/webapp/tabula_debug.rb b/webapp/tabula_debug.rb index ac6ca1cc..12ffdd96 100644 --- a/webapp/tabula_debug.rb +++ b/webapp/tabula_debug.rb @@ -1,108 +1,86 @@ require 'json' -class TabulaDebug < Cuba - define do +class TabulaDebug < Roda + clear_middleware! - on ":file_id/characters" do |file_id| - par = JSON.load(req.params['coords']).first + route do + on :file_id, :method=>:get do |file_id| + par = JSON.load(request['coords']).first page = par['page'] - - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - - text_elements = extractor.extract.next.get_text([par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f]) - - res['Content-Type'] = 'application/json' - res.write text_elements.map { |te| - { 'left' => te.left, - 'top' => te.top, - 'width' => te.width, - 'height' => te.height, - 'text' => te.text } - }.to_json - end - - on ":file_id/text_chunks" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - text_elements = extractor.extract.next.get_text([par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f]) - - text_chunks = Tabula::TextElement.merge_words(text_elements) + is "characters" do |file_id| + text_elements = extractor.extract.next.get_text([par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f]) + + text_elements.map { |te| + { 'left' => te.left, + 'top' => te.top, + 'width' => te.width, + 'height' => te.height, + 'text' => te.text } + } + end - puts text_chunks.inspect + is "text_chunks" do |file_id| + text_elements = extractor.extract.next.get_text([par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f]) - res['Content-Type'] = 'application/json' - res.write text_chunks.map { |te| - { 'left' => te.left, - 'top' => te.top, - 'width' => te.width, - 'height' => te.height, - 'text' => te.text } - }.to_json - end + text_chunks = Tabula::TextElement.merge_words(text_elements) + text_chunks.map { |te| + { 'left' => te.left, + 'top' => te.top, + 'width' => te.width, + 'height' => te.height, + 'text' => te.text } + } + end - on ":file_id/clipping_paths" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) - extractor.debug_clipping_paths = true + is "clipping_paths" do |file_id| + extractor.debug_clipping_paths = true - extractor.extract.next + extractor.extract.next - res['Content-Type'] = 'application/json' - res.write extractor.clipping_paths.map { |cp| - { - 'left' => cp.left, - 'top' => cp.top, - 'width' => cp.width, - 'height' => cp.height + extractor.clipping_paths.map { |cp| + { + 'left' => cp.left, + 'top' => cp.top, + 'width' => cp.width, + 'height' => cp.height + } } - }.to_json - end - - on ":file_id/rulings" do |file_id| - par = JSON.load(req.params['coords']).first - page = par['page'] + end - pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - extractor = Tabula::Extraction::ObjectExtractor.new(pdf_path, [page]) + is "rulings" do |file_id| + # crop lines to area of interest + top, left, bottom, right = [par['y1'].to_f, + par['x1'].to_f, + par['y2'].to_f, + par['x2'].to_f] - # crop lines to area of interest - par = JSON.load(req.params['coords']).first - top, left, bottom, right = [par['y1'].to_f, - par['x1'].to_f, - par['y2'].to_f, - par['x2'].to_f] + area = Tabula::ZoneEntity.new(top, left, + right - left, bottom - top) - area = Tabula::ZoneEntity.new(top, left, - right - left, bottom - top) + page_obj = extractor.extract.next + page_area = page_obj.get_area(area) + rulings = page_area.ruling_lines - page_obj = extractor.extract.next - page_area = page_obj.get_area(area) - rulings = page_area.ruling_lines + intersections = {} + if request['show_intersections'] != 'false' + intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines, + page_area.vertical_ruling_lines) + end - intersections = {} - if req.params['show_intersections'] != 'false' - intersections = Tabula::Ruling.find_intersections(page_area.horizontal_ruling_lines, - page_area.vertical_ruling_lines) + {:rulings => rulings.uniq, :intersections => intersections.keys} end - res['Content-Type'] = 'application/json' - res.write({:rulings => rulings.uniq, :intersections => intersections.keys }.to_json) end - end end diff --git a/webapp/tabula_job_progress.rb b/webapp/tabula_job_progress.rb index 13af32f9..f1985f39 100644 --- a/webapp/tabula_job_progress.rb +++ b/webapp/tabula_job_progress.rb @@ -1,52 +1,51 @@ require_relative '../lib/tabula_job_executor/executor.rb' -class TabulaJobProgress < Cuba - define do - on ":upload_id/json" do |batch_id| - # upload_id is the "job id" uuid that resque-status provides - batch = Tabula::Background::JobExecutor.get_by_batch(batch_id) - res['Content-Type'] = 'application/json' - message = {} - if batch.empty? - res.status = 404 - message[:status] = "error" - message[:message] = "No such job" - message[:pct_complete] = 0 - elsif batch.any? { |uuid, job| job.failed? } - message[:status] = "error" - message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again." - message[:pct_complete] = 99 - res.write message.to_json - else - s = batch.find { |uuid, job| job.working? } - message[:status] = !s.nil? ? s.last.status['status'] : 'completed' - message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : '' - message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i - message[:file_id] = req.params['file_id'] - message[:upload_id] = batch_id - res.write message.to_json - end - end +class TabulaJobProgress < Roda + clear_middleware! - on ":upload_id" do |batch_id| + route do + on :upload_id, :method=>:get do |batch_id| # upload_id is the "job id" uuid that resque-status provides batch = Tabula::Background::JobExecutor.get_by_batch(batch_id) - if batch.empty? - res.status = 404 - res.write "" - res.write view("upload_error.html", - :message => "invalid upload_id (TODO: make this generic 404)") - elsif batch.any? { |uuid, job| job.failed? } - res.write view("upload_error.html", - :message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again.") - else - s = batch.find { |uuid, job| job.working? } - res.write view("upload_status.html", - :status => !s.nil? ? s.last.message : 'completed', - :pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i, - :upload_id => batch_id, - :file_id => req.params['file_id']) + is "json" do |batch_id| + message = {} + if batch.empty? + response.status = 404 + message[:status] = "error" + message[:message] = "No such job" + message[:pct_complete] = 0 + elsif batch.any? { |uuid, job| job.failed? } + message[:status] = "error" + message[:message] = "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again." + message[:pct_complete] = 99 + else + s = batch.find { |uuid, job| job.working? } + message[:status] = !s.nil? ? s.last.status['status'] : 'completed' + message[:message] = !s.nil? && !s.last.message.nil? ? s.last.message.first : '' + message[:pct_complete] = (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i + message[:file_id] = request['file_id'] + message[:upload_id] = batch_id + end + message + end + + is do + if batch.empty? + response.status = 404 + view("upload_error.html", :locals=>{ + :message => "invalid upload_id (TODO: make this generic 404)"}) + elsif batch.any? { |uuid, job| job.failed? } + view("upload_error.html", :locals=>{ + :message => "Sorry, your file upload could not be processed. Please double-check that the file you uploaded is a valid PDF file and try again."}) + else + s = batch.find { |uuid, job| job.working? } + view("upload_status.html", :locals=>{ + :status => !s.nil? ? s.last.message : 'completed', + :pct_complete => (batch.inject(0.0) { |sum, (uuid, job)| sum + job.pct_complete } / batch.size).to_i, + :upload_id => batch_id, + :file_id => request['file_id']}) + end end end end diff --git a/webapp/tabula_web.rb b/webapp/tabula_web.rb index e9c93954..ba4fab9a 100644 --- a/webapp/tabula_web.rb +++ b/webapp/tabula_web.rb @@ -1,6 +1,6 @@ # -*- coding: utf-8 -*- -require 'cuba' -require 'cuba/render' +require 'roda' +require 'tilt/erb' require 'rufus-lru' require 'digest/sha1' @@ -44,12 +44,17 @@ def is_valid_pdf?(path) MAX_CACHE_ENTRIES = 10 -Cuba.plugin Cuba::Render -Cuba.settings[:render].store(:views, File.expand_path("views", File.dirname(__FILE__))) -Cuba.use Rack::MethodOverride -Cuba.use Rack::Static, root: STATIC_ROOT, urls: ["/css","/js", "/img", "/swf"] -Cuba.use Rack::ContentLength -Cuba.use Rack::Reloader +Roda.plugin :render, :views=>File.expand_path("views", File.dirname(__FILE__)), + :template_opts=>{:default_encoding=>'UTF-8'} +Roda.plugin :all_verbs +Roda.plugin :json +Roda.plugin :delegate +Roda.request_delegate :on, :is, :delete, :get, :put, :post, :root, :run +Roda.plugin :default_headers, 'Content-Type'=>"text/html; charset=utf-8" +Roda.use Rack::MethodOverride +Roda.use Rack::Static, root: STATIC_ROOT, urls: ["/css","/js", "/img", "/swf"] +Roda.use Rack::ContentLength +Roda.use Rack::Reloader if TabulaSettings::EXTRACTION_CACHE CACHE = Rufus::Lru::SynchronizedHash.new(MAX_CACHE_ENTRIES) @@ -70,24 +75,26 @@ def has_key?(k) CACHE = NoCache.new end -Cuba.define do +if TabulaSettings::ENABLE_DEBUG_METHODS + require_relative './tabula_debug.rb' +end +require_relative './tabula_job_progress.rb' + +Roda.route do if TabulaSettings::ENABLE_DEBUG_METHODS - require_relative './tabula_debug.rb' on 'debug' do run TabulaDebug end end - on 'queue' do - require_relative './tabula_job_progress.rb' run TabulaJobProgress end - on delete do + delete do - on 'pdf/:file_id/page/:page_number' do |file_id, page_number| + is 'pdf/:file_id/page/:page_number' do |file_id, page_number| index_fname = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'pages.json') @@ -97,7 +104,7 @@ def has_key?(k) end # delete an uploaded file - on 'pdf/:file_id' do |file_id| + is 'pdf/:file_id' do |file_id| workspace_file = File.join(TabulaSettings::DOCUMENTS_BASEPATH, 'workspace.json') raise if !File.exists?(workspace_file) @@ -118,14 +125,14 @@ def has_key?(k) end - on put do - on 'pdf/:file_id/page/:page_number' do |file_id, page_number| + put do + is 'pdf/:file_id/page/:page_number' do |file_id, page_number| # nothing yet end end - on get do - on root do + get do + root do workspace_file = File.join(TabulaSettings::DOCUMENTS_BASEPATH, 'workspace.json') workspace = if File.exists?(workspace_file) File.open(workspace_file) { |f| JSON.load(f) } @@ -133,8 +140,7 @@ def has_key?(k) [] end - res.write view("index.html", - workspace: workspace) + view("index.html", :locals=>{workspace: workspace}) end @@ -142,43 +148,42 @@ def has_key?(k) run Rack::File.new(TabulaSettings::DOCUMENTS_BASEPATH) end - on "pdf/:file_id" do |file_id| + is "pdf/:file_id" do |file_id| document_dir = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id) - unless File.directory?(document_dir) - res.status = 404 - else - res.write view("pdf_view.html", + if File.directory?(document_dir) + view("pdf_view.html", :locals=>{ pages: File.open(File.join(document_dir, 'pages.json')) { |f| JSON.parse(f.read) }, - file_id: file_id) + file_id: file_id}) end end end # /get - on post do - on 'upload' do + post do + is 'upload' do + + tempfile_path = request['file'][:tempfile].path # Make sure this is a PDF, before doing anything - unless is_valid_pdf?(req.params['file'][:tempfile].path) - res.status = 400 - res.write view("upload_error.html", - :message => "Sorry, the file you uploaded was not detected as a PDF. You must upload a PDF file. Please try again.") - next # halt this handler + unless is_valid_pdf?(tempfile_path) + response.status = 400 + next view("upload_error.html", :locals=>{ + :message => "Sorry, the file you uploaded was not detected as a PDF. You must upload a PDF file. Please try again."}) end - original_filename = req.params['file'][:filename] + original_filename = request['file'][:filename] file_id = Digest::SHA1.hexdigest(Time.now.to_s) file_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id) FileUtils.mkdir(file_path) begin - FileUtils.mv(req.params['file'][:tempfile].path, + FileUtils.mv(tempfile_path, File.join(file_path, 'document.pdf')) rescue Errno::EACCES # move fails on windows sometimes - FileUtils.cp_r(req.params['file'][:tempfile].path, + FileUtils.cp_r(tempfile_path, File.join(file_path, 'document.pdf')) - FileUtils.rm_rf(req.params['file'][:tempfile].path) + FileUtils.rm_rf(tempfile_path) end @@ -192,7 +197,7 @@ def has_key?(k) :id => file_id, :batch => job_batch) - if req.params['autodetect-tables'] + if request['autodetect-tables'] DetectTablesJob.create(:filename => file, :output_dir => file_path, :batch => job_batch) @@ -208,21 +213,21 @@ def has_key?(k) :thumbnail_sizes => [560], :batch => job_batch) - res.redirect "/queue/#{job_batch}?file_id=#{file_id}" + request.redirect "/queue/#{job_batch}?file_id=#{file_id}" end - on "pdf/:file_id/data" do |file_id| + is "pdf/:file_id/data" do |file_id| pdf_path = File.join(TabulaSettings::DOCUMENTS_BASEPATH, file_id, 'document.pdf') - coords = JSON.load(req.params['coords']) + coords = JSON.load(request['coords']) coords.sort_by! do |coord_set| [ coord_set['page'], [coord_set['y1'], coord_set['y2']].min.to_i / 10, [coord_set['x1'], coord_set['x2']].min ] end - if ["guess", "spreadsheet", "original"].include?(req.params['extraction_method']) - extraction_method_requested = req.params['extraction_method'] + if ["guess", "spreadsheet", "original"].include?(request['extraction_method']) + extraction_method_requested = request['extraction_method'] else extraction_method_requested = "guess" end @@ -242,39 +247,37 @@ def has_key?(k) end end - case req.params['format'] + case request['format'] when 'csv' - res['Content-Type'] = 'text/csv' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.csv\"" + response['Content-Type'] = 'text/csv' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.csv\"" tables = CACHE[coords_method_key].flatten(1) tables.each do |table| - res.write table.to_csv + response.write table.to_csv end when 'tsv' - res['Content-Type'] = 'text/tab-separated-values' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.tsv\"" + response['Content-Type'] = 'text/tab-separated-values' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.tsv\"" tables = CACHE[coords_method_key].flatten(1) tables.each do |table| - res.write table.to_tsv + response.write table.to_tsv end when 'script' # Write shell script of tabula-extractor commands. $1 takes # the name of a file from the command line and passes it # to tabula-extractor so the script can be reused on similar pdfs. - res['Content-Type'] = 'application/x-sh' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.sh\"" + response['Content-Type'] = 'application/x-sh' + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.sh\"" coords.each do |c| - res.write "tabula -a #{c['y1']},#{c['x1']},#{c['y2']},#{c['x2']} -p #{c['page']} \"$1\" \n" + response.write "tabula -a #{c['y1']},#{c['x1']},#{c['y2']},#{c['x2']} -p #{c['page']} \"$1\" \n" end when 'bbox' # Write json representation of bounding boxes and pages for # use in OCR and other back ends. - res['Content-Type'] = 'application/json' - res['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.json\"" - res.write coords.to_json + response['Content-Disposition'] = "attachment; filename=\"tabula-#{file_id}.json\"" + coords else - res['Content-Type'] = 'application/json' - res.write CACHE[coords_method_key].flatten(1).to_json + CACHE[coords_method_key].flatten(1) end end end diff --git a/webapp/views/layout.erb b/webapp/views/layout.erb index dc29b55a..4430a066 100644 --- a/webapp/views/layout.erb +++ b/webapp/views/layout.erb @@ -12,7 +12,7 @@ <% if $TABULA_VERSION.start_with?('rev') %>
DEV mode
<% end %> - <% if req.path != "/" %> + <% if request.path != "/" %>