From 60575b872a63d618e33831fe107391b1e10f0d01 Mon Sep 17 00:00:00 2001 From: SimonBrazell Date: Fri, 7 Jun 2024 15:25:07 +1000 Subject: [PATCH 1/2] Add optional `encoding` argument to set output encoding --- README.md | 20 +++++++++++++++++--- lib/henkei.rb | 42 ++++++++++++++++++++++++++++++++---------- lib/henkei/version.rb | 2 +- spec/henkei_spec.rb | 20 ++++++++++++++++++++ 4 files changed, 70 insertions(+), 14 deletions(-) diff --git a/README.md b/README.md index d5711eb..1092ac7 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ been split up. To keep the gem size down Henkei will only include the client app call to Henkei, a new Java process will be started, run your command, then terminate. Another change is the metadata keys. A lot of duplicate keys have been removed in favour of a more standards -based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata) +based approach. A list of the old vs new key names can be found [here](https://cwiki.apache.org/confluence/display/TIKA/Migrating+to+Tika+2.0.0#MigratingtoTika2.0.0-Metadata) ## Usage @@ -111,12 +111,26 @@ henkei.mimetype.content_type #=> "application/vnd.openxmlformats-officedocument. henkei.mimetype.extensions #=> ['docx'] ``` +### Output text in a specific character encoding + +You can specify the output character encoding by passing in the optional `encoding` argument when calling to the +`text` or `html` instance methods, as well as the `read` class method. + +```ruby +henkei = Henkei.new 'sample.pages' +utf_8_text = henkei.text(encoding: 'UTF-8') +utf_16_html = henkei.html(encoding: 'UTF-16') + +data = File.read 'sample.pages' +utf_32_text = Henkei.read :text, data, encoding: 'UTF-32' +``` + ## Installation and Dependencies ### Java Runtime Henkei packages the Apache Tika application jar and requires a working JRE for it to work. -Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path. +Check that you either have the `JAVA_HOME` environment variable set, or that `java` is in your path. ### Gem @@ -131,7 +145,7 @@ And then execute: Or install it yourself as: $ gem install henkei - + ### Heroku Add the JVM Buildpack to your Heroku project: diff --git a/lib/henkei.rb b/lib/henkei.rb index 62ac116..717694c 100644 --- a/lib/henkei.rb +++ b/lib/henkei.rb @@ -47,8 +47,8 @@ def self.mimetype(content_type) # text = Henkei.read :text, data # metadata = Henkei.read :metadata, data # - def self.read(type, data, include_ocr: false) - result = client_read(type, data, include_ocr: include_ocr) + def self.read(type, data, include_ocr: false, encoding: nil) + result = client_read(type, data, include_ocr: include_ocr, encoding: encoding) case type when :text, :html then result @@ -96,10 +96,14 @@ def initialize(input) # # henkei.text(include_ocr: true) # - def text(include_ocr: false) + # Set the output character encoding (e.g. 'UTF-8') + # + # henkei.text(encoding: 'UTF-8') + # + def text(include_ocr: false, encoding: nil) return @text if defined? @text - @text = Henkei.read :text, data, include_ocr: include_ocr + @text = Henkei.read :text, data, include_ocr: include_ocr, encoding: encoding end # Returns the text content of the Henkei document in HTML. @@ -111,10 +115,14 @@ def text(include_ocr: false) # # henkei.html(include_ocr: true) # - def html(include_ocr: false) + # Set the output character encoding (e.g. 'UTF-8') + # + # henkei.text(encoding: 'UTF-8') + # + def html(include_ocr: false, encoding: nil) return @html if defined? @html - @html = Henkei.read :html, data, include_ocr: include_ocr + @html = Henkei.read :html, data, include_ocr: include_ocr, encoding: encoding end # Returns the metadata hash of the Henkei document. @@ -211,20 +219,34 @@ def self.java_path # Internal helper for calling to Tika library directly # - def self.client_read(type, data, include_ocr: false) - Open3.capture2(*tika_command(type, include_ocr: include_ocr), stdin_data: data, binmode: true).first + def self.client_read(type, data, include_ocr: false, encoding: nil) + unless encoding.nil? || Encoding.name_list.include?(encoding) + raise ArgumentError, "unsupported encoding - #{encoding}" + end + + Open3.popen2(*tika_command(type, include_ocr: include_ocr, encoding: encoding)) do |stdin, stdout| + stdin.binmode + stdout.binmode + stdout.set_encoding encoding unless encoding.nil? + + stdin.puts data + out_reader = Thread.new { stdout.read } + stdin.close + out_reader.value + end end private_class_method :client_read # Internal helper for building the Java command to call Tika # - def self.tika_command(type, include_ocr: false) + def self.tika_command(type, include_ocr: false, encoding: nil) [ java_path, '-Djava.awt.headless=true', '-jar', Henkei::JAR_PATH, - "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}" + "--config=#{include_ocr ? Henkei::CONFIG_PATH : Henkei::CONFIG_WITHOUT_OCR_PATH}", + *("--encoding=#{encoding}" unless encoding.nil?) ] + switch_for_type(type) end private_class_method :tika_command diff --git a/lib/henkei/version.rb b/lib/henkei/version.rb index dece6b0..d787743 100644 --- a/lib/henkei/version.rb +++ b/lib/henkei/version.rb @@ -1,5 +1,5 @@ # frozen_string_literal: true class Henkei - VERSION = '2.9.2.1' + VERSION = '2.9.2.2' end diff --git a/spec/henkei_spec.rb b/spec/henkei_spec.rb index 13cb1d1..dba94da 100644 --- a/spec/henkei_spec.rb +++ b/spec/henkei_spec.rb @@ -72,6 +72,26 @@ def ci? end end end + + context 'when a valid `encoding` value is provided' do + let(:encoding) { 'UTF-32' } + + it 'returns the parsed text in the specified encoding' do + text = described_class.read :text, data, encoding: encoding + + expect(text.encoding.name).to eq encoding + end + end + + context 'when an invalid `encoding` value is provided' do + let(:encoding) { 'Beef' } + + it 'raises an error' do + expect do + described_class.read :text, data, encoding: encoding + end.to raise_error(ArgumentError, "unsupported encoding - #{encoding}") + end + end end describe '.new' do From 24406e02bcee8886a0d04a03882edb13edec628e Mon Sep 17 00:00:00 2001 From: SimonBrazell Date: Tue, 11 Jun 2024 10:18:05 +1000 Subject: [PATCH 2/2] Ensure `Open3.popen2` more closely matches `capture2` source --- lib/henkei.rb | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/lib/henkei.rb b/lib/henkei.rb index 717694c..56030c7 100644 --- a/lib/henkei.rb +++ b/lib/henkei.rb @@ -229,9 +229,12 @@ def self.client_read(type, data, include_ocr: false, encoding: nil) stdout.binmode stdout.set_encoding encoding unless encoding.nil? - stdin.puts data out_reader = Thread.new { stdout.read } + + write_data_to_stdin(data, stdin) + stdin.close + out_reader.value end end @@ -262,4 +265,21 @@ def self.switch_for_type(type) }[type] end private_class_method :switch_for_type + + # Internal helper for writing the input data to stdin when calling Tika + # + def self.write_data_to_stdin(data, stdin) + return unless data + + begin + if data.respond_to? :readpartial + IO.copy_stream(data, stdin) + else + stdin.write data + end + rescue Errno::EPIPE + # Catch broken pipe. + end + end + private_class_method :write_data_to_stdin end