Skip to content

Commit

Permalink
Made detect language support underlying HTML mode.
Browse files Browse the repository at this point in the history
  • Loading branch information
rojotek committed May 16, 2012
1 parent 97bf5c5 commit 2056c31
Show file tree
Hide file tree
Showing 3 changed files with 19 additions and 5 deletions.
3 changes: 1 addition & 2 deletions ext/cld/thunk.cc
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,7 @@ typedef struct {
} RESULT;

extern "C" {
RESULT detectLanguageThunkInt(const char * src) {
bool is_plain_text = true;
RESULT detectLanguageThunkInt(const char * src, bool is_plain_text) {
bool do_allow_extended_languages = true;
bool do_pick_summary_language = false;
bool do_remove_weak_matches = false;
Expand Down
6 changes: 3 additions & 3 deletions lib/cld.rb
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@
module CLD
extend FFI::Library

def self.detect_language(text)
result = detect_language_ext(text.to_s)
def self.detect_language(text, is_plain_text=true)
result = detect_language_ext(text.to_s, is_plain_text)
Hash[ result.members.map {|member| [member.to_sym, result[member]]} ]
end

Expand All @@ -17,5 +17,5 @@ class ReturnValue < FFI::Struct

GEM_ROOT = File.expand_path("../../", __FILE__)
ffi_lib "#{GEM_ROOT}/ext/cld/cld.so"
attach_function "detect_language_ext","detectLanguageThunkInt", [:buffer_in], ReturnValue.by_value
attach_function "detect_language_ext","detectLanguageThunkInt", [:buffer_in, :bool], ReturnValue.by_value
end
15 changes: 15 additions & 0 deletions spec/cld_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,21 @@
it { subject[:reliable].should be_true }
end

context "French in HTML - using CLD html " do
subject { CLD.detect_language("<html><head><body><script>A large amount of english in the script which should be ignored if using html in detect_language.</script><p>plus ça change, plus c'est la même chose</p></body></html>", false) }

it { subject[:name].should eq("FRENCH") }
it { subject[:code].should eq("fr") }

end
context "French in HTML - using CLD text " do
subject { CLD.detect_language("<html><head><body><script>A large amount of english in the script which should be ignored if using html in detect_language.</script><p>plus ça change, plus c'est la même chose</p></body></html>", true) }

it { subject[:name].should eq("ENGLISH") }
it { subject[:code].should eq("en") }

end

context "Simplified Chinese text" do
subject { CLD.detect_language("你好吗箭体") }

Expand Down

0 comments on commit 2056c31

Please sign in to comment.