Skip to content

Commit

Permalink
xapian: Don't stem words, and use stopper more strictly to reduce db …
Browse files Browse the repository at this point in the history
…size
  • Loading branch information
johnl committed Apr 5, 2020
1 parent 968a6e0 commit cdd97d8
Show file tree
Hide file tree
Showing 5 changed files with 74 additions and 15 deletions.
5 changes: 4 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -14,4 +14,7 @@ config/secrets.yml
.rspec
public/assets
tmp/
.DS_Store
.DS_Store
docker-mysql/
*.sql
*.lz4
5 changes: 3 additions & 2 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ gem "web-page-parser", :git => "https://github.com/johnl/web-page-parser.git", :
gem "diff-lcs", '~>1.2.5', :require => "diff/lcs"

# Requires xapian library
gem "xapian-fu", :git => "https://github.com/johnl/xapian-fu.git", :ref => 'additional-flags'
#gem "xapian-fu", "~> 1.6.0"
#gem "xapian-fu", :git => "https://github.com/johnl/xapian-fu.git", :ref => 'additional-flags'
gem "xapian-fu", "~> 1.7.0"
gem "xapian-ruby"

gem "nokogiri"
Expand All @@ -27,4 +27,5 @@ group :development do
gem "sqlite3", "~> 1.3.13"
gem "rspec-core"
gem "rspec-rails"
gem "ruby-prof"
end
14 changes: 5 additions & 9 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -16,13 +16,6 @@ GIT
htmlentities (~> 4.3)
nokogiri (~> 1.9.1)

GIT
remote: https://github.com/johnl/xapian-fu.git
revision: 84fb6a37eecc90bcd8b353ccea70158fab0dcd6c
ref: additional-flags
specs:
xapian-fu (1.6.0)

GEM
remote: https://rubygems.org/
specs:
Expand Down Expand Up @@ -86,7 +79,7 @@ GEM
mini_mime (1.0.2)
mini_portile2 (2.4.0)
minitest (5.13.0)
mysql2 (0.5.2)
mysql2 (0.5.3)
nio4r (2.5.2)
nokogiri (1.9.1)
mini_portile2 (~> 2.4.0)
Expand Down Expand Up @@ -146,6 +139,7 @@ GEM
rspec-mocks (~> 3.9.0)
rspec-support (~> 3.9.0)
rspec-support (3.9.0)
ruby-prof (1.3.1)
sass (3.7.4)
sass-listen (~> 4.0.0)
sass-listen (4.0.0)
Expand All @@ -171,6 +165,7 @@ GEM
tzinfo (1.2.5)
thread_safe (~> 0.1)
will_paginate (3.2.1)
xapian-fu (1.7.0)
xapian-ruby (1.4.9)

PLATFORMS
Expand All @@ -188,11 +183,12 @@ DEPENDENCIES
responders (~> 2.0)
rspec-core
rspec-rails
ruby-prof
sass-rails (~> 5.0)
sqlite3 (~> 1.3.13)
web-page-parser!
will_paginate (~> 3.0)
xapian-fu!
xapian-fu (~> 1.7.0)
xapian-ruby

BUNDLED WITH
Expand Down
9 changes: 7 additions & 2 deletions app/models/news_article_version/xapian_indexing.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,13 +39,18 @@ def xapian_db
title: { type: String },
text: { type: String, index: :without_field_names }
}
@xapian_db = XapianFu::XapianDb.new(dir: xapian_db_path, create: true,
@xapian_db = XapianFu::XapianDb.new(dir: xapian_db_path, create: true, stopper_strategy: :all,
fields: fields, index_positions: false, spelling: false,
stemmer: false,
additional_flag: Xapian::DB_NO_SYNC)
end

def xapian_db_path=(path)
@xapian_db_path = path
end

def xapian_db_path
File.join(Rails.root, 'xapian/news_article_versions')
@xapian_db_path ||= File.join(Rails.root, 'xapian/news_article_versions')
end

def xapian_rebuild(options = {})
Expand Down
56 changes: 55 additions & 1 deletion spec/models/news_article_version_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
describe NewsArticleVersion do

before(:each) do
NewsArticleVersion.xapian_db_path = File.join(Rails.root, 'xapian/news_article_versions-' + SecureRandom.uuid)

@valid_attributes = {
:title => 'PM Brown plays down expenses row',
:source => 'bbc',
Expand All @@ -13,11 +15,63 @@
@more_valid_attributes = @valid_attributes.merge({ :guid => '7984712' })
@expenses_row_article = @valid_attributes
end

it "should increment the version number for each new version" do
na = a_news_article_with_two_versions
versions = na.versions.collect { |v| v.version }
versions.max.should == 1
versions.min.should == 0
end

describe "xapian index" do
it "should index article versions" do
na = a_news_article_with_two_versions
v = na.versions.first

xapdoc = v.to_xapian_doc
db = NewsArticleVersion.xapian_db
xapdoc.db = db
db << xapdoc
db.flush

NewsArticleVersion.xapian_search("Gordon Brown").first.should eq v
NewsArticleVersion.xapian_search("'Gordon Brown'").first.should eq v
NewsArticleVersion.xapian_search("Gordon -Brown").first.should eq nil
NewsArticleVersion.xapian_search("eternal washing").first.should eq nil
NewsArticleVersion.xapian_search("title:transparency").first.should eq nil
NewsArticleVersion.xapian_search("transparency").first.should eq v
NewsArticleVersion.xapian_search("residence").first.should eq v # stemmed
NewsArticleVersion.xapian_search("title:brown").first.should eq v
NewsArticleVersion.xapian_search("+source:bbc brown").first.should eq v
NewsArticleVersion.xapian_search("-source:bbc brown").first.should eq nil
puts xapdoc.to_xapian_document.terms.collect { |t| t.term }.inspect

end

it "should not stem the source " do
na = a_news_article_with_two_versions
v = na.versions.first
xapdoc = v.to_xapian_doc
db = NewsArticleVersion.xapian_db
xapdoc.db = db
NewsArticleVersion.xapian_db << xapdoc

terms = db.rw.allterms.collect { |t| t.term }
terms.include?("XSOURCEbbc").should be true
terms.include?("ZXSOURCEbbc").should be false
end


it "should stop all stop words" do
na = a_news_article_with_two_versions
v = na.versions.first
xapdoc = v.to_xapian_doc
db = NewsArticleVersion.xapian_db
xapdoc.db = db
NewsArticleVersion.xapian_db << xapdoc

terms = db.rw.allterms.collect { |t| t.term }
terms.include?("who").should be false
end
end
end

0 comments on commit cdd97d8

Please sign in to comment.