Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Bug/#1767 ga import #1792

Closed
wants to merge 2 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .rubocop.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ AllCops:
- 'app/services/report.rb'
- 'app/services/work_loader.rb'
- 'app/services/hyrax/actor_factory.rb'
- 'app/services/hyrax/user_stat_importer.rb'
- 'spec/services/hyrax/actor_factory_spec.rb'
- 'db/**/*'
- 'lib/tasks/batch.rake'
Expand Down
152 changes: 152 additions & 0 deletions app/services/hyrax/user_stat_importer.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,152 @@
module Hyrax
# Cache work view, file view & file download stats for all users
# this is called by 'rake hyrax:stats:user_stats'
class UserStatImporter
UserRecord = Struct.new("UserRecord", :id, :user_key, :last_stats_update)

def initialize(options = {})
if options[:verbose]
stdout_logger = Logger.new(STDOUT)
stdout_logger.level = Logger::INFO
Rails.logger.extend(ActiveSupport::Logger.broadcast(stdout_logger))
end
@logging = options[:logging]
@delay_secs = options[:delay_secs].to_f
@number_of_retries = options[:number_of_retries].to_i
end

delegate :depositor_field, to: DepositSearchBuilder

def import
log_message('Begin import of User stats.')

sorted_users.each do |user|
start_date = date_since_last_cache(user)
# this user has already been processed today continue without delay
next if start_date.to_date >= Time.zone.today

stats = {}

process_files(stats, user, start_date)
process_works(stats, user, start_date)
create_or_update_user_stats(stats, user)
end
log_message('User stats import complete.')
end

# Returns an array of users sorted by the date of their last stats update. Users that have not been recently updated
# will be at the top of the array.
def sorted_users
users = []
::User.find_each do |user|
users.push(UserRecord.new(user.id, user.user_key, date_since_last_cache(user)))
end
users.sort_by(&:last_stats_update)
end

private

def process_files(stats, user, start_date)
file_ids_for_user(user).each do |file_id|
file = ::FileSet.find(file_id)
view_stats = rescue_and_retry("Retried FileViewStat on #{user} for file #{file_id} too many times.") { FileViewStat.statistics(file, start_date, user.id) }
stats = tally_results(view_stats, :views, stats) unless (view_stats.blank? || view_stats)
delay
dl_stats = rescue_and_retry("Retried FileDownloadStat on #{user} for file #{file_id} too many times.") { FileDownloadStat.statistics(file, start_date, user.id) }
stats = tally_results(dl_stats, :downloads, stats) unless (dl_stats.blank? || dl_stats)
delay
end
end

def process_works(stats, user, start_date)
work_ids_for_user(user).each do |work_id|
work = Hyrax::WorkRelation.new.find(work_id)
work_stats = rescue_and_retry("Retried WorkViewStat on #{user} for work #{work_id} too many times.") { WorkViewStat.statistics(work, start_date, user.id) }
stats = tally_results(work_stats, :work_views, stats) unless (work_stats.blank? || work_stats)
delay
end
end

def delay
sleep @delay_secs
end

def rescue_and_retry(fail_message)
retry_count = 0
begin
return yield
rescue StandardError => e
retry_count += 1
if retry_count < @number_of_retries
delay
retry
else
log_message fail_message
log_message "Last exception #{e}"
end
end
end

def date_since_last_cache(user)
last_cached_stat = UserStat.where(user_id: user.id).order(date: :asc).last

if last_cached_stat
last_cached_stat.date + 1.day
else
Hyrax.config.analytic_start_date
end
end

def file_ids_for_user(user)
ids = []
::FileSet.search_in_batches("#{depositor_field}:\"#{user.user_key}\"", fl: "id") do |group|
ids.concat group.map { |doc| doc["id"] }
end
ids
end

def work_ids_for_user(user)
ids = []
Hyrax::WorkRelation.new.search_in_batches("#{depositor_field}:\"#{user.user_key}\"", fl: "id") do |group|
ids.concat group.map { |doc| doc["id"] }
end
ids
end

# For each date, add the view and download counts for this file to the view & download sub-totals for that day.
# The resulting hash will look something like this: {"2014-11-30 00:00:00 UTC" => {:views=>2, :downloads=>5},
# "2014-12-01 00:00:00 UTC" => {:views=>4, :downloads=>4}}
def tally_results(current_stats, stat_name, total_stats)
current_stats.each do |stats|
# Exclude the stats from today since it will only be a partial day's worth of data
break if stats.date == Time.zone.today

date_key = stats.date.to_s
old_count = total_stats[date_key] ? total_stats[date_key].fetch(stat_name) { 0 } : 0
new_count = old_count + stats.method(stat_name).call

old_values = total_stats[date_key] || {}
total_stats.store(date_key, old_values)
total_stats[date_key].store(stat_name, new_count)
end
total_stats
end

def create_or_update_user_stats(stats, user)
stats.each do |date_string, data|
date = Time.zone.parse(date_string)

user_stat = UserStat.where(user_id: user.id, date: date).first_or_initialize(user_id: user.id, date: date)

user_stat.file_views = data.fetch(:views, 0)
user_stat.file_downloads = data.fetch(:downloads, 0)
user_stat.work_views = data.fetch(:work_views, 0)
user_stat.save!
end
end

def log_message(message)
Rails.logger.info "#{self.class}: #{message}" if @logging
end
end
end