Skip to content

Commit

Permalink
fixes to the analytics script and a new script to generate UA analyti…
Browse files Browse the repository at this point in the history
…cs for documentation
  • Loading branch information
mdorf committed Nov 14, 2023
1 parent 7429289 commit e8fa020
Show file tree
Hide file tree
Showing 4 changed files with 190 additions and 26 deletions.
6 changes: 6 additions & 0 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,12 @@ source 'https://rubygems.org'
gemspec

gem 'ffi'

# This is needed temporarily to pull the Google Universal Analytics (UA)
# data and store it in a file. See (bin/generate_ua_analytics_file.rb)
# The ability to pull this data from Google will cease on July 1, 2024
gem "google-apis-analytics_v3"

gem 'google-analytics-data'
gem 'mail', '2.6.6'
gem 'multi_json'
Expand Down
35 changes: 29 additions & 6 deletions Gemfile.lock
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ GIT

GIT
remote: https://github.com/ncbo/ncbo_annotator.git
revision: 067104ae94c0e9d058cfbf419364fbf03f34de43
revision: ebbb7a3c28ecde49c261290bec34ab082490a271
branch: develop
specs:
ncbo_annotator (0.0.1)
Expand All @@ -26,7 +26,7 @@ GIT

GIT
remote: https://github.com/ncbo/ontologies_linked_data.git
revision: ff10e5ff4103431da1aec3cbbaebc57547c0035c
revision: 5600020a8017cb4901e719f577032b0be6a14949
branch: develop
specs:
ontologies_linked_data (0.0.1)
Expand Down Expand Up @@ -76,14 +76,15 @@ GEM
multi_json (~> 1.0)
addressable (2.8.5)
public_suffix (>= 2.0.2, < 6.0)
base64 (0.1.1)
base64 (0.2.0)
bcrypt (3.1.19)
builder (3.2.4)
coderay (1.1.3)
concurrent-ruby (1.2.2)
connection_pool (2.4.1)
cube-ruby (0.0.3)
dante (0.2.0)
declarative (0.0.20)
docile (1.4.0)
domain_name (0.5.20190701)
unf (>= 0.0.5, < 1.0.0)
Expand Down Expand Up @@ -113,6 +114,17 @@ GEM
google-analytics-data-v1beta (0.9.0)
gapic-common (>= 0.20.0, < 2.a)
google-cloud-errors (~> 1.0)
google-apis-analytics_v3 (0.13.0)
google-apis-core (>= 0.11.0, < 2.a)
google-apis-core (0.11.2)
addressable (~> 2.5, >= 2.5.1)
googleauth (>= 0.16.2, < 2.a)
httpclient (>= 2.8.1, < 3.a)
mini_mime (~> 1.0)
representable (~> 3.0)
retriable (>= 2.0, < 4.a)
rexml
webrick
google-cloud-core (1.6.0)
google-cloud-env (~> 1.0)
google-cloud-errors (~> 1.0)
Expand All @@ -126,7 +138,7 @@ GEM
google-protobuf (~> 3.14)
googleapis-common-protos-types (~> 1.2)
grpc (~> 1.27)
googleapis-common-protos-types (1.9.0)
googleapis-common-protos-types (1.10.0)
google-protobuf (~> 3.18)
googleauth (1.8.1)
faraday (>= 0.17.3, < 3.a)
Expand All @@ -147,15 +159,16 @@ GEM
http-accept (1.7.0)
http-cookie (1.0.5)
domain_name (~> 0.5)
httpclient (2.8.3)
i18n (0.9.5)
concurrent-ruby (~> 1.0)
json (2.6.3)
json_pure (2.6.3)
jwt (2.7.1)
launchy (2.5.2)
addressable (~> 2.8)
libxml-ruby (4.1.1)
logger (1.5.3)
libxml-ruby (4.1.2)
logger (1.6.0)
macaddr (1.7.2)
systemu (~> 2.6.5)
mail (2.6.6)
Expand All @@ -164,6 +177,7 @@ GEM
mime-types (3.5.1)
mime-types-data (~> 3.2015)
mime-types-data (3.2023.1003)
mini_mime (1.1.5)
minitest (4.7.5)
mlanett-redis-lock (0.2.7)
redis
Expand Down Expand Up @@ -191,11 +205,16 @@ GEM
redis-client (>= 0.17.0)
redis-client (0.18.0)
connection_pool
representable (3.2.0)
declarative (< 0.1.0)
trailblazer-option (>= 0.1.1, < 0.2.0)
uber (< 0.2.0)
rest-client (2.1.0)
http-accept (>= 1.7.0, < 2.0)
http-cookie (>= 1.0.2, < 2.0)
mime-types (>= 1.16, < 4.0)
netrc (~> 0.8)
retriable (3.1.2)
rexml (3.2.6)
rsolr (2.5.0)
builder (>= 2.1.2)
Expand Down Expand Up @@ -224,13 +243,16 @@ GEM
systemu (2.6.5)
test-unit-minitest (0.9.1)
minitest (~> 4.7)
trailblazer-option (0.1.2)
tzinfo (2.0.6)
concurrent-ruby (~> 1.0)
uber (0.1.0)
unf (0.1.4)
unf_ext
unf_ext (0.0.8.2)
uuid (2.3.9)
macaddr (~> 1.0)
webrick (1.8.1)

PLATFORMS
ruby
Expand All @@ -244,6 +266,7 @@ DEPENDENCIES
ffi
goo!
google-analytics-data
google-apis-analytics_v3
mail (= 2.6.6)
minitest (< 5.0)
multi_json
Expand Down
126 changes: 126 additions & 0 deletions bin/generate_ua_analytics_file.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@
require 'logger'
require 'google/apis/analytics_v3'
require 'google/api_client/auth/key_utils'

module NcboCron
module Models

class OntologyAnalyticsUA

def initialize(logger)
@logger = logger
end

def run
redis = Redis.new(:host => NcboCron.settings.redis_host, :port => NcboCron.settings.redis_port)
ontology_analytics = fetch_ontology_analytics
File.open(NcboCron.settings.analytics_path_to_ua_data_file, 'w') do |f|
f.write(ontology_analytics.to_json)
end
end

def fetch_ontology_analytics
google_client = authenticate_google
aggregated_results = Hash.new
start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013
ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym}
# ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"]
filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}"

ont_acronyms.each do |acronym|
max_results = 10000
num_results = 10000
start_index = 1
results = nil

loop do
results = google_client.get_ga_data(
ids = NcboCron.settings.analytics_profile_id,
start_date = NcboCron.settings.analytics_start_date,
end_date = Date.today.to_s,
metrics = 'ga:pageviews',
{
dimensions: 'ga:pagePath,ga:year,ga:month',
filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}",
start_index: start_index,
max_results: max_results
}
)
results.rows ||= []
start_index += max_results
num_results = results.rows.length
@logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}"
@logger.flush

results.rows.each do |row|
if aggregated_results.has_key?(acronym)
# year
if aggregated_results[acronym].has_key?(row[1].to_i)
# month
if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i)
aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i
else
aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i
end
else
aggregated_results[acronym][row[1].to_i] = Hash.new
aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i
end
else
aggregated_results[acronym] = Hash.new
aggregated_results[acronym][row[1].to_i] = Hash.new
aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i
end
end

if num_results < max_results
# fill up non existent years
(start_year..Date.today.year).each do |y|
aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil?
aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y)
end
# fill up non existent months with zeros
(1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } }
break
end
end
end

@logger.info "Completed Universal Analytics pull..."
@logger.flush

aggregated_results
end

def authenticate_google
Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name
Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version
# enable google api call retries in order to
# minigate analytics processing failure due to occasional google api timeouts and other outages
Google::Apis::RequestOptions.default.retries = 5
# uncoment to enable logging for debugging purposes
# Google::Apis.logger.level = Logger::DEBUG
# Google::Apis.logger = @logger
client = Google::Apis::AnalyticsV3::AnalyticsService.new
key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_ua_key_file, 'notasecret')
client.authorization = Signet::OAuth2::Client.new(
:token_credential_uri => 'https://accounts.google.com/o/oauth2/token',
:audience => 'https://accounts.google.com/o/oauth2/token',
:scope => 'https://www.googleapis.com/auth/analytics.readonly',
:issuer => NcboCron.settings.analytics_service_account_email_address,
:signing_key => key
).tap { |auth| auth.fetch_access_token! }
client
end
end
end
end

require 'ontologies_linked_data'
require 'goo'
require 'ncbo_annotator'
require 'ncbo_cron/config'
require_relative '../config/config'
ontology_analytics_log_path = File.join("logs", "ontology-analytics-ua.log")
ontology_analytics_logger = Logger.new(ontology_analytics_log_path)
NcboCron::Models::OntologyAnalyticsUA.new(ontology_analytics_logger).run
49 changes: 29 additions & 20 deletions lib/ncbo_cron/ontology_analytics.rb
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@ def fetch_ontology_analytics
@logger.flush
ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym}
# ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"]

@logger.info "Authenticating with the Google Analytics Endpoint..."
@logger.flush
google_client = authenticate_google
Expand Down Expand Up @@ -137,39 +136,49 @@ def fetch_ontology_analytics
break if num_results < max_results
end # loop
end # ont_acronyms
@logger.info "Refresh complete, merging GA4 and UA data..."
@logger.flush
full_data = merge_ga4_ua_data(aggregated_results)
@logger.info "Merged"
@logger.info "Refresh complete"
@logger.flush
full_data = merge_and_fill_missing_data(aggregated_results)
end # Benchmark.realtime
@logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes."
@logger.flush
full_data
end

def merge_ga4_ua_data(ga4_data)
ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file)
ua_data = JSON.parse(ua_data_file)
ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s
ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s

# add up hits for June of 2023 (the only intersecting month between UA and GA4)
ua_data.each do |acronym, _|
if ga4_data.has_key?(acronym)
if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month)
ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] +=
ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month]
# delete data for June of 2023 from ga4_data to avoid overwriting when merging
ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month)
def merge_and_fill_missing_data(ga4_data)
ua_data = {}

if File.exists?(NcboCron.settings.analytics_path_to_ua_data_file) &&
!File.zero?(NcboCron.settings.analytics_path_to_ua_data_file)
@logger.info "Merging GA4 and UA data..."
@logger.flush
ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file)
ua_data = JSON.parse(ua_data_file)
ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s
ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s

# add up hits for June of 2023 (the only intersecting month between UA and GA4)
ua_data.each do |acronym, _|
if ga4_data.has_key?(acronym)
if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month)
ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] +=
ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month]
# delete data for June of 2023 from ga4_data to avoid overwriting when merging
ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month)
end
end
end
end

# merge ua and ga4 data
merged_data = ua_data.deep_merge(ga4_data)
# fill missing years and months
@logger.info "Filling in missing years data..."
@logger.flush
fill_missing_data(merged_data)
# sort acronyms, years and months
@logger.info "Sorting final data..."
@logger.flush
sort_ga_data(merged_data)
end

Expand Down Expand Up @@ -221,4 +230,4 @@ def deep_merge(second)
# # ontology_analytics_logger = Logger.new(ontology_analytics_log_path)
# ontology_analytics_logger = Logger.new(STDOUT)
# NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run
# # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *'
# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *'

0 comments on commit e8fa020

Please sign in to comment.