diff --git a/.gitignore b/.gitignore index 9170162c..ccf97ea0 100644 --- a/.gitignore +++ b/.gitignore @@ -3,6 +3,8 @@ config/config.rb config/config_*.rb config/*.p12 +config/*.json +data/ projectFilesBackup/ .ruby-version repo* diff --git a/Gemfile b/Gemfile index e2696023..ea3a89ea 100644 --- a/Gemfile +++ b/Gemfile @@ -4,7 +4,7 @@ gemspec gem 'faraday', '~> 1.9' gem 'ffi' -gem "google-apis-analytics_v3" +gem 'google-analytics-data' gem 'mail', '2.6.6' gem 'multi_json' gem 'oj', '~> 2.0' diff --git a/Gemfile.lock b/Gemfile.lock index 76bf6034..05cb7504 100644 --- a/Gemfile.lock +++ b/Gemfile.lock @@ -15,7 +15,7 @@ GIT GIT remote: https://github.com/ncbo/ncbo_annotator.git - revision: 964f0680799421ab24eddc974d9f2995c6c88734 + revision: f4aa1c394359500b50dbc6b0b636e9fd6da93275 branch: master specs: ncbo_annotator (0.0.1) @@ -60,7 +60,7 @@ PATH ncbo_cron (0.0.1) dante goo - google-apis-analytics_v3 + google-analytics-data mlanett-redis-lock multi_json ncbo_annotator @@ -83,7 +83,6 @@ GEM connection_pool (2.4.1) cube-ruby (0.0.3) dante (0.2.0) - declarative (0.0.20) docile (1.4.0) domain_name (0.5.20190701) unf (>= 0.0.5, < 1.0.0) @@ -115,29 +114,55 @@ GEM faraday-rack (1.0.0) faraday-retry (1.0.3) ffi (1.15.5) - google-apis-analytics_v3 (0.13.0) - google-apis-core (>= 0.11.0, < 2.a) - google-apis-core (0.11.0) - addressable (~> 2.5, >= 2.5.1) - googleauth (>= 0.16.2, < 2.a) - httpclient (>= 2.8.1, < 3.a) - mini_mime (~> 1.0) - representable (~> 3.0) - retriable (>= 2.0, < 4.a) - rexml - webrick - googleauth (1.6.0) + gapic-common (0.19.1) + faraday (>= 1.9, < 3.a) + faraday-retry (>= 1.0, < 3.a) + google-protobuf (~> 3.14) + googleapis-common-protos (>= 1.3.12, < 2.a) + googleapis-common-protos-types (>= 1.3.1, < 2.a) + googleauth (~> 1.0) + grpc (~> 1.36) + google-analytics-data (0.4.0) + google-analytics-data-v1beta (>= 0.7, < 2.a) + google-cloud-core (~> 1.6) + google-analytics-data-v1beta (0.8.0) + gapic-common (>= 0.19.1, < 2.a) + google-cloud-errors (~> 1.0) + google-cloud-core (1.6.0) + google-cloud-env (~> 1.0) + google-cloud-errors (~> 1.0) + google-cloud-env (1.6.0) + faraday (>= 0.17.3, < 3.0) + google-cloud-errors (1.3.1) + google-protobuf (3.23.4) + google-protobuf (3.23.4-x86_64-darwin) + google-protobuf (3.23.4-x86_64-linux) + googleapis-common-protos (1.4.0) + google-protobuf (~> 3.14) + googleapis-common-protos-types (~> 1.2) + grpc (~> 1.27) + googleapis-common-protos-types (1.7.0) + google-protobuf (~> 3.14) + googleauth (1.7.0) faraday (>= 0.17.3, < 3.a) jwt (>= 1.4, < 3.0) memoist (~> 0.16) multi_json (~> 1.11) os (>= 0.9, < 2.0) signet (>= 0.16, < 2.a) + grpc (1.56.2) + google-protobuf (~> 3.23) + googleapis-common-protos-types (~> 1.0) + grpc (1.56.2-x86_64-darwin) + google-protobuf (~> 3.23) + googleapis-common-protos-types (~> 1.0) + grpc (1.56.2-x86_64-linux) + google-protobuf (~> 3.23) + googleapis-common-protos-types (~> 1.0) htmlentities (4.3.4) http-accept (1.7.0) http-cookie (1.0.5) domain_name (~> 0.5) - httpclient (2.8.3) i18n (0.9.5) concurrent-ruby (~> 1.0) json (2.6.3) @@ -156,7 +181,6 @@ GEM mime-types (3.4.1) mime-types-data (~> 3.2015) mime-types-data (3.2023.0218.1) - mini_mime (1.1.2) minitest (4.7.5) mlanett-redis-lock (0.2.7) redis @@ -174,7 +198,7 @@ GEM pry (0.14.2) coderay (~> 1.1) method_source (~> 1.0) - public_suffix (5.0.1) + public_suffix (5.0.3) rack (3.0.8) rack-test (2.1.0) rack (>= 1.3) @@ -185,16 +209,11 @@ GEM redis-client (>= 0.9.0) redis-client (0.14.1) connection_pool - representable (3.2.0) - declarative (< 0.1.0) - trailblazer-option (>= 0.1.1, < 0.2.0) - uber (< 0.2.0) rest-client (2.1.0) http-accept (>= 1.7.0, < 2.0) http-cookie (>= 1.0.2, < 2.0) mime-types (>= 1.16, < 4.0) netrc (~> 0.8) - retriable (3.1.2) rexml (3.2.5) rsolr (2.5.0) builder (>= 2.1.2) @@ -223,16 +242,13 @@ GEM systemu (2.6.5) test-unit-minitest (0.9.1) minitest (~> 4.7) - trailblazer-option (0.1.2) tzinfo (2.0.6) concurrent-ruby (~> 1.0) - uber (0.1.0) unf (0.1.4) unf_ext unf_ext (0.0.8.2) uuid (2.3.9) macaddr (~> 1.0) - webrick (1.8.1) PLATFORMS ruby @@ -246,7 +262,7 @@ DEPENDENCIES faraday (~> 1.9) ffi goo! - google-apis-analytics_v3 + google-analytics-data mail (= 2.6.6) minitest (< 5.0) multi_json @@ -267,4 +283,4 @@ DEPENDENCIES test-unit-minitest BUNDLED WITH - 2.4.9 + 2.4.17 diff --git a/config/config.rb.sample b/config/config.rb.sample index 8d204311..668c7a0c 100644 --- a/config/config.rb.sample +++ b/config/config.rb.sample @@ -69,14 +69,14 @@ NcboCron.config do |config| config.search_index_all_url = "http://localhost:8983/solr/term_search_core2" config.property_search_index_all_url = "http://localhost:8983/solr/prop_search_core2" - # Google Analytics config - config.analytics_service_account_email_address = "123456789999-sikipho0wk8q0atflrmw62dj4kpwoj3c@developer.gserviceaccount.com" - config.analytics_path_to_key_file = "config/bioportal-analytics.p12" - config.analytics_profile_id = "ga:1234567" - config.analytics_app_name = "BioPortal" - config.analytics_app_version = "1.0.0" - config.analytics_start_date = "2013-10-01" - config.analytics_filter_str = "ga:networkLocation!@stanford;ga:networkLocation!@amazon" + # Google Analytics GA4 config + config.analytics_path_to_key_file = "config/your_analytics_key.json" + config.analytics_property_id = "123456789" + # path to the Universal Analytics data, which stopped collecting on June 1st, 2023 + config.analytics_path_to_ua_data_file = "data/your_ua_data.json" + # path to the file that will hold your Google Analytics data + # this is in addition to storing it in Redis + config.analytics_path_to_ga_data_file = "data/your_ga_data.json" # this is a Base64.encode64 encoded personal access token # you need to run Base64.decode64 on it before using it in your code diff --git a/lib/ncbo_cron/ontology_analytics.rb b/lib/ncbo_cron/ontology_analytics.rb index 3e4076b4..334da43e 100644 --- a/lib/ncbo_cron/ontology_analytics.rb +++ b/lib/ncbo_cron/ontology_analytics.rb @@ -1,12 +1,16 @@ require 'logger' -require 'google/apis/analytics_v3' -require 'google/api_client/auth/key_utils' +require 'json' +require 'benchmark' +require 'google/analytics/data' + module NcboCron module Models class OntologyAnalytics - ONTOLOGY_ANALYTICS_REDIS_FIELD = "ontology_analytics" + ONTOLOGY_ANALYTICS_REDIS_FIELD = 'ontology_analytics' + UA_START_DATE = '2013-10-01' + GA4_START_DATE = '2023-06-01' def initialize(logger) @logger = logger @@ -15,103 +19,196 @@ def initialize(logger) def run redis = Redis.new(:host => NcboCron.settings.ontology_analytics_redis_host, :port => NcboCron.settings.ontology_analytics_redis_port) ontology_analytics = fetch_ontology_analytics + File.open(NcboCron.settings.analytics_path_to_ga_data_file, 'w') do |f| + f.write(ontology_analytics.to_json) + end redis.set(ONTOLOGY_ANALYTICS_REDIS_FIELD, Marshal.dump(ontology_analytics)) end def fetch_ontology_analytics - google_client = authenticate_google - aggregated_results = Hash.new - start_year = Date.parse(NcboCron.settings.analytics_start_date).year || 2013 - ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} - # ont_acronyms = ["NCIT", "ONTOMA", "CMPO", "AEO", "SNOMEDCT"] - filter_str = (NcboCron.settings.analytics_filter_str.nil? || NcboCron.settings.analytics_filter_str.empty?) ? "" : ";#{NcboCron.settings.analytics_filter_str}" - - ont_acronyms.each do |acronym| + @logger.info "Starting Google Analytics refresh..." + @logger.flush + full_data = nil + + time = Benchmark.realtime do max_results = 10000 - num_results = 10000 - start_index = 1 - results = nil - - loop do - results = google_client.get_ga_data( - ids = NcboCron.settings.analytics_profile_id, - start_date = NcboCron.settings.analytics_start_date, - end_date = Date.today.to_s, - metrics = 'ga:pageviews', - { - dimensions: 'ga:pagePath,ga:year,ga:month', - filters: "ga:pagePath=~^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$#{filter_str}", - start_index: start_index, - max_results: max_results - } - ) - results.rows ||= [] - start_index += max_results - num_results = results.rows.length - @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" - @logger.flush - - results.rows.each do |row| - if aggregated_results.has_key?(acronym) - # year - if aggregated_results[acronym].has_key?(row[1].to_i) - # month - if aggregated_results[acronym][row[1].to_i].has_key?(row[2].to_i) - aggregated_results[acronym][row[1].to_i][row[2].to_i] += row[3].to_i + aggregated_results = Hash.new + + @logger.info "Fetching all ontology acronyms from backend..." + @logger.flush + ont_acronyms = LinkedData::Models::Ontology.where.include(:acronym).all.map {|o| o.acronym} + # ont_acronyms = ["NCIT", "SNOMEDCT", "MEDDRA"] + + @logger.info "Authenticating with the Google Analytics Endpoint..." + @logger.flush + google_client = authenticate_google + + date_range = Google::Analytics::Data::V1beta::DateRange.new( + start_date: GA4_START_DATE, + end_date: Date.today.to_s + ) + metrics_page_views = Google::Analytics::Data::V1beta::Metric.new( + name: "screenPageViews" + ) + dimension_path = Google::Analytics::Data::V1beta::Dimension.new( + name: "pagePath" + ) + dimension_year = Google::Analytics::Data::V1beta::Dimension.new( + name: "year" + ) + dimension_month = Google::Analytics::Data::V1beta::Dimension.new( + name: "month" + ) + string_filter = Google::Analytics::Data::V1beta::Filter::StringFilter.new( + match_type: Google::Analytics::Data::V1beta::Filter::StringFilter::MatchType::FULL_REGEXP + ) + filter = Google::Analytics::Data::V1beta::Filter.new( + field_name: "pagePath", + string_filter: string_filter + ) + filter_expression = Google::Analytics::Data::V1beta::FilterExpression.new( + filter: filter + ) + order_year = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( + dimension_name: "year" + ) + orderby_year = Google::Analytics::Data::V1beta::OrderBy.new( + desc: false, + dimension: order_year + ) + order_month = Google::Analytics::Data::V1beta::OrderBy::DimensionOrderBy.new( + dimension_name: "month" + ) + orderby_month = Google::Analytics::Data::V1beta::OrderBy.new( + desc: false, + dimension: order_month + ) + @logger.info "Fetching GA4 analytics for all ontologies..." + @logger.flush + + ont_acronyms.each do |acronym| + start_index = 0 + string_filter.value = "^(\\/ontologies\\/#{acronym})(\\/?\\?{0}|\\/?\\?{1}.*)$" + + loop do + request = Google::Analytics::Data::V1beta::RunReportRequest.new( + property: "properties/#{NcboCron.settings.analytics_property_id}", + metrics: [metrics_page_views], + dimension_filter: filter_expression, + dimensions: [dimension_path, dimension_year, dimension_month], + date_ranges: [date_range], + order_bys: [orderby_year, orderby_month], + offset: start_index, + limit: max_results + ) + response = google_client.run_report request + + response.rows ||= [] + start_index += max_results + num_results = response.rows.length + @logger.info "Acronym: #{acronym}, Results: #{num_results}, Start Index: #{start_index}" + @logger.flush + + response.rows.each do |row| + row_h = row.to_h + year_month_hits = row_h[:dimension_values].map.with_index { + |v, i| i > 0 ? v[:value].to_i.to_s : row_h[:metric_values][0][:value].to_i + }.rotate(1) + + if aggregated_results.has_key?(acronym) + # year + if aggregated_results[acronym].has_key?(year_month_hits[0]) + # month + if aggregated_results[acronym][year_month_hits[0]].has_key?(year_month_hits[1]) + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] += year_month_hits[2] + else + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] + end else - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + aggregated_results[acronym][year_month_hits[0]] = Hash.new + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] end else - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i + aggregated_results[acronym] = Hash.new + aggregated_results[acronym][year_month_hits[0]] = Hash.new + aggregated_results[acronym][year_month_hits[0]][year_month_hits[1]] = year_month_hits[2] end - else - aggregated_results[acronym] = Hash.new - aggregated_results[acronym][row[1].to_i] = Hash.new - aggregated_results[acronym][row[1].to_i][row[2].to_i] = row[3].to_i end - end + break if num_results < max_results + end # loop + end # ont_acronyms + @logger.info "Refresh complete, merging GA4 and UA data..." + @logger.flush + full_data = merge_ga4_ua_data(aggregated_results) + @logger.info "Merged" + @logger.flush + end # Benchmark.realtime + @logger.info "Completed Google Analytics refresh in #{(time/60).round(1)} minutes." + @logger.flush + full_data + end - if num_results < max_results - # fill up non existent years - (start_year..Date.today.year).each do |y| - aggregated_results[acronym] = Hash.new if aggregated_results[acronym].nil? - aggregated_results[acronym][y] = Hash.new unless aggregated_results[acronym].has_key?(y) - end - # fill up non existent months with zeros - (1..12).each { |n| aggregated_results[acronym].values.each { |v| v[n] = 0 unless v.has_key?(n) } } - break + def merge_ga4_ua_data(ga4_data) + ua_data_file = File.read(NcboCron.settings.analytics_path_to_ua_data_file) + ua_data = JSON.parse(ua_data_file) + ua_ga4_intersecting_year = Date.parse(GA4_START_DATE).year.to_s + ua_ga4_intersecting_month = Date.parse(GA4_START_DATE).month.to_s + + # add up hits for June of 2023 (the only intersecting month between UA and GA4) + ua_data.each do |acronym, _| + if ga4_data.has_key?(acronym) + if ga4_data[acronym][ua_ga4_intersecting_year].has_key?(ua_ga4_intersecting_month) + ua_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] += + ga4_data[acronym][ua_ga4_intersecting_year][ua_ga4_intersecting_month] + # delete data for June of 2023 from ga4_data to avoid overwriting when merging + ga4_data[acronym][ua_ga4_intersecting_year].delete(ua_ga4_intersecting_month) end end end + # merge ua and ga4 data + merged_data = ua_data.deep_merge(ga4_data) + # fill missing years and months + fill_missing_data(merged_data) + # sort acronyms, years and months + sort_ga_data(merged_data) + end - @logger.info "Completed ontology analytics refresh..." - @logger.flush + def fill_missing_data(ga_data) + # fill up non existent years + start_year = Date.parse(UA_START_DATE).year + + ga_data.each do |acronym, _| + (start_year..Date.today.year).each do |y| + ga_data[acronym] = Hash.new if ga_data[acronym].nil? + ga_data[acronym][y.to_s] = Hash.new unless ga_data[acronym].has_key?(y.to_s) + end + # fill up non existent months with zeros + (1..12).each { |n| ga_data[acronym].values.each { |v| v[n.to_s] = 0 unless v.has_key?(n.to_s) } } + end + end - aggregated_results + def sort_ga_data(ga_data) + ga_data.transform_values { |value| + value.transform_values { |val| + val.sort_by { |key, _| key.to_i }.to_h + }.sort_by { |k, _| k.to_i }.to_h + }.sort.to_h end def authenticate_google - Google::Apis::ClientOptions.default.application_name = NcboCron.settings.analytics_app_name - Google::Apis::ClientOptions.default.application_version = NcboCron.settings.analytics_app_version - # enable google api call retries in order to - # minigate analytics processing failure due to ocasional google api timeouts and other outages - Google::Apis::RequestOptions.default.retries = 5 - # uncoment to enable logging for debugging purposes - # Google::Apis.logger.level = Logger::DEBUG - # Google::Apis.logger = @logger - client = Google::Apis::AnalyticsV3::AnalyticsService.new - key = Google::APIClient::KeyUtils::load_from_pkcs12(NcboCron.settings.analytics_path_to_key_file, 'notasecret') - client.authorization = Signet::OAuth2::Client.new( - :token_credential_uri => 'https://accounts.google.com/o/oauth2/token', - :audience => 'https://accounts.google.com/o/oauth2/token', - :scope => 'https://www.googleapis.com/auth/analytics.readonly', - :issuer => NcboCron.settings.analytics_service_account_email_address, - :signing_key => key - ).tap { |auth| auth.fetch_access_token! } - client + Google::Analytics::Data.analytics_data do |config| + config.credentials = NcboCron.settings.analytics_path_to_key_file + end end - end + end # class + + end +end + +class ::Hash + def deep_merge(second) + merger = proc { |key, v1, v2| Hash === v1 && Hash === v2 ? v1.merge(v2, &merger) : v2 } + self.merge(second, &merger) end end @@ -124,4 +221,4 @@ def authenticate_google # # ontology_analytics_logger = Logger.new(ontology_analytics_log_path) # ontology_analytics_logger = Logger.new(STDOUT) # NcboCron::Models::OntologyAnalytics.new(ontology_analytics_logger).run -# ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' +# # ./bin/ncbo_cron --disable-processing true --disable-pull true --disable-flush true --disable-warmq true --disable-ontologies-report true --disable-mapping-counts true --disable-spam-deletion true --ontology-analytics '14 * * * *' diff --git a/ncbo_cron.gemspec b/ncbo_cron.gemspec index ef21761f..c8faa03d 100644 --- a/ncbo_cron.gemspec +++ b/ncbo_cron.gemspec @@ -16,7 +16,7 @@ Gem::Specification.new do |gem| gem.add_dependency("dante") gem.add_dependency("goo") - gem.add_dependency("google-apis-analytics_v3") + gem.add_dependency("google-analytics-data") gem.add_dependency("mlanett-redis-lock") gem.add_dependency("multi_json") gem.add_dependency("ncbo_annotator") diff --git a/rakelib/purl_management.rake b/rakelib/purl_management.rake new file mode 100644 index 00000000..58cfadd7 --- /dev/null +++ b/rakelib/purl_management.rake @@ -0,0 +1,28 @@ +# Task for updating and adding missing purl for all ontologies +# +desc 'Purl Utilities' +namespace :purl do + require 'bundler/setup' + # Configure the process for the current cron configuration. + require_relative '../lib/ncbo_cron' + config_exists = File.exist?(File.expand_path('../../config/config.rb', __FILE__)) + abort('Please create a config/config.rb file using the config/config.rb.sample as a template') unless config_exists + require_relative '../config/config' + + desc 'update purl for all ontologies' + task :update_all do + purl_client = LinkedData::Purl::Client.new + LinkedData::Models::Ontology.all.each do |ont| + ont.bring(:acronym) + acronym = ont.acronym + + if purl_client.purl_exists(acronym) + puts "#{acronym} exists" + purl_client.fix_purl(acronym) + else + puts "#{acronym} DOES NOT exist" + purl_client.create_purl(acronym) + end + end + end +end