Create and use new model to store nested retweets metadata

berkmancenter · Mar 24, 2020 · 70577c2 · 70577c2
1 parent 4b6ad81
commit 70577c2
Show file tree

Hide file tree

Showing 12 changed files with 129 additions and 38 deletions.
diff --git a/app/models/data_set.rb b/app/models/data_set.rb
@@ -30,8 +30,7 @@
 
 class DataSet < ApplicationRecord
   belongs_to :cohort
-
-  store_accessor :top_retweets
+  has_many :retweets
 
   attr_readonly :index_name
   before_create :add_index_name
@@ -59,9 +58,17 @@ def update_aggregates
       top_urls: MetadataHarvester.new(:urls, all_tweets).harvest,
       top_words: MetadataHarvester.new(:words, all_tweets).harvest,
       top_mentions: MetadataHarvester.new(:mentions, all_tweets).harvest,
-      top_sources: MetadataHarvester.new(:sources, all_tweets).harvest,
-      top_retweets: MetadataHarvester.new(:retweets, all_tweets).harvest
+      top_sources: MetadataHarvester.new(:sources, all_tweets).harvest
     )
+    # Nested retweets goes to their own table
+    MetadataHarvester.new(:retweets, all_tweets).harvest.each do |text, retweet|
+      Retweet.create!(
+        data_set: self,
+        text: text,
+        count: retweet[:count],
+        link: retweet[:link]
+      )
+    end
   end
 
   def ingest_data
@@ -98,34 +105,28 @@ def store_data(tweets)
   def self.aggregate(ids)
     keys = %i[hashtags top_urls top_words top_mentions top_sources top_retweets]
     data_sets = self.where(id: ids)
+
     retval = {}
 
     keys.each do |key|
       # Keep only the data above our thresholds.
       if key == :top_retweets
         # Accumulate data from all datasets in scope.
-        data = data_sets.pluck(key)
-                        .reduce ({}) do |first, second|
+        data = data_sets.map(&:top_retweets)
+                        .flatten(1)
+                        .reduce({}) do |first, second|
                           first.merge(second) do |_, a, b|
-                            a = eval(a)
-                            b = eval(b)
                             { count: a[:count].to_i + b[:count].to_i, link: a[:link] }
                           end
                         end
 
-        data.each do |k, v|
-          if v.is_a?(String)
-            data[k] = eval(data[k])
-            data[k][:count] = data[k][:count].to_i
-          end
-        end
-        min_count = data.map(&:count).sort.last(Extractor::TOP_N)[0]
+        min_count = data.map { |_k, v| v[:count] }.sort.last(Extractor::TOP_N)[0]
         data.reject! { |k, v| v[:count] < [min_count, Extractor::THRESHOLD].max }
       else
         # Accumulate data from all datasets in scope.
         data = data_sets.pluck(key)
                         .map { |h| h.transform_values!(&:to_i) }
-                        .reduce ({}) do |first, second|
+                        .reduce({}) do |first, second|
                           first.merge(second) { |_, a, b| a + b }
                         end
 
@@ -139,6 +140,14 @@ def self.aggregate(ids)
     retval
   end
 
+  def top_retweets
+    top = {}
+    self.retweets.map do |item|
+      top[item[:text]] = { count: item[:count], link: item[:link] }
+    end
+    top
+  end
+
   private
 
   def all_tweets

diff --git a/app/models/retweet.rb b/app/models/retweet.rb
@@ -0,0 +1,20 @@
+# == Schema Information
+#
+# Table name: retweets
+#
+#  id          :bigint           not null, primary key
+#  count       :integer
+#  link        :string
+#  text        :text
+#  created_at  :datetime         not null
+#  updated_at  :datetime         not null
+#  data_set_id :bigint
+#
+# Indexes
+#
+#  index_retweets_on_data_set_id  (data_set_id)
+#
+
+class Retweet < ApplicationRecord
+  belongs_to :data_set
+end
diff --git a/db/migrate/20200320182736_create_retweets.rb b/db/migrate/20200320182736_create_retweets.rb
@@ -0,0 +1,12 @@
+class CreateRetweets < ActiveRecord::Migration[5.2]
+  def change
+    create_table :retweets do |t|
+      t.text :text
+      t.integer :count
+      t.string :link
+      t.references :data_set
+
+      t.timestamps
+    end
+  end
+end
diff --git a/db/schema.rb b/db/schema.rb
@@ -10,7 +10,7 @@
 #
 # It's strongly recommended that you check this file into your version control system.
 
-ActiveRecord::Schema.define(version: 2020_03_13_181803) do
+ActiveRecord::Schema.define(version: 2020_03_20_182736) do
 
   # These are extensions that must be enabled in order to support this database
   enable_extension "hstore"
@@ -56,6 +56,16 @@
     t.index ["cohort_id"], name: "index_data_sets_on_cohort_id"
   end
 
+  create_table "retweets", force: :cascade do |t|
+    t.text "text"
+    t.integer "count"
+    t.string "link"
+    t.bigint "data_set_id"
+    t.datetime "created_at", null: false
+    t.datetime "updated_at", null: false
+    t.index ["data_set_id"], name: "index_retweets_on_data_set_id"
+  end
+
   create_table "search_queries", force: :cascade do |t|
     t.boolean "active"
     t.text "description"

diff --git a/spec/factories.rb b/spec/factories.rb
@@ -10,6 +10,12 @@
     end
   end
 
+  factory :retweet do
+    text { 'retweet text' }
+    count { 3 }
+    link { 'http://twitter.com/status/xxxxxx' }
+  end
+
   factory :cohort do
     twitter_ids { [14706139] }  # @BKCHarvard's twitter id
     description { 'Berkman Klein Center for Internet & Society' }
@@ -47,11 +53,27 @@
     num_tweets { 200 }
     num_retweets { 10 }
     top_mentions { { 'plato'=>'5', 'aristotle'=>'7' } }
-    top_retweets { { 'first tweet test' => { count: 2, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 3, link: 'https://secondtweettext.com' }} }
     top_sources { { 'godeysladysbook.com'=>'7', 'twitter.com'=>'4' } }
     top_urls { { 'www.cnn.com/a_story'=>'4', 'http://bitly.com/98K8eH'=>'8'} }
     top_words { { 'stopword'=>'5', 'moose'=>'74' } }
     hashtags { { 'llamas'=>'7', 'octopodes'=>'24' } }
+
+    after(:create) do |data_set|
+      create(
+        :retweet,
+        data_set: data_set,
+        text: 'first tweet test',
+        count: 2,
+        link: 'https://firsttweettext.com'
+      )
+      create(
+        :retweet,
+        data_set: data_set,
+        text: 'second tweet text',
+        count: 3,
+        link: 'https://secondtweettext.com'
+      )
+    end
   end
 
   factory :search_query do

diff --git a/spec/features/api_spec.rb b/spec/features/api_spec.rb
@@ -66,6 +66,7 @@
     after :all do
       DataSet.destroy_all
       Cohort.destroy_all
+      Retweet.destroy_all
     end
 
     it 'returns aggregated data' do

diff --git a/spec/models/cohort_spec.rb b/spec/models/cohort_spec.rb
@@ -41,7 +41,7 @@
         "ruha9"=>"4",
         })
       expect(ds.top_retweets).to eq({
-        "If you haven't yet watched the video of @ruha9 speak at @BKCHarvard on The New Jim Code,take some time today to listen to her speak on the intersection of race and technology,carceral technoscience,&amp; liberatory imagination in everyday life. https://t.co/VUbrXxmYeD"=>"{:count=>2, :link=>\"https://twitter.com/farman/status/1227305335901302785\"}"
+        "If you haven't yet watched the video of @ruha9 speak at @BKCHarvard on The New Jim Code,take some time today to listen to her speak on the intersection of race and technology,carceral technoscience,&amp; liberatory imagination in everyday life. https://t.co/VUbrXxmYeD"=> {:count=>2, :link=>"https://twitter.com/farman/status/1227305335901302785"}
       })
       # These numbers are lower than you'll see grepping through the VCR
       # cassette because 1) only the expanded_url field is considered and 2)
@@ -72,7 +72,6 @@
       create(:data_set,
         cohort: @cohorts.first,
         top_mentions: { 'plato'=>'5', 'aristotle'=>'7' },
-        top_retweets: { 'first tweet test' => { count: '2', link: 'https://firsttweettext.com' }, 'second tweet text' => { count: '3', link: 'https://secondtweettext.com' } },
         top_sources: { 'godeysladysbook.com'=>'7', 'twitter.com'=>'4' },
         top_urls: { 'www.cnn.com/a_story'=>'4', 'http://bitly.com/98K8eH'=>'8'},
         top_words: { 'stopword'=>'5', 'moose'=>'74' },
@@ -81,7 +80,6 @@
       create(:data_set,
         cohort: @cohorts.second,
         top_mentions: { 'plato'=>'10', 'socrates'=>'7' },
-        top_retweets: { 'first tweet test' => { count: '1', link: 'https://firsttweettext.com' }, 'second tweet text' => { count: '1', link: 'https://secondtweettext.com' }},
         top_sources: { 'twitter.com'=>'4', 'livejournal.com'=>'4' },
         top_urls: { 'www.cnn.com/a_story'=>'1' },
         top_words: { 'stopword'=>'5', 'bats'=>'7' },
@@ -92,6 +90,7 @@
     after :all do
       DataSet.destroy_all
       Cohort.destroy_all
+      Retweet.destroy_all
     end
 
     it 'can aggregate data from multiple cohorts' do
@@ -100,7 +99,7 @@
         'plato'=>15, 'aristotle'=>7, 'socrates'=>7
       })
       expect(aggs[:top_retweets]).to eq({
-        'first tweet test' => { count: 3, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 4, link: 'https://secondtweettext.com' }
+        'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 6, link: 'https://secondtweettext.com' }
       })
       expect(aggs[:top_sources]).to eq({
         'godeysladysbook.com'=>7, 'twitter.com'=>8, 'livejournal.com'=>4

diff --git a/spec/models/data_set_spec.rb b/spec/models/data_set_spec.rb
@@ -11,7 +11,6 @@
 #  num_tweets   :integer
 #  num_users    :integer
 #  top_mentions :hstore
-#  top_retweets :hstore
 #  top_sources  :hstore
 #  top_urls     :hstore
 #  top_words    :hstore
@@ -67,7 +66,7 @@
     end
   end
 
-  context 'during data ingestion' do
+  context 'during data ingestion', elasticsearch: true do
     it 'asks twitter for data on a user' do
       VCR.use_cassette('data set spec') do
         expect_any_instance_of(Twitter::REST::Client)
@@ -110,7 +109,7 @@
     end
   end
 
-  context 'data aggregation' do
+  context 'data aggregation', elasticsearch: true do
     it 'sets aggregates appropriately' do
       VCR.use_cassette('data aggregation') do
         cohort = create(:cohort)
@@ -166,11 +165,11 @@
 
         expect(ds_pipelined.top_mentions).to eq ({ "BKCHarvard"=>"5" })
         expect(ds_pipelined.top_retweets).to eq({
-          "Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3" => "{:count=>1, :link=>\"https://twitter.com/rtushnet/status/1227619561412997124\"}",
-          "Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp" => "{:count=>1, :link=>\"https://twitter.com/EngageLab/status/1227585647856123904\"}",
-          "Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn" => "{:count=>1, :link=>\"https://twitter.com/evelyndouek/status/1227282185364918274\"}",
-          "There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV" => "{:count=>1, :link=>\"https://twitter.com/omertene/status/1227807251227910147\"}",
-          "\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR" => "{:count=>1, :link=>\"https://twitter.com/datasociety/status/1228009942420000768\"}",
+          "Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3" => {:count=>1, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
+          "Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp" => {:count=>1, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"},
+          "Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn" => {:count=>1, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
+          "There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV" => {:count=>1, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
+          "\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR" => {:count=>1, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
         })
         expect(ds_pipelined.top_sources).to eq({
           "bit.ly"=>"2", "medium.com"=>"2", "news.bloomberglaw.com"=>"2",
@@ -221,11 +220,11 @@
 
           expect(aggs[:top_mentions]).to eq ({"BKCHarvard"=>10})
           expect(aggs[:top_retweets]).to eq({
-            "\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR"=>{:count=>2, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
-            "There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV"=>{:count=>2, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
-            "Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3"=>{:count=>2, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
-            "Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn"=>{:count=>2, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
-            "Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp"=>{:count=>2, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"}
+            "\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR"=>{:count=>1, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
+            "There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV"=>{:count=>1, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
+            "Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3"=>{:count=>1, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
+            "Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn"=>{:count=>1, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
+            "Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp"=>{:count=>1, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"}
           })
           expect(aggs[:top_sources]).to eq({
             "bit.ly"=>4, "medium.com"=>4, "news.bloomberglaw.com"=>4,

diff --git a/spec/models/extractor_spec.rb b/spec/models/extractor_spec.rb
@@ -16,6 +16,7 @@
   after :all do
     DataSet.destroy_all
     Cohort.destroy_all
+    Retweet.destroy_all
     Rails.application.config.tweets_per_user = @tweets_per_user
   end
 

diff --git a/spec/models/source_spec.rb b/spec/models/source_spec.rb
@@ -1,3 +1,19 @@
+# == Schema Information
+#
+# Table name: sources
+#
+#  id             :bigint           not null, primary key
+#  canonical_host :string
+#  variant_hosts  :string           is an Array
+#  created_at     :datetime         not null
+#  updated_at     :datetime         not null
+#
+# Indexes
+#
+#  index_sources_on_canonical_host  (canonical_host) UNIQUE
+#  index_sources_on_variant_hosts   (variant_hosts) USING gin
+#
+
 require 'rails_helper'
 
 describe Source do

diff --git a/spec/serializers/cohort_serializer_spec.rb b/spec/serializers/cohort_serializer_spec.rb
@@ -17,7 +17,6 @@
 
     @ds1 = create(:data_set,
                   top_mentions: { 'squid'=>'3' },
-                  top_retweets: { 'first tweet test' => { count: '2', link: 'https://firsttweettext.com' } },
                   top_sources:  { 'twitter.com'=>'9', 'http://hasthelargehadroncolliderdestroyedtheworldyet.com/'=>'5'},
                   top_urls:     { 'www.cnn.com/a_story'=>'10' },
                   cohort: @cohort1
@@ -34,6 +33,7 @@
     @ds3.destroy
     @cohort1.destroy
     @cohort2.destroy
+    Retweet.destroy_all
   end
 
   it 'includes the description attribute' do
@@ -68,7 +68,7 @@
       { 'squid'=>3, 'plato'=>5, 'aristotle'=>7 }
     )
     expect(hsh[:aggregates][:top_retweets]).to eq(
-      { 'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 3, link: 'https://secondtweettext.com' }}
+      { 'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 6, link: 'https://secondtweettext.com' }}
     )
     expect(hsh[:aggregates][:top_sources]).to eq (
       { 'godeysladysbook.com'=>7, 'twitter.com'=>13,

diff --git a/spec/serializers/data_set_serializer_spec.rb b/spec/serializers/data_set_serializer_spec.rb
@@ -36,7 +36,9 @@
     top_urls: { 'https://www.foo.com'=>'1', 'https://www.bar.com'=>'2'},
     top_sources: { 'twitter.com'=>'91', 'www.cnn.com'=>'8' },
     top_mentions: { 'BKCHarvard'=>'5' },
-    top_retweets: { 'tweet the first'=>'2', 'tweet the second'=>'3' }
+    retweets: [
+      Retweet.create(link: 'http://twit.com/123', count: 3, text: 'text')
+    ]
   ) }
 
   let(:serializer) { DataSetSerializer.new(ds) }