Skip to content

Commit

Permalink
Create and use new model to store nested retweets metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
peter-hank authored and thatandromeda committed Mar 24, 2020
1 parent 4b6ad81 commit 70577c2
Show file tree
Hide file tree
Showing 12 changed files with 129 additions and 38 deletions.
41 changes: 25 additions & 16 deletions app/models/data_set.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,7 @@

class DataSet < ApplicationRecord
belongs_to :cohort

store_accessor :top_retweets
has_many :retweets

attr_readonly :index_name
before_create :add_index_name
Expand Down Expand Up @@ -59,9 +58,17 @@ def update_aggregates
top_urls: MetadataHarvester.new(:urls, all_tweets).harvest,
top_words: MetadataHarvester.new(:words, all_tweets).harvest,
top_mentions: MetadataHarvester.new(:mentions, all_tweets).harvest,
top_sources: MetadataHarvester.new(:sources, all_tweets).harvest,
top_retweets: MetadataHarvester.new(:retweets, all_tweets).harvest
top_sources: MetadataHarvester.new(:sources, all_tweets).harvest
)
# Nested retweets goes to their own table
MetadataHarvester.new(:retweets, all_tweets).harvest.each do |text, retweet|
Retweet.create!(
data_set: self,
text: text,
count: retweet[:count],
link: retweet[:link]
)
end
end

def ingest_data
Expand Down Expand Up @@ -98,34 +105,28 @@ def store_data(tweets)
def self.aggregate(ids)
keys = %i[hashtags top_urls top_words top_mentions top_sources top_retweets]
data_sets = self.where(id: ids)

retval = {}

keys.each do |key|
# Keep only the data above our thresholds.
if key == :top_retweets
# Accumulate data from all datasets in scope.
data = data_sets.pluck(key)
.reduce ({}) do |first, second|
data = data_sets.map(&:top_retweets)
.flatten(1)
.reduce({}) do |first, second|
first.merge(second) do |_, a, b|
a = eval(a)
b = eval(b)
{ count: a[:count].to_i + b[:count].to_i, link: a[:link] }
end
end

data.each do |k, v|
if v.is_a?(String)
data[k] = eval(data[k])
data[k][:count] = data[k][:count].to_i
end
end
min_count = data.map(&:count).sort.last(Extractor::TOP_N)[0]
min_count = data.map { |_k, v| v[:count] }.sort.last(Extractor::TOP_N)[0]
data.reject! { |k, v| v[:count] < [min_count, Extractor::THRESHOLD].max }
else
# Accumulate data from all datasets in scope.
data = data_sets.pluck(key)
.map { |h| h.transform_values!(&:to_i) }
.reduce ({}) do |first, second|
.reduce({}) do |first, second|
first.merge(second) { |_, a, b| a + b }
end

Expand All @@ -139,6 +140,14 @@ def self.aggregate(ids)
retval
end

def top_retweets
top = {}
self.retweets.map do |item|
top[item[:text]] = { count: item[:count], link: item[:link] }
end
top
end

private

def all_tweets
Expand Down
20 changes: 20 additions & 0 deletions app/models/retweet.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
# == Schema Information
#
# Table name: retweets
#
# id :bigint not null, primary key
# count :integer
# link :string
# text :text
# created_at :datetime not null
# updated_at :datetime not null
# data_set_id :bigint
#
# Indexes
#
# index_retweets_on_data_set_id (data_set_id)
#

class Retweet < ApplicationRecord
belongs_to :data_set
end
12 changes: 12 additions & 0 deletions db/migrate/20200320182736_create_retweets.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
class CreateRetweets < ActiveRecord::Migration[5.2]
def change
create_table :retweets do |t|
t.text :text
t.integer :count
t.string :link
t.references :data_set

t.timestamps
end
end
end
12 changes: 11 additions & 1 deletion db/schema.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
#
# It's strongly recommended that you check this file into your version control system.

ActiveRecord::Schema.define(version: 2020_03_13_181803) do
ActiveRecord::Schema.define(version: 2020_03_20_182736) do

# These are extensions that must be enabled in order to support this database
enable_extension "hstore"
Expand Down Expand Up @@ -56,6 +56,16 @@
t.index ["cohort_id"], name: "index_data_sets_on_cohort_id"
end

create_table "retweets", force: :cascade do |t|
t.text "text"
t.integer "count"
t.string "link"
t.bigint "data_set_id"
t.datetime "created_at", null: false
t.datetime "updated_at", null: false
t.index ["data_set_id"], name: "index_retweets_on_data_set_id"
end

create_table "search_queries", force: :cascade do |t|
t.boolean "active"
t.text "description"
Expand Down
24 changes: 23 additions & 1 deletion spec/factories.rb
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,12 @@
end
end

factory :retweet do
text { 'retweet text' }
count { 3 }
link { 'http://twitter.com/status/xxxxxx' }
end

factory :cohort do
twitter_ids { [14706139] } # @BKCHarvard's twitter id
description { 'Berkman Klein Center for Internet & Society' }
Expand Down Expand Up @@ -47,11 +53,27 @@
num_tweets { 200 }
num_retweets { 10 }
top_mentions { { 'plato'=>'5', 'aristotle'=>'7' } }
top_retweets { { 'first tweet test' => { count: 2, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 3, link: 'https://secondtweettext.com' }} }
top_sources { { 'godeysladysbook.com'=>'7', 'twitter.com'=>'4' } }
top_urls { { 'www.cnn.com/a_story'=>'4', 'http://bitly.com/98K8eH'=>'8'} }
top_words { { 'stopword'=>'5', 'moose'=>'74' } }
hashtags { { 'llamas'=>'7', 'octopodes'=>'24' } }

after(:create) do |data_set|
create(
:retweet,
data_set: data_set,
text: 'first tweet test',
count: 2,
link: 'https://firsttweettext.com'
)
create(
:retweet,
data_set: data_set,
text: 'second tweet text',
count: 3,
link: 'https://secondtweettext.com'
)
end
end

factory :search_query do
Expand Down
1 change: 1 addition & 0 deletions spec/features/api_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@
after :all do
DataSet.destroy_all
Cohort.destroy_all
Retweet.destroy_all
end

it 'returns aggregated data' do
Expand Down
7 changes: 3 additions & 4 deletions spec/models/cohort_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
"ruha9"=>"4",
})
expect(ds.top_retweets).to eq({
"If you haven't yet watched the video of @ruha9 speak at @BKCHarvard on The New Jim Code,take some time today to listen to her speak on the intersection of race and technology,carceral technoscience,&amp; liberatory imagination in everyday life. https://t.co/VUbrXxmYeD"=>"{:count=>2, :link=>\"https://twitter.com/farman/status/1227305335901302785\"}"
"If you haven't yet watched the video of @ruha9 speak at @BKCHarvard on The New Jim Code,take some time today to listen to her speak on the intersection of race and technology,carceral technoscience,&amp; liberatory imagination in everyday life. https://t.co/VUbrXxmYeD"=> {:count=>2, :link=>"https://twitter.com/farman/status/1227305335901302785"}
})
# These numbers are lower than you'll see grepping through the VCR
# cassette because 1) only the expanded_url field is considered and 2)
Expand Down Expand Up @@ -72,7 +72,6 @@
create(:data_set,
cohort: @cohorts.first,
top_mentions: { 'plato'=>'5', 'aristotle'=>'7' },
top_retweets: { 'first tweet test' => { count: '2', link: 'https://firsttweettext.com' }, 'second tweet text' => { count: '3', link: 'https://secondtweettext.com' } },
top_sources: { 'godeysladysbook.com'=>'7', 'twitter.com'=>'4' },
top_urls: { 'www.cnn.com/a_story'=>'4', 'http://bitly.com/98K8eH'=>'8'},
top_words: { 'stopword'=>'5', 'moose'=>'74' },
Expand All @@ -81,7 +80,6 @@
create(:data_set,
cohort: @cohorts.second,
top_mentions: { 'plato'=>'10', 'socrates'=>'7' },
top_retweets: { 'first tweet test' => { count: '1', link: 'https://firsttweettext.com' }, 'second tweet text' => { count: '1', link: 'https://secondtweettext.com' }},
top_sources: { 'twitter.com'=>'4', 'livejournal.com'=>'4' },
top_urls: { 'www.cnn.com/a_story'=>'1' },
top_words: { 'stopword'=>'5', 'bats'=>'7' },
Expand All @@ -92,6 +90,7 @@
after :all do
DataSet.destroy_all
Cohort.destroy_all
Retweet.destroy_all
end

it 'can aggregate data from multiple cohorts' do
Expand All @@ -100,7 +99,7 @@
'plato'=>15, 'aristotle'=>7, 'socrates'=>7
})
expect(aggs[:top_retweets]).to eq({
'first tweet test' => { count: 3, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 4, link: 'https://secondtweettext.com' }
'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 6, link: 'https://secondtweettext.com' }
})
expect(aggs[:top_sources]).to eq({
'godeysladysbook.com'=>7, 'twitter.com'=>8, 'livejournal.com'=>4
Expand Down
25 changes: 12 additions & 13 deletions spec/models/data_set_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,6 @@
# num_tweets :integer
# num_users :integer
# top_mentions :hstore
# top_retweets :hstore
# top_sources :hstore
# top_urls :hstore
# top_words :hstore
Expand Down Expand Up @@ -67,7 +66,7 @@
end
end

context 'during data ingestion' do
context 'during data ingestion', elasticsearch: true do
it 'asks twitter for data on a user' do
VCR.use_cassette('data set spec') do
expect_any_instance_of(Twitter::REST::Client)
Expand Down Expand Up @@ -110,7 +109,7 @@
end
end

context 'data aggregation' do
context 'data aggregation', elasticsearch: true do
it 'sets aggregates appropriately' do
VCR.use_cassette('data aggregation') do
cohort = create(:cohort)
Expand Down Expand Up @@ -166,11 +165,11 @@

expect(ds_pipelined.top_mentions).to eq ({ "BKCHarvard"=>"5" })
expect(ds_pipelined.top_retweets).to eq({
"Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3" => "{:count=>1, :link=>\"https://twitter.com/rtushnet/status/1227619561412997124\"}",
"Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp" => "{:count=>1, :link=>\"https://twitter.com/EngageLab/status/1227585647856123904\"}",
"Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn" => "{:count=>1, :link=>\"https://twitter.com/evelyndouek/status/1227282185364918274\"}",
"There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV" => "{:count=>1, :link=>\"https://twitter.com/omertene/status/1227807251227910147\"}",
"\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR" => "{:count=>1, :link=>\"https://twitter.com/datasociety/status/1228009942420000768\"}",
"Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3" => {:count=>1, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
"Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp" => {:count=>1, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"},
"Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn" => {:count=>1, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
"There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV" => {:count=>1, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
"\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR" => {:count=>1, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
})
expect(ds_pipelined.top_sources).to eq({
"bit.ly"=>"2", "medium.com"=>"2", "news.bloomberglaw.com"=>"2",
Expand Down Expand Up @@ -221,11 +220,11 @@

expect(aggs[:top_mentions]).to eq ({"BKCHarvard"=>10})
expect(aggs[:top_retweets]).to eq({
"\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR"=>{:count=>2, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
"There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV"=>{:count=>2, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
"Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3"=>{:count=>2, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
"Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn"=>{:count=>2, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
"Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp"=>{:count=>2, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"}
"\"In a rush to apply technical solutions to urban problems regarding public health, we must consider who it’s working for, &amp; how to create more egalitarian spaces &amp; services.” — @draganakaurin for @BKCHarvard https://t.co/D39dG1HJMR"=>{:count=>1, :link=>"https://twitter.com/datasociety/status/1228009942420000768"},
"There are sooooo many attempts at codifying ethical principles for AI. This is a fantastic paper from @BKCHarvard @JessicaFjeld @ne8en et al organizing and mapping consensus. With great infographics. https://t.co/xEHD85Lj9C https://t.co/Ng4Cd2OdTV"=>{:count=>1, :link=>"https://twitter.com/omertene/status/1227807251227910147"},
"Amazon’s Judging of IP Claims Questioned in Seller Lawsuits (featuring comments from me) https://t.co/QuLXmtIWz3"=>{:count=>1, :link=>"https://twitter.com/rtushnet/status/1227619561412997124"},
"Excited to have this out in the world!! I've been slammed on all sides on this one which, despite the saying, I don't think means I am definitely doing anything right😛, but I do think means it's a conversation we need to be having. 1/ https://t.co/h9E0BOujCn"=>{:count=>1, :link=>"https://twitter.com/evelyndouek/status/1227282185364918274"},
"Check out this informative Q&amp;A by our friends at @BKCHarvard, combining aspects of two of our core initiatives, health advocacy and trust in the news, https://t.co/0ClD7Fx1mp"=>{:count=>1, :link=>"https://twitter.com/EngageLab/status/1227585647856123904"}
})
expect(aggs[:top_sources]).to eq({
"bit.ly"=>4, "medium.com"=>4, "news.bloomberglaw.com"=>4,
Expand Down
1 change: 1 addition & 0 deletions spec/models/extractor_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
after :all do
DataSet.destroy_all
Cohort.destroy_all
Retweet.destroy_all
Rails.application.config.tweets_per_user = @tweets_per_user
end

Expand Down
16 changes: 16 additions & 0 deletions spec/models/source_spec.rb
Original file line number Diff line number Diff line change
@@ -1,3 +1,19 @@
# == Schema Information
#
# Table name: sources
#
# id :bigint not null, primary key
# canonical_host :string
# variant_hosts :string is an Array
# created_at :datetime not null
# updated_at :datetime not null
#
# Indexes
#
# index_sources_on_canonical_host (canonical_host) UNIQUE
# index_sources_on_variant_hosts (variant_hosts) USING gin
#

require 'rails_helper'

describe Source do
Expand Down
4 changes: 2 additions & 2 deletions spec/serializers/cohort_serializer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,6 @@

@ds1 = create(:data_set,
top_mentions: { 'squid'=>'3' },
top_retweets: { 'first tweet test' => { count: '2', link: 'https://firsttweettext.com' } },
top_sources: { 'twitter.com'=>'9', 'http://hasthelargehadroncolliderdestroyedtheworldyet.com/'=>'5'},
top_urls: { 'www.cnn.com/a_story'=>'10' },
cohort: @cohort1
Expand All @@ -34,6 +33,7 @@
@ds3.destroy
@cohort1.destroy
@cohort2.destroy
Retweet.destroy_all
end

it 'includes the description attribute' do
Expand Down Expand Up @@ -68,7 +68,7 @@
{ 'squid'=>3, 'plato'=>5, 'aristotle'=>7 }
)
expect(hsh[:aggregates][:top_retweets]).to eq(
{ 'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 3, link: 'https://secondtweettext.com' }}
{ 'first tweet test' => { count: 4, link: 'https://firsttweettext.com' }, 'second tweet text' => { count: 6, link: 'https://secondtweettext.com' }}
)
expect(hsh[:aggregates][:top_sources]).to eq (
{ 'godeysladysbook.com'=>7, 'twitter.com'=>13,
Expand Down
4 changes: 3 additions & 1 deletion spec/serializers/data_set_serializer_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,9 @@
top_urls: { 'https://www.foo.com'=>'1', 'https://www.bar.com'=>'2'},
top_sources: { 'twitter.com'=>'91', 'www.cnn.com'=>'8' },
top_mentions: { 'BKCHarvard'=>'5' },
top_retweets: { 'tweet the first'=>'2', 'tweet the second'=>'3' }
retweets: [
Retweet.create(link: 'http://twit.com/123', count: 3, text: 'text')
]
) }

let(:serializer) { DataSetSerializer.new(ds) }
Expand Down

0 comments on commit 70577c2

Please sign in to comment.