diff --git a/R/RCrowdTangle.R b/R/RCrowdTangle.R index d5b06e6..3035d07 100644 --- a/R/RCrowdTangle.R +++ b/R/RCrowdTangle.R @@ -1,10 +1,172 @@ +# ----- RCrowdTangle.R ----- +# +# Fork of cpbuschmann's RCrowdTangle library +# +# CC-By Jan Eggers + + require("jsonlite") require("dplyr") -ct_get_links <- function(x = "", platforms = "", count = 100, startDate = "", endDate = "", token = "") + +# ---- Definition section ---- + +# Versions as date strings +ctRVersion <- "2021-07-14" +apiVersion <- "2021" + +# Rate limits: +# Default rate limit is 6 calls/minute, except for /links calls, who are +# limited to 2 calls/minute. Every function waits for the appropriate time +# beforte trying again. If not needed, just set to NULL. + +ct_api_limit <- function(calls = 6, links=FALSE) { + if (calls > 0) { + if(links) { + apiWaitLinks <<- 60/calls + } else { + # Default wait + apiWait <<- 60/calls + } + } +} + +ct_api_limit(6) +ct_api_limit(2,links=TRUE) + +# Global variables for last access of API to determine wait +apiWaitTill <- NULL + +# Helper functions + +# Call before accessing the API. +ct_wait <- function(t = apiWait) { + if (!is.null(apiWaitTill)) { + # If before wait point, wait. + while(now() < apiWaitTill) { + Sys.sleep(1) + } + } + # Save new wait point + apiWaitTill <<- now()+t +} + +# Global variable to store token from environment +ct_token <- NULL + +# ---- Manage API token ---- TODO +# Takes token and stores it in the .Renviron file for future reference +require(stringr) + +ct_auth <- function(token = NULL, overwrite = FALSE) { + # Nicked large parts of this function from Bene Witzenberger's great + # datawRappr package. + # It just reads the .Renviron file, looks if the API key already exists, + # returns a warning if it does (or overwrites, if ordered to), + # or adds it, if necessary + + # Access global environment file: + filename <- paste0(Sys.getenv("HOME"), "/.Renviron") + + # If no token is given, read key from ./Renviron and proceed to trying out + # immediately. + if (is.null(token)) { + token <- Sys.getenv("CROWDTANGLE_API_TOKEN") + if (token=="") stop("No token set") + } else { + # Get on with it. Write key (or ignore if already exists and no overwrite) + hook_name <- "CROWDTANGLE_API_TOKEN" + + if (!file.exists(filename)) { # create .Renviron, if it doesn't exist + file.create(filename) + warning("No Renviron file found") + } + + # check if key already exists - if yes, check for overwrite = TRUE, else: write new key + if (Sys.getenv(token) != "") { + + if (overwrite == TRUE) { + + + # base R-solution: + txt_vector <- readLines(filename, n = -1) + lines_delete <- which(grepl(paste0("^",hook_name,".*$"), txt_vector)) + output_txt <- txt_vector[-lines_delete] + + # add new key: + new_key <- paste0(hook_name,' = ', token) + output_txt <- c(output_txt, new_key) + writeLines(output_txt, filename) + + # reload Renviron after changing it: + readRenviron(filename) + + } else if (overwrite == FALSE) { # if key exists, but overwrite is FALSE: throw warning and end function + # Query existing token + token <- Sys.getenv("CROWDTANGLE_API_TOKEN") + + warning(paste0("API key ", hook_name, " already exists on this system.\nSet `overwrite = TRUE` to delete it."), immediate. = TRUE) + + } + + + } else { + + # write new Key to to environment file + new_key <- paste0(hook_name,' = ', token) + write(new_key, file = filename, append = TRUE) + readRenviron(filename) + } + + } + # Try accessing the API with token; just a blank default query + endpoint.posts <- "https://api.crowdtangle.com/posts" + query.string <- paste0(endpoint.posts, "?token=", token) + ct_wait(apiWait) + response.json <- try(fromJSON(query.string), silent = TRUE) + # Check if query returned OK + if (response.json$status==200) { + # Set global variable to token + ct_token <<- token + return("OK") + } else { + stop(paste0("Token not valid - returns ",response.json$status)) + } +} + +# not_run +# ct_auth("your_api_key_here",overwrite=TRUE) +# Read with Sys.getenv("CROWDTANGLE_API_TOKEN") + + +# ---- API call: links ---- +# Retrieve a set of posts matching a certain link. This will return up to +# 1000 posts. + +ct_get_links <- function(x = "", platforms = "", count = 100, + startDate = "", endDate = "", + token = NULL) { + # if no token is given, try to retrieve token from environment + if (is.null(token)) { + # No token given? Try to read it from global variable. + if (is.null(ct_token)) { + # Call auth function without parameters - writes API token to + # global variable ct_token and stops if no API token is set + ct_auth() + } + token <- ct_token + token <- Sys.getenv("CROWDTANGLE_API_TOKEN") + } + # get on with it! endpoint.links <- "https://api.crowdtangle.com/links" - query.string <- paste0(endpoint.links, "?link=", x, "&platforms=", platforms, "&count=", count, "&startDate=", startDate, "&endDate=", endDate, "&token=", token) + query.string <- paste0(endpoint.links, + "?link=", x, + "&platforms=", platforms, + "&count=", count, + "&startDate=", startDate, "&endDate=", endDate, + "&token=", token) + ct_wait(apiWaitList) response.json <- try(fromJSON(query.string), silent = TRUE) if (!class(response.json) == "try-error") { @@ -17,29 +179,83 @@ ct_get_links <- function(x = "", platforms = "", count = 100, startDate = "", en if("expandedLinks" %in% colnames(posts)) posts <- select(posts, -expandedLinks) if("media" %in% colnames(posts)) posts <- select(posts, -media) posts <- jsonlite::flatten(posts) + # Wait a fraction of a second to stay below the API rate limit return(posts) } - else if (status == 429) - { - print("API rate limit hit, sleeping...") - Sys.sleep(60) - } } } -ct_get_posts <- function(x = "", searchTerm = "", language = "", types= "", minInteractions = 0, count = 100, startDate = "", endDate = "", token = "") +# ---- API call: posts ---- +# Retrieve a set of posts for the given parameters. + + +ct_get_posts <- function(x = "", searchTerm = "", + language = "", types= "", + minInteractions = 0, count = 100, + startDate = "", endDate = "", + token = NULL) { + # if no token is given, try to retrieve token from environment + if (is.null(token)) { + # No token given? Try to read it from global variable. + if (is.null(ct_token)) { + # Call auth function without parameters - writes API token to + # global variable ct_token and stops if no API token is set + ct_auth() + } + token <- ct_token + token <- Sys.getenv("CROWDTANGLE_API_TOKEN") + } + # get on with it! endpoint.posts <- "https://api.crowdtangle.com/posts" - query.string <- paste0(endpoint.posts, "?listIds=", x, "&searchTerm=", searchTerm, "&language=", language, "&types=", types, "&minInteractions=", minInteractions, "&count=", count, "&startDate=", startDate, "&endDate=", endDate, "&token=", token) + query.string <- paste0(endpoint.posts, + "?listIds=", x, + "&searchTerm=", searchTerm, + "&language=", language, + "&types=", types, + "&minInteractions=", minInteractions, + "&count=", count, + "&startDate=", startDate, + "&endDate=", endDate, + "&token=", token) + ct_wait(apiWait) response.json <- try(fromJSON(query.string), silent = TRUE) status <- response.json$status - nextpage <- response.json$result$pagination$nextPage - posts <- response.json$result$posts %>% select(-expandedLinks, -media) %>% flatten() - return(posts) + if (status == 200) + { + nextpage <- response.json$result$pagination$nextPage + posts <- response.json$result$posts %>% select(-expandedLinks, -media) %>% flatten() + return(posts) + } else { + # return error + } } -ct_search_posts <- function(x = "", and = "", not = "", inAccountIds = "", inListIds = "", notInAccountIds = "", notInListIds = "", notInTitle = "", platforms = "", types= "", minInteractions = 0, minSubscriberCount = 0, verifiedOnly = "false", count = 100, startDate = "", endDate = "", token = "") + +# ---- API call: posts-search ---- +# Retrieve a set of posts for the given parameters. + + +ct_search_posts <- function(x = "", and = "", not = "", + inAccountIds = "", inListIds = "", + notInAccountIds = "", notInListIds = "", + notInTitle = "", platforms = "", types= "", + minInteractions = 0, minSubscriberCount = 0, + verifiedOnly = "false", count = 100, + startDate = "", endDate = "", token = "") { + # if no token is given, try to retrieve token from environment + if (is.null(token)) { + # No token given? Try to read it from global variable. + if (is.null(ct_token)) { + # Call auth function without parameters - writes API token to + # global variable ct_token and stops if no API token is set + ct_auth() + } + token <- ct_token + token <- Sys.getenv("CROWDTANGLE_API_TOKEN") + } + # get on with it! endpoint.posts <- "https://api.crowdtangle.com/posts" query.string <- paste0(endpoint.posts, "?searchTerm=", x, @@ -51,7 +267,12 @@ ct_search_posts <- function(x = "", and = "", not = "", inAccountIds = "", inLis "&inAccountIds=", inAccountIds, "&inAccountIds=", inAccountIds, - "&language=", language, "&types=", types, "&minInteractions=", minInteractions, "&count=", count, "&startDate=", startDate, "&endDate=", endDate, "&token=", token) + "&language=", language, "&types=", + types, "&minInteractions=", minInteractions, + "&count=", count, + "&startDate=", startDate, "&endDate=", endDate, + "&token=", token) + ct_wait(apiWait) response.json <- try(fromJSON(query.string), silent = TRUE) status <- response.json$status nextpage <- response.json$result$pagination$nextPage @@ -59,3 +280,171 @@ ct_search_posts <- function(x = "", and = "", not = "", inAccountIds = "", inLis return(posts) } + +# ---- API call: /posts/:id ---- +# Retrieves a specific post. There are two versions of this endpoint, depending +# upon what you need. Both return the same data. Please note that you must use +# a dashboard token that corresponds to the post platform - i.e. an Instagram +# token for Instagram posts, and a Facebook token for Facebook posts. +# +# Please also note that the ID format for Facebook and Instagram are different. +# For Instagram, it's [post_id]_[page_id], while for Facebook, +# it's [page_id]_[post_id]. While Page and Post IDs can be found in Facebook +# post URLs, Instagram does not expose the IDs in its URLs. You can pull +# the necessary Instagram IDs from our API. + +ct_get_post_by_id <- function(id = NULL, + redditAccount = NULL, + includeHistory = FALSE, + token = NULL) +{ + # if no token is given, try to retrieve token from environment + if (is.null(token)) { + # No token given? Try to read it from global variable. + if (is.null(ct_token)) { + # Call auth function without parameters - writes API token to + # global variable ct_token and stops if no API token is set + ct_auth() + } + token <- ct_token + } + # get on with it! + # Error check: No valid ID? + if (is.null(id)) stop("No ID given") + + endpoint.posts <- "https://api.crowdtangle.com/post" + query.string <- paste0(endpoint.posts, "/",id, + "?", + ifelse(is.null(redditAccount),"", + paste0("redditAccount=", + redditAccount,"&")), + ifelse(includeHistory, + "includeHistory=1&",""), + "token=", token) + ct_wait(apiWait) + response.json <- try(fromJSON(query.string), silent = TRUE) + status <- response.json$status + if (status == 200) + { + nextpage <- response.json$result$pagination$nextPage + posts <- response.json$result$posts %>% select(-expandedLinks, -media) %>% flatten() + return(posts) + } else { + # return error + } +} + +# Wrapper for Facebook and Instagram posts: Return data for post by link +# If you call this function in a dplyr pipeline (e.g. with mutate()), +# use rowwise(). + +ct_get_fb_post <- function (link="", + includeHistory = FALSE, + token = NULL) { + if (link=="") stop("No Link") + # Zahl vor dem Wort "post" + page_id <- str_extract(link,"[0-9]+(?=\\/posts)") + # Zahl am Ende des Links + post_id <- str_extract(link,"[0-9]+$") + return(ct_get_post_by_id( + id = paste0(page_id,"_",post_id), + includeHistory = includeHistory, + token = token)) +} + + +# ---- Instagram post query ---- +# This is more than a simple wrapper for ct_get_post_by_id: +# To query the stats for an Instagram post, you have to grab +# the post ID from the source code, and the profile page of the source, +# and the page ID from that source. + +# Source URL: https://www.instagram.com/p/CQ1iXf_tJ37/ + +# Instagram page id of post can be grabbed from +# +# Instagram post id can be grabbed from +# +# or calculated from code, using the helper fn InstaToPostID() below. + +# Just to keep track of this: if you query +# https://www.instagram.com/p/CQ1iXf_tJ37/?__a=1 +# a post is returned as a JSON with all relevant info. + +# Helper function to convert the Instagram URL to a post ID, +# which you need to query Crowdtangle for a single Insta post. +# +# Makes use of the gmp library which is for handling, like, really big figures. +# It *does* work if you try to use integers but whether it works correctly +# may depend on how double and integer are represented on your very system - +# max number is 64^10-1. +# +# Are you sure you won't want to use that obscure gmp library? + +library(gmp) +library(stringr) +b64_str_split <- unlist( + strsplit("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_","")) +# +instaToPostID <- function(x = "x") { + # Function expects either a url, or the isolated code. + # If url, extract. + if (str_detect(x,"instagram\\.com")) { + # Extract instagram.com/p/ as well as instagram.com/tv/ + x <- str_extract(x,"(?<=instagram\\.com\\/p\\/)[a-zA-Z0-9_\\-]+|(?<=instagram\\.com\\/tv\\/)[a-zA-Z0-9_\\-]+") + if (is.na(x)) stop("Could not extract URL ID code") + } + # convert input string to vector of integers + t <- unlist(strsplit(x,"")) + lt <- length(t) + r = as.bigz(0L) + for (i in 1:lt) { + r <- r+((which(b64_str_split==t[i])-1)*as.bigz(64)^(lt-i)) + } + return(as.character(r)) +} + + +ct_get_insta_post <- function (link="", + includeHistory = FALSE, + token = NULL) { + if (link=="") stop("No Link") + # Zahl vor dem Wort "post" + page_id <- str_extract(link,"[0-9]+(?=\\/posts)") + # Zahl am Ende des Links + post_id <- str_extract(link,"[0-9]+$") + return(ct_get_post_by_id( + id = paste0(page_id,"_",post_id), + includeHistory = includeHistory, + token = token)) +} + + +# ---- API call: /posts/search ---- TODO +# ** Note: Access to the Search is restricted to a limited set customers** +# and usage requires prior approval by CrowdTangle. +# +# Retrieve a set of posts for the given parameters and search terms. +# This endpoint, unlike the main /posts endpoint, searches the entire, +# cross-platform CrowdTangle system of posts. It can be limited by lists and +# accounts, but by default will search beyond the dashboard the token is +# associated with. + + +# ---- API call: /leaderboard ---- TODO +# Retrieves leaderboard data for a certain list or set of accounts. + + +# ---- API call: /lists ---- TODO +# Retrieve the lists, saved searches and saved post lists of the dashboard +# associated with the token sent in. + + +# ---- API call: /lists/:listid/accounts ---- TODO +# Retrieve the accounts for a given list. Accounts may only be retrieved for +# lists of type LIST, as saved searches and saved posts do not have +# associated accounts. + +# ---- API call: /ctpost/:id ---- TODO +# Gives data on Crowdtangle post - only glimpsed from the demo json file at +# https://ct-staticfiles.s3-us-west-1.amazonaws.com/api/API-Demo-2020.postman_collection.json diff --git a/README.md b/README.md index 1bcca67..2a9fb95 100644 --- a/README.md +++ b/README.md @@ -3,6 +3,54 @@ A Wrapper To Retrieve Data From The CrowdTangle API This package provides programmatic accces to the [CrowdTangle API](https://help.crowdtangle.com/en/articles/1189612-crowdtangle-api) with R. Users need to have a [CrowdTangle](https://www.crowdtangle.com/) account in order to make API calls. +The package currently supports using the *links* and *posts* endpoints of the CrowdTangle API, as well as the *post/:id* query (Facebook only!) to retrieve data on specific URLs and posts as well as searching for posts. + +## Installing + +Download the RCrowdTangle.R file from the R folder, place it in your +working directory, and include it with ```source("RCrowdTangle.R")``` + +As it's not a proper R library *yet*, installing it with + +```devtools::install_github("untergeekDE/RCrowdTangle")``` + +will lead to errors. Yet. + +## Function calls + +- **ct_auth(token, overwrite=FALSE)** - Set Crowdtangle token as an environment variable +- **ct_get_links()** Call Links endpoint (consult [CT API documentation](https://github.com/CrowdTangle/API/wiki/Links) ) +- **ct_get_posts()** Call Posts endpoint (consult [CT API documentation](https://github.com/CrowdTangle/API/wiki/posts) ) +- **ct_search_posts** - Basically ct_get_posts() with a focus on search terms. +- **ct_get_post_by_id(id)** - Call Post by ID endpoint (consult [CT API doc](https://github.com/CrowdTangle/API/wiki/Posts#get-postid)) +- **ct_get_fb_post(url)** - Return information on single FB post + ## Examples -The package currently supports using the *links* and *posts* endpoints of the CrowdTangle API to retrieve data on specific URLs and posts as well as searching for posts. +(TODO) + +## Rate limit + +The default rate limit for CrowdTangle API calls is 6 per minute (with the exception +of the /links call which is limited to 2 calls per minute). The function calls +wait for a fraction of a second before returning. + +If you wish to change the rate limit to something lower, use + +- **ct_set_api_limit(n)** +- **ct_set_api_limit(n, links=TRUE)** + +to set the limit to n calls per minute. Whenever a query is done, a timer is set +via the + +## Todo + +- Examples and use cases +- Convert to a proper R library (anybody any advice how to do this?) +- /posts/search call (invitation only!) +- /leaderboard call +- /lists call +- /lists/:listid/accounts call +- /ctpost/:id call (hidden, possibly deprecated) +- clean up the rather messy parameter structure for the calls +- Error handling