From c9e1dc2bf4a9bcbc0d396e8d0129e92f59b6c4bc Mon Sep 17 00:00:00 2001 From: rachelmbrubaker <84355699+rachelmbrubaker@users.noreply.github.com> Date: Thu, 23 Sep 2021 11:21:38 -0600 Subject: [PATCH] add tensorflow example (#42) * first version from google docs * in-progress effort on tensorflow. submitting for visibility of status * accuracy tweak * fix description * small tweak * save the link id * fixing parsing of json article * adding in rapidApi integration * final working script, with minimal API usage * final README.md * fix formatting * fix formatting * removing duplicate import * Update README.md * adding line to main import * fix url Co-authored-by: margaretkennedy <82049573+margaretkennedy@users.noreply.github.com> --- README.md | 2 + tensorflow/README.md | 36 ++++ tensorflow/tensorflow.py | 362 +++++++++++++++++++++++++++++++++++++++ tensorflow/trainData.csv | 3 + 4 files changed, 403 insertions(+) create mode 100644 tensorflow/README.md create mode 100644 tensorflow/tensorflow.py create mode 100755 tensorflow/trainData.csv diff --git a/README.md b/README.md index e08b6446..7c6ff531 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,8 @@ The following folders can be found in this repository: - **[`metriccentury`](https://github.com/deephaven/examples/tree/main/metriccentury)** - Data recorded from a 100 km bike ride - **[`pems`](https://pems.dot.ca.gov/)** - Traffic flow data collected near Davis, CA. - **[`taxi`](https://azure.microsoft.com/en-us/services/open-datasets/catalog/nyc-taxi-limousine-commission-yellow-taxi-trip-records/)** - Yellow Taxi trip records +- **[`tensorflow`](https://www.tensorflow.org/)** - Statistically calculate positive/negative sentiment using machine-learning + training mechanisms based on an RSS feed from Seeking Alpha. - **[`fit`](https://www.strava.com/)** - Workout results in the proprietary fit format developed by Garmin. Downloadable from Strava. - **[`tickingHeartRate`]** - Simulated ticking heart rate data. diff --git a/tensorflow/README.md b/tensorflow/README.md new file mode 100644 index 00000000..f26db167 --- /dev/null +++ b/tensorflow/README.md @@ -0,0 +1,36 @@ +# Tensorflow example demonstrating data from Seeking Alpha + +Pull a RSS feed from Seeking Alpha, and statistically calculate positive/negative sentiment using machine-learning +training mechanisms. + +## Table of contents + +* `tensorflow.py` - Python script to run. +* `trainData.csv` - The input data to train the AI algorithm. + +## Steps to run + +1. Install Python modules: + `docker exec $(basename $(pwd))_grpc-api_1 pip install tensorflow tensorflow_hub sklearn spacy bs4 lxml` + Note: please use this exact install mechanism, rather than variations + from [How to install Python packages](https://deephaven.io/core/docs/how-to-guides/install-python-packages). + The lxml installation is somewhat fragile in allowing bs4 to see that it has been installed. + See for more information. +1. Install the spacy english module: + `docker exec $(basename $(pwd))_grpc-api_1 pip install https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.0.0/en_core_web_sm-2.0.0.tar.gz` + Alternatively, use another version from here: + +1. Drag/drop the file `trainData.csv` onto the Deephaven console. +1. Get a login to (free) and subscribe to . + * Note that every time you run the script, you will consume some quota of your API usage for this particular + endpoint. This is kept minimal: a single API access of each published article being advertised by Seeking Alpha + on any one day (using the `knownLinks[]` variable within the script). However, to allow repeated iterations for + debug/troubleshooting, all variables are reset on a new script run, and hence another round of API calls is + required for each run. + * The number of API calls per day is usually small(~5-30), so provided the script is only run once-per-day, the free + tier of 500 calls/month should be adequate for demonstrative purposes. + * API call usage can be seen here: +1. Look at any of the endpoint examples, and **select+save** your unique endpoint API key. It is called `x-rapidapi-key`. +1. Import your key into Deephaven by running: + `ra_sa_key='enter-your-key-here'` (avoiding any additional space/quote characters) +1. Run `tensorflow.py`. diff --git a/tensorflow/tensorflow.py b/tensorflow/tensorflow.py new file mode 100644 index 00000000..86ffc060 --- /dev/null +++ b/tensorflow/tensorflow.py @@ -0,0 +1,362 @@ +import tensorflow as tf +from tensorflow import keras +import tensorflow_hub as hub +import numpy as np +import pandas as pd +from sklearn.utils import shuffle +from sklearn.feature_extraction.text import TfidfVectorizer +from sklearn.neighbors import NearestCentroid +from sklearn.naive_bayes import BernoulliNB, MultinomialNB +import string +import re +import spacy +import json +import deephaven.npy as inp +from deephaven.java_to_python import columnToNumpyArray +from deephaven import DynamicTableWriter, Types as dht +from deephaven.TableTools import timeTable + +# Load training data to use: +if not "trainData" in globals(): + print("Data not available: please load trainData.csv") + +# Ensure credientials are setup to log in to seekingAlpha +if not "ra_sa_key" in globals(): + print("Please set Rapid Api key for Seeking Alpha (ra_sa_key='the-key'):") + + +def cleanText(text): + # to lowercase + text = text.lower() + # correct spaces (e.g. "End sentence.Begin another" becomes "End sentence. Begin another") + text = re.sub(r'\.([a-zA-Z])', r'. \1', text) + text = re.sub(r'\?([a-zA-Z])', r'. \1', text) + text = re.sub(r'\!([a-zA-Z])', r'. \1', text) + # replace q1,2,3,4 with q + text = re.sub("q[1-4]", "q", text) + # replace 20xx with 2000 + text = re.sub("20[0-2][0-9]", "2000", text) + # lemmatize and remove stop words and punctuation + nlp = spacy.load('en_core_web_sm') + doc = nlp(text) + lemmatizedText = "" + for token in doc: + if not token.is_stop and not token.is_punct: + lemma = token.lemma_ + if lemma == "-PRON-": + lemma = "it" + lemmatizedText += (lemma + " ") + text = lemmatizedText + return text + + +def shuffleTable(unshuffledTable): + return unshuffledTable.update("__r=Math.random()").sort("__r").dropColumns("__r") + + +def centroid(trainTextVectorized, trainLabels, testTextVectorized): + nc = NearestCentroid(metric='manhattan') + nc.fit(trainTextVectorized, trainLabels) + return nc.predict(testTextVectorized) + + +def naiveBayes(trainTextVectorized, trainLabels, testTextVectorized): + nb = BernoulliNB(alpha=1) + nb.fit(trainTextVectorized, trainLabels) + return nb.predict(testTextVectorized) + + +def preTrainedEmbedding(trainText, trainLabels, evalText, valSize, trainEmb=True): + # initialize training, validation, and testing data + valText = trainText[-1 * valSize:] + valLabels = trainLabels[-1 * valSize:] + trainText = trainText[:-1 * valSize] + trainLabels = trainLabels[:-1 * valSize] + # create and run model + hub_layer = hub.KerasLayer("https://tfhub.dev/google/tf2-preview/gnews-swivel-20dim-with-oov/1", output_shape=[20], + input_shape=[], dtype=tf.string, trainable=trainEmb) + model = keras.Sequential(name="mymodel") + model.add(hub_layer) + model.add(keras.layers.Dense(16, activation='relu')) + model.add(keras.layers.Dense(1, activation='sigmoid')) + model.compile(optimizer='adam', + loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), + metrics=['accuracy']) + history = model.fit(trainText, + trainLabels, + epochs=40, + batch_size=4, + validation_data=(valText, valLabels), + verbose=0) + model.predict(evalText, verbose=1) + + +def predict(text, model): + if model == 'c' or model == 'nb': + textVectorized = vectorizer.transform([text]) + if model == 'c': + return int(centroid(trainTextVectorized, trainLabels, textVectorized)[0]) + else: + return int(naiveBayes(trainTextVectorized, trainLabels, textVectorized)[0]) + elif model == 'e': + return int(preTrainedEmbedding(trainText, trainLabels, text, 10)) + + +from bs4 import BeautifulSoup +import requests +import re +import pandas as pd +from deephaven.TableTools import emptyTable +from deephaven.conversion_utils import convertToJavaArray + + +# gets only those xml items which represent earnings call transcripts +def getEarningsCalls(items): + return [item for item in items if item.title.text.split()[-3:] == ["Earnings", "Call", "Transcript"]] + + +# gets the symbol and quarter/year out of an article's header +def parseHeader(header): + leftParenIdx = header.rindex("(") + rightParenIdx = header.rindex(")") + symbol = header[leftParenIdx + 1:rightParenIdx] + quarterIdx = re.search("Q[1-4] 20[0-2][0-9]", header).start() + quarter = header[quarterIdx:quarterIdx + 7] + return (symbol, quarter) + + +# find the index of the first paragraph that is equal to an item from the search list +# necessary for some functions below +def findIdx(paragraphs, searchList): + idx = 0 + for paragraph in paragraphs: + for title in searchList: + if (title == paragraph.text.lower()): + return idx + idx += 1 + return idx + + +# gets the names of all the company participants on the call +# this is useful in other functions below +def getNames(paragraphs): + # find the indices of the company participants and conference call participants roll-call sections + companyList = ["company participants", "corporate participants", "executives", "company representatives"] + confList = ["conference call participants", "analysts"] + startIdx = findIdx(paragraphs, companyList) + endIdx = findIdx(paragraphs, confList) + + # record the name of each company participant + idx = startIdx + 1 + names = [] + while idx < endIdx: + paragraph = paragraphs[idx] + text = paragraph.text.split() + if len(text) < 2: + break + names.append(text[0] + " " + text[1]) + idx += 1 + return names + + +# removes the roll-call, operator announcement, and q&a sections of the call +def truncate(paragraphs, names): + # find the indices of the operator and q&a sections + operatorIdx = qaIdx = 0 + for paragraph in paragraphs: + if paragraph.text.lower()[:len("operator")] == "operator": + break + operatorIdx += 1 + for paragraph in paragraphs: + if "id" in paragraph.attrs.keys() and paragraph["id"].lower()[:len("question-answer-session")] == "question-answer-session": + break + qaIdx += 1 + + # if there is an operator section, get the section between it and the q&a + if operatorIdx < qaIdx: + paragraphs = paragraphs[operatorIdx + 1:qaIdx - 1] + # if there isn't an operator section, remove the company participant roll-call + # this is necessary for the next step + else: + confList = ["conference call participants", "analysts"] + confIdx = findIdx(paragraphs, confList) + paragraphs = paragraphs[confIdx:qaIdx - 1] + + # find the index of the first company participant's speaking section + # this represents the start of either the safe-harbor statement or the CEO presentation + nameIdx = 0 + for paragraph in paragraphs: + text = paragraph.text.split() + if len(text) < 2: + break + name = text[0] + " " + text[1] + if name in names: + break + nameIdx += 1 + + # remove everything before the first company participant's speaking section + # print("op:%d\nqa:%d\nname:%d" % (operatorIdx, qaIdx, nameIdx)) + return paragraphs[nameIdx:] + + +# check if the call has a safe-harbor section +def hasSafeHarborStatement(paragraphs): + phrases = ["10-K", "forward-looking statements", "forward-looking information", "non-GAAP"] + for paragraph in paragraphs: + for phrase in phrases: + if phrase in paragraph.text: + return True + return False + + +# remove the call's safe-harbor section with the assumption that it exists +def removeSafeHarborStatement(paragraphs, names): + # find the indices of the first two company participant speaking sections + # the first company speaker always says the safe-harbor statement, so his/her section must be removed + i = startIdx = endIdx = 0 + first = True + for paragraph in paragraphs: + text = paragraph.text.split() + if text[0].lower() == "presentation": + i += 1 + continue + name = text[0] + " " + text[1] + if name in names: + if first: + # this is the first speaker's index + startIdx = i + first = False + else: + # this is the second speaker's index + endIdx = i + break + i += 1 + + # remove the section between the two indices, i.e. the first speaker's section + return paragraphs[:startIdx] + paragraphs[endIdx + 1:] + + +# removes all company participant names/paragraphs, as each name has its own paragraph in the call +def removeNames(paragraphs, names): + return [paragraph for paragraph in paragraphs if not paragraph.text in names] + + +# convert a list of paragraphs into a single text string +def collate(paragraphs): + s = "" + for paragraph in paragraphs: + s += paragraph.text + return s + + +def getArticle(articleId): + url = "https://seeking-alpha.p.rapidapi.com/articles/get-details" + + querystring = {"id": articleId} + + headers = { + 'x-rapidapi-host': "seeking-alpha.p.rapidapi.com", + 'x-rapidapi-key': ra_sa_key + } + + response = requests.request("GET", url, headers=headers, params=querystring) + return response + + +def runRSS(): + # get the rss feed + feed = requests.get("https://seekingalpha.com/sector/transcripts.xml").text + soup = BeautifulSoup(feed, "xml") + items = soup.find_all("item") + items = getEarningsCalls(items) + links = [item.link.text for item in items] + + # these store the data for articles where access is granted + texts = [] + timestamps = [] + symbols = [] + quarters = [] + for link in links[:1]: + linkId = link[link.index('/article/') + len('/article/'): link.index('-')] + + # Note: every tick, the API request will be re-sent, either consuming quota + # fast, or racking up bills fast. Skipping already seen links minimizes API + # usage to only new articles. + if linkId in knownLinks: + # print("Skipping lookup - already included: " + link) + continue + else: + knownLinks.append(linkId) + + # get the transcript article + source = json.loads(getArticle(linkId).text) + + try: + # find the header, timestamp, and paragraphs of the article + article = source["data"]["attributes"]["content"] + header = source["data"]["attributes"]["title"] + timestamp = source["data"]["attributes"]["publishOn"] + paragraphs = BeautifulSoup(article, "lxml").find_all("p") + + # get symbol and quarter from the header + symbol, quarter = parseHeader(header) + + # clean and collate the paragraphs + names = getNames(paragraphs) + paragraphs = truncate(paragraphs, names) + if hasSafeHarborStatement(paragraphs): + paragraphs = removeSafeHarborStatement(paragraphs, names) + paragraphs = removeNames(paragraphs, names) + text = collate(paragraphs) + + # store collected data + texts.append(text) + timestamps.append(timestamp) + symbols.append(symbol) + quarters.append(quarter) + except: + # either bad article or access denied + print("Warning: article skipped due to bad formatting or access denied.") + pass + + if len(texts) == 0: + return False + + # Known bug: https://github.com/deephaven/deephaven-core/issues/1309 + try: + symCol = columnToNumpyArray(calls, "Sym") + except: + symCol = [] + try: + quarterCol = columnToNumpyArray(calls, "Quarter") + except: + quarterCol = [] + containsNewCall = False + for i in range(len(texts)): + if symbols[i] not in symCol and quarters[i] not in quarterCol: + containsNewCall = True + tw.logRow(texts[i], timestamps[i], symbols[i], quarters[i]) + return containsNewCall + + +trainData = shuffleTable(trainData) +trainText = columnToNumpyArray(trainData, "Text") +trainLabels = inp.numpy_slice(trainData.view("Label"), 0, trainData.size(), dtype=np.int32) +trainLabels = np.reshape(trainLabels, -1) +vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 1), norm='l1') +trainTextVectorized = vectorizer.fit_transform(trainText) + +knownLinks = [] +cols = ["Text", "RSSTimestamp", "Sym", "Quarter"] +types = [dht.string, dht.string, dht.string, dht.string] +tw = DynamicTableWriter(cols, types) +twt = tw.getTable() +calls = twt \ + .firstBy("Sym", "Quarter") \ + .update("Text = (String)cleanText.call(Text)", "PredictedLabel = (int)predict.call(Text, `c`)", "PredictedLabel = PredictedLabel==0 ? -1 : PredictedLabel") \ + .moveUpColumns("Sym", "Quarter", "RSSTimestamp", "PredictedLabel") + +tt = timeTable("'00:01:00'") \ + .sortDescending("Timestamp") \ + .update("ContainedNewCalls=(boolean)runRSS.call()") +callsSummary = calls.view("Sym", "Date=convertDate(RSSTimestamp.substring(0,10))", "PredictedLabel") diff --git a/tensorflow/trainData.csv b/tensorflow/trainData.csv new file mode 100755 index 00000000..e2bf05ff --- /dev/null +++ b/tensorflow/trainData.csv @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17661d1736102715c1889a42c3f6eb64d67fcda9cc1abf9bb01030c0d52cfe42 +size 606167