Skip to content

Commit

Permalink
feat(google_trends.py): add google trends data loader
Browse files Browse the repository at this point in the history
Closes #72
  • Loading branch information
davidpomerenke committed Apr 26, 2024
1 parent 575b8cb commit a60354b
Show file tree
Hide file tree
Showing 4 changed files with 232 additions and 1 deletion.
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
"""
Documentation of the Google Trends API:
- https://github.com/GeneralMills/pytrends
- https://searchanalysisguide.blogspot.com/2013/04/google-trends-what-is-partial-data.html
There is also regional data, useful for synthetic control.
For the last 90 days, data is also available with daily resolution; otherwise only weekly.
"""

from time import sleep

from media_impact_monitor.util.cache import cache
from pytrends.request import TrendReq


@cache
def get_google_trends_counts(query: str):
PyTrends = TrendReq(hl="de-DE", tz=60)
PyTrends.build_payload([query], timeframe="today 5-y", geo="DE")
df = PyTrends.interest_over_time()
df = (
df[~df["isPartial"]]
.drop(columns=["isPartial"])
.rename(columns={query: "count"})
)
# when rate limit is reached, this should be 60 seconds according to https://github.com/GeneralMills/pytrends
sleep(1)
return df
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from media_impact_monitor.data_loaders.web.google_trends import get_google_trends_counts


def test_get_google_trends_counts():
df = get_google_trends_counts("corona")
assert not df.empty
assert df.columns == ["count"]
assert df.index.name == "date"
assert df.index.is_monotonic_increasing
assert df["count"].dtype == int
assert df["count"].min() >= 0
assert df["count"].max() >= 0
assert df["count"].max() == 100
assert df["count"].sum() >= 0
assert df["count"].sum() <= 100 * len(df)
Loading

0 comments on commit a60354b

Please sign in to comment.