-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
adding the ability to track daily updates
- Loading branch information
Peter Benzoni
committed
Aug 16, 2024
1 parent
331b5a6
commit b8f61ad
Showing
1 changed file
with
66 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,66 @@ | ||
import os | ||
import pandas as pd | ||
from sqlalchemy import create_engine, text | ||
from io import StringIO, BytesIO | ||
from flask import send_file | ||
from app import fetch_content_results | ||
from dotenv import load_dotenv | ||
load_dotenv() | ||
import zipfile | ||
|
||
from sqlalchemy import create_engine | ||
from sqlalchemy.engine import URL | ||
|
||
from sqlalchemy import create_engine | ||
|
||
# Simplified connection string | ||
source_connection_string = os.getenv('HAMILTON_DATABASE_URL') | ||
source_engine = create_engine(source_connection_string) | ||
|
||
target_connection_string = os.getenv('PRODUCTION_DATABASE_URL') | ||
target_engine = create_engine(target_connection_string) | ||
|
||
articles_query = os.getenv('ARTICLES_RETRIEVAL_QUERY_DAILY') # Adjust query according to your schema | ||
|
||
|
||
# Function to articles | ||
def fetch_articles(limit=5): | ||
query = text(articles_query) | ||
with source_engine.connect() as conn: | ||
result = conn.execute(query) | ||
articles = pd.DataFrame(result.fetchall(), columns=result.keys()) | ||
return articles | ||
|
||
|
||
# Process each article through the laundromat | ||
def process_article(title, content, language, engines = ['google', 'google_news', 'bing', 'bing_news', 'yahoo', 'duckduckgo', 'yandex', 'gdelt', 'copyscape']): | ||
return fetch_content_results(title, content, 'OR', language, 'us', engines=engines) | ||
|
||
# Save results to content_queries_results_hamilton in the target database | ||
def save_results_to_target_db(results): | ||
try: | ||
results = results.drop(['engine', 'source'], axis=1) | ||
results.to_sql('content_queries_results_hamilton', target_engine, if_exists='append', index=False) | ||
except Exception as e: | ||
print(f"Error saving results to target database: {e}") | ||
|
||
|
||
def main(): | ||
# Fetch articles | ||
articles = fetch_articles() | ||
|
||
# Process articles iteratively | ||
processed_data = [] | ||
|
||
for idx, row in articles.iterrows(): | ||
results, csv_data = process_article( row['title'], row['excerpt'], row['langTranslated']) | ||
for result in results: | ||
result.update({"searched_site": row['site'] , 'searched_published': row['published'], "searched_title": row['title'], "searched_excerpt": row['excerpt'], "searched_langtranslated": row['langTranslated'], 'searched_titletranslated':row['titleTranslated'], 'searched_excerpttranslated':row['excerptTranslated'], 'searched_url':row['url']}) | ||
save_results_to_target_db(pd.DataFrame(results)) | ||
print(f"Processed article: {row}") | ||
|
||
# Save results to content_queries_results_hamilton in the target database | ||
|
||
|
||
if __name__ == "__main__": | ||
main() |