diff --git a/.env.example b/.env.example index c502e26..8455c2b 100644 --- a/.env.example +++ b/.env.example @@ -1,6 +1,7 @@ +SUPABASE_URL="" +SUPABASE_KEY="" LINKEDIN_URL="" DISCORD_WEBHOOK="" -DB_URI="" DB_TABLE="opportunities_table" PALM_API_KEY="" GH_INTERN24_URL="https://github.com/pittcsc/Summer2024-Internships" diff --git a/.github/workflows/automate.yaml b/.github/workflows/automate.yaml index a9b6720..4d39a04 100644 --- a/.github/workflows/automate.yaml +++ b/.github/workflows/automate.yaml @@ -42,10 +42,11 @@ jobs: # fmt: off #LINT.IF .env.example env: + SUPABASE_URL: ${{ secrets.SUPABASE_URL }} + SUPABASE_KEY: ${{ secrets.SUPABASE_KEY }} + DB_TABLE_NAME: ${{ secrets.DB_TABLE_NAME}} LINKEDIN_URL: ${{ secrets.LINKEDIN_URL}} DISCORD_WEBHOOK: ${{ secrets.DISCORD_WEBHOOK}} - DB_URI: ${{ secrets.DB_URI}} - DB_TABLE: ${{ secrets.DB_TABLE}} PALM_API_KEY: ${{ secrets.PALM_API_KEY}} LINKEDIN_INTERN_URL: ${{ secrets.LINKEDIN_INTERN_URL}} GH_INTERN24_URL: ${{ secrets.GH_INTERN24_URL}} diff --git a/main.py b/main.py index d82adf9..f132836 100644 --- a/main.py +++ b/main.py @@ -13,6 +13,7 @@ request_linkedin_internship24_data, ) from utility.palm import gpt_job_analyze +from utility.error import ErrorMsg # Load and determine if all env variables are set @@ -70,7 +71,9 @@ async def execute_opportunities_webhook( if response.status_code == 204: print("Webhook message was sent sucessfully!") else: - print(f"Failed to send webhook message. Status Code: {response.status_code}") + print( + f"Failed to send webhook message. {ErrorMsg().status_code_failure(response.status_code)}" + ) async def main(): @@ -102,6 +105,7 @@ async def main(): job_opps, prompt_object["full_time"], ) + opps.ingest_opportunities(filtered_job_opps) # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. @@ -123,7 +127,9 @@ async def main(): # db.reset_processed_status() - internship_data_results = opps.list_opportunities(True, "internship", filtered=True) + internship_data_results = opps.list_opportunities( + False, "internship", filtered=True + ) job_data_results = opps.list_opportunities(True, "full_time", filtered=True) internship_formatted_message = opps.format_opportunities( diff --git a/requirements.txt b/requirements.txt index a3724f3..55d3f81 100644 Binary files a/requirements.txt and b/requirements.txt differ diff --git a/utility/db.py b/utility/db.py index 20fe0f6..7f974e9 100644 --- a/utility/db.py +++ b/utility/db.py @@ -1,66 +1,98 @@ -import psycopg2 import os +from supabase import create_client +from dataclasses import dataclass +from dotenv import load_dotenv +import requests +from utility.error import ErrorMsg +load_dotenv() +ERROR_MSG = ErrorMsg() -def instantiate_db_connection(): - """Returns the connection from the DB""" - db_uri = os.getenv("DB_URI") - return psycopg2.connect(db_uri) +@dataclass +class SupabaseConnection: + """Constructs Supabase connection.""" + CLIENT: any = None + SUPABASE_URL: str = os.getenv("SUPABASE_URL") + SUPABASE_KEY: str = os.getenv("SUPABASE_KEY") + SUPABASE_API_URL: str = f"{SUPABASE_URL}/rest/v1/rpc" -def create(TABLE_NAME: str) -> None: - """Creates the DB. Only needs to be called once.""" + def __post_init__(self): + self.CLIENT = self.create_supabase_client() - with instantiate_db_connection() as connection: - cursor = connection.cursor() + def create_supabase_client(self): + """Creates the Supabase client with the URL and key variables.""" - cursor.execute( - f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)""" - ) + return create_client(self.SUPABASE_URL, self.SUPABASE_KEY) - connection.commit() +SUPABASE_INSTANCE = SupabaseConnection().CLIENT -def add_column(column_name: str, data_type: str) -> None: - """Adds a column for adjustment to the table after the table has been created""" - with instantiate_db_connection() as connection: - cursor = connection.cursor() +def delete_all_opportunity_type(TABLE_NAME: str, opp_type: str) -> None: + """Deletes all opportunities of a specific type for testing purposes only.""" + + SUPABASE_INSTANCE.table(TABLE_NAME).delete().eq("type", opp_type).execute() + - cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}") +def reset_processed_status(TABLE_NAME: str) -> None: + """Jobs status will be set to _processed = 0 for testing a debugging purposes""" - connection.commit() + SUPABASE_INSTANCE.table(TABLE_NAME).update({"processed": 0}).eq( + "processed", 1 + ).limit(5).execute() -def delete_all_opportunity_type(opp_type: str) -> None: - """Deletes all opportunities of a specific type for testing purposes only""" +def execute_sql(sql: str): + """Executes a raw SQL query using the Supabase HTTP API.""" + connection = SupabaseConnection() + headers = { + "apikey": connection.SUPABASE_KEY, + "Authorization": f"Bearer {connection.SUPABASE_KEY}", + "Content-Type": "application/json", + } - with instantiate_db_connection() as connection: - cursor = connection.cursor() + data = {"query": sql} + response = requests.post(connection.SUPABASE_API_URL, headers=headers, json=data) - cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,)) - connection.commit() + response.raise_for_status() + return response -def reset_processed_status(TABLE_NAME: str) -> None: - """Jobs status will be set to _processed = 0 for testing a debugging purposes""" +def create_table(TABLE_NAME: str) -> None: + """Creates a new table in Supabase database. Only needs to be called once.""" - with instantiate_db_connection() as connection: - cursor = connection.cursor() + request = f""" + CREATE TABLE IF NOT EXISTS {TABLE_NAME} ( + company TEXT, + title TEXT, + link TEXT, + processed INTEGER DEFAULT 0, + type TEXT + ); + """ + + response = execute_sql(request) + + if response.status_code != 200: + return ERROR_MSG.status_code_failure(response.status_code) + + return "Request executed successfully." + + +def add_column(column_name: str, data_type: str) -> None: + """Adds a column for adjustment to the table after the table has been created""" - cursor.execute( - f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5" - ) + TABLE_NAME = os.getenv("DB_TABLE") - rows = cursor.fetchall() + request = f""" + ALTER TABLE {TABLE_NAME} ADD COLUMN {column_name} {data_type}; + """ - for row in rows: - company, title, location = row[:3] + response = execute_sql(request) - cursor.execute( - f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s", - (company, title, location), - ) + if response.status_code != 200: + return ERROR_MSG.status_code_failure(response.status_code) - connection.commit() + return "Request executed successfully." diff --git a/utility/error.py b/utility/error.py new file mode 100644 index 0000000..0a4d96d --- /dev/null +++ b/utility/error.py @@ -0,0 +1,24 @@ +from dataclasses import dataclass + + +@dataclass +class ErrorMsg: + """Custom error message data class.""" + + @staticmethod + def status_code_failure(status_code: str) -> str: + """A function returns an status code that is not 200.""" + + return f"Execution failure. Status code returned: {status_code}." + + @staticmethod + def date_difference_failure(error: str) -> str: + """Calculating the date difference is not possible.""" + + return f"Error calculating date difference: {error}." + + @staticmethod + def file_open_failure(file_path: str) -> str: + """Unable to open file path.""" + + return f"Unable to open/read file path: '{file_path}'." diff --git a/utility/opportunity.py b/utility/opportunity.py index 1385543..7cbaa7e 100644 --- a/utility/opportunity.py +++ b/utility/opportunity.py @@ -3,6 +3,8 @@ from typing import List import utility.db as db from enum import Enum +import uuid +from datetime import datetime import os load_dotenv() @@ -21,47 +23,49 @@ class OpportunityType(Enum): class Opportunity: """Struct to hold data for an opportunity""" + id: any company: str title: str location: str link: str processed: bool - type: OpportunityType + type_of_opportunity: OpportunityType -table_name = os.getenv("DB_TABLE") +TABLE_NAME = os.getenv("DB_TABLE_NAME") def ingest_opportunities(job_data: List[Opportunity]) -> None: """Inserts opportunities if and only if they do not already exist""" - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - for job in job_data: - cursor.execute( - f"SELECT * FROM {table_name} WHERE company = %(company)s AND title = %(title)s AND location = %(location)s AND type = %(type)s", + supabase = db.SupabaseConnection().CLIENT + for job in job_data: + + response = ( + supabase.table(TABLE_NAME) + .select("id, company, title, location, link, processed, type") + .eq("company", job.company) + .eq("title", job.title) + .eq("location", job.location) + .eq("link", job.link) + .eq("type", job.type_of_opportunity) + .execute() + ) + + if not response.data: + request = supabase.table(TABLE_NAME).insert( { + "id": str(uuid.uuid4()), "company": job.company, "title": job.title, "location": job.location, - "type": job.type, - }, + "link": job.link, + "processed": job.processed, + "type": job.type_of_opportunity, + } ) - row = cursor.fetchone() - - if row is None: - cursor.execute( - f"INSERT INTO {table_name} (company, title, location, link, processed, type) VALUES (%s, %s, %s, %s, %s, %s)", - ( - job.company, - job.title, - job.location, - job.link, - job.processed, - job.type, - ), - ) - connection.commit() + + response = request.execute() def list_opportunities( @@ -71,19 +75,20 @@ def list_opportunities( ) -> List[Opportunity]: """Lists all oppportunities in DB as well as returns them""" - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() + supabase = db.SupabaseConnection().CLIENT - if filtered: - cursor.execute( - f"SELECT * FROM {table_name} WHERE processed = 0 AND type = '{opp_type}' LIMIT 15" - ) - else: - cursor.execute(f"SELECT * FROM {table_name}") - - rows = cursor.fetchall() + if filtered: + response = ( + supabase.table(TABLE_NAME) + .select("*") + .match({"processed": False, "type": opp_type}) + .limit(15) + .execute() + ) + else: + response = supabase.table(TABLE_NAME).select("*") - return read_all_opportunities(rows, debug) + return read_all_opportunities(response.data, debug) def read_all_opportunities(rows, debug_tool: bool) -> List[Opportunity]: @@ -91,18 +96,25 @@ def read_all_opportunities(rows, debug_tool: bool) -> List[Opportunity]: opportunities = [] for row in rows: - company, title, location, link, processed, type = row - if debug_tool: - print("Company:", company) - print("Title:", title) - print("Location:", location) - print("Link:", link) - print("Processed:", processed) - print("Type: ", type) + print("Id:", row.get("id")) + print("Company:", row.get("company")) + print("Title:", row.get("title")) + print("Location:", row.get("location")) + print("Link:", row.get("link")) + print("Processed:", row.get("processed")) + print("Type: ", row.get("type")) print(" ") - opportunity = Opportunity(company, title, location, link, processed, type) + opportunity = Opportunity( + row.get("id"), + row.get("company"), + row.get("title"), + row.get("location"), + row.get("link"), + row.get("processed"), + row.get("type"), + ) opportunities.append(opportunity) @@ -112,24 +124,18 @@ def read_all_opportunities(rows, debug_tool: bool) -> List[Opportunity]: def update_opportunities_status(data_results: List[Opportunity]) -> None: """Updates the status of the jobs to processed = 1 after it's been sent by the discord bot""" - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - - for data_block in data_results: - cursor.execute( - f"UPDATE {table_name} SET processed = %s WHERE company = %s AND title = %s AND location = %s", - ( - 1, - data_block.company, - data_block.title, - data_block.location, - ), - ) - - connection.commit() + supabase = db.SupabaseConnection().CLIENT + for data_block in data_results: + supabase.table(TABLE_NAME).update({"processed": 1}).match( + { + "company": data_block.company, + "title": data_block.title, + "location": data_block.location, + } + ).execute() -def format_opportunities(data_results: str, formatted_text: str) -> str: +def format_opportunities(data_results: List[Opportunity], formatted_text: str) -> str: """Receives data from list_filtered_opporunities() and returns a single string message""" formatted_string = "" diff --git a/utility/palm.py b/utility/palm.py index 0a66404..89b48a1 100644 --- a/utility/palm.py +++ b/utility/palm.py @@ -54,7 +54,7 @@ def filter_out_opportunities( ] print( - f"Length after GPT analyzed the {list_of_opps[0].type}: {len(structured_opps)}" + f"Length after GPT analyzed the {list_of_opps[0].type_of_opportunity}: {len(structured_opps)}" ) return structured_opps @@ -90,7 +90,7 @@ def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opport """Analyzes each job opportunity before being inserted into the DB""" print( - f"The type '{list_of_opps[0].type}' original length before filtering: {len(list_of_opps)}" + f"The type '{list_of_opps[0].type_of_opportunity}' original length before filtering: {len(list_of_opps)}" ) for opp in list_of_opps: diff --git a/utility/scrape.py b/utility/scrape.py index 15fd870..85e4d98 100644 --- a/utility/scrape.py +++ b/utility/scrape.py @@ -5,6 +5,8 @@ from dotenv import load_dotenv import re import requests +import uuid +from time import sleep load_dotenv() utils.verify_set_env_variables() @@ -23,21 +25,21 @@ def request_github_internship24_data() -> List[Opportunity]: parse_content = utils.content_parser(url) td_elems = parse_content.find_all("tr") - for cell in td_elems[1:]: + for cell in td_elems[10:]: if len(github_list) <= MAX_OPPORTUNITY_LIST_LENGTH: elements = cell.find_all("td") - company = elements[0].text title = elements[1].text location = elements[2].text link = elements[3] if "🔒" not in link.text: opportunity = Opportunity( + uuid.uuid4(), company, title, location, link.find("a")["href"], - 0, + False, OpportunityType.INTERNSHIP.value, ) github_list.append(opportunity) @@ -104,7 +106,7 @@ def request_rapidapi_indeed_data() -> List[Opportunity]: title = elem["title"] location = elem["location"] link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us' - processed = 0 + processed = False opportunity = Opportunity( company, @@ -126,16 +128,23 @@ def request_linkedin_data() -> List[Opportunity]: url = os.getenv("LINKEDIN_URL") parse_content = utils.content_parser(url) - linked_in_jobs = utils.blueprint_opportunity_formatter( - parse_content, - "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - "hidden-nested-link", - "base-search-card__title", - "job-search-card__location", - "base-card__full-link", - True, - MAX_OPPORTUNITY_LIST_LENGTH, - OpportunityType.FULL_TIME.value, - ) + MAX_RETRY = 5 + + for _ in range(MAX_RETRY): + try: + linked_in_jobs = utils.blueprint_opportunity_formatter( + parse_content, + "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + "hidden-nested-link", + "base-search-card__title", + "job-search-card__location", + "base-card__full-link", + True, + MAX_OPPORTUNITY_LIST_LENGTH, + OpportunityType.FULL_TIME.value, + ) + break + except ValueError: + sleep(0.5) return linked_in_jobs diff --git a/utility/utils.py b/utility/utils.py index ba2001c..62fe335 100644 --- a/utility/utils.py +++ b/utility/utils.py @@ -3,10 +3,12 @@ from typing import List import os import argparse +import uuid import json from bs4 import BeautifulSoup from utility.opportunity import Opportunity from utility.blocklist import BlockList +from utility.error import ErrorMsg # ----------------- FOR CLI LIBRARY COMMAND ----------------- @@ -40,10 +42,11 @@ def verify_set_env_variables() -> any: """Determines if the env variables are all set properly""" env_variables = [ + "SUPABASE_URL", + "SUPABASE_KEY", + "DB_TABLE_NAME", "LINKEDIN_URL", "DISCORD_WEBHOOK", - "DB_URI", - "DB_TABLE", "PALM_API_KEY", "GH_INTERN24_URL", "LINKEDIN_INTERN_URL", @@ -76,6 +79,12 @@ def calculate_day_difference(elem: datetime) -> int: return day_difference +from typing import List +from datetime import datetime +import re +import random + + def blueprint_opportunity_formatter( content, # Parsed content div_elem, # Class to traverse job elements @@ -91,25 +100,49 @@ def blueprint_opportunity_formatter( div = content.find_all("div", class_=div_elem) days_needed_command_value = extract_command_value().days_needed[0] + + # Validate and clean days_needed_command_value + try: + days_needed = int(re.sub(r"\D", "", days_needed_command_value)) + except ValueError: + days_needed = 0 # Default to 0 or handle as needed + internship_list = [] + for elem in div: company = elem.find(class_=company_elem).text.strip() if not BlockList().is_blacklisted_company(company): title = elem.find(class_=title_elem).text.strip() location = elem.find(class_=location_elem).text.strip() link = elem.find(class_=link_elem)["href"].split("?")[0] - processed = 0 + processed = False - date_difference = calculate_day_difference(elem) + try: + date_difference = calculate_day_difference(elem) + except Exception as e: + ErrorMsg().date_difference_failure(e) + continue # Skip this element if there's an issue if len(internship_list) < len_of_jobs: - if date_limit and int(days_needed_command_value) >= date_difference: + if date_limit and days_needed >= date_difference: opportunity = Opportunity( - company, title, location, link, processed, opp_type + id=str(uuid.uuid4()), + company=company, + title=title, + location=location, + link=link, + processed=processed, + type_of_opportunity=opp_type, ) else: opportunity = Opportunity( - company, title, location, link, processed, opp_type + id=str(uuid.uuid4()), + company=company, + title=title, + location=location, + link=link, + processed=processed, + type_of_opportunity=opp_type, ) internship_list.append(opportunity) @@ -148,7 +181,7 @@ def user_customization(file_paths: List[str]) -> dict: text = file.read() data.append(text) except OSError: - print(f"Unable to open/read file path: '{file_path}'") + ErrorMsg().file_open_failure(file_path) return {"customized_message": data[0], "customized_prompts": data[1]}