diff --git a/db.py b/db.py deleted file mode 100644 index b881aac..0000000 --- a/db.py +++ /dev/null @@ -1,30 +0,0 @@ -import psycopg2 -import os - - -def instantiate_db_connection(): - """Returns the connection from the DB""" - - db_uri = os.getenv("DB_URI") - return psycopg2.connect(db_uri) - - -def add_column(column_name: str, data_type: str) -> None: - """Adds a column for adjustment to the table after the table has been created""" - - with instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}") - - connection.commit() - - -def delete_alL_opportunity_type(opp_type: str) -> None: - """Deletes all opportunities of a specific type for testing purposes only""" - - with instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,)) - connection.commit() diff --git a/main.py b/main.py index 59c4b44..eb13b98 100644 --- a/main.py +++ b/main.py @@ -2,199 +2,27 @@ import os import json import asyncio -from typing import List -import re from datetime import date -import utility as utils -import db -import opportunity as opps -from opportunity import Opportunity, OpportunityType +import utility.utils as ut +import utility.db as db +import utility.opportunity as opps from dotenv import load_dotenv -from blocklist import BlockList +from utility.scrape import ( + request_github_internship24_data, + request_linkedin_data, + request_linkedin_internship24_data, +) +from utility.palm import gpt_job_analyze -load_dotenv() # To obtain keys from the .env file +# Load and determine if all env variables are set +load_dotenv() +ut.verify_set_env_variables() -# ----------------- POSTGRES ----------------- -TABLE_NAME = os.getenv("DB_TABLE") -MAX_LIST_LENGTH = 15 - - -def create(): - """Creates the DB. Only needs to be called once.""" - - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute( - f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)""" - ) - - connection.commit() - - -# ----------------- INTERNSHIP DATA ----------------- - - -def request_github_internship24_data() -> List[Opportunity]: - """Scrapes Internship Data '24 from Github Repo""" - - url = os.getenv("GH_INTERN24_URL") - parse_content = utils.content_parser(url) - github_list = [] - td_elems = parse_content.find_all("tr") - - for cell in td_elems[1:]: - if len(github_list) <= MAX_LIST_LENGTH: - elements = cell.find_all("td") - - company = elements[0].text - if not BlockList().is_blacklisted_company(company): - title = elements[1].text - location = elements[2].text - link = elements[3] - if "🔒" not in link.text: - opportunity = Opportunity( - company, - title, - location, - link.find("a")["href"], - 0, - OpportunityType.INTERNSHIP.value, - ) - github_list.append(opportunity) - - return github_list - - -def request_linkedin_internship24_data() -> List[Opportunity]: - """Web scrapes Summer '24 Internship Opportunities using LinkedIn""" - - url = os.getenv("LINKEDIN_INTERN_URL") - - parse_content = utils.content_parser(url) - - linkedin_internship_opps = utils.blueprint_opportunity_formatter( - parse_content, - "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - "hidden-nested-link", - "base-search-card__title", - "job-search-card__location", - "base-card__full-link", - True, - MAX_LIST_LENGTH, - OpportunityType.INTERNSHIP.value, - ) - - return linkedin_internship_opps - - -# ----------------- JOB DATA ----------------- - - -def request_rapidapi_indeed_data() -> List[Opportunity]: - """ - This API call retrieves a formatted response object - and returns a List[Opportunity] as the result - """ - - url = os.getenv("RAPID_API_URL") - rapid_api_key = os.getenv("RAPID_API_KEY") - - headers = { - "X-RapidAPI-Key": rapid_api_key, - "X-RapidAPI-Host": "indeed12.p.rapidapi.com", - } - - rapid_jobs = [] - response = requests.get(url, headers=headers).json() - - days_needed_command_value = utils.extract_command_value().days_needed[ - 0 - ] # Extracts command-line value - - for elem in response["hits"]: - time = elem["formatted_relative_time"] - - numeric = re.search(r"\d+", time) - formatted_time_integer = int(numeric.group()) if numeric else 0 - - if ( - len(rapid_jobs) < MAX_LIST_LENGTH - and int(days_needed_command_value) >= formatted_time_integer - ): - company = elem["company_name"] - title = elem["title"] - location = elem["location"] - link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us' - processed = 0 - - opportunity = Opportunity( - company, - title, - location, - link, - processed, - OpportunityType.FULL_TIME.value, - ) - - rapid_jobs.append(opportunity) - - return rapid_jobs - - -def request_linkedin_data() -> List[Opportunity]: - """Returns a List[Opportunity] which contains web scraped job content""" - - url = os.getenv("LINKEDIN_URL") - parse_content = utils.content_parser(url) - - linked_in_jobs = utils.blueprint_opportunity_formatter( - parse_content, - "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - "hidden-nested-link", - "base-search-card__title", - "job-search-card__location", - "base-card__full-link", - True, - MAX_LIST_LENGTH, - OpportunityType.FULL_TIME.value, - ) - - return linked_in_jobs - - -# ----------------- RESET FUNCTION (DEBUGGING PURPOSES) ----------------- - - -def reset_processed_status(TABLE_NAME): - """Jobs status will be set to _processed = 0 for testing a debugging purposes""" - - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute( - f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5" - ) - - rows = cursor.fetchall() - - for row in rows: - company, title, location = row[:3] - - cursor.execute( - f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s", - (company, title, location), - ) - - connection.commit() - - -# ----------------- DISCORD BOT ----------------- - - -async def execute_opportunities_webhook(webhook_url, job_message, internship_message): +async def execute_opportunities_webhook( + webhook_url: str, job_message: str, internship_message: str +): """ Executes the message which receives the formatted message from the format_opportunities() function as well as the webhook @@ -247,39 +75,42 @@ async def execute_opportunities_webhook(webhook_url, job_message, internship_mes async def main(): # Creates table in database - with_create_table_command = utils.extract_command_value().create + with_create_table_command = ut.extract_command_value().create if with_create_table_command: - create() + TABLE_NAME = os.getenv("DB_TABLE") + + db.create(TABLE_NAME) + print(f"Sucessfully created {TABLE_NAME}!") exit() # Exit the main function to avoid calling other functions file_paths = [os.getenv("MESSAGE_PATH"), os.getenv("PROMPTS_PATH")] - customized_object = utils.user_customization(file_paths) + customized_object = ut.user_customization(file_paths) # Determines the customized prompts for PaLM - prompt_object = utils.determine_prompts(customized_object["customized_prompts"]) + prompt_object = ut.determine_prompts(customized_object["customized_prompts"]) # Determines the customized message for the webhook - finalized_message = utils.determine_customized_message( + finalized_message = ut.determine_customized_message( customized_object["customized_message"] ) # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. - job_opps = utils.merge_all_opportunity_data(request_linkedin_data()) + job_opps = ut.merge_all_opportunity_data(request_linkedin_data()) - filtered_job_opps = utils.gpt_job_analyze( + filtered_job_opps = gpt_job_analyze( job_opps, prompt_object["full_time"], ) opps.ingest_opportunities(filtered_job_opps) # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. - internship_opps = utils.merge_all_opportunity_data( + internship_opps = ut.merge_all_opportunity_data( request_linkedin_internship24_data(), request_github_internship24_data(), ) - filtered_internship_opps = utils.gpt_job_analyze( + filtered_internship_opps = gpt_job_analyze( internship_opps, prompt_object["internship"], ) @@ -291,7 +122,7 @@ async def main(): # To do so, please comment the function calls above this comment. # After, please uncomment the following line of code: - # reset_processed_status() + # db.reset_processed_status() internship_data_results = opps.list_opportunities(True, "internship", filtered=True) job_data_results = opps.list_opportunities(True, "full_time", filtered=True) diff --git a/blocklist.py b/utility/blocklist.py similarity index 66% rename from blocklist.py rename to utility/blocklist.py index cf9c7ea..be7052c 100644 --- a/blocklist.py +++ b/utility/blocklist.py @@ -2,7 +2,10 @@ class BlockList: """A class holding methods to determine if a company is blocklisted""" BLOCKLISTED_COMPANIES = set( - ["Pattern Learning AI - Career & Tech Recruitment Reimagined!"] + [ + "Pattern Learning AI - Career & Tech Recruitment Reimagined!", + "Patterned Learning AI - Tech Recruitment & Staffing", + ] ) def is_blacklisted_company(self, company: str) -> bool: diff --git a/utility/db.py b/utility/db.py new file mode 100644 index 0000000..20fe0f6 --- /dev/null +++ b/utility/db.py @@ -0,0 +1,66 @@ +import psycopg2 +import os + + +def instantiate_db_connection(): + """Returns the connection from the DB""" + + db_uri = os.getenv("DB_URI") + return psycopg2.connect(db_uri) + + +def create(TABLE_NAME: str) -> None: + """Creates the DB. Only needs to be called once.""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute( + f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)""" + ) + + connection.commit() + + +def add_column(column_name: str, data_type: str) -> None: + """Adds a column for adjustment to the table after the table has been created""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}") + + connection.commit() + + +def delete_all_opportunity_type(opp_type: str) -> None: + """Deletes all opportunities of a specific type for testing purposes only""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,)) + connection.commit() + + +def reset_processed_status(TABLE_NAME: str) -> None: + """Jobs status will be set to _processed = 0 for testing a debugging purposes""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute( + f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5" + ) + + rows = cursor.fetchall() + + for row in rows: + company, title, location = row[:3] + + cursor.execute( + f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s", + (company, title, location), + ) + + connection.commit() diff --git a/opportunity.py b/utility/opportunity.py similarity index 97% rename from opportunity.py rename to utility/opportunity.py index bbc90cf..1385543 100644 --- a/opportunity.py +++ b/utility/opportunity.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from dotenv import load_dotenv from typing import List -import db +import utility.db as db from enum import Enum import os @@ -32,7 +32,7 @@ class Opportunity: table_name = os.getenv("DB_TABLE") -def ingest_opportunities(job_data): +def ingest_opportunities(job_data: List[Opportunity]) -> None: """Inserts opportunities if and only if they do not already exist""" with db.instantiate_db_connection() as connection: cursor = connection.cursor() diff --git a/utility/palm.py b/utility/palm.py new file mode 100644 index 0000000..170b9de --- /dev/null +++ b/utility/palm.py @@ -0,0 +1,116 @@ +import google.generativeai as palm +from time import sleep +import os +import utility.utils as utils +from dotenv import load_dotenv +from typing import List +import json +from utility.opportunity import Opportunity + +load_dotenv() +utils.verify_set_env_variables() + + +MAX_RETRY = 5 # Max number of retrys +palm.configure(api_key=os.getenv("PALM_API_KEY")) + + +def current_model_inuse() -> any: + """Returns the model in use""" + + models = [ + m + for m in palm.list_models() + if "generateText" in m.supported_generation_methods + ] + + model = models[0].name + + return model + + +def parse_gpt_values(gpt_response: str) -> List[bool]: + """Helper function to parse the gpt response from a str -> List[bool]""" + + response: List[bool] + + for _ in range(MAX_RETRY): + try: + response = json.loads(gpt_response.lower()) + break + except AttributeError: + sleep(0.5) + + return response + + +def filter_out_opportunities( + list_of_opps: List[Opportunity], gpt_response: List[bool] +) -> List[Opportunity]: + """Helper function for gpt_job_analyzer() to filter the data""" + + structured_opps = [ + opp for opp, response in zip(list_of_opps, gpt_response) if response + ] + + print( + f"Length after GPT analyzed the {list_of_opps[0].type}: {len(structured_opps)}" + ) + return structured_opps + + +def get_parsed_values(prompt: str) -> List[bool]: + """Function which returns parsed values if the opportunity mathces with the clubs values""" + + defaults = { + "model": "models/text-bison-001", + "temperature": 0.0, + "candidate_count": 1, + "top_k": 100, + "top_p": 0.95, + "max_output_tokens": 3072, + "stop_sequences": [], + "safety_settings": [ + {"category": "HARM_CATEGORY_DEROGATORY", "threshold": 3}, + {"category": "HARM_CATEGORY_TOXICITY", "threshold": 3}, + {"category": "HARM_CATEGORY_VIOLENCE", "threshold": 3}, + {"category": "HARM_CATEGORY_SEXUAL", "threshold": 3}, + {"category": "HARM_CATEGORY_MEDICAL", "threshold": 3}, + {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 3}, + ], + } + + completion = palm.generate_text(**defaults, prompt=prompt) + + parsed_values = parse_gpt_values(completion.result) + return parsed_values + + +def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opportunity]: + """Analyzes each job opportunity before being inserted into the DB""" + + print( + f"The type '{list_of_opps[0].type}' original length before filtering: {len(list_of_opps)}" + ) + + for opp in list_of_opps: + prompt += f"\nCompany: {opp.company}" + prompt += f"\nTitle: {opp.title}" + prompt += f"\nLocation: {opp.location}" + prompt += "\n" + + parsed_values = [] + for _ in range(MAX_RETRY): # Keep looping until a valid prompt is received + try: + parsed_values = get_parsed_values(prompt) + break + except ( + json.decoder.JSONDecodeError + ): # The type of error that would be received is type JSON + sleep(0.5) + + print(f" Below are the parsed values from GPT - {parsed_values}") + + return filter_out_opportunities( + list_of_opps, parsed_values + ) # Returns filtered out opportunities diff --git a/utility/scrape.py b/utility/scrape.py new file mode 100644 index 0000000..15fd870 --- /dev/null +++ b/utility/scrape.py @@ -0,0 +1,141 @@ +from utility.opportunity import Opportunity, OpportunityType +from typing import List +import utility.utils as utils +import os +from dotenv import load_dotenv +import re +import requests + +load_dotenv() +utils.verify_set_env_variables() + +MAX_OPPORTUNITY_LIST_LENGTH = 15 + +# ----------------- INTERNSHIP DATA ----------------- + + +def request_github_internship24_data() -> List[Opportunity]: + """Scrapes Internship Data '24 from Github Repo""" + + github_list = [] + + url = os.getenv("GH_INTERN24_URL") + parse_content = utils.content_parser(url) + td_elems = parse_content.find_all("tr") + + for cell in td_elems[1:]: + if len(github_list) <= MAX_OPPORTUNITY_LIST_LENGTH: + elements = cell.find_all("td") + + company = elements[0].text + title = elements[1].text + location = elements[2].text + link = elements[3] + if "🔒" not in link.text: + opportunity = Opportunity( + company, + title, + location, + link.find("a")["href"], + 0, + OpportunityType.INTERNSHIP.value, + ) + github_list.append(opportunity) + + return github_list + + +def request_linkedin_internship24_data() -> List[Opportunity]: + """Web scrapes Summer '24 Internship Opportunities using LinkedIn""" + + url = os.getenv("LINKEDIN_INTERN_URL") + parse_content = utils.content_parser(url) + + linkedin_internship_opps = utils.blueprint_opportunity_formatter( + parse_content, + "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + "hidden-nested-link", + "base-search-card__title", + "job-search-card__location", + "base-card__full-link", + True, + MAX_OPPORTUNITY_LIST_LENGTH, + OpportunityType.INTERNSHIP.value, + ) + + return linkedin_internship_opps + + +# ----------------- JOB DATA ----------------- + + +def request_rapidapi_indeed_data() -> List[Opportunity]: + """ + This API call retrieves a formatted response object + and returns a List[Opportunity] as the result + """ + + url = os.getenv("RAPID_API_URL") + rapid_api_key = os.getenv("RAPID_API_KEY") + + headers = { + "X-RapidAPI-Key": rapid_api_key, + "X-RapidAPI-Host": "indeed12.p.rapidapi.com", + } + + rapid_jobs = [] + response = requests.get(url, headers=headers).json() + + days_needed_command_value = utils.extract_command_value().days_needed[ + 0 + ] # Extracts command-line value + + for elem in response["hits"]: + time = elem["formatted_relative_time"] + + numeric = re.search(r"\d+", time) + formatted_time_integer = int(numeric.group()) if numeric else 0 + + if ( + len(rapid_jobs) < MAX_OPPORTUNITY_LIST_LENGTH + and int(days_needed_command_value) >= formatted_time_integer + ): + company = elem["company_name"] + title = elem["title"] + location = elem["location"] + link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us' + processed = 0 + + opportunity = Opportunity( + company, + title, + location, + link, + processed, + OpportunityType.FULL_TIME.value, + ) + + rapid_jobs.append(opportunity) + + return rapid_jobs + + +def request_linkedin_data() -> List[Opportunity]: + """Returns a List[Opportunity] which contains web scraped job content""" + + url = os.getenv("LINKEDIN_URL") + parse_content = utils.content_parser(url) + + linked_in_jobs = utils.blueprint_opportunity_formatter( + parse_content, + "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + "hidden-nested-link", + "base-search-card__title", + "job-search-card__location", + "base-card__full-link", + True, + MAX_OPPORTUNITY_LIST_LENGTH, + OpportunityType.FULL_TIME.value, + ) + + return linked_in_jobs diff --git a/utility.py b/utility/utils.py similarity index 60% rename from utility.py rename to utility/utils.py index c75e983..ba2001c 100644 --- a/utility.py +++ b/utility/utils.py @@ -3,12 +3,10 @@ from typing import List import os import argparse -from time import sleep import json -import google.generativeai as palm from bs4 import BeautifulSoup -from opportunity import Opportunity -from blocklist import BlockList +from utility.opportunity import Opportunity +from utility.blocklist import BlockList # ----------------- FOR CLI LIBRARY COMMAND ----------------- @@ -38,6 +36,27 @@ def extract_command_value(): return arguments +def verify_set_env_variables() -> any: + """Determines if the env variables are all set properly""" + + env_variables = [ + "LINKEDIN_URL", + "DISCORD_WEBHOOK", + "DB_URI", + "DB_TABLE", + "PALM_API_KEY", + "GH_INTERN24_URL", + "LINKEDIN_INTERN_URL", + "PROMPTS_PATH", + "MESSAGE_PATH", + ] + + # Checks to see if the env variables in env_variables + # all exist in the current variables + if not set(os.environ).issuperset(env_variables): + raise EnvironmentError("One or more env variables are not set.") + + def calculate_day_difference(elem: datetime) -> int: """Calculates day difference for job posting times to the relevant day today""" @@ -153,106 +172,3 @@ def determine_customized_message(message: dict) -> str: file_message = json.loads(message)[0] return file_message["Message"] if file_message["Message"] else default - - -# ----------------- PALM API ----------------- - - -MAX_RETRY = 5 # Max number of retrys -palm.configure(api_key=os.getenv("PALM_API_KEY")) - - -def current_model_inuse() -> any: - """Returns the model in use""" - - models = [ - m - for m in palm.list_models() - if "generateText" in m.supported_generation_methods - ] - - model = models[0].name - - return model - - -def parse_gpt_values(gpt_response) -> List[bool]: - """Helper function to parse the gpt response from a str -> List[bool]""" - - response: List[bool] - - for _ in range(MAX_RETRY): - try: - response = json.loads(gpt_response.lower()) - break - except AttributeError: - sleep(0.5) - - return response - - -def filter_out_opportunities(list_of_opps, gpt_response) -> List[Opportunity]: - """Helper function for gpt_job_analyzer() to filter the data""" - - structured_opps = [ - opp for opp, response in zip(list_of_opps, gpt_response) if response - ] - - print(f"Length after GPT analyzed the jobs: {len(structured_opps)}") - return structured_opps - - -def get_parsed_values(prompt) -> List[bool]: - """Function which returns parsed values if the opportunity mathces with the clubs values""" - - defaults = { - "model": "models/text-bison-001", - "temperature": 0.0, - "candidate_count": 1, - "top_k": 100, - "top_p": 0.95, - "max_output_tokens": 3072, - "stop_sequences": [], - "safety_settings": [ - {"category": "HARM_CATEGORY_DEROGATORY", "threshold": 3}, - {"category": "HARM_CATEGORY_TOXICITY", "threshold": 3}, - {"category": "HARM_CATEGORY_VIOLENCE", "threshold": 3}, - {"category": "HARM_CATEGORY_SEXUAL", "threshold": 3}, - {"category": "HARM_CATEGORY_MEDICAL", "threshold": 3}, - {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 3}, - ], - } - - completion = palm.generate_text(**defaults, prompt=prompt) - - parsed_values = parse_gpt_values(completion.result) - return parsed_values - - -def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opportunity]: - """Analyzes each job opportunity before being inserted into the DB""" - - print(f"The jobs original length before filtering: {len(list_of_opps)}") - - for opp in list_of_opps: - prompt += f"\nCompany: {opp.company}" - prompt += f"\nTitle: {opp.title}" - prompt += f"\nLocation: {opp.location}" - prompt += "\n" - - parsed_values = [] - for _ in range(MAX_RETRY): # Keep looping until a valid prompt is received - try: - parsed_values = get_parsed_values(prompt) - break - except ( - json.decoder.JSONDecodeError - ): # The type of error that would be received is type JSON - sleep(0.5) - - print(f" Below are the parsed values from GPT\n {parsed_values}") - print(parsed_values) # For debugging purposes - - return filter_out_opportunities( - list_of_opps, parsed_values - ) # Returns filtered out opportunities