From 939112e65e919bdf28a9c0408a77913abfdff88e Mon Sep 17 00:00:00 2001 From: Boushra Bettir <116927138+boushrabettir@users.noreply.github.com> Date: Tue, 19 Sep 2023 14:49:47 -0700 Subject: [PATCH 1/5] Refactor code - Placed extra files in `utility` folder. - Created `db.py` to store the basic database queries/functionalities - Created `palm.py` to place all functions made for using PaLM API - Created `scrap.py` to move all scraping functions in their own file --- db.py | 30 ---- main.py | 211 +++-------------------- utility/db.py | 66 +++++++ opportunity.py => utility/opportunity.py | 0 utility/palm.py | 111 ++++++++++++ utility/scrap.py | 141 +++++++++++++++ utility.py => utility/utils.py | 126 +++----------- 7 files changed, 360 insertions(+), 325 deletions(-) delete mode 100644 db.py create mode 100644 utility/db.py rename opportunity.py => utility/opportunity.py (100%) create mode 100644 utility/palm.py create mode 100644 utility/scrap.py rename utility.py => utility/utils.py (60%) diff --git a/db.py b/db.py deleted file mode 100644 index b881aac..0000000 --- a/db.py +++ /dev/null @@ -1,30 +0,0 @@ -import psycopg2 -import os - - -def instantiate_db_connection(): - """Returns the connection from the DB""" - - db_uri = os.getenv("DB_URI") - return psycopg2.connect(db_uri) - - -def add_column(column_name: str, data_type: str) -> None: - """Adds a column for adjustment to the table after the table has been created""" - - with instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}") - - connection.commit() - - -def delete_alL_opportunity_type(opp_type: str) -> None: - """Deletes all opportunities of a specific type for testing purposes only""" - - with instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,)) - connection.commit() diff --git a/main.py b/main.py index 89785e6..39de0f2 100644 --- a/main.py +++ b/main.py @@ -2,194 +2,22 @@ import os import json import asyncio -from typing import List -import re from datetime import date -import utility as utils -import db -import opportunity as opps -from opportunity import Opportunity, OpportunityType +from utility import utils +import utility.db as db +import utility.opportunity as opps from dotenv import load_dotenv +from utility.scrap import ( + request_github_internship24_data, + request_linkedin_data, + request_linkedin_internship24_data, +) +from utility.palm import gpt_job_analyze -load_dotenv() # To obtain keys from the .env file - -# ----------------- POSTGRES ----------------- - -TABLE_NAME = os.getenv("DB_TABLE") -MAX_LIST_LENGTH = 13 - - -def create(): - """Creates the DB. Only needs to be called once.""" - - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute( - f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)""" - ) - - connection.commit() - - -# ----------------- INTERNSHIP DATA ----------------- - - -def request_github_internship24_data() -> List[Opportunity]: - """Scrapes Internship Data '24 from Github Repo""" - - url = os.getenv("GH_INTERN24_URL") - parse_content = utils.content_parser(url) - github_list = [] - td_elems = parse_content.find_all("tr") - - for cell in td_elems[1:]: - if len(github_list) <= MAX_LIST_LENGTH: - elements = cell.find_all("td") - - company = elements[0].text - title = elements[1].text - location = elements[2].text - link = elements[3] - if "🔒" not in link.text: - opportunity = Opportunity( - company, - title, - location, - link.find("a")["href"], - 0, - OpportunityType.INTERNSHIP.value, - ) - github_list.append(opportunity) - - return github_list - - -def request_linkedin_internship24_data() -> List[Opportunity]: - """Web scrapes Summer '24 Internship Opportunities using LinkedIn""" - - url = os.getenv("LINKEDIN_INTERN_URL") - - parse_content = utils.content_parser(url) - - linkedin_internship_opps = utils.blueprint_opportunity_formatter( - parse_content, - "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - "hidden-nested-link", - "base-search-card__title", - "job-search-card__location", - "base-card__full-link", - True, - MAX_LIST_LENGTH, - OpportunityType.INTERNSHIP.value, - ) - - return linkedin_internship_opps - - -# ----------------- JOB DATA ----------------- - - -def request_rapidapi_indeed_data() -> List[Opportunity]: - """ - This API call retrieves a formatted response object - and returns a List[Opportunity] as the result - """ - - url = os.getenv("RAPID_API_URL") - rapid_api_key = os.getenv("RAPID_API_KEY") - - headers = { - "X-RapidAPI-Key": rapid_api_key, - "X-RapidAPI-Host": "indeed12.p.rapidapi.com", - } - - rapid_jobs = [] - response = requests.get(url, headers=headers).json() - - days_needed_command_value = utils.extract_command_value().days_needed[ - 0 - ] # Extracts command-line value - - for elem in response["hits"]: - time = elem["formatted_relative_time"] - - numeric = re.search(r"\d+", time) - formatted_time_integer = int(numeric.group()) if numeric else 0 - - if ( - len(rapid_jobs) < MAX_LIST_LENGTH - and int(days_needed_command_value) >= formatted_time_integer - ): - company = elem["company_name"] - title = elem["title"] - location = elem["location"] - link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us' - processed = 0 - - opportunity = Opportunity( - company, - title, - location, - link, - processed, - OpportunityType.FULL_TIME.value, - ) - - rapid_jobs.append(opportunity) - - return rapid_jobs - - -def request_linkedin_data() -> List[Opportunity]: - """Returns a List[Opportunity] which contains web scraped job content""" - - url = os.getenv("LINKEDIN_URL") - parse_content = utils.content_parser(url) - - linked_in_jobs = utils.blueprint_opportunity_formatter( - parse_content, - "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", - "hidden-nested-link", - "base-search-card__title", - "job-search-card__location", - "base-card__full-link", - True, - MAX_LIST_LENGTH, - OpportunityType.FULL_TIME.value, - ) - - return linked_in_jobs - - -# ----------------- RESET FUNCTION (DEBUGGING PURPOSES) ----------------- - - -def reset_processed_status(TABLE_NAME): - """Jobs status will be set to _processed = 0 for testing a debugging purposes""" - - with db.instantiate_db_connection() as connection: - cursor = connection.cursor() - - cursor.execute( - f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5" - ) - - rows = cursor.fetchall() - - for row in rows: - company, title, location = row[:3] - - cursor.execute( - f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s", - (company, title, location), - ) - - connection.commit() - - -# ----------------- DISCORD BOT ----------------- +# Load and determine if all env variables are set +load_dotenv() +utils.verify_set_env_variables() async def execute_opportunities_webhook(webhook_url, job_message, internship_message): @@ -247,7 +75,10 @@ async def main(): # Creates table in database with_create_table_command = utils.extract_command_value().create if with_create_table_command: - create() + TABLE_NAME = os.getenv("DB_TABLE") + + db.create(TABLE_NAME) + print(f"Sucessfully created {TABLE_NAME}!") exit() # Exit the main function to avoid calling other functions @@ -265,7 +96,7 @@ async def main(): # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. job_opps = utils.merge_all_opportunity_data(request_linkedin_data()) - filtered_job_opps = utils.gpt_job_analyze( + filtered_job_opps = gpt_job_analyze( job_opps, prompt_object["full_time"], ) @@ -277,7 +108,7 @@ async def main(): request_github_internship24_data(), ) - filtered_internship_opps = utils.gpt_job_analyze( + filtered_internship_opps = gpt_job_analyze( internship_opps, prompt_object["internship"], ) @@ -289,7 +120,7 @@ async def main(): # To do so, please comment the function calls above this comment. # After, please uncomment the following line of code: - # reset_processed_status() + # db.reset_processed_status() internship_data_results = opps.list_opportunities(True, "internship", filtered=True) job_data_results = opps.list_opportunities(True, "full_time", filtered=True) @@ -311,5 +142,5 @@ async def main(): opps.update_opportunities_status(internship_data_results) -if __name__ == "__main__": - asyncio.run(main()) +# if __name__ == "__main__": +# asyncio.run(main()) diff --git a/utility/db.py b/utility/db.py new file mode 100644 index 0000000..7371f07 --- /dev/null +++ b/utility/db.py @@ -0,0 +1,66 @@ +import psycopg2 +import os + + +def instantiate_db_connection(): + """Returns the connection from the DB""" + + db_uri = os.getenv("DB_URI") + return psycopg2.connect(db_uri) + + +def create(TABLE_NAME: str): + """Creates the DB. Only needs to be called once.""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute( + f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)""" + ) + + connection.commit() + + +def add_column(column_name: str, data_type: str) -> None: + """Adds a column for adjustment to the table after the table has been created""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}") + + connection.commit() + + +def delete_all_opportunity_type(opp_type: str) -> None: + """Deletes all opportunities of a specific type for testing purposes only""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,)) + connection.commit() + + +def reset_processed_status(TABLE_NAME): + """Jobs status will be set to _processed = 0 for testing a debugging purposes""" + + with instantiate_db_connection() as connection: + cursor = connection.cursor() + + cursor.execute( + f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5" + ) + + rows = cursor.fetchall() + + for row in rows: + company, title, location = row[:3] + + cursor.execute( + f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s", + (company, title, location), + ) + + connection.commit() diff --git a/opportunity.py b/utility/opportunity.py similarity index 100% rename from opportunity.py rename to utility/opportunity.py diff --git a/utility/palm.py b/utility/palm.py new file mode 100644 index 0000000..f2e4606 --- /dev/null +++ b/utility/palm.py @@ -0,0 +1,111 @@ +import google.generativeai as palm +from time import sleep +import os +import utils +from dotenv import load_dotenv +from typing import List +import json +from opportunity import Opportunity + +load_dotenv() +utils.verify_set_env_variables() + + +MAX_RETRY = 5 # Max number of retrys +palm.configure(api_key=os.getenv("PALM_API_KEY")) + + +def current_model_inuse() -> any: + """Returns the model in use""" + + models = [ + m + for m in palm.list_models() + if "generateText" in m.supported_generation_methods + ] + + model = models[0].name + + return model + + +def parse_gpt_values(gpt_response) -> List[bool]: + """Helper function to parse the gpt response from a str -> List[bool]""" + + response: List[bool] + + for _ in range(MAX_RETRY): + try: + response = json.loads(gpt_response.lower()) + break + except AttributeError: + sleep(0.5) + + return response + + +def filter_out_opportunities(list_of_opps, gpt_response) -> List[Opportunity]: + """Helper function for gpt_job_analyzer() to filter the data""" + + structured_opps = [ + opp for opp, response in zip(list_of_opps, gpt_response) if response + ] + + print(f"Length after GPT analyzed the jobs: {len(structured_opps)}") + return structured_opps + + +def get_parsed_values(prompt) -> List[bool]: + """Function which returns parsed values if the opportunity mathces with the clubs values""" + + defaults = { + "model": "models/text-bison-001", + "temperature": 0.0, + "candidate_count": 1, + "top_k": 100, + "top_p": 0.95, + "max_output_tokens": 3072, + "stop_sequences": [], + "safety_settings": [ + {"category": "HARM_CATEGORY_DEROGATORY", "threshold": 3}, + {"category": "HARM_CATEGORY_TOXICITY", "threshold": 3}, + {"category": "HARM_CATEGORY_VIOLENCE", "threshold": 3}, + {"category": "HARM_CATEGORY_SEXUAL", "threshold": 3}, + {"category": "HARM_CATEGORY_MEDICAL", "threshold": 3}, + {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 3}, + ], + } + + completion = palm.generate_text(**defaults, prompt=prompt) + + parsed_values = parse_gpt_values(completion.result) + return parsed_values + + +def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opportunity]: + """Analyzes each job opportunity before being inserted into the DB""" + + print(f"The jobs original length before filtering: {len(list_of_opps)}") + + for opp in list_of_opps: + prompt += f"\nCompany: {opp.company}" + prompt += f"\nTitle: {opp.title}" + prompt += f"\nLocation: {opp.location}" + prompt += "\n" + + parsed_values = [] + for _ in range(MAX_RETRY): # Keep looping until a valid prompt is received + try: + parsed_values = get_parsed_values(prompt) + break + except ( + json.decoder.JSONDecodeError + ): # The type of error that would be received is type JSON + sleep(0.5) + + print(f" Below are the parsed values from GPT\n {parsed_values}") + print(parsed_values) # For debugging purposes + + return filter_out_opportunities( + list_of_opps, parsed_values + ) # Returns filtered out opportunities diff --git a/utility/scrap.py b/utility/scrap.py new file mode 100644 index 0000000..5df3e79 --- /dev/null +++ b/utility/scrap.py @@ -0,0 +1,141 @@ +from utility.opportunity import Opportunity, OpportunityType +from typing import List +import utils +import os +from dotenv import load_dotenv +import re +import requests + +load_dotenv() +utils.verify_set_env_variables() + +MAX_OPPORTUNITY_LIST_LENGTH = 13 + +# ----------------- INTERNSHIP DATA ----------------- + + +def request_github_internship24_data() -> List[Opportunity]: + """Scrapes Internship Data '24 from Github Repo""" + + github_list = [] + + url = os.getenv("GH_INTERN24_URL") + parse_content = utils.content_parser(url) + td_elems = parse_content.find_all("tr") + + for cell in td_elems[1:]: + if len(github_list) <= MAX_OPPORTUNITY_LIST_LENGTH: + elements = cell.find_all("td") + + company = elements[0].text + title = elements[1].text + location = elements[2].text + link = elements[3] + if "🔒" not in link.text: + opportunity = Opportunity( + company, + title, + location, + link.find("a")["href"], + 0, + OpportunityType.INTERNSHIP.value, + ) + github_list.append(opportunity) + + return github_list + + +def request_linkedin_internship24_data() -> List[Opportunity]: + """Web scrapes Summer '24 Internship Opportunities using LinkedIn""" + + url = os.getenv("LINKEDIN_INTERN_URL") + parse_content = utils.content_parser(url) + + linkedin_internship_opps = utils.blueprint_opportunity_formatter( + parse_content, + "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + "hidden-nested-link", + "base-search-card__title", + "job-search-card__location", + "base-card__full-link", + True, + MAX_OPPORTUNITY_LIST_LENGTH, + OpportunityType.INTERNSHIP.value, + ) + + return linkedin_internship_opps + + +# ----------------- JOB DATA ----------------- + + +def request_rapidapi_indeed_data() -> List[Opportunity]: + """ + This API call retrieves a formatted response object + and returns a List[Opportunity] as the result + """ + + url = os.getenv("RAPID_API_URL") + rapid_api_key = os.getenv("RAPID_API_KEY") + + headers = { + "X-RapidAPI-Key": rapid_api_key, + "X-RapidAPI-Host": "indeed12.p.rapidapi.com", + } + + rapid_jobs = [] + response = requests.get(url, headers=headers).json() + + days_needed_command_value = utils.extract_command_value().days_needed[ + 0 + ] # Extracts command-line value + + for elem in response["hits"]: + time = elem["formatted_relative_time"] + + numeric = re.search(r"\d+", time) + formatted_time_integer = int(numeric.group()) if numeric else 0 + + if ( + len(rapid_jobs) < MAX_OPPORTUNITY_LIST_LENGTH + and int(days_needed_command_value) >= formatted_time_integer + ): + company = elem["company_name"] + title = elem["title"] + location = elem["location"] + link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us' + processed = 0 + + opportunity = Opportunity( + company, + title, + location, + link, + processed, + OpportunityType.FULL_TIME.value, + ) + + rapid_jobs.append(opportunity) + + return rapid_jobs + + +def request_linkedin_data() -> List[Opportunity]: + """Returns a List[Opportunity] which contains web scraped job content""" + + url = os.getenv("LINKEDIN_URL") + parse_content = utils.content_parser(url) + + linked_in_jobs = utils.blueprint_opportunity_formatter( + parse_content, + "base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card", + "hidden-nested-link", + "base-search-card__title", + "job-search-card__location", + "base-card__full-link", + True, + MAX_OPPORTUNITY_LIST_LENGTH, + OpportunityType.FULL_TIME.value, + ) + + return linked_in_jobs diff --git a/utility.py b/utility/utils.py similarity index 60% rename from utility.py rename to utility/utils.py index e91d4da..698bfcb 100644 --- a/utility.py +++ b/utility/utils.py @@ -3,9 +3,7 @@ from typing import List import os import argparse -from time import sleep import json -import google.generativeai as palm from bs4 import BeautifulSoup from opportunity import Opportunity, OpportunityType @@ -37,6 +35,27 @@ def extract_command_value(): return arguments +def verify_set_env_variables() -> any: + """Determines if the env variables are all set properly""" + + env_variables = [ + "LINKEDIN_URL", + "DISCORD_WEBHOOK", + "DB_URI", + "DB_TABLE", + "PALM_API_KEY", + "GH_INTERN24_URL", + "LINKEDIN_INTERN_URL", + "PROMPTS_PATH", + "MESSAGE_PATH", + ] + + # Checks to see if the env variables in env_variables + # all exist in the current variables + if not set(os.environ).issuperset(env_variables): + raise EnvironmentError("One or more env variables are not set.") + + def calculate_day_difference(elem: datetime) -> int: """Calculates day difference for job posting times to the relevant day today""" @@ -150,106 +169,3 @@ def determine_customized_message(message: dict) -> str: file_message = json.loads(message)[0] return file_message["Message"] if file_message["Message"] else default - - -# ----------------- PALM API ----------------- - - -MAX_RETRY = 5 # Max number of retrys -palm.configure(api_key=os.getenv("PALM_API_KEY")) - - -def current_model_inuse() -> any: - """Returns the model in use""" - - models = [ - m - for m in palm.list_models() - if "generateText" in m.supported_generation_methods - ] - - model = models[0].name - - return model - - -def parse_gpt_values(gpt_response) -> List[bool]: - """Helper function to parse the gpt response from a str -> List[bool]""" - - response: List[bool] - - for _ in range(MAX_RETRY): - try: - response = json.loads(gpt_response.lower()) - break - except AttributeError: - sleep(0.5) - - return response - - -def filter_out_opportunities(list_of_opps, gpt_response) -> List[Opportunity]: - """Helper function for gpt_job_analyzer() to filter the data""" - - structured_opps = [ - opp for opp, response in zip(list_of_opps, gpt_response) if response - ] - - print(f"Length after GPT analyzed the jobs: {len(structured_opps)}") - return structured_opps - - -def get_parsed_values(prompt) -> List[bool]: - """Function which returns parsed values if the opportunity mathces with the clubs values""" - - defaults = { - "model": "models/text-bison-001", - "temperature": 0.0, - "candidate_count": 1, - "top_k": 100, - "top_p": 0.95, - "max_output_tokens": 3072, - "stop_sequences": [], - "safety_settings": [ - {"category": "HARM_CATEGORY_DEROGATORY", "threshold": 3}, - {"category": "HARM_CATEGORY_TOXICITY", "threshold": 3}, - {"category": "HARM_CATEGORY_VIOLENCE", "threshold": 3}, - {"category": "HARM_CATEGORY_SEXUAL", "threshold": 3}, - {"category": "HARM_CATEGORY_MEDICAL", "threshold": 3}, - {"category": "HARM_CATEGORY_DANGEROUS", "threshold": 3}, - ], - } - - completion = palm.generate_text(**defaults, prompt=prompt) - - parsed_values = parse_gpt_values(completion.result) - return parsed_values - - -def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opportunity]: - """Analyzes each job opportunity before being inserted into the DB""" - - print(f"The jobs original length before filtering: {len(list_of_opps)}") - - for opp in list_of_opps: - prompt += f"\nCompany: {opp.company}" - prompt += f"\nTitle: {opp.title}" - prompt += f"\nLocation: {opp.location}" - prompt += "\n" - - parsed_values = [] - for _ in range(MAX_RETRY): # Keep looping until a valid prompt is received - try: - parsed_values = get_parsed_values(prompt) - break - except ( - json.decoder.JSONDecodeError - ): # The type of error that would be received is type JSON - sleep(0.5) - - print(f" Below are the parsed values from GPT\n {parsed_values}") - print(parsed_values) # For debugging purposes - - return filter_out_opportunities( - list_of_opps, parsed_values - ) # Returns filtered out opportunities From 133f201a50009e11e913ea086ea0c1d4f72c9a96 Mon Sep 17 00:00:00 2001 From: Boushra Bettir <116927138+boushrabettir@users.noreply.github.com> Date: Mon, 25 Sep 2023 12:31:55 -0700 Subject: [PATCH 2/5] Update scrap->scrape. --- utility/{scrap.py => scrape.py} | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) rename utility/{scrap.py => scrape.py} (99%) diff --git a/utility/scrap.py b/utility/scrape.py similarity index 99% rename from utility/scrap.py rename to utility/scrape.py index 5df3e79..288b012 100644 --- a/utility/scrap.py +++ b/utility/scrape.py @@ -9,7 +9,7 @@ load_dotenv() utils.verify_set_env_variables() -MAX_OPPORTUNITY_LIST_LENGTH = 13 +MAX_OPPORTUNITY_LIST_LENGTH = 15 # ----------------- INTERNSHIP DATA ----------------- From a6a7c59e06beec7ff7556613bb63fc0aaf2ee48e Mon Sep 17 00:00:00 2001 From: Boushra Bettir <116927138+boushrabettir@users.noreply.github.com> Date: Mon, 25 Sep 2023 12:32:07 -0700 Subject: [PATCH 3/5] Update import --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 39de0f2..ce7074e 100644 --- a/main.py +++ b/main.py @@ -7,7 +7,7 @@ import utility.db as db import utility.opportunity as opps from dotenv import load_dotenv -from utility.scrap import ( +from utility.scrape import ( request_github_internship24_data, request_linkedin_data, request_linkedin_internship24_data, From 8c7e0162ca1eb92cb6bdaba18e24717e1aeacd80 Mon Sep 17 00:00:00 2001 From: Boushra Bettir <116927138+boushrabettir@users.noreply.github.com> Date: Fri, 6 Oct 2023 22:31:45 -0700 Subject: [PATCH 4/5] Update imports --- main.py | 24 ++++++++++++------------ blocklist.py => utility/blocklist.py | 5 ++++- utility/opportunity.py | 2 +- utility/palm.py | 4 ++-- utility/scrape.py | 2 +- utility/utils.py | 4 ++-- 6 files changed, 22 insertions(+), 19 deletions(-) rename blocklist.py => utility/blocklist.py (66%) diff --git a/main.py b/main.py index ce7074e..22cae04 100644 --- a/main.py +++ b/main.py @@ -3,7 +3,7 @@ import json import asyncio from datetime import date -from utility import utils +import utility.utils as ut import utility.db as db import utility.opportunity as opps from dotenv import load_dotenv @@ -17,7 +17,7 @@ # Load and determine if all env variables are set load_dotenv() -utils.verify_set_env_variables() +ut.verify_set_env_variables() async def execute_opportunities_webhook(webhook_url, job_message, internship_message): @@ -73,7 +73,7 @@ async def execute_opportunities_webhook(webhook_url, job_message, internship_mes async def main(): # Creates table in database - with_create_table_command = utils.extract_command_value().create + with_create_table_command = ut.extract_command_value().create if with_create_table_command: TABLE_NAME = os.getenv("DB_TABLE") @@ -83,18 +83,18 @@ async def main(): exit() # Exit the main function to avoid calling other functions file_paths = [os.getenv("MESSAGE_PATH"), os.getenv("PROMPTS_PATH")] - customized_object = utils.user_customization(file_paths) + customized_object = ut.user_customization(file_paths) # Determines the customized prompts for PaLM - prompt_object = utils.determine_prompts(customized_object["customized_prompts"]) + prompt_object = ut.determine_prompts(customized_object["customized_prompts"]) # Determines the customized message for the webhook - finalized_message = utils.determine_customized_message( + finalized_message = ut.determine_customized_message( customized_object["customized_message"] ) # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. - job_opps = utils.merge_all_opportunity_data(request_linkedin_data()) + job_opps = ut.merge_all_opportunity_data(request_linkedin_data()) filtered_job_opps = gpt_job_analyze( job_opps, @@ -103,7 +103,7 @@ async def main(): opps.ingest_opportunities(filtered_job_opps) # Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER. - internship_opps = utils.merge_all_opportunity_data( + internship_opps = ut.merge_all_opportunity_data( request_linkedin_internship24_data(), request_github_internship24_data(), ) @@ -138,9 +138,9 @@ async def main(): discord_webhook, job_formatted_message, internship_formatted_message ) - opps.update_opportunities_status(job_data_results) - opps.update_opportunities_status(internship_data_results) + # opps.update_opportunities_status(job_data_results) + # opps.update_opportunities_status(internship_data_results) -# if __name__ == "__main__": -# asyncio.run(main()) +if __name__ == "__main__": + asyncio.run(main()) diff --git a/blocklist.py b/utility/blocklist.py similarity index 66% rename from blocklist.py rename to utility/blocklist.py index cf9c7ea..be7052c 100644 --- a/blocklist.py +++ b/utility/blocklist.py @@ -2,7 +2,10 @@ class BlockList: """A class holding methods to determine if a company is blocklisted""" BLOCKLISTED_COMPANIES = set( - ["Pattern Learning AI - Career & Tech Recruitment Reimagined!"] + [ + "Pattern Learning AI - Career & Tech Recruitment Reimagined!", + "Patterned Learning AI - Tech Recruitment & Staffing", + ] ) def is_blacklisted_company(self, company: str) -> bool: diff --git a/utility/opportunity.py b/utility/opportunity.py index bbc90cf..b86a1fa 100644 --- a/utility/opportunity.py +++ b/utility/opportunity.py @@ -1,7 +1,7 @@ from dataclasses import dataclass from dotenv import load_dotenv from typing import List -import db +import utility.db as db from enum import Enum import os diff --git a/utility/palm.py b/utility/palm.py index f2e4606..dc478a9 100644 --- a/utility/palm.py +++ b/utility/palm.py @@ -1,11 +1,11 @@ import google.generativeai as palm from time import sleep import os -import utils +import utility.utils as utils from dotenv import load_dotenv from typing import List import json -from opportunity import Opportunity +from utility.opportunity import Opportunity load_dotenv() utils.verify_set_env_variables() diff --git a/utility/scrape.py b/utility/scrape.py index 288b012..15fd870 100644 --- a/utility/scrape.py +++ b/utility/scrape.py @@ -1,6 +1,6 @@ from utility.opportunity import Opportunity, OpportunityType from typing import List -import utils +import utility.utils as utils import os from dotenv import load_dotenv import re diff --git a/utility/utils.py b/utility/utils.py index d74f6e9..ba2001c 100644 --- a/utility/utils.py +++ b/utility/utils.py @@ -5,8 +5,8 @@ import argparse import json from bs4 import BeautifulSoup -from opportunity import Opportunity -from blocklist import BlockList +from utility.opportunity import Opportunity +from utility.blocklist import BlockList # ----------------- FOR CLI LIBRARY COMMAND ----------------- From d21614b7b41d354c42a3a91fb4c9352e7887b89a Mon Sep 17 00:00:00 2001 From: Boushra Bettir <116927138+boushrabettir@users.noreply.github.com> Date: Sat, 7 Oct 2023 20:14:24 -0700 Subject: [PATCH 5/5] Update types + debug message. --- main.py | 8 +++++--- utility/db.py | 4 ++-- utility/opportunity.py | 2 +- utility/palm.py | 19 ++++++++++++------- 4 files changed, 20 insertions(+), 13 deletions(-) diff --git a/main.py b/main.py index 22cae04..eb13b98 100644 --- a/main.py +++ b/main.py @@ -20,7 +20,9 @@ ut.verify_set_env_variables() -async def execute_opportunities_webhook(webhook_url, job_message, internship_message): +async def execute_opportunities_webhook( + webhook_url: str, job_message: str, internship_message: str +): """ Executes the message which receives the formatted message from the format_opportunities() function as well as the webhook @@ -138,8 +140,8 @@ async def main(): discord_webhook, job_formatted_message, internship_formatted_message ) - # opps.update_opportunities_status(job_data_results) - # opps.update_opportunities_status(internship_data_results) + opps.update_opportunities_status(job_data_results) + opps.update_opportunities_status(internship_data_results) if __name__ == "__main__": diff --git a/utility/db.py b/utility/db.py index 7371f07..20fe0f6 100644 --- a/utility/db.py +++ b/utility/db.py @@ -9,7 +9,7 @@ def instantiate_db_connection(): return psycopg2.connect(db_uri) -def create(TABLE_NAME: str): +def create(TABLE_NAME: str) -> None: """Creates the DB. Only needs to be called once.""" with instantiate_db_connection() as connection: @@ -43,7 +43,7 @@ def delete_all_opportunity_type(opp_type: str) -> None: connection.commit() -def reset_processed_status(TABLE_NAME): +def reset_processed_status(TABLE_NAME: str) -> None: """Jobs status will be set to _processed = 0 for testing a debugging purposes""" with instantiate_db_connection() as connection: diff --git a/utility/opportunity.py b/utility/opportunity.py index b86a1fa..1385543 100644 --- a/utility/opportunity.py +++ b/utility/opportunity.py @@ -32,7 +32,7 @@ class Opportunity: table_name = os.getenv("DB_TABLE") -def ingest_opportunities(job_data): +def ingest_opportunities(job_data: List[Opportunity]) -> None: """Inserts opportunities if and only if they do not already exist""" with db.instantiate_db_connection() as connection: cursor = connection.cursor() diff --git a/utility/palm.py b/utility/palm.py index dc478a9..170b9de 100644 --- a/utility/palm.py +++ b/utility/palm.py @@ -29,7 +29,7 @@ def current_model_inuse() -> any: return model -def parse_gpt_values(gpt_response) -> List[bool]: +def parse_gpt_values(gpt_response: str) -> List[bool]: """Helper function to parse the gpt response from a str -> List[bool]""" response: List[bool] @@ -44,18 +44,22 @@ def parse_gpt_values(gpt_response) -> List[bool]: return response -def filter_out_opportunities(list_of_opps, gpt_response) -> List[Opportunity]: +def filter_out_opportunities( + list_of_opps: List[Opportunity], gpt_response: List[bool] +) -> List[Opportunity]: """Helper function for gpt_job_analyzer() to filter the data""" structured_opps = [ opp for opp, response in zip(list_of_opps, gpt_response) if response ] - print(f"Length after GPT analyzed the jobs: {len(structured_opps)}") + print( + f"Length after GPT analyzed the {list_of_opps[0].type}: {len(structured_opps)}" + ) return structured_opps -def get_parsed_values(prompt) -> List[bool]: +def get_parsed_values(prompt: str) -> List[bool]: """Function which returns parsed values if the opportunity mathces with the clubs values""" defaults = { @@ -85,7 +89,9 @@ def get_parsed_values(prompt) -> List[bool]: def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opportunity]: """Analyzes each job opportunity before being inserted into the DB""" - print(f"The jobs original length before filtering: {len(list_of_opps)}") + print( + f"The type '{list_of_opps[0].type}' original length before filtering: {len(list_of_opps)}" + ) for opp in list_of_opps: prompt += f"\nCompany: {opp.company}" @@ -103,8 +109,7 @@ def gpt_job_analyze(list_of_opps: List[Opportunity], prompt: str) -> List[Opport ): # The type of error that would be received is type JSON sleep(0.5) - print(f" Below are the parsed values from GPT\n {parsed_values}") - print(parsed_values) # For debugging purposes + print(f" Below are the parsed values from GPT - {parsed_values}") return filter_out_opportunities( list_of_opps, parsed_values