Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Refactor code #29

Merged
merged 6 commits into from
Oct 8, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 0 additions & 30 deletions db.py

This file was deleted.

225 changes: 28 additions & 197 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,199 +2,27 @@
import os
import json
import asyncio
from typing import List
import re
from datetime import date
import utility as utils
import db
import opportunity as opps
from opportunity import Opportunity, OpportunityType
import utility.utils as ut
import utility.db as db
import utility.opportunity as opps
from dotenv import load_dotenv
from blocklist import BlockList
from utility.scrape import (
request_github_internship24_data,
request_linkedin_data,
request_linkedin_internship24_data,
)
from utility.palm import gpt_job_analyze

load_dotenv() # To obtain keys from the .env file

# Load and determine if all env variables are set
load_dotenv()
ut.verify_set_env_variables()

# ----------------- POSTGRES -----------------

TABLE_NAME = os.getenv("DB_TABLE")
MAX_LIST_LENGTH = 15


def create():
"""Creates the DB. Only needs to be called once."""

with db.instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute(
f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)"""
)

connection.commit()


# ----------------- INTERNSHIP DATA -----------------


def request_github_internship24_data() -> List[Opportunity]:
"""Scrapes Internship Data '24 from Github Repo"""

url = os.getenv("GH_INTERN24_URL")
parse_content = utils.content_parser(url)
github_list = []
td_elems = parse_content.find_all("tr")

for cell in td_elems[1:]:
if len(github_list) <= MAX_LIST_LENGTH:
elements = cell.find_all("td")

company = elements[0].text
if not BlockList().is_blacklisted_company(company):
title = elements[1].text
location = elements[2].text
link = elements[3]
if "🔒" not in link.text:
opportunity = Opportunity(
company,
title,
location,
link.find("a")["href"],
0,
OpportunityType.INTERNSHIP.value,
)
github_list.append(opportunity)

return github_list


def request_linkedin_internship24_data() -> List[Opportunity]:
"""Web scrapes Summer '24 Internship Opportunities using LinkedIn"""

url = os.getenv("LINKEDIN_INTERN_URL")

parse_content = utils.content_parser(url)

linkedin_internship_opps = utils.blueprint_opportunity_formatter(
parse_content,
"base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
"hidden-nested-link",
"base-search-card__title",
"job-search-card__location",
"base-card__full-link",
True,
MAX_LIST_LENGTH,
OpportunityType.INTERNSHIP.value,
)

return linkedin_internship_opps


# ----------------- JOB DATA -----------------


def request_rapidapi_indeed_data() -> List[Opportunity]:
"""
This API call retrieves a formatted response object
and returns a List[Opportunity] as the result
"""

url = os.getenv("RAPID_API_URL")
rapid_api_key = os.getenv("RAPID_API_KEY")

headers = {
"X-RapidAPI-Key": rapid_api_key,
"X-RapidAPI-Host": "indeed12.p.rapidapi.com",
}

rapid_jobs = []
response = requests.get(url, headers=headers).json()

days_needed_command_value = utils.extract_command_value().days_needed[
0
] # Extracts command-line value

for elem in response["hits"]:
time = elem["formatted_relative_time"]

numeric = re.search(r"\d+", time)
formatted_time_integer = int(numeric.group()) if numeric else 0

if (
len(rapid_jobs) < MAX_LIST_LENGTH
and int(days_needed_command_value) >= formatted_time_integer
):
company = elem["company_name"]
title = elem["title"]
location = elem["location"]
link = f'https://www.indeed.com/viewjob?jk={elem["id"]}&locality=us'
processed = 0

opportunity = Opportunity(
company,
title,
location,
link,
processed,
OpportunityType.FULL_TIME.value,
)

rapid_jobs.append(opportunity)

return rapid_jobs


def request_linkedin_data() -> List[Opportunity]:
"""Returns a List[Opportunity] which contains web scraped job content"""

url = os.getenv("LINKEDIN_URL")
parse_content = utils.content_parser(url)

linked_in_jobs = utils.blueprint_opportunity_formatter(
parse_content,
"base-card relative w-full hover:no-underline focus:no-underline base-card--link base-search-card base-search-card--link job-search-card",
"hidden-nested-link",
"base-search-card__title",
"job-search-card__location",
"base-card__full-link",
True,
MAX_LIST_LENGTH,
OpportunityType.FULL_TIME.value,
)

return linked_in_jobs


# ----------------- RESET FUNCTION (DEBUGGING PURPOSES) -----------------


def reset_processed_status(TABLE_NAME):
"""Jobs status will be set to _processed = 0 for testing a debugging purposes"""

with db.instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute(
f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5"
)

rows = cursor.fetchall()

for row in rows:
company, title, location = row[:3]

cursor.execute(
f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s",
(company, title, location),
)

connection.commit()


# ----------------- DISCORD BOT -----------------


async def execute_opportunities_webhook(webhook_url, job_message, internship_message):
async def execute_opportunities_webhook(
webhook_url: str, job_message: str, internship_message: str
):
"""
Executes the message which receives the formatted message
from the format_opportunities() function as well as the webhook
Expand Down Expand Up @@ -247,39 +75,42 @@ async def execute_opportunities_webhook(webhook_url, job_message, internship_mes

async def main():
# Creates table in database
with_create_table_command = utils.extract_command_value().create
with_create_table_command = ut.extract_command_value().create
if with_create_table_command:
create()
TABLE_NAME = os.getenv("DB_TABLE")

db.create(TABLE_NAME)

print(f"Sucessfully created {TABLE_NAME}!")
exit() # Exit the main function to avoid calling other functions

file_paths = [os.getenv("MESSAGE_PATH"), os.getenv("PROMPTS_PATH")]
customized_object = utils.user_customization(file_paths)
customized_object = ut.user_customization(file_paths)

# Determines the customized prompts for PaLM
prompt_object = utils.determine_prompts(customized_object["customized_prompts"])
prompt_object = ut.determine_prompts(customized_object["customized_prompts"])

# Determines the customized message for the webhook
finalized_message = utils.determine_customized_message(
finalized_message = ut.determine_customized_message(
customized_object["customized_message"]
)

# Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER.
job_opps = utils.merge_all_opportunity_data(request_linkedin_data())
job_opps = ut.merge_all_opportunity_data(request_linkedin_data())

filtered_job_opps = utils.gpt_job_analyze(
filtered_job_opps = gpt_job_analyze(
job_opps,
prompt_object["full_time"],
)
opps.ingest_opportunities(filtered_job_opps)

# Consolidates all job-related opportunities into a comprehensive List[Opportunity], eliminating repetitive calls to the LLM SERVER.
internship_opps = utils.merge_all_opportunity_data(
internship_opps = ut.merge_all_opportunity_data(
request_linkedin_internship24_data(),
request_github_internship24_data(),
)

filtered_internship_opps = utils.gpt_job_analyze(
filtered_internship_opps = gpt_job_analyze(
internship_opps,
prompt_object["internship"],
)
Expand All @@ -291,7 +122,7 @@ async def main():
# To do so, please comment the function calls above this comment.
# After, please uncomment the following line of code:

# reset_processed_status()
# db.reset_processed_status()

internship_data_results = opps.list_opportunities(True, "internship", filtered=True)
job_data_results = opps.list_opportunities(True, "full_time", filtered=True)
Expand Down
5 changes: 4 additions & 1 deletion blocklist.py → utility/blocklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,10 @@ class BlockList:
"""A class holding methods to determine if a company is blocklisted"""

BLOCKLISTED_COMPANIES = set(
["Pattern Learning AI - Career & Tech Recruitment Reimagined!"]
[
"Pattern Learning AI - Career & Tech Recruitment Reimagined!",
"Patterned Learning AI - Tech Recruitment & Staffing",
]
)

def is_blacklisted_company(self, company: str) -> bool:
Expand Down
66 changes: 66 additions & 0 deletions utility/db.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import psycopg2
import os


def instantiate_db_connection():
"""Returns the connection from the DB"""

db_uri = os.getenv("DB_URI")
return psycopg2.connect(db_uri)


def create(TABLE_NAME: str) -> None:
"""Creates the DB. Only needs to be called once."""

with instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute(
f"""CREATE TABLE IF NOT EXISTS {TABLE_NAME}(company TEXT, title TEXT, location TEXT, link TEXT, processed INTEGER DEFAULT 0)"""
)

connection.commit()


def add_column(column_name: str, data_type: str) -> None:
"""Adds a column for adjustment to the table after the table has been created"""

with instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute(f"ALTER TABLE jobs_table ADD COLUMN {column_name} {data_type}")

connection.commit()


def delete_all_opportunity_type(opp_type: str) -> None:
"""Deletes all opportunities of a specific type for testing purposes only"""

with instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute("DELETE FROM jobs_table WHERE type = %s", (opp_type,))
connection.commit()


def reset_processed_status(TABLE_NAME: str) -> None:
"""Jobs status will be set to _processed = 0 for testing a debugging purposes"""

with instantiate_db_connection() as connection:
cursor = connection.cursor()

cursor.execute(
f"SELECT company, title, location FROM {TABLE_NAME} WHERE processed = 1 LIMIT 5"
)

rows = cursor.fetchall()

for row in rows:
company, title, location = row[:3]

cursor.execute(
f"UPDATE {TABLE_NAME} SET processed = 0 WHERE company = %s AND title = %s AND location = %s",
(company, title, location),
)

connection.commit()
Loading