Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feature/workday_scraper #67

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion config/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,6 @@ beautifulsoup4==4.9.3
requests==2.25.1
openai
python-dotenv
windows-curses; sys_platform == 'win32'
windows-curses; sys_platform == 'win32'
selenium==4.25.0
webdriver-manager==4.0.2
5 changes: 5 additions & 0 deletions docker/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,11 @@ WORKDIR /commandjobs
# Install any needed packages specified in requirements.txt
RUN pip3 install --no-cache-dir -r config/requirements.txt

# Install required packages, including Chromium and ChromeDriver
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Excellent :)

RUN apt-get update && \
apt-get install -y wget unzip chromium chromium-driver && \
apt-get clean \

# Run menu.py when the container launches
CMD ["python3", "src/menu.py"]

6 changes: 5 additions & 1 deletion docker/docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,20 +8,24 @@ services:
build:
context: ..
dockerfile: docker/Dockerfile

# Set container name
container_name: commandjobs

# Set environment variables
environment:
- MENU_APP=src/menu.py
- PYTHONPATH=/commandjobs

# Mount entire project into docker container under /repo
volumes:
- ../:/commandjobs
- ./docker/docker-entrypoint.sh:/commandjobs/docker-entrypoint.sh

# Use host network mode (may require changes depending on Docker environment)
network_mode: host

tty: true # Allocate a pseudo-TTY
stdin_open: true # Keep STDIN open

working_dir: /commandjobs
Empty file added job_scraper/__init__.py
Empty file.
Empty file.
2 changes: 0 additions & 2 deletions src/hn_scraper.py → job_scraper/hacker_news/scraper.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import requests
from bs4 import BeautifulSoup
import sqlite3
import os
from queue import Queue

# Define a new exception for interrupting scraping
class ScrapingInterrupt(Exception):
Expand Down
Empty file.
9 changes: 9 additions & 0 deletions job_scraper/scraper_selectors/workday_selectors.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from enum import StrEnum


class WorkDaySelectors(StrEnum):
JOB_LISTING_XPATH = '//li[@class="css-1q2dra3"]'
JOB_TITLE_XPATH = './/h3/a'
JOB_ID_XPATH = './/ul[@data-automation-id="subtitle"]/li'
POSTED_ON_XAPTH = './/dd[@class="css-129m7dg"][preceding-sibling::dt[contains(text(),"posted on")]]'
JOB_DESCRIPTION_XPATH = '//div[@data-automation-id="jobPostingDescription"]'
14 changes: 14 additions & 0 deletions job_scraper/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
def get_workday_company_urls() -> dict:
urls = {
'NVIDIA': 'https://nvidia.wd5.myworkdayjobs.com/NVIDIAExternalCareerSite?jobFamilyGroup=0c40f6bd1d8f10ae43ffaefd46dc7e78',
'SALESFORCE': 'https://salesforce.wd12.myworkdayjobs.com/en-US/External_Career_Site/details/Lead-Marketing-Cloud-Solution-Engineer_JR268932?jobFamilyGroup=14fa3452ec7c1011f90d0002a2100000',
# 'RED_HAT': 'https://redhat.wd5.myworkdayjobs.com/Jobs',
# 'CROWDSTRIKE': 'https://crowdstrike.wd5.myworkdayjobs.com/crowdstrikecareers'
}
return urls

def get_workday_post_time_range() -> list[str]:
return ['posted today']
# return ['posted today', 'posted yesterday', 'posted 2 days ago', 'posted 3 days ago',
# 'posted 4 days ago', 'posted 5 days ago', 'posted 6 days ago', 'posted 7 days ago']

Empty file added job_scraper/workday/__init__.py
Empty file.
131 changes: 131 additions & 0 deletions job_scraper/workday/scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import sqlite3
import time
from selenium import webdriver
from selenium.common.exceptions import TimeoutException, StaleElementReferenceException
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait

from job_scraper.scraper_selectors.workday_selectors import WorkDaySelectors
from job_scraper.utils import get_workday_post_time_range, get_workday_company_urls


class WorkdayScraper:
def __init__(self, db_path='job_listings.db', update_func=None, done_event=None, result_queue=None):
self.db_path = db_path
self.driver = webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=self.get_selenium_configs())
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Very helpful 👍🏼

self.one_week_span_text = get_workday_post_time_range()
self.company_urls = get_workday_company_urls()
self.new_entries_count = 0
self.done_event = done_event
self.result_queue = result_queue
self.update_func = update_func
self.job_listings = []

@staticmethod
def get_selenium_configs() -> Options:
chrome_options = Options()
chrome_options.add_argument("--headless")
chrome_options.add_argument("--no-sandbox")
chrome_options.add_argument("--disable-dev-shm-usage")
chrome_options.add_argument("--disable-gpu")
return chrome_options

def save_to_database(self, original_text, original_html, source, external_id):
"""Save a job listing to the SQLite database."""
conn = sqlite3.connect(self.db_path)
c = conn.cursor()
c.execute("INSERT OR IGNORE INTO job_listings (original_text, original_html, source, external_id) VALUES (?, ?, ?, ?)",
(original_text, original_html, source, external_id))
conn.commit()
conn.close()
return c.rowcount > 0

def save_new_job_listing(self, job_description, job_description_html, job_url, job_id):
if not job_description:
return
if not job_description_html:
return
if not job_url:
return
if not job_id:
return
self.job_listings.append({
'original_text': job_description,
'original_html': job_description_html,
'source': job_url,
'external_id': job_id
})

def save_job_listings_to_db(self):
for job in self.job_listings:
# inserted = self.save_to_database(
# job['original_text'],
# job['original_html'],
# job['source'],
# job['external_id']
# )
inserted = job['external_id']
if inserted:
self.new_entries_count += 1
if self.done_event:
self.result_queue.put(self.new_entries_count)
self.done_event.set()

def scrape(self):
for company_name, company_url in self.company_urls.items():
self.driver.get(company_url)
wait = WebDriverWait(self.driver, 10)
self.update_func(f"Scraping Workday companies:\t{", ".join(self.company_urls.keys())}")

posted_this_week = True
while posted_this_week:
try:
wait.until(EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)))
except TimeoutException:
self.update_func("Job Listing Element not found. Try again later")
break

job_elements = self.driver.find_elements(By.XPATH, WorkDaySelectors.JOB_LISTING_XPATH)
for job_element in job_elements:
try:
self.update_func(f"*{company_name}* \n {self.driver.current_url}")
job_title_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_TITLE_XPATH)
job_id_element = job_element.find_element(By.XPATH, WorkDaySelectors.JOB_ID_XPATH)
job_id = job_id_element.text
posted_on_element = job_element.find_element(By.XPATH, WorkDaySelectors.POSTED_ON_XAPTH)
posted_on = posted_on_element.text

if posted_on.lower() in self.one_week_span_text:
job_url = job_title_element.get_attribute('href')
job_title_element.click()
job_description_element = wait.until(
EC.presence_of_element_located((By.XPATH, WorkDaySelectors.JOB_DESCRIPTION_XPATH))
)
job_description = job_description_element.text
job_description_html = job_description_element.get_attribute("innerHTML")
self.save_new_job_listing(job_description, job_description_html, job_url, job_id)
else:
posted_this_week = False
break
except StaleElementReferenceException:
self.update_func("Encountered an issue while fetching job list. Retrying...")
time.sleep(1)

if not posted_this_week:
break

try:
next_page_button = wait.until(
EC.element_to_be_clickable((By.XPATH, "//button[@data-uxi-element-id='next']"))
)
next_page_button.click()
except TimeoutException:
self.update_func("TimeoutException. Please try again later!")
break

self.save_job_listings_to_db()
self.update_func("Scraping completed for all companies.")
Empty file added src/__init__.py
Empty file.
34 changes: 25 additions & 9 deletions src/menu.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
import curses
import textwrap
import os
import time
from hn_scraper import HNScraper
from job_scraper.hacker_news.scraper import HNScraper
from display_table import draw_table
from database_manager import DatabaseManager
from display_matching_table import MatchingTableDisplay
Expand All @@ -15,6 +14,7 @@
from queue import Queue
from dotenv import load_dotenv

from job_scraper.workday.scraper import WorkdayScraper
from work_startup_scraper import WorkStartupScraper

DB_PATH='job_listings.db'
Expand Down Expand Up @@ -66,8 +66,10 @@ def __init__(self, stdscr, logger):
if self.total_ai_job_recommendations > 0:
ai_recommendations_menu = f"✅ {self.total_ai_job_recommendations} recommended listings, out of {total_processed}"

self.menu_items = [resume_menu, "🕸 Scrape \"Ask HN: Who's hiring?\"",
self.menu_items = [resume_menu,
"🕸 Scrape \"Ask HN: Who's hiring?\"",
"🕸 Scrape \"Work at a Startup jobs\"",
"🕸 Scrape \"Workday\"",
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

😄

db_menu_item, find_best_matches_menu,
ai_recommendations_menu] # New menu option added
self.current_row = 0
Expand Down Expand Up @@ -217,9 +219,9 @@ def update_menu_items(self):

# Update the relevant menu items
self.menu_items[0] = resume_menu
self.menu_items[3] = db_menu_item
self.menu_items[4] = find_best_matches_menu
self.menu_items[5] = ai_recommendations_menu
self.menu_items[4] = db_menu_item
self.menu_items[5] = find_best_matches_menu
self.menu_items[6] = ai_recommendations_menu

# Redraw the menu to reflect the updated items
self.draw_menu()
Expand All @@ -236,11 +238,13 @@ def execute_menu_action(self):
self.start_scraping_with_status_updates()
elif self.current_row == 2: # Scrape Work at a Startup jobs
self.start_scraping_WaaS_with_status_updates()
elif self.current_row == 3: # Navigate jobs in local db
elif self.current_row == 3: # Scrape Workday
self.start_scraping_workday_with_status_updates()
elif self.current_row == 4: # Navigate jobs in local db
draw_table(self.stdscr, self.db_path)
elif self.current_row == 4: # "Process job listings with GPT" option
elif self.current_row == 5: # "Process job listings with GPT" option
exit_message = asyncio.run(self.process_with_gpt())
elif self.current_row == 5: # Index of the new menu option
elif self.current_row == 6: # Index of the new menu option
self.table_display.draw_table()
self.stdscr.clear()
self.update_menu_items()
Expand Down Expand Up @@ -356,6 +360,18 @@ def start_scraping_WaaS_with_status_updates(self):
time.sleep(3)
self.stdscr.clear()

def start_scraping_workday_with_status_updates(self):
result_queue= Queue()
self.scraper = WorkdayScraper(self.db_path, self.update_status_bar, self.scraping_done_event, result_queue)
scraping_thread = threading.Thread(target=self.scraper.scrape)
scraping_thread.start()
self.scraping_done_event.wait()
new_listings_count = result_queue.get()
self.update_status_bar(f"Scraping of Workday completed {new_listings_count} new listings added")
self.scraping_done_event.clear()
time.sleep(3)
self.stdscr.clear()


# Despite the name of the method, this currently
# is not handling scrolling 😅
Expand Down