0% found this document useful (0 votes)
47 views5 pages

Python Code

The document is a Python script designed to scrape job listings from the Indeed and CWJobs websites based on specified job titles and locations. It utilizes libraries such as requests, BeautifulSoup, and Selenium to extract job information including titles, companies, links, and dates listed, and saves the results to an Excel file. The script includes functions for initiating a web driver, making job searches, and extracting relevant job details from the HTML content.

Uploaded by

AliRazaJatoi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
47 views5 pages

Python Code

The document is a Python script designed to scrape job listings from the Indeed and CWJobs websites based on specified job titles and locations. It utilizes libraries such as requests, BeautifulSoup, and Selenium to extract job information including titles, companies, links, and dates listed, and saves the results to an Excel file. The script includes functions for initiating a web driver, making job searches, and extracting relevant job details from the HTML content.

Uploaded by

AliRazaJatoi
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 5

#!

/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Apr 28 11:35:04 2020

@author: chrislovejoy
"""

import urllib
import requests
from bs4 import BeautifulSoup
import selenium
from selenium import webdriver
from selenium.webdriver.support.ui import WebDriverWait
import pandas as pd
import os

def find_jobs_from(website, job_title, location, desired_characs,


filename="results.xls"):

if website == 'Indeed':
job_soup = load_indeed_jobs_div(job_title, location)
jobs_list, num_listings = extract_job_information_indeed(job_soup,
desired_characs)

if website == 'CWjobs':
location_of_driver = os.getcwd()
driver = initiate_driver(location_of_driver, browser='chrome')
job_soup = make_job_search(job_title, location, driver)
jobs_list, num_listings = extract_job_information_cwjobs(job_soup,
desired_characs)

save_jobs_to_excel(jobs_list, filename)

print('{} new job postings retrieved from {}. Stored in


{}.'.format(num_listings,
website,
filename))

## ======================= GENERIC FUNCTIONS ======================= ##

def save_jobs_to_excel(jobs_list, filename):


jobs = pd.DataFrame(jobs_list)
jobs.to_excel(filename)

## ================== FUNCTIONS FOR INDEED.CO.UK =================== ##

def load_indeed_jobs_div(job_title, location):


getVars = {'q' : job_title, 'l' : location, 'fromage' : 'last', 'sort' :
'date'}
url = ('https://hk.indeed.com/' + urllib.parse.urlencode(getVars))
page = requests.get(url)
soup = BeautifulSoup(page.content, "html.parser")
job_soup = soup.find(id="resultsCol")
return job_soup

def extract_job_information_indeed(job_soup, desired_characs):


job_elems = job_soup.find_all('div', class_='jobsearch-SerpJobCard')

cols = []
extracted_info = []

if 'titles' in desired_characs:
titles = []
cols.append('titles')
for job_elem in job_elems:
titles.append(extract_job_title_indeed(job_elem))
extracted_info.append(titles)

if 'companies' in desired_characs:
companies = []
cols.append('companies')
for job_elem in job_elems:
companies.append(extract_company_indeed(job_elem))
extracted_info.append(companies)

if 'links' in desired_characs:
links = []
cols.append('links')
for job_elem in job_elems:
links.append(extract_link_indeed(job_elem))
extracted_info.append(links)

if 'date_listed' in desired_characs:
dates = []
cols.append('date_listed')
for job_elem in job_elems:
dates.append(extract_date_indeed(job_elem))
extracted_info.append(dates)

jobs_list = {}

for j in range(len(cols)):
jobs_list[cols[j]] = extracted_info[j]

num_listings = len(extracted_info[0])

return jobs_list, num_listings

def extract_job_title_indeed(job_elem):
title_elem = job_elem.find('h2', class_='title')
title = title_elem.text.strip()
return title

def extract_company_indeed(job_elem):
company_elem = job_elem.find('span', class_='company')
company = company_elem.text.strip()
return company

def extract_link_indeed(job_elem):
link = job_elem.find('a')['href']
link = 'www.Indeed.co.uk/' + link
return link

def extract_date_indeed(job_elem):
date_elem = job_elem.find('span', class_='date')
date = date_elem.text.strip()
return date

## ================== FUNCTIONS FOR CWJOBS.CO.UK =================== ##

def initiate_driver(location_of_driver, browser):


if browser == 'chrome':
driver = webdriver.Chrome(executable_path=(location_of_driver +
"/chromedriver"))
elif browser == 'firefox':
driver = webdriver.Firefox(executable_path=(location_of_driver +
"/firefoxdriver"))
elif browser == 'safari':
driver = webdriver.Safari(executable_path=(location_of_driver +
"/safaridriver"))
elif browser == 'edge':
driver = webdriver.Edge(executable_path=(location_of_driver +
"/edgedriver"))
return driver

def make_job_search(job_title, location, driver):


driver.get('https://www.cwjobs.co.uk/')

# Select the job box


job_title_box = driver.find_element_by_name('Keywords')

# Send job information


job_title_box.send_keys(job_title)

# Selection location box


location_box = driver.find_element_by_id('location')

# Send location information


location_box.send_keys(location)

# Find Search button


search_button = driver.find_element_by_id('search-button')
search_button.click()

driver.implicitly_wait(5)

page_source = driver.page_source

job_soup = BeautifulSoup(page_source, "html.parser")

return job_soup

def extract_job_information_cwjobs(job_soup, desired_characs):

job_elems = job_soup.find_all('div', class_="job")


cols = []
extracted_info = []

if 'titles' in desired_characs:
titles = []
cols.append('titles')
for job_elem in job_elems:
titles.append(extract_job_title_cwjobs(job_elem))
extracted_info.append(titles)

if 'companies' in desired_characs:
companies = []
cols.append('companies')
for job_elem in job_elems:
companies.append(extract_company_cwjobs(job_elem))
extracted_info.append(companies)

if 'links' in desired_characs:
links = []
cols.append('links')
for job_elem in job_elems:
links.append(extract_link_cwjobs(job_elem))
extracted_info.append(links)

if 'date_listed' in desired_characs:
dates = []
cols.append('date_listed')
for job_elem in job_elems:
dates.append(extract_date_cwjobs(job_elem))
extracted_info.append(dates)

jobs_list = {}

for j in range(len(cols)):
jobs_list[cols[j]] = extracted_info[j]

num_listings = len(extracted_info[0])

return jobs_list, num_listings

def extract_job_title_cwjobs(job_elem):
title_elem = job_elem.find('h2')
title = title_elem.text.strip()
return title

def extract_company_cwjobs(job_elem):
company_elem = job_elem.find('h3')
company = company_elem.text.strip()
return company

def extract_link_cwjobs(job_elem):
link = job_elem.find('a')['href']
return link

def extract_date_cwjobs(job_elem):
link_elem = job_elem.find('li', class_='date-posted')
link = link_elem.text.strip()
return link

You might also like