0% found this document useful (0 votes)
15 views10 pages

Main Code

The document outlines a Scrapy spider class named Act113Spider designed to scrape violation data from the Good Jobs First website, specifically targeting OFAC-related entries. It includes configurations for cookies, headers, and custom settings, as well as methods for making requests, parsing listings, and extracting detailed company information. The spider processes data to clean and format it, handling pagination and various data extraction tasks to compile a comprehensive dataset for export.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views10 pages

Main Code

The document outlines a Scrapy spider class named Act113Spider designed to scrape violation data from the Good Jobs First website, specifically targeting OFAC-related entries. It includes configurations for cookies, headers, and custom settings, as well as methods for making requests, parsing listings, and extracting detailed company information. The spider processes data to clean and format it, handling pagination and various data extraction tasks to compile a comprehensive dataset for export.

Uploaded by

dhruvchavda447
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
You are on page 1/ 10

import random

import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute

# Spider class for scraping violation data from Good Jobs First (OFAC-related
entries)
class Act113Spider(scrapy.Spider):
name = "EEOC"

# Preset cookies required for session management and access


cookies = {
'_fbp': 'fb.1.1747201269861.315460754820040010',
'_gid': 'GA1.2.38545643.1747749932',
'_ga_Q4G4E8KT5J': 'GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0',
'PHPSESSID': '454b72dc00e908524384cdb37fddb976',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0',
'_ga': 'GA1.1.1470230669.1747201268',
}

headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,hi;q=0.7',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'cross-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
# 'cookie': '_fbp=fb.1.1747201269861.315460754820040010;
_gid=GA1.2.38545643.1747749932;
_ga_Q4G4E8KT5J=GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0;
PHPSESSID=454b72dc00e908524384cdb37fddb976; _gat_UA-21812781-2=1;
_ga_9VW1HCFL7C=GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0;
_ga=GA1.1.1470230669.1747201268',
}

custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}

def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path

# Create the directory if it doesn't exist


os.makedirs(self.excel, exist_ok=True)
self.url = "https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=FTC"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url'] = self.url

# Send initial request with custom headers and cookies


yield scrapy.Request(url=self.url,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_listing)

def parse_listing(self, response):


company_links =
response.xpath('//table[2]/tbody/tr/td[1]/a/@href').getall()
source_url = response.meta.get('source_url')

for link in company_links:


browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url']=source_url

yield response.follow(link,meta=meta, headers=self.headers,


cookies=self.cookies, callback=self.parse_details)
# break # Use during testing to limit pagination

# Handle pagination by identifying total pages and iterating through all


next_page = response.xpath('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{self.url}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url']=paginated_url

yield scrapy.Request(url=paginated_url,meta=meta,
headers=self.headers, cookies=self.cookies, callback=self.parse_listing)

def parse_details(self, response):


# Known countries list
self.known_countries = ['india ', 'usa', 'united states', 'uk', 'canada',
'australia', 'bahamas', 'bahamas',
'singapore', 'germany', 'france', 'hong kong',
'north america', 'america']
source_url = response.meta.get('source_url')

# Extract all key-value details from the company profile page


item = dict()
raw_item = dict() # For storing completely raw data

item["PDP URL"] = response.url


raw_item["PDP URL"] = response.url

for path in response.xpath('//div[@id="contentResult"]//b'):


item['source_url']=source_url

data_skip = False
key = path.xpath('./text()').get(default='NA').strip()
raw_value = \
''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value

# Store raw value first


raw_item[key] = raw_value

# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass
# Remove "Note:" prefix and clean whitespace if key is "Note"
if key == 'Notes':
value = re.sub(r'^Note:\s*', '', value).strip()

# Special handling for Company field


if key == 'Company':
# Strip off "d/b/a", "D.B.A.", etc.
# d/b/a GR-Duratech Texas, Inc.
# Cleanup value: remove leading "d/b/a" or similar
value = re.sub(r'(?i)\b(d/?b/?a)\b\s*', '', value).strip()

parens = re.findall(r'\(([^()]+)\)', value)

aliases = []
country = None

# 1. Check inside () for country or special cases (Client, HK)


for val in parens:
cleaned_val = val.strip().lower()
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)

# Check if it's a known country


if cleaned_val in self.known_countries:
# Preserve country in parentheses (no removal, no alias)
value = re.sub(r'\(' + re.escape(val) + r'\)', f' ({val})',
value)
country = cleaned_val
else:
# Only add as alias if the parentheses is part of the main
company name
# (not added during the split operation)
if f'({val})' in value:
aliases.append(cleaned_val) # 2. Remove
() content only if not country
for val in parens:
if val.lower() not in self.known_countries:
value = re.sub(r'\(' + re.escape(val) + r'\)', ' ', value)

# Extract aliases from known keywords


alias_keywords_regex = r'\b(?:f/k/a|fka|d/b/a/?|a/k/a|aka|dba)\b'
parts = re.split(alias_keywords_regex, value, flags=re.IGNORECASE)
parts = [p.strip().strip(' ,;') for p in parts if p.strip()]

if parts:
company_name = parts[0]
extracted_aliases = parts[1:]
cleaned_aliases = [
re.sub(r'^[/\s]+', '', a.lower().strip())
for a in extracted_aliases
if len(a.strip()) >= 3 and a.lower().strip() not in ["hk"]
]
aliases.extend(cleaned_aliases)
else:
company_name = value.strip()

# 4. Clean up company name (add space around punctuation and remove


unwanted punctuation)
company_name = self.clean_text_punct(company_name) # Apply the
custom clean function

item[key] = company_name

# 5. Assign Alias 1, Alias 2 (in order)


for idx, alias_val in enumerate(aliases, start=1):
# Remove leading slashes/spaces
alias_val = re.sub(r'^[/\s]+', '', alias_val)

# Remove alias keywords (fka/dba/etc.) from start


remove_words = ['fka', 'f/k/a', 'dba', 'd/b/a', 'aka', 'a/k/a']
pattern = r'^(?:' + '|'.join(re.escape(word) for word in
remove_words) + r')\s+'
alias_val = re.sub(pattern, '', alias_val, flags=re.IGNORECASE)

# Split on 'and' and process each part


split_aliases = re.split(r'\s+and\s+', alias_val,
flags=re.IGNORECASE)

for i, part in enumerate(split_aliases, start=idx):


part = part.strip()
if part: # Only add non-empty parts
# Handle cases where 'and' was merged (like 'andmercy'
-> 'mercy')
part = re.sub(r'^and\s*', '', part,
flags=re.IGNORECASE)
item[f'Alias {i}'] = part

# Update idx to avoid overwriting in next iteration


idx += len(split_aliases) - 1

continue

# Handle Current Parent Company (similar to Company but without


alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
continue

# Handle special keys (Mega-Scandal, Source of Data)


if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
else:
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value
data_skip = True

# Handle link-based values (e.g., "click here")


if not data_skip:
if "(click here)" in value.lower():
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation

# Additional handling for specific URL columns


url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = response.xpath(
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value

split_items = self.split_company_names(item)
for new_item in split_items:
self.data.append(new_item)
self.raw_data.append(raw_item)

def split_company_names(self, item):


split_keywords = [
# Common US suffixes
'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'corporation',
'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities', 'center',
'co', 'company', 'group'
]
company_name = item['Company']

# First extract all parentheses content and their positions


paren_matches = list(re.finditer(r'\(([^()]+)\)', company_name))

# Split on 'and' or '&' with surrounding whitespace


parts = re.split(r'\s+(and)\s+', company_name, flags=re.IGNORECASE)

# If no splitting occurred, return original item in a list


if len(parts) == 1:
return [item]

result_items = []
current = parts[0].strip()
i = 1

while i < len(parts):


sep = parts[i]
next_part = parts[i + 1].strip()

# Check if current ends with a keyword


keyword_match = False
for kw in split_keywords:
if re.search(rf'\b{kw}\.?$', current, re.IGNORECASE):
keyword_match = True
break

if keyword_match:
# Create a new item with the current company name
new_item = item.copy()
# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]

new_item['Company'] = current
result_items.append(new_item)
current = next_part
else:
current += f" {sep} {next_part}"

i += 2

# Add the last remaining part


new_item = item.copy()

# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]

new_item['Company'] = current
result_items.append(new_item)

# Now reprocess each item to extract aliases specific to that company


final_items = []
for split_item in result_items:
# Make a copy to avoid modifying the original
processed_item = split_item.copy()
company_name = processed_item['Company']

# Extract and process aliases just for this company


parens = re.findall(r'\(([^()]+)\)', company_name)
aliases = []
country = None

# Check inside () for country or aliases


for val in parens:
cleaned_val = val.strip().lower()
if cleaned_val.upper() != "N.A.":
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)

# Check if it's a known country


if cleaned_val in self.known_countries:
country = cleaned_val
else:
aliases.append(cleaned_val)

# Remove () content from company name


processed_item['Company'] = re.sub(r'\([^()]+\)', '',
company_name).strip()

# Assign aliases if any


for idx, alias_val in enumerate(aliases[:2], start=1):
processed_item[f'Alias {idx}'] = alias_val
final_items.append(processed_item)

return final_items

# def clean_text_punct(text):
# """Replace punctuation with space (except - ' & /), then normalize
whitespace"""
# if not isinstance(text, str) or text.strip() == '':
# return text
# keep_chars = {"&", "-", "/", "'"}
# cleaned = ''.join(char if char not in string.punctuation or char in
keep_chars else ' ' for char in text)
# return re.sub(r'\s+', ' ', cleaned).strip()

def extract_name_alias(self, entry):


"""
Extract country from a given company name string.
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None

entry = entry.strip()
country = None

# Extract and remove country from parentheses


parens = re.findall(r'\(([^()]+)\)', entry)
for val in parens:
if val.strip() in self.known_countries:
country = val.strip()
entry = re.sub(r'\(' + re.escape(val) + r'\)', '',
entry).strip()

return entry, country

except Exception:
return entry, None
# 2. Remove punctuation except &-/'
def clean_text_punct(self,text):
"""Clean punctuation and whitespace for Company-related fields"""
if not isinstance(text, str) or text.strip() == '':
return text
keep_chars = {"&", "-", "/", "'"}
cleaned = []
text = text.replace(".", " ")

# Process the text and preserve country names inside parentheses


parens = re.findall(r'\(([^()]+)\)', text) # Extract all content inside
parentheses
if "HK" not in text:
text = text.replace("(", "abcdef")
text = text.replace(")", "fedcba")

for char in text:


if char in string.punctuation and char not in keep_chars:
cleaned.append(' ')
else:
cleaned.append(char)
cleaned = ''.join(cleaned)
cleaned = cleaned.replace("abcdef", "(")
cleaned = cleaned.replace("fedcba", ")")

return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()

def normalize_whitespace(self,text):
"""Normalize whitespace (used for non-Company fields)"""
if not isinstance(text, str) or text.strip() == '':
return text
return re.sub(r'\s+', ' ', text).strip()

def close(self, reason):


# Create DataFrames
df_cleaned = pd.DataFrame(self.data)
df_uncleaned = pd.DataFrame(self.raw_data) # Use completely raw data

# Add common columns to both DataFrames


for df in [df_cleaned]: # Only iterate over df_cleaned
df.insert(0, 'ID', range(1, 1 + len(df)))
df = df[['ID', 'source_url'] + [col for col in df.columns if col not in
['ID', 'source_url']]]

# Only add ID to uncleaned data (without Source URL)


df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
# df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
df = df.rename(columns={
'Current Parent Company name': 'Current Parent Company'
})
# Save uncleaned file (completely raw data)
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"raw_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = os.path.join(self.excel, filename_uncleaned)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',


engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)

# Process cleaned data


protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']
columns_to_clean = ['Company', 'Current Parent Company name']

# 1. Replace N/A with blank except in protected columns


for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')
columns_to_clean = ['Company', 'Current Parent Company']

# Apply per-column logic


for col in df_cleaned.columns:
if col in protected_columns:
continue
if df_cleaned[col].dtype == object:
if col in columns_to_clean:
df_cleaned[col] = df_cleaned[col].apply(self.clean_text_punct)
else:
df_cleaned[col] =
df_cleaned[col].apply(self.normalize_whitespace)

# Save cleaned file


filename_cleaned =
f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = os.path.join(self.excel, filename_cleaned)

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',


engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)
# Columns that should have punctuation removed

if __name__ == '__main__':
execute("scrapy crawl EEOC".split())

You might also like