import random
import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute
# Spider class for scraping violation data from Good Jobs First (OFAC-related
entries)
class Act113Spider(scrapy.Spider):
name = "EEOC"
# Preset cookies required for session management and access
cookies = {
'_fbp': 'fb.1.1747201269861.315460754820040010',
'_gid': 'GA1.2.38545643.1747749932',
'_ga_Q4G4E8KT5J': 'GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0',
'PHPSESSID': '454b72dc00e908524384cdb37fddb976',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0',
'_ga': 'GA1.1.1470230669.1747201268',
}
headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,hi;q=0.7',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'cross-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
# 'cookie': '_fbp=fb.1.1747201269861.315460754820040010;
_gid=GA1.2.38545643.1747749932;
_ga_Q4G4E8KT5J=GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0;
PHPSESSID=454b72dc00e908524384cdb37fddb976; _gat_UA-21812781-2=1;
_ga_9VW1HCFL7C=GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0;
_ga=GA1.1.1470230669.1747201268',
}
custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}
def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path
# Create the directory if it doesn't exist
os.makedirs(self.excel, exist_ok=True)
self.url = "https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=FTC"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url'] = self.url
# Send initial request with custom headers and cookies
yield scrapy.Request(url=self.url,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_listing)
def parse_listing(self, response):
company_links =
response.xpath('//table[2]/tbody/tr/td[1]/a/@href').getall()
source_url = response.meta.get('source_url')
for link in company_links:
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url']=source_url
yield response.follow(link,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_details)
# break # Use during testing to limit pagination
# Handle pagination by identifying total pages and iterating through all
next_page = response.xpath('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{self.url}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url']=paginated_url
yield scrapy.Request(url=paginated_url,meta=meta,
headers=self.headers, cookies=self.cookies, callback=self.parse_listing)
def parse_details(self, response):
# Known countries list
self.known_countries = ['india ', 'usa', 'united states', 'uk', 'canada',
'australia', 'bahamas', 'bahamas',
'singapore', 'germany', 'france', 'hong kong',
'north america', 'america']
source_url = response.meta.get('source_url')
# Extract all key-value details from the company profile page
item = dict()
raw_item = dict() # For storing completely raw data
item["PDP URL"] = response.url
raw_item["PDP URL"] = response.url
for path in response.xpath('//div[@id="contentResult"]//b'):
item['source_url']=source_url
data_skip = False
key = path.xpath('./text()').get(default='NA').strip()
raw_value = \
''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value
# Store raw value first
raw_item[key] = raw_value
# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass
# Remove "Note:" prefix and clean whitespace if key is "Note"
if key == 'Notes':
value = re.sub(r'^Note:\s*', '', value).strip()
# Special handling for Company field
if key == 'Company':
# Strip off "d/b/a", "D.B.A.", etc.
# d/b/a GR-Duratech Texas, Inc.
# Cleanup value: remove leading "d/b/a" or similar
value = re.sub(r'(?i)\b(d/?b/?a)\b\s*', '', value).strip()
parens = re.findall(r'\(([^()]+)\)', value)
aliases = []
country = None
# 1. Check inside () for country or special cases (Client, HK)
for val in parens:
cleaned_val = val.strip().lower()
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)
# Check if it's a known country
if cleaned_val in self.known_countries:
# Preserve country in parentheses (no removal, no alias)
value = re.sub(r'\(' + re.escape(val) + r'\)', f' ({val})',
value)
country = cleaned_val
else:
# Only add as alias if the parentheses is part of the main
company name
# (not added during the split operation)
if f'({val})' in value:
aliases.append(cleaned_val) # 2. Remove
() content only if not country
for val in parens:
if val.lower() not in self.known_countries:
value = re.sub(r'\(' + re.escape(val) + r'\)', ' ', value)
# Extract aliases from known keywords
alias_keywords_regex = r'\b(?:f/k/a|fka|d/b/a/?|a/k/a|aka|dba)\b'
parts = re.split(alias_keywords_regex, value, flags=re.IGNORECASE)
parts = [p.strip().strip(' ,;') for p in parts if p.strip()]
if parts:
company_name = parts[0]
extracted_aliases = parts[1:]
cleaned_aliases = [
re.sub(r'^[/\s]+', '', a.lower().strip())
for a in extracted_aliases
if len(a.strip()) >= 3 and a.lower().strip() not in ["hk"]
]
aliases.extend(cleaned_aliases)
else:
company_name = value.strip()
# 4. Clean up company name (add space around punctuation and remove
unwanted punctuation)
company_name = self.clean_text_punct(company_name) # Apply the
custom clean function
item[key] = company_name
# 5. Assign Alias 1, Alias 2 (in order)
for idx, alias_val in enumerate(aliases, start=1):
# Remove leading slashes/spaces
alias_val = re.sub(r'^[/\s]+', '', alias_val)
# Remove alias keywords (fka/dba/etc.) from start
remove_words = ['fka', 'f/k/a', 'dba', 'd/b/a', 'aka', 'a/k/a']
pattern = r'^(?:' + '|'.join(re.escape(word) for word in
remove_words) + r')\s+'
alias_val = re.sub(pattern, '', alias_val, flags=re.IGNORECASE)
# Split on 'and' and process each part
split_aliases = re.split(r'\s+and\s+', alias_val,
flags=re.IGNORECASE)
for i, part in enumerate(split_aliases, start=idx):
part = part.strip()
if part: # Only add non-empty parts
# Handle cases where 'and' was merged (like 'andmercy'
-> 'mercy')
part = re.sub(r'^and\s*', '', part,
flags=re.IGNORECASE)
item[f'Alias {i}'] = part
# Update idx to avoid overwriting in next iteration
idx += len(split_aliases) - 1
continue
# Handle Current Parent Company (similar to Company but without
alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
continue
# Handle special keys (Mega-Scandal, Source of Data)
if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
else:
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value
data_skip = True
# Handle link-based values (e.g., "click here")
if not data_skip:
if "(click here)" in value.lower():
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation
# Additional handling for specific URL columns
url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = response.xpath(
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value
split_items = self.split_company_names(item)
for new_item in split_items:
self.data.append(new_item)
self.raw_data.append(raw_item)
def split_company_names(self, item):
split_keywords = [
# Common US suffixes
'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'corporation',
'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities', 'center',
'co', 'company', 'group'
]
company_name = item['Company']
# First extract all parentheses content and their positions
paren_matches = list(re.finditer(r'\(([^()]+)\)', company_name))
# Split on 'and' or '&' with surrounding whitespace
parts = re.split(r'\s+(and)\s+', company_name, flags=re.IGNORECASE)
# If no splitting occurred, return original item in a list
if len(parts) == 1:
return [item]
result_items = []
current = parts[0].strip()
i = 1
while i < len(parts):
sep = parts[i]
next_part = parts[i + 1].strip()
# Check if current ends with a keyword
keyword_match = False
for kw in split_keywords:
if re.search(rf'\b{kw}\.?$', current, re.IGNORECASE):
keyword_match = True
break
if keyword_match:
# Create a new item with the current company name
new_item = item.copy()
# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]
new_item['Company'] = current
result_items.append(new_item)
current = next_part
else:
current += f" {sep} {next_part}"
i += 2
# Add the last remaining part
new_item = item.copy()
# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]
new_item['Company'] = current
result_items.append(new_item)
# Now reprocess each item to extract aliases specific to that company
final_items = []
for split_item in result_items:
# Make a copy to avoid modifying the original
processed_item = split_item.copy()
company_name = processed_item['Company']
# Extract and process aliases just for this company
parens = re.findall(r'\(([^()]+)\)', company_name)
aliases = []
country = None
# Check inside () for country or aliases
for val in parens:
cleaned_val = val.strip().lower()
if cleaned_val.upper() != "N.A.":
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)
# Check if it's a known country
if cleaned_val in self.known_countries:
country = cleaned_val
else:
aliases.append(cleaned_val)
# Remove () content from company name
processed_item['Company'] = re.sub(r'\([^()]+\)', '',
company_name).strip()
# Assign aliases if any
for idx, alias_val in enumerate(aliases[:2], start=1):
processed_item[f'Alias {idx}'] = alias_val
final_items.append(processed_item)
return final_items
# def clean_text_punct(text):
# """Replace punctuation with space (except - ' & /), then normalize
whitespace"""
# if not isinstance(text, str) or text.strip() == '':
# return text
# keep_chars = {"&", "-", "/", "'"}
# cleaned = ''.join(char if char not in string.punctuation or char in
keep_chars else ' ' for char in text)
# return re.sub(r'\s+', ' ', cleaned).strip()
def extract_name_alias(self, entry):
"""
Extract country from a given company name string.
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None
entry = entry.strip()
country = None
# Extract and remove country from parentheses
parens = re.findall(r'\(([^()]+)\)', entry)
for val in parens:
if val.strip() in self.known_countries:
country = val.strip()
entry = re.sub(r'\(' + re.escape(val) + r'\)', '',
entry).strip()
return entry, country
except Exception:
return entry, None
# 2. Remove punctuation except &-/'
def clean_text_punct(self,text):
"""Clean punctuation and whitespace for Company-related fields"""
if not isinstance(text, str) or text.strip() == '':
return text
keep_chars = {"&", "-", "/", "'"}
cleaned = []
text = text.replace(".", " ")
# Process the text and preserve country names inside parentheses
parens = re.findall(r'\(([^()]+)\)', text) # Extract all content inside
parentheses
if "HK" not in text:
text = text.replace("(", "abcdef")
text = text.replace(")", "fedcba")
for char in text:
if char in string.punctuation and char not in keep_chars:
cleaned.append(' ')
else:
cleaned.append(char)
cleaned = ''.join(cleaned)
cleaned = cleaned.replace("abcdef", "(")
cleaned = cleaned.replace("fedcba", ")")
return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()
def normalize_whitespace(self,text):
"""Normalize whitespace (used for non-Company fields)"""
if not isinstance(text, str) or text.strip() == '':
return text
return re.sub(r'\s+', ' ', text).strip()
def close(self, reason):
# Create DataFrames
df_cleaned = pd.DataFrame(self.data)
df_uncleaned = pd.DataFrame(self.raw_data) # Use completely raw data
# Add common columns to both DataFrames
for df in [df_cleaned]: # Only iterate over df_cleaned
df.insert(0, 'ID', range(1, 1 + len(df)))
df = df[['ID', 'source_url'] + [col for col in df.columns if col not in
['ID', 'source_url']]]
# Only add ID to uncleaned data (without Source URL)
df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
# df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
df = df.rename(columns={
'Current Parent Company name': 'Current Parent Company'
})
# Save uncleaned file (completely raw data)
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"raw_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = os.path.join(self.excel, filename_uncleaned)
with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)
# Process cleaned data
protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']
columns_to_clean = ['Company', 'Current Parent Company name']
# 1. Replace N/A with blank except in protected columns
for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')
columns_to_clean = ['Company', 'Current Parent Company']
# Apply per-column logic
for col in df_cleaned.columns:
if col in protected_columns:
continue
if df_cleaned[col].dtype == object:
if col in columns_to_clean:
df_cleaned[col] = df_cleaned[col].apply(self.clean_text_punct)
else:
df_cleaned[col] =
df_cleaned[col].apply(self.normalize_whitespace)
# Save cleaned file
filename_cleaned =
f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = os.path.join(self.excel, filename_cleaned)
with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',
engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)
# Columns that should have punctuation removed
if __name__ == '__main__':
execute("scrapy crawl EEOC".split())