0% found this document useful (0 votes)

15 views10 pages

Main Code

The document outlines a Scrapy spider class named Act113Spider designed to scrape violation data from the Good Jobs First website, specifically targeting OFAC-related entries. It includes configurations for cookies, headers, and custom settings, as well as methods for making requests, parsing listings, and extracting detailed company information. The spider processes data to clean and format it, handling pagination and various data extraction tasks to compile a comprehensive dataset for export.

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

15 views10 pages

Main Code

Uploaded by

dhruvchavda447

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as TXT, PDF, TXT or read online on Scribd

You are on page 1/ 10

import random

import re
import os
import string
import scrapy
import pandas as pd
from datetime import datetime
from scrapy.cmdline import execute

# Spider class for scraping violation data from Good Jobs First (OFAC-related
entries)
class Act113Spider(scrapy.Spider):
name = "EEOC"

# Preset cookies required for session management and access

cookies = {
'_fbp': 'fb.1.1747201269861.315460754820040010',
'_gid': 'GA1.2.38545643.1747749932',
'_ga_Q4G4E8KT5J': 'GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0',
'PHPSESSID': '454b72dc00e908524384cdb37fddb976',
'_gat_UA-21812781-2': '1',
'_ga_9VW1HCFL7C': 'GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0',
'_ga': 'GA1.1.1470230669.1747201268',
}

headers = {
'accept':
'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/
webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,hi;q=0.7',
'cache-control': 'no-cache',
'pragma': 'no-cache',
'priority': 'u=0, i',
'referer': 'https://www.google.com/',
'sec-ch-ua': '"Chromium";v="136", "Google Chrome";v="136",
"Not.A/Brand";v="99"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"Windows"',
'sec-fetch-dest': 'document',
'sec-fetch-mode': 'navigate',
'sec-fetch-site': 'cross-site',
'sec-fetch-user': '?1',
'upgrade-insecure-requests': '1',
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36
(KHTML, like Gecko) Chrome/136.0.0.0 Safari/537.36',
# 'cookie': '_fbp=fb.1.1747201269861.315460754820040010;
_gid=GA1.2.38545643.1747749932;
_ga_Q4G4E8KT5J=GS2.1.s1747827357$o2$g1$t1747827638$j0$l0$h0;
PHPSESSID=454b72dc00e908524384cdb37fddb976; _gat_UA-21812781-2=1;
_ga_9VW1HCFL7C=GS2.1.s1747997430$o42$g1$t1748001505$j0$l0$h0;
_ga=GA1.1.1470230669.1747201268',
}

custom_settings = {
"DOWNLOAD_HANDLERS": {
"http": "scrapy_impersonate.ImpersonateDownloadHandler",
"https": "scrapy_impersonate.ImpersonateDownloadHandler",
},
"TWISTED_REACTOR":
"twisted.internet.asyncioreactor.AsyncioSelectorReactor",
}

def __init__(self):
# Store all parsed data in a list for final export
self.data = []
self.raw_data=[]
self.data_cleaned = []
self.data_uncleaned = []
self.excel = os.path.join(os.getcwd(), "exports") # or specify your own
path

# Create the directory if it doesn't exist

os.makedirs(self.excel, exist_ok=True)
self.url = "https://violationtracker.goodjobsfirst.org/?
company_op=starts&company=&offense_group=&agency_code=FTC"
def start_requests(self):
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url'] = self.url

# Send initial request with custom headers and cookies

yield scrapy.Request(url=self.url,meta=meta, headers=self.headers,
cookies=self.cookies, callback=self.parse_listing)

def parse_listing(self, response):

company_links =
response.xpath('//table[2]/tbody/tr/td[1]/a/@href').getall()
source_url = response.meta.get('source_url')

yield response.follow(link,meta=meta, headers=self.headers,

cookies=self.cookies, callback=self.parse_details)
# break # Use during testing to limit pagination

# Handle pagination by identifying total pages and iterating through all

next_page = response.xpath('//a[contains(text(),">>")]/@href').get()
if next_page:
total_page = int(next_page.split('page=')[-1])
for page in range(0, total_page+1):
paginated_url = f"{self.url}&page={page}"
browsers = [
"chrome110",
"edge99",
"safari15_5"
]
meta = {}
meta['impersonate'] = random.choice(browsers)
meta['source_url']=paginated_url

yield scrapy.Request(url=paginated_url,meta=meta,
headers=self.headers, cookies=self.cookies, callback=self.parse_listing)

def parse_details(self, response):

# Known countries list
self.known_countries = ['india ', 'usa', 'united states', 'uk', 'canada',
'australia', 'bahamas', 'bahamas',
'singapore', 'germany', 'france', 'hong kong',
'north america', 'america']
source_url = response.meta.get('source_url')

# Extract all key-value details from the company profile page

item = dict()
raw_item = dict() # For storing completely raw data

item["PDP URL"] = response.url

raw_item["PDP URL"] = response.url

for path in response.xpath('//div[@id="contentResult"]//b'):

item['source_url']=source_url

data_skip = False
key = path.xpath('./text()').get(default='NA').strip()
raw_value = \
''.join(path.xpath('./following-sibling::text() | ./following-
sibling::a//text()').getall()).split(
'\n:')[0].replace(':\xa0', '').strip()
value = raw_value # Initialize cleaned value with raw value

# Store raw value first

raw_item[key] = raw_value

# Format date fields into YYYY-MM-DD format (only for cleaned data)
if 'Date' in key:
try:
date_obj = datetime.strptime(value, "%B %d, %Y")
value = date_obj.strftime("%Y-%m-%d")
except Exception:
pass
# Remove "Note:" prefix and clean whitespace if key is "Note"
if key == 'Notes':
value = re.sub(r'^Note:\s*', '', value).strip()

# Special handling for Company field

if key == 'Company':
# Strip off "d/b/a", "D.B.A.", etc.
# d/b/a GR-Duratech Texas, Inc.
# Cleanup value: remove leading "d/b/a" or similar
value = re.sub(r'(?i)\b(d/?b/?a)\b\s*', '', value).strip()

parens = re.findall(r'$([^()]+)$', value)

aliases = []
country = None

# 1. Check inside () for country or special cases (Client, HK)

for val in parens:
cleaned_val = val.strip().lower()
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)

# Check if it's a known country

if cleaned_val in self.known_countries:
# Preserve country in parentheses (no removal, no alias)
value = re.sub(r'$' + re.escape(val) + r'$', f' ({val})',
value)
country = cleaned_val
else:
# Only add as alias if the parentheses is part of the main
company name
# (not added during the split operation)
if f'({val})' in value:
aliases.append(cleaned_val) # 2. Remove
() content only if not country
for val in parens:
if val.lower() not in self.known_countries:
value = re.sub(r'$' + re.escape(val) + r'$', ' ', value)

# Extract aliases from known keywords

alias_keywords_regex = r'\b(?:f/k/a|fka|d/b/a/?|a/k/a|aka|dba)\b'
parts = re.split(alias_keywords_regex, value, flags=re.IGNORECASE)
parts = [p.strip().strip(' ,;') for p in parts if p.strip()]

if parts:
company_name = parts[0]
extracted_aliases = parts[1:]
cleaned_aliases = [
re.sub(r'^[/\s]+', '', a.lower().strip())
for a in extracted_aliases
if len(a.strip()) >= 3 and a.lower().strip() not in ["hk"]
]
aliases.extend(cleaned_aliases)
else:
company_name = value.strip()

# 4. Clean up company name (add space around punctuation and remove

unwanted punctuation)
company_name = self.clean_text_punct(company_name) # Apply the
custom clean function

item[key] = company_name

# 5. Assign Alias 1, Alias 2 (in order)

for idx, alias_val in enumerate(aliases, start=1):
# Remove leading slashes/spaces
alias_val = re.sub(r'^[/\s]+', '', alias_val)

# Remove alias keywords (fka/dba/etc.) from start

remove_words = ['fka', 'f/k/a', 'dba', 'd/b/a', 'aka', 'a/k/a']
pattern = r'^(?:' + '|'.join(re.escape(word) for word in
remove_words) + r')\s+'
alias_val = re.sub(pattern, '', alias_val, flags=re.IGNORECASE)

# Split on 'and' and process each part

split_aliases = re.split(r'\s+and\s+', alias_val,
flags=re.IGNORECASE)

for i, part in enumerate(split_aliases, start=idx):

part = part.strip()
if part: # Only add non-empty parts
# Handle cases where 'and' was merged (like 'andmercy'
-> 'mercy')
part = re.sub(r'^and\s*', '', part,
flags=re.IGNORECASE)
item[f'Alias {i}'] = part

# Update idx to avoid overwriting in next iteration

idx += len(split_aliases) - 1

continue

# Handle Current Parent Company (similar to Company but without

alias/country extraction)
if key == 'Current Parent Company':
# Clean up punctuation (keeping - ' & /)
text_no_punct = value.translate(str.maketrans('', '',
string.punctuation.replace('-', '')
.replace("'", '')
.replace('&', '')
.replace('/', '')))
value = re.sub(r'\s+', ' ', text_no_punct).strip()
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
continue

# Handle special keys (Mega-Scandal, Source of Data)

if 'Mega-Scandal' in key or 'Source of Data' in key:
if 'Mega-Scandal' in key:
item[key + " name"] = value
item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key + " name"] = raw_value
raw_item[key + " url"] =
path.xpath('./following-sibling::a/@href').get(default='')
else:
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value
data_skip = True

# Handle link-based values (e.g., "click here")

if not data_skip:
if "(click here)" in value.lower():
value =
path.xpath('./following-sibling::a/@href').get(default='')
raw_item[key] = value
item[key] = value # All other fields keep original punctuation

# Additional handling for specific URL columns

url_columns = ['Current Parent Company url', 'Archived Source', 'Mega-
Scandal url']
for url_col in url_columns:
if url_col in item and isinstance(item[url_col], str) and "(click
here)" in item[url_col].lower():
url_value = response.xpath(
f'//b[contains(text(), "{url_col.replace(" url",
"")}")]/following-sibling::a/@href').get(
default='')
item[url_col] = url_value
raw_item[url_col] = url_value

split_items = self.split_company_names(item)
for new_item in split_items:
self.data.append(new_item)
self.raw_data.append(raw_item)

def split_company_names(self, item):

split_keywords = [
# Common US suffixes
'inc', 'inc ', 'incorporated', 'corp', 'corp ', 'corporation',
'llc', 'ltd', 'limited', 'lp', 'llp', 'plc', 'communities', 'center',
'co', 'company', 'group'
]
company_name = item['Company']

# First extract all parentheses content and their positions

paren_matches = list(re.finditer(r'$([^()]+)$', company_name))

# Split on 'and' or '&' with surrounding whitespace

parts = re.split(r'\s+(and)\s+', company_name, flags=re.IGNORECASE)

# If no splitting occurred, return original item in a list

if len(parts) == 1:
return [item]

result_items = []
current = parts[0].strip()
i = 1

while i < len(parts):

sep = parts[i]
next_part = parts[i + 1].strip()

# Check if current ends with a keyword

keyword_match = False
for kw in split_keywords:
if re.search(rf'\b{kw}\.?$', current, re.IGNORECASE):
keyword_match = True
break

if keyword_match:
# Create a new item with the current company name
new_item = item.copy()
# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]

new_item['Company'] = current
result_items.append(new_item)
current = next_part
else:
current += f" {sep} {next_part}"

i += 2

# Add the last remaining part

new_item = item.copy()

# Remove any aliases from the item copy since we'll reprocess them
for k in list(new_item.keys()):
if k.startswith('Alias '):
del new_item[k]

new_item['Company'] = current
result_items.append(new_item)

# Now reprocess each item to extract aliases specific to that company

final_items = []
for split_item in result_items:
# Make a copy to avoid modifying the original
processed_item = split_item.copy()
company_name = processed_item['Company']

# Extract and process aliases just for this company

parens = re.findall(r'$([^()]+)$', company_name)
aliases = []
country = None

# Check inside () for country or aliases

for val in parens:
cleaned_val = val.strip().lower()
if cleaned_val.upper() != "N.A.":
# Remove phrases like "now known as", "formerly", etc.
cleaned_val = re.sub(r'^(now known as|formerly known as|also
known as)\s+', '', cleaned_val,
flags=re.IGNORECASE)

# Check if it's a known country

if cleaned_val in self.known_countries:
country = cleaned_val
else:
aliases.append(cleaned_val)

# Remove () content from company name

processed_item['Company'] = re.sub(r'$[^()]+$', '',
company_name).strip()

# Assign aliases if any

for idx, alias_val in enumerate(aliases[:2], start=1):
processed_item[f'Alias {idx}'] = alias_val
final_items.append(processed_item)

return final_items

# def clean_text_punct(text):
# """Replace punctuation with space (except - ' & /), then normalize
whitespace"""
# if not isinstance(text, str) or text.strip() == '':
# return text
# keep_chars = {"&", "-", "/", "'"}
# cleaned = ''.join(char if char not in string.punctuation or char in
keep_chars else ' ' for char in text)
# return re.sub(r'\s+', ' ', cleaned).strip()

def extract_name_alias(self, entry):

"""
Extract country from a given company name string.
Returns a tuple: (name, country)
"""
try:
if not isinstance(entry, str):
return entry, None

entry = entry.strip()
country = None

# Extract and remove country from parentheses

parens = re.findall(r'$([^()]+)$', entry)
for val in parens:
if val.strip() in self.known_countries:
country = val.strip()
entry = re.sub(r'$' + re.escape(val) + r'$', '',
entry).strip()

return entry, country

except Exception:
return entry, None
# 2. Remove punctuation except &-/'
def clean_text_punct(self,text):
"""Clean punctuation and whitespace for Company-related fields"""
if not isinstance(text, str) or text.strip() == '':
return text
keep_chars = {"&", "-", "/", "'"}
cleaned = []
text = text.replace(".", " ")

# Process the text and preserve country names inside parentheses

parens = re.findall(r'$([^()]+)$', text) # Extract all content inside
parentheses
if "HK" not in text:
text = text.replace("(", "abcdef")
text = text.replace(")", "fedcba")

for char in text:

if char in string.punctuation and char not in keep_chars:
cleaned.append(' ')
else:
cleaned.append(char)
cleaned = ''.join(cleaned)
cleaned = cleaned.replace("abcdef", "(")
cleaned = cleaned.replace("fedcba", ")")

return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()

def normalize_whitespace(self,text):
"""Normalize whitespace (used for non-Company fields)"""
if not isinstance(text, str) or text.strip() == '':
return text
return re.sub(r'\s+', ' ', text).strip()

def close(self, reason):

# Create DataFrames
df_cleaned = pd.DataFrame(self.data)
df_uncleaned = pd.DataFrame(self.raw_data) # Use completely raw data

# Add common columns to both DataFrames

for df in [df_cleaned]: # Only iterate over df_cleaned
df.insert(0, 'ID', range(1, 1 + len(df)))
df = df[['ID', 'source_url'] + [col for col in df.columns if col not in
['ID', 'source_url']]]

# Only add ID to uncleaned data (without Source URL)

df_uncleaned.insert(0, 'ID', range(1, 1 + len(df_uncleaned)))
# df_uncleaned = df_uncleaned.drop(columns=['Current Parent Company
url','Source of Data'])
df = df.rename(columns={
'Current Parent Company name': 'Current Parent Company'
})
# Save uncleaned file (completely raw data)
timestamp = datetime.now().strftime('%Y-%m-%d_%H%M%S')
filename_uncleaned =
f"raw_data_violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_uncleaned = os.path.join(self.excel, filename_uncleaned)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_uncleaned.fillna("").to_excel(writer, index=False)

# Process cleaned data

protected_columns = ['ID', 'Source URL', 'PDP URL', 'Penalty', 'Date',
'Current Parent Company url', 'Archived Source',
'Mega-Scandal url']
columns_to_clean = ['Company', 'Current Parent Company name']

# 1. Replace N/A with blank except in protected columns

for col in df_cleaned.columns:
if col not in protected_columns:
df_cleaned[col] = df_cleaned[col].replace(['N/A', 'NA'], '')
columns_to_clean = ['Company', 'Current Parent Company']

# Apply per-column logic

for col in df_cleaned.columns:
if col in protected_columns:
continue
if df_cleaned[col].dtype == object:
if col in columns_to_clean:
df_cleaned[col] = df_cleaned[col].apply(self.clean_text_punct)
else:
df_cleaned[col] =
df_cleaned[col].apply(self.normalize_whitespace)

# Save cleaned file

filename_cleaned =
f"violationtracker_goodjobsfirst_org_{timestamp}.xlsx"
filepath_cleaned = os.path.join(self.excel, filename_cleaned)

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',

engine_kwargs={'options': {'strings_to_numbers':
True}}) as writer:
df_cleaned.to_excel(writer, index=False)
# Columns that should have punctuation removed

if __name__ == '__main__':
execute("scrapy crawl EEOC".split())

Final 057
No ratings yet
Final 057
8 pages
Act 115 1
No ratings yet
Act 115 1
22 pages
Main 115
No ratings yet
Main 115
22 pages
111 Final
No ratings yet
111 Final
12 pages
84 Store
No ratings yet
84 Store
7 pages
84 3
No ratings yet
84 3
10 pages
87 1
No ratings yet
87 1
10 pages
This Is A PDF Extractor
No ratings yet
This Is A PDF Extractor
2 pages
77 Main
No ratings yet
77 Main
17 pages
77 Main 2
No ratings yet
77 Main 2
13 pages
77 Final
No ratings yet
77 Final
24 pages
Python Code
No ratings yet
Python Code
5 pages
77 Main 3
No ratings yet
77 Main 3
13 pages
077 Main
No ratings yet
077 Main
13 pages
Real Estate Scraper
No ratings yet
Real Estate Scraper
23 pages
Scraperskank
No ratings yet
Scraperskank
3 pages
Web Scrapping
No ratings yet
Web Scrapping
3 pages
77 Chatgpt 1
No ratings yet
77 Chatgpt 1
4 pages
Abnormal Psych Exam Questions CHP 1
No ratings yet
Abnormal Psych Exam Questions CHP 1
3 pages
Extract Transform Load
No ratings yet
Extract Transform Load
80 pages
B - 2 CIE Web Scraping
No ratings yet
B - 2 CIE Web Scraping
8 pages
Data Gathering
No ratings yet
Data Gathering
7 pages
Links
No ratings yet
Links
24 pages
This Python Script Reads A JSON File
No ratings yet
This Python Script Reads A JSON File
3 pages
Stock Analysis Example
No ratings yet
Stock Analysis Example
9 pages
Web Scraping Code
No ratings yet
Web Scraping Code
4 pages
Flask Project
No ratings yet
Flask Project
5 pages
Python Part2
No ratings yet
Python Part2
11 pages
Python Data Cleaning Techniques
No ratings yet
Python Data Cleaning Techniques
8 pages
03-Ecommerce Purchases Exercises
No ratings yet
03-Ecommerce Purchases Exercises
12 pages
Job Scraping Automation with Python
No ratings yet
Job Scraping Automation with Python
1 page
File Cleaning
No ratings yet
File Cleaning
2 pages
Code Save
No ratings yet
Code Save
56 pages
Web Scraping CheatSheet Guide
No ratings yet
Web Scraping CheatSheet Guide
10 pages
Chatgpt Code Chat Data
No ratings yet
Chatgpt Code Chat Data
32 pages
Parse PDF Bank Statements with Python
No ratings yet
Parse PDF Bank Statements with Python
8 pages
Text Data Cleaning Steps in Python
No ratings yet
Text Data Cleaning Steps in Python
6 pages
Spark Data Processing Guide
No ratings yet
Spark Data Processing Guide
3 pages
Pseudocodes and Flowcharts (Riyansha Shahare)
No ratings yet
Pseudocodes and Flowcharts (Riyansha Shahare)
14 pages
Python & SQL Programming Tasks
No ratings yet
Python & SQL Programming Tasks
26 pages
2025events Scraper
No ratings yet
2025events Scraper
5 pages
Automation Cheat Sheet 2.0
100% (1)
Automation Cheat Sheet 2.0
6 pages
Sans Titre
No ratings yet
Sans Titre
11 pages
01 Test Instructions
No ratings yet
01 Test Instructions
2 pages
Ecinetwireless Backend Code
No ratings yet
Ecinetwireless Backend Code
6 pages
Pickles Script Untested With Instructions
No ratings yet
Pickles Script Untested With Instructions
14 pages
ELT Using Pandas
No ratings yet
ELT Using Pandas
5 pages
Scrubber
No ratings yet
Scrubber
2 pages
Company House API Integration Guide
No ratings yet
Company House API Integration Guide
3 pages
A Simple Python Web Crawler...
100% (1)
A Simple Python Web Crawler...
5 pages
Aasma Pes1ug23cs008 Unit 1
No ratings yet
Aasma Pes1ug23cs008 Unit 1
6 pages
Employee Count Google Scraping Python Script (Serpapi)
No ratings yet
Employee Count Google Scraping Python Script (Serpapi)
6 pages
Another Hack Test3
No ratings yet
Another Hack Test3
4 pages
OEL01
No ratings yet
OEL01
8 pages
57 Changes Require
No ratings yet
57 Changes Require
1 page
Gov Process
No ratings yet
Gov Process
1 page
03 06
No ratings yet
03 06
1 page
57 Changes Require
No ratings yet
57 Changes Require
1 page
77 Changes Need
No ratings yet
77 Changes Need
1 page
S.No. Vendor Id Vendor Nameprioritywatchlist Name Bs Country Source File Format Available
No ratings yet
S.No. Vendor Id Vendor Nameprioritywatchlist Name Bs Country Source File Format Available
23 pages
741 Op-Amp: Features and Applications
No ratings yet
741 Op-Amp: Features and Applications
21 pages
Chapter 7 Data Analytics and Visualisation
No ratings yet
Chapter 7 Data Analytics and Visualisation
7 pages
Waveshare RS485 To Ethernet Converter For EU
100% (1)
Waveshare RS485 To Ethernet Converter For EU
26 pages
Telecom Numbering Workshop Overview
No ratings yet
Telecom Numbering Workshop Overview
116 pages
TESDA Memos & Circulars Index
No ratings yet
TESDA Memos & Circulars Index
2 pages
Intro To Lunetta CMOS Synths
67% (3)
Intro To Lunetta CMOS Synths
46 pages
4069fundamentals of Electric Circuits Sixth Edition Alexander Charles K - Ebook PDF - The Ebook in PDF Format Is Ready For Immediate Access
100% (6)
4069fundamentals of Electric Circuits Sixth Edition Alexander Charles K - Ebook PDF - The Ebook in PDF Format Is Ready For Immediate Access
52 pages
02 MA5600T Basic Operation ISSUE1 00
No ratings yet
02 MA5600T Basic Operation ISSUE1 00
95 pages
Digital Tools for Teaching and Learning
No ratings yet
Digital Tools for Teaching and Learning
52 pages
Security Constraint Optimal Power Flow (SCOPF) - A Comprehensive Survey
No ratings yet
Security Constraint Optimal Power Flow (SCOPF) - A Comprehensive Survey
11 pages
Analysis That Works !: Nooresh Merani - Analyse India
No ratings yet
Analysis That Works !: Nooresh Merani - Analyse India
11 pages
Infinite Ocean For Cinema 4D by C4Depot
No ratings yet
Infinite Ocean For Cinema 4D by C4Depot
4 pages
XL 640sop1
No ratings yet
XL 640sop1
1 page
Altr BRSFT IPO9
No ratings yet
Altr BRSFT IPO9
31 pages
Square: Investment Opportunity Overview
No ratings yet
Square: Investment Opportunity Overview
20 pages
Ad3251 Data Structures Design
100% (1)
Ad3251 Data Structures Design
2 pages
Divya Gupta - TBE - BIM Arch - CV (1) - 2
No ratings yet
Divya Gupta - TBE - BIM Arch - CV (1) - 2
2 pages
Assembly Language Lab Manual
No ratings yet
Assembly Language Lab Manual
13 pages
De Sulfat or PCB
No ratings yet
De Sulfat or PCB
1 page
Key Characteristics of Digital Marketing
No ratings yet
Key Characteristics of Digital Marketing
22 pages
Grade 8 Computing Worksheet - 4
No ratings yet
Grade 8 Computing Worksheet - 4
3 pages
Abhishek Singh
No ratings yet
Abhishek Singh
1 page
Linear Programming - Graphical Method
No ratings yet
Linear Programming - Graphical Method
17 pages
Steel City Corp. Catalog
No ratings yet
Steel City Corp. Catalog
32 pages
MELC 8 Intellectual Property Copyright and
No ratings yet
MELC 8 Intellectual Property Copyright and
30 pages
English Grammar and Vocabulary Exercises
No ratings yet
English Grammar and Vocabulary Exercises
3 pages
Knowledge Capture and Codification
100% (1)
Knowledge Capture and Codification
45 pages
Configuring GlobalProtect SSL VPN Using A User-Defined Port
No ratings yet
Configuring GlobalProtect SSL VPN Using A User-Defined Port
28 pages
Lab 2 PDF
No ratings yet
Lab 2 PDF
3 pages
LJBA - Key To Correction
No ratings yet
LJBA - Key To Correction
1 page

Main Code

Uploaded by

Main Code

Uploaded by

import random

# Preset cookies required for session management and access

# Create the directory if it doesn't exist

# Send initial request with custom headers and cookies

def parse_listing(self, response):

for link in company_links:

yield response.follow(link,meta=meta, headers=self.headers,

# Handle pagination by identifying total pages and iterating through all

def parse_details(self, response):

# Extract all key-value details from the company profile page

item["PDP URL"] = response.url

for path in response.xpath('//div[@id="contentResult"]//b'):

# Store raw value first

# Special handling for Company field

parens = re.findall(r'\(([^()]+)\)', value)

# 1. Check inside () for country or special cases (Client, HK)

# Check if it's a known country

# Extract aliases from known keywords

# 4. Clean up company name (add space around punctuation and remove

# 5. Assign Alias 1, Alias 2 (in order)

# Remove alias keywords (fka/dba/etc.) from start

# Split on 'and' and process each part

for i, part in enumerate(split_aliases, start=idx):

# Update idx to avoid overwriting in next iteration

# Handle Current Parent Company (similar to Company but without

# Handle special keys (Mega-Scandal, Source of Data)

# Handle link-based values (e.g., "click here")

# Additional handling for specific URL columns

def split_company_names(self, item):

# First extract all parentheses content and their positions

# Split on 'and' or '&' with surrounding whitespace

# If no splitting occurred, return original item in a list

while i < len(parts):

# Check if current ends with a keyword

# Add the last remaining part

# Now reprocess each item to extract aliases specific to that company

# Extract and process aliases just for this company

# Check inside () for country or aliases

# Check if it's a known country

# Remove () content from company name

# Assign aliases if any

def extract_name_alias(self, entry):

# Extract and remove country from parentheses

return entry, country

# Process the text and preserve country names inside parentheses

for char in text:

return re.sub(r'\s+', ' ', ''.join(cleaned)).strip()

def close(self, reason):

# Add common columns to both DataFrames

# Only add ID to uncleaned data (without Source URL)

with pd.ExcelWriter(filepath_uncleaned, engine='xlsxwriter',

# Process cleaned data

# 1. Replace N/A with blank except in protected columns

# Apply per-column logic

# Save cleaned file

with pd.ExcelWriter(filepath_cleaned, engine='xlsxwriter',

You might also like