Fake Job Detection
Fake Job Detection
In [2]:
!pip install -U spacy
In [3]:
import re
import string
import numpy as np
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.base import TransformerMixin
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, confusion_matrix
from wordcloud import WordCloud
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.lang.en import English
In [4]:
df=pd.read_csv('fake_job_postings.csv')
In [5]:
df.head()
Out[5]: job_id title location department salary_range company_profile description requirements benefits telecommuting has_company_logo has_questions employment_type required_experien
We're Food52,
Food52, a fast- Experience with
Marketing US, NY, and we've
0 1 Marketing NaN growing, James content management NaN 0 1 0 Other Internsh
Intern New York created a
Beard Award-winn... systems a m...
groundbreaki...
What you
Customer 90 Seconds, the will get
Organised - Focused What we expect from
Service - Cloud NZ, , worlds Cloud from
1 2 Success NaN - Vibrant - you:Your key 0 1 0 Full-time Not Applicab
Video Auckland Video Production usThrough
Awesome!Do you... responsibilit...
Production ... being part
of...
Our
culture is
Account
Our passion for THE COMPANY: ESRI EDUCATION: Bachelor’s anything
Executive - US, DC,
3 4 Sales NaN improving quality – Environmental or Master’s in GIS, but 0 1 0 Full-time Mid-Senior lev
Washington Washington
of life thro... Systems Rese... busi... corporate
DC
—we have
...
SpotSource
JOB TITLE: QUALIFICATIONS:RN Full
Bill Review US, FL, Fort Solutions LLC is a
4 5 NaN NaN Itemization Review license in the State of Benefits 0 1 1 Full-time Mid-Senior lev
Manager Worth Global Human
ManagerLOCATION:... Texa... Offered
Cap...
In [6]:
df.shape
In [7]:
df.isnull().sum()
Out[7]: job_id 0
title 0
location 346
department 11547
salary_range 15012
company_profile 3308
description 1
requirements 2695
benefits 7210
telecommuting 0
has_company_logo 0
has_questions 0
employment_type 3471
required_experience 7050
required_education 8105
industry 4903
function 6455
fraudulent 0
dtype: int64
In [8]:
columns = ['job_id', 'telecommuting', 'has_company_logo', 'has_questions', 'salary_range', 'employment_type']
for colu in columns:
del df[colu]
In [9]:
df.head()
Out[9]: title location department company_profile description requirements benefits required_experience required_education industry function fraudulent
Customer Service - 90 Seconds, the worlds Organised - Focused - What you will get Marketing
NZ, , What we expect from Customer
1 Cloud Video Success Cloud Video Vibrant - Awesome!Do from usThrough Not Applicable NaN and 0
Auckland you:Your key responsibilit... Service
Production Production ... you... being part of... Advertising
Our culture is
Our passion for THE COMPANY: ESRI –
Account Executive - US, DC, EDUCATION: Bachelor’s or anything but Computer
3 Sales improving quality of Environmental Systems Mid-Senior level Bachelor's Degree Sales 0
Washington DC Washington Master’s in GIS, busi... corporate—we Software
life thro... Rese...
have ...
In [10]:
df.fillna('', inplace=True)
In [11]:
plt.figure(figsize=(15,5))
sns.countplot(y='fraudulent', data=df)
plt.show()
In [12]:
df.groupby('fraudulent')['fraudulent'].count()
Out[12]: fraudulent
0 17014
1 866
Name: fraudulent, dtype: int64
In [13]:
exp = dict(df.required_experience.value_counts())
del exp['']
In [14]:
exp
In [15]:
plt.figure(figsize=(10,5))
sns.set_theme(style='whitegrid')
plt.bar(exp.keys(), exp.values())
plt.title('No. of jobs with Experience', size=20)
plt.xlabel('Experience', size=10)
plt.ylabel('No. of jobs', size=10)
plt.xticks(rotation=30)
plt.show()
In [16]:
def split(location):
l = location.split(',')
return l[0]
df['country'] = df.location.apply(split)
In [17]:
df.head()
Out[17]: title location department company_profile description requirements benefits required_experience required_education industry function fraudulent country
Our culture is
Our passion for THE COMPANY: ESRI –
Account Executive US, DC, EDUCATION: Bachelor’s or anything but Computer
3 Sales improving quality of Environmental Systems Mid-Senior level Bachelor's Degree Sales 0 US
- Washington DC Washington Master’s in GIS, busi... corporate—we Software
life thro... Rese...
have ...
In [18]:
countr = dict(df.country.value_counts()[:14])
del countr['']
countr
In [19]:
plt.figure(figsize=(8,6))
plt.title('Country-wise Job Posting',size=20)
plt.bar(countr.keys(), countr.values())
plt.ylabel('No. of jobs', size=10)
plt.xlabel('Countries', size=10)
In [20]:
edu = dict(df.required_education.value_counts()[:7])
del edu['']
edu
In [21]:
plt.figure(figsize=(15,6))
plt.title('Job postings based on Education', size=20)
plt.bar(edu.keys(), edu.values())
plt.ylabel('No. of Jobs', size=10)
plt.xlabel('Education', size=10)
In [22]:
print(df[df.fraudulent==0].title.value_counts()[:10])
In [23]:
print(df[df.fraudulent==1].title.value_counts()[:10])
In [24]:
df['text']=df['title']+' '+df['company_profile']+' '+df['description']+' '+df['requirements']+' '+df['benefits']
del df['title']
del df['location']
del df['department']
del df['company_profile']
del df['description']
del df['requirements']
del df['benefits']
del df['required_experience']
del df['required_education']
del df['industry']
del df['function']
del df['country']
In [25]:
df.head()
In [26]:
fraudjobs_text = df[df.fraudulent==1].text
realjobs_text = df[df.fraudulent==0].text
In [27]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
plt.figure(figsize=(16,14))
wc = WordCloud(min_font_size = 3, max_words = 3000, width = 1500, height = 800, stopwords= STOPWORDS).generate(str(" ".join(fraudjobs_text)))
plt.imshow(wc, interpolation = 'bilinear')
In [28]:
STOPWORDS = spacy.lang.en.stop_words.STOP_WORDS
plt.figure(figsize=(16,14))
wc = WordCloud(min_font_size = 3, max_words = 3000, width = 1500, height = 800, stopwords= STOPWORDS).generate(str(" ".join(realjobs_text)))
plt.imshow(wc, interpolation = 'bilinear')
In [29]:
!pip install spacy && python -m spacy download en
In [30]:
punctuations = string.punctuation
nlp = spacy.load("en_core_web_sm")
stop_words = spacy.lang.en.stop_words.STOP_WORDS
parser = English()
def spacy_tokenizer(sentence):
mytokens = parser(sentence)
mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]
return mytokens
In [31]:
df['text'] = df['text'].apply(clean_text)
In [32]:
cv = TfidfVectorizer(max_features = 100)
x = cv.fit_transform(df['text'])
df1 = pd.DataFrame(x.toarray(), columns = cv.get_feature_names())
df.drop(['text'], axis=1, inplace=True)
main_df = pd.concat([df1,df], axis=1)
In [33]:
main_df.head()
Out[33]: ability about all also amp an and are as at ... who will with work working world years you your fraudulent
0 0.000000 0.041120 0.000000 0.042424 0.036488 0.000000 0.755238 0.000000 0.078653 0.000000 ... 0.000000 0.000000 0.186067 0.051026 0.068029 0.000000 0.000000 0.000000 0.000000 0
1 0.021895 0.094183 0.035394 0.024292 0.041787 0.029771 0.490896 0.056626 0.060050 0.052431 ... 0.000000 0.078004 0.165735 0.043827 0.116862 0.099327 0.000000 0.204854 0.130452 0
2 0.000000 0.000000 0.176807 0.000000 0.041749 0.089231 0.397029 0.113149 0.000000 0.000000 ... 0.000000 0.062346 0.307512 0.058383 0.000000 0.000000 0.000000 0.094462 0.074476 0
3 0.023267 0.000000 0.018806 0.000000 0.000000 0.094909 0.695542 0.000000 0.031906 0.037144 ... 0.023132 0.049735 0.075480 0.046573 0.000000 0.105551 0.019806 0.050236 0.059411 0
4 0.000000 0.000000 0.068009 0.000000 0.040147 0.028602 0.606379 0.081605 0.115386 0.000000 ... 0.000000 0.000000 0.159230 0.028071 0.037425 0.000000 0.035814 0.030279 0.107427 0
In [34]:
Y = main_df.iloc[:, -1]
X = main_df.iloc[:, :-1]
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)
(12516, 100)
(12516,)
(5364, 100)
(5364,)
In [35]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier(n_jobs=3,oob_score=True,n_estimators=100,criterion="entropy")
model = rfc.fit(X_train,Y_train)
In [36]:
print(X_test)
In [37]:
pred = rfc.predict(X_test)
score = accuracy_score(Y_test, pred)
score
Out[37]: 0.9737136465324385
In [38]:
print('Classification_Report\n')
print(classification_report(Y_test, pred))
print('Confusion Matrix\n')
print(confusion_matrix(Y_test, pred))
Classification_Report
Confusion Matrix
[[5117 0]
[ 141 106]]
In [ ]: