0% found this document useful (0 votes)
16 views5 pages

Sample Discovery

asdfghjkl;'
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
16 views5 pages

Sample Discovery

asdfghjkl;'
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

Sample_Discovery

August 6, 2024

[1]: import pandas as pd

# Load the dataset


df = pd.read_csv('messy_employee_data.csv')

# Descriptive Statistics
descriptive_stats = df.describe()
print(descriptive_stats)

Employee_ID Phone_Number Total_Work_Hour_per_Month Salary_per_Month


count 100.000000 9.000000e+01 79.000000 74.000000
mean 5874.450000 5.014917e+09 2631.392405 24305.405405
std 2698.235527 2.946591e+09 4317.578213 37333.773655
min 1139.000000 2.058916e+08 -10.000000 -5000.000000
25% 3639.250000 2.417400e+09 160.000000 8000.000000
50% 6168.000000 4.735941e+09 180.000000 10800.000000
75% 8183.250000 7.711621e+09 5099.500000 12600.000000
max 9957.000000 9.871025e+09 9999.000000 100000.000000

[2]: # Completeness Metric


completeness = df.notnull().mean() * 100
print("Completeness Metric:\n", completeness)

Completeness Metric:
Employee_Name 90.0
Employee_ID 100.0
Job_Role 91.0
Phone_Number 90.0
Email_ID 90.0
Total_Work_Hour_per_Month 79.0
Salary_per_Month 74.0
dtype: float64

[3]: # Accuracy Metric for Salary_per_Month (ensure non-negative values)


accuracy_salary = (df['Salary_per_Month'] >= 0).mean() * 100
print(f"Accuracy for Salary per Month: {accuracy_salary}%")

Accuracy for Salary per Month: 59.0%

1
[4]: # Consistency Metric for Job_Role
valid_job_roles = ['Engineer', 'Data Scientist', 'Manager', 'Analyst',␣
,→'Developer']

consistency_job_role = df['Job_Role'].isin(valid_job_roles).mean() * 100


print(f"Consistency for Job Role: {consistency_job_role}%")

Consistency for Job Role: 91.0%

[5]: # Correlation between Salary and Total Work Hours


correlation = df[['Total_Work_Hour_per_Month', 'Salary_per_Month']].corr()
print("Correlation:\n", correlation)

Correlation:
Total_Work_Hour_per_Month Salary_per_Month
Total_Work_Hour_per_Month 1.000000 0.600796
Salary_per_Month 0.600796 1.000000

[6]: import seaborn as sns


import matplotlib.pyplot as plt

# Plot the heatmap


plt.figure(figsize=(10, 8))
sns.heatmap(correlation, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Correlation Matrix Heatmap')
plt.show()

2
[7]: # Missing Data Analysis
missing_values_count = df.isnull().sum()
#missing_data_pattern = df.isnull()
print("Missing Values Count:\n", missing_values_count)
#print("Missing Data Pattern:\n", missing_data_pattern)

Missing Values Count:


Employee_Name 10
Employee_ID 0
Job_Role 9
Phone_Number 10
Email_ID 10
Total_Work_Hour_per_Month 21
Salary_per_Month 26
dtype: int64

3
[14]: # Fill missing values
df['Employee_Name'].fillna('Unknown', inplace=True)
df['Employee_ID'].fillna('Unknown', inplace=True)
df['Phone_Number'].replace('unknown', None, inplace=True)
df['Phone_Number'].fillna('Unknown', inplace=True)
df['Total_Work_Hour_per_Month'].fillna(df['Total_Work_Hour_per_Month'].mean(),␣
,→inplace=True)

df['Salary_per_Month'].fillna(df['Salary_per_Month'].mean(), inplace=True)

# Correct data types


df['Employee_ID'] = df['Employee_ID'].astype(str)
df['Total_Work_Hour_per_Month'] = df['Total_Work_Hour_per_Month'].astype(float)
df['Salary_per_Month'] = df['Salary_per_Month'].astype(float)
#df['Phone_Number']= df['Phone_Number'].astype(str)
# Normalize Email IDs
df['Email_ID'] = df['Email_ID'].apply(lambda x: x if pd.isna(x) or '@' in x else␣
,→x + '@example.com')

# Ensure consistent job role naming


df['Job_Role'] = df['Job_Role'].str.title()

# Format phone numbers (dummy formatting for demonstration)


df['Phone_Number'] = df['Phone_Number'].apply(lambda x: x if pd.isna(x) or x ==␣
,→'Unknown' else x.replace('-', ''))

df['Phone_Number'] = df['Phone_Number'].astype(str)
# Ensure Employee Names are title case
df['Employee_Name'] = df['Employee_Name'].str.title()

# Remove duplicate rows (if any)


df.drop_duplicates(inplace=True)

print("Structured and Formatted Dataset:\n", df)

Structured and Formatted Dataset:


Employee_Name Employee_ID Job_Role Phone_Number Email_ID
\
0 Ospjpqptpe 9516 Manager 4733377351.0 [email protected]
1 Arwarmgzmo 8444 Manager Unknown [email protected]
2 Qwxbncqkag 2420 Manager Unknown [email protected]
3 Akmmthjndy 3445 Developer 4901793467.0 [email protected]
4 Croxaopkbi 9378 Manager 8679802795.0 [email protected]
.. ... ... ... ... ...
95 Nsgdvhcolz 3727 Analyst 3957223288.0 [email protected]
96 Unknown 3857 Data Scientist 7050739609.0 [email protected]
97 Vbowqqbmye 2467 Data Scientist 8921949055.0 [email protected]
98 Kukowpctzv 3553 Developer 9197584574.0 NaN

4
99 Pvvicpbxnk 9638 Analyst 3176133724.0 [email protected]

Total_Work_Hour_per_Month Salary_per_Month
0 -10.000000 24305.405405
1 9999.000000 -5000.000000
2 180.000000 10800.000000
3 200.000000 12000.000000
4 180.000000 10800.000000
.. ... ...
95 200.000000 10000.000000
96 160.000000 11200.000000
97 160.000000 8000.000000
98 2631.392405 12000.000000
99 180.000000 10800.000000

[100 rows x 7 columns]

[17]: df['Employee_Name'].dtype

[17]: dtype('O')

[ ]:

You might also like