310503: Statistics and Machine Learning
For a payroll dataset create Measure of central tenancy and its measure of dispersion for
statistical analysis of given data.
Program:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/payroll
dataset.csv')
# Descriptive Statistics
# Measures of Central Tendency
mean_salary = data['Salary'].mean()
median_salary = data['Salary'].median()
mode_salary = data['Salary'].mode()[0]
mid_range_salary = (data['Salary'].max() + data['Salary'].min()) / 2
# Measures of Dispersion
range_salary = data['Salary'].max() - data['Salary'].min()
variance_salary = data['Salary'].var()
mean_deviation_salary = np.mean(np.abs(data['Salary'] - mean_salary))
std_deviation_salary = data['Salary'].std()
# Tabulate the results
results = {
'Measure': ['Mean', 'Median', 'Mode', 'Mid-Range', 'Range',
'Variance', 'Mean Deviation', 'Standard Deviation'],
'Value': [mean_salary, median_salary, mode_salary,
mid_range_salary, range_salary, variance_salary, mean_deviation_salary,
std_deviation_salary]
}
results_df = pd.DataFrame(results)
print("Descriptive Statistics:")
print(results_df)
# Plot the Salary distribution
plt.figure(figsize=(10, 6))
plt.hist(data['Salary'], bins=20, color='blue', alpha=0.7,
edgecolor='black')
plt.axvline(mean_salary, color='red', linestyle='dashed', linewidth=1,
label=f'Mean: {mean_salary:.2f}')
plt.axvline(median_salary, color='green', linestyle='dashed',
linewidth=1, label=f'Median: {median_salary:.2f}')
plt.axvline(mode_salary, color='purple', linestyle='dashed',
linewidth=1, label=f'Mode: {mode_salary:.2f}')
plt.xlabel('Salary')
plt.ylabel('Frequency')
plt.title('Salary Distribution with Central Tendency Measures')
plt.legend()
plt.show()
# Boxplot to visualize dispersion
plt.figure(figsize=(8, 6))
plt.boxplot(data['Salary'], vert=False, patch_artist=True,
boxprops=dict(facecolor='lightblue'))
plt.title('Boxplot of Salary (Measures of Dispersion)')
plt.xlabel('Salary')
plt.show()
# Importance of Statistical Inference in Machine Learning
print("\nImportance of Statistical Inference in Machine Learning:")
print("""
1. **Descriptive Statistics**: Helps in understanding the data
distribution and summarizing the main features.
2. **Inferential Statistics**: Allows making predictions or inferences
about a population based on sample data.
3. **Model Evaluation**: Statistical inference is crucial for
evaluating model performance, understanding uncertainty, and making
data-driven decisions.
4. **Hypothesis Testing**: Used to validate assumptions and test the
significance of features in machine learning models.
5. **Confidence Intervals**: Provide a range of values within which the
true population parameter is expected to lie.
""")
Descriptive Statistics:
Measure Value
0 Mean 2.059147e+06
1 Median 2.500000e+05
2 Mode 2.500000e+04
3 Mid-Range 5.001500e+06
4 Range 9.997000e+06
5 Variance 1.004968e+13
6 Mean Deviation 2.610499e+06
7 Standard Deviation 3.170124e+06
Create a probabilistic model for credit card fraud detection
Program:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix,
roc_auc_score, roc_curve
import matplotlib.pyplot as plt
from google.colab import drive
drive.mount('/content/drive')
# Load the dataset
data = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/csv/Credit
Card Fraud Detection.csv')
# Display the first few rows of the dataset
print(data.head())
# Check for missing values
print("\nMissing values in the dataset:")
print(data.isnull().sum())
# Dataset information
print("\nDataset information:")
print(data.info())
# Class distribution (fraud vs non-fraud)
print("\nClass distribution:")
print(data['Class'].value_counts())
# Separate features (X) and target (y)
X = data.drop('Class', axis=1)
y = data['Class']
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.3, random_state=42, stratify=y)
# Standardize the features (important for Logistic Regression)
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
# Train a Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=1000)
model.fit(X_train, y_train)
# Predict probabilities on the test set
y_pred_prob = model.predict_proba(X_test)[:, 1] # Probability of fraud
(Class 1)
# Predict classes on the test set
y_pred = model.predict(X_test)
# Evaluate the model
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))
# ROC-AUC Score
roc_auc = roc_auc_score(y_test, y_pred_prob)
print(f"\nROC-AUC Score: {roc_auc:.4f}")
# Plot ROC Curve
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC Curve (AUC = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for Fraud Detection')
plt.legend()
plt.show()
# Example: Predict fraud probability for a new transaction
new_transaction = np.array([[0, -1.359807, -0.072781, 2.536347,
1.378155, -0.338321, 0.462388, 0.239599, 0.098698, 0.363787, 0.090794,
-0.551600, -0.617801, -0.991390, -0.311169, 1.468177, -0.470401,
0.207971, 0.025791, 0.403993, 0.251412, -0.018307, 0.277838, -0.110474,
0.066928, 0.128539, -0.189115, 0.133558, -0.021053, 149.62]])
new_transaction_scaled = scaler.transform(new_transaction)
fraud_probability = model.predict_proba(new_transaction_scaled)[0, 1]
print(f"\nFraud Probability for New Transaction:
{fraud_probability:.4f}")