!
pip install tldextract
Collecting tldextract
Downloading [Link] (11 kB)
Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (
Requirement already satisfied: requests>=2.1.0 in /usr/local/lib/python3.11/dist
Collecting requests-file>=1.4 (from tldextract)
Downloading requests_file-[Link] (1.7 kB)
Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.11/dist
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/d
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/d
Downloading [Link] (107 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 107.4/107.4 kB 4.1 MB/s eta 0
Downloading requests_file-[Link] (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0
!pip install scikit-learn imbalanced-learn matplotlib seaborn
import numpy as np
import pandas as pd
import re
import [Link] as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from [Link] import StandardScaler
from [Link] import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import RandomOverSampler
import tensorflow as tf
from [Link] import Model
from [Link] import (Input, Embedding, Conv1D, MaxPooling1D,
Bidirectional, LSTM, Dense, Dropout,
BatchNormalization, GlobalMaxPooling1D,
Concatenate)
from [Link] import Adam
from [Link] import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from [Link] import Tokenizer
from [Link] import pad_sequences
from [Link] import l2
# Load dataset
df = pd.read_csv('/content/drive/MyDrive/kaggle_datasets/email_dataset/spam_Emails_data.csv
df = df[['text', 'label']].dropna()
# Preprocessing pipeline
def preprocess_text(text):
text = str(text).lower()
# Remove HTML tags
text = [Link](r'<[^>]+>', '', text)
# Replace URLs with 'URL'
text = [Link](r'https?://\S+|www\.\S+', 'URL', text)
# Replace email addresses with 'EMAIL'
text = [Link](r'\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b', 'EMAIL', text)
# Replace currency symbols
text = [Link](r'[$€£¥]\d+\.?\d*', 'CURRENCY', text)
# Normalize common obfuscations
text = [Link](r'v[i1!][a@]gr[a@]', 'viagra', text)
text = [Link](r'fr[e3][e3]', 'free', text)
# Replace multiple spaces with single space
text = [Link](r'\s+', ' ', text).strip()
return text
df['text'] = df['text'].apply(preprocess_text)
# Encode labels
df['label'] = df['label'].map({'Ham': 0, 'Spam': 1})
# Expanded keyword lists
SPAM_KEYWORDS = ['free', 'win', 'prize', 'offer', 'lottery', 'claim', 'exclusive', 'discount
'deal', 'bonus', 'gift', 'reward', 'limited', 'special', 'cash', 'money',
'save', 'buy', 'shop']
URGENCY_KEYWORDS = ['urgent', 'now', 'immediately', 'act', 'last', 'expire', 'deadline',
'final', 'today', 'quick', 'hurry']
PHISHING_KEYWORDS = ['verify', 'login', 'account', 'password', 'secure', 'update', 'confirm
'alert', 'suspended']
SCAM_KEYWORDS = ['inheritance', 'bank', 'transfer', 'funds', 'payment', 'deposit', 'million
'billion']
CALL_TO_ACTION = ['click here', 'visit now', 'call now', 'apply now', 'get now']
# Feature extraction
def extract_features(text):
features = [Link](20) # Increased to accommodate new features
text = str(text)
# Basic features
features[0] = len(text)
features[1] = [Link]('!')
features[2] = [Link]('?')
features[3] = [Link]('$')
features[4] = [Link]('@')
# Keyword counts
features[5] = sum([Link]().count(kw) for kw in SPAM_KEYWORDS)
features[6] = sum([Link]().count(kw) for kw in URGENCY_KEYWORDS)
features[7] = sum([Link]() for c in text) / max(1, len(text))
features[8] = sum([Link]() for c in text) / max(1, len(text))
features[9] = len([Link](r'URL', text))
features[10] = len([Link](r'EMAIL', text))
features[11] = len([Link](r'\b\d{5,}\b', text)) # Long numbers
features[12] = len([Link]()) # Word count
features[13] = len(set([Link]())) / max(1, len([Link]())) # Unique word ratio
features[14] = 1 if 'attachment' in [Link]() else 0
# New features
features[15] = len([Link](r'[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF]', text)) #
features[16] = 1 if any(kw in [Link]() for kw in ['noreply', 'admin', 'support']) e
features[17] = features[5] / max(1, len([Link]())) # Spam keyword density
features[18] = len([Link](r'[*#~\^]', text)) / max(1, len(text)) # Special characte
features[19] = sum([Link]().count(phrase) for phrase in CALL_TO_ACTION) # Call-to-a
return features
X_num = [Link]([extract_features(text) for text in df['text']])
y = df['label'].values
# Scale features
scaler = StandardScaler()
X_num = scaler.fit_transform(X_num)
# Tokenization
max_words = 20000
max_len = 256 # Optimized for email length
tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
tokenizer.fit_on_texts(df['text'])
sequences = tokenizer.texts_to_sequences(df['text'])
X_text = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
# Class balancing
sampler = RandomOverSampler(random_state=42)
X_num, y = sampler.fit_resample(X_num, y)
X_text = [Link]([X_text[i] for i in sampler.sample_indices_])
# Split data
X_text_train, X_text_test, X_num_train, X_num_test, y_train, y_test = train_test_split(
X_text, X_num, y, test_size=0.2, random_state=42
)
X_text_train, X_text_val, X_num_train, X_num_val, y_train, y_val = train_test_split(
X_text_train, X_num_train, y_train, test_size=0.2, random_state=42
)
# Input layers
text_input = Input(shape=(max_len,), name='text_input')
num_input = Input(shape=(X_num.shape[1],), name='num_input')
# Text processing branch
x = Embedding(max_words, 128)(text_input)
x = Conv1D(64, 5, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2)(x)
x = Conv1D(128, 3, activation='relu', padding='same')(x)
x = BatchNormalization()(x)
x = MaxPooling1D(2)(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x = GlobalMaxPooling1D()(x)
x = Dropout(0.5)(x)
# Numerical features branch
y = Dense(64, activation='relu')(num_input)
y = BatchNormalization()(y)
y = Dropout(0.3)(y)
# Combined model
combined = Concatenate()([x, y])
z = Dense(64, activation='relu')(combined)
z = Dropout(0.3)(z)
output = Dense(1, activation='sigmoid')(z)
model = Model(inputs=[text_input, num_input], outputs=output)
[Link](
optimizer=Adam(learning_rate=0.001),
loss='binary_crossentropy',
metrics=['accuracy',
[Link](name='precision'),
[Link](name='recall'),
[Link](name='auc')]
)
callbacks = [
EarlyStopping(monitor='val_auc', patience=5, mode='max', restore_best_weights=True),
ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=2, min_lr=1e-6),
ModelCheckpoint(filepath='best_gmail_spam_model.h5', monitor='val_auc', save_best_only=T
]
history = [Link](
[X_text_train, X_num_train], y_train,
validation_data=([X_text_val, X_num_val], y_val),
epochs=10, # Increased for better convergence
batch_size=64, # Slightly smaller for better gradient updates
callbacks=callbacks,
verbose=1
)
# Predictions
y_pred = ([Link]([X_text_test, X_num_test]) > 0.5).astype(int)
# Metrics
print(classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))
print(f"AUC-ROC: {roc_auc_score(y_test, y_pred):.4f}")
# Confusion matrix
cm = confusion_matrix(y_test, y_pred)
[Link](cm, annot=True, fmt='d', cmap='Blues',
xticklabels=['Ham', 'Spam'], yticklabels=['Ham', 'Spam'])
[Link]('Predicted')
[Link]('Actual')
[Link]()
[Link]('gmail_spam_model.h5')
with open('[Link]', 'wb') as f:
[Link](tokenizer, f)
with open('[Link]', 'wb') as f:
[Link](scaler, f)
# Prediction script
def predict_email(text):
"""Predict whether an email is spam or ham"""
PREDICTION_THRESHOLD = 0.5
# Preprocess text
cleaned_text = preprocess_text(text)
# Prepare text features
sequence = tokenizer.texts_to_sequences([cleaned_text])
padded_sequence = pad_sequences(sequence, maxlen=256, padding='post', truncating='post'
# Prepare numerical features
num_features = extract_features(text)
scaled_features = [Link]([num_features])
# Make prediction
prediction_prob = [Link]([padded_sequence, scaled_features])[0][0]
is_spam = prediction_prob > PREDICTION_THRESHOLD
# Generate human-readable result
result = {
'text_sample': text[:100] + "..." if len(text) > 100 else text,
'prediction': 'SPAM' if is_spam else 'HAM',
'probability': float(prediction_prob),
'confidence': 'HIGH' if (prediction_prob > 0.8 or prediction_prob < 0.2) else 'MEDIU
}
# Explain key factors
suspicious_factors = []
if num_features[1] > 3:
suspicious_factors.append(f"Multiple (!) marks ({int(num_features[1])})")
if num_features[3] > 0:
suspicious_factors.append("Contains dollar signs")
if num_features[5] > 0:
suspicious_factors.append(f"Spam keywords ({int(num_features[5])})")
if num_features[9] > 0:
suspicious_factors.append(f"Contains URLs ({int(num_features[9])})")
if num_features[7] > 0.3:
suspicious_factors.append(f"Excessive uppercase ({num_features[7]:.1%})")
if num_features[15] > 0:
suspicious_factors.append(f"Contains emojis ({int(num_features[15])})")
if num_features[16] > 0:
suspicious_factors.append("Suspicious sender pattern")
if num_features[19] > 0:
suspicious_factors.append(f"Call-to-action phrases ({int(num_features[19])})")
if suspicious_factors:
result['suspicious_factors'] = suspicious_factors
return result
# Example usage
test_emails = [
"WIN A FREE iPhone! Click now: [Link]
"Hi John, just following up about our meeting tomorrow",
"URGENT: Your account has been compromised. Verify now! $$$",
"Please find attached the quarterly report",
"Congratulations! You've won a $1000 gift card! Click: [Link]
"Dear Sir, you have inherited $5 million from a relative. Contact us immediately!"
]
print("="*60)
print("GMAIL SPAM CLASSIFIER PREDICTIONS")
print("="*60)
for email in test_emails:
prediction = predict_email(email)
print(f"\nEmail: {prediction['text_sample']}")
print(f"Prediction: {prediction['prediction']} (Probability: {prediction['probability']
print(f"Confidence: {prediction['confidence']}")
if 'suspicious_factors' in prediction:
print("Suspicious factors detected:")
for factor in prediction['suspicious_factors']:
print(f"- {factor}")
print("-"*60)
Requirement already satisfied: scikit-learn in /usr/local/lib/python3.11/dist-pa
Requirement already satisfied: imbalanced-learn in /usr/local/lib/python3.11/dis
Requirement already satisfied: matplotlib in /usr/local/lib/python3.11/dist-pack
Requirement already satisfied: seaborn in /usr/local/lib/python3.11/dist-package
Requirement already satisfied: numpy>=1.19.5 in /usr/local/lib/python3.11/dist-p
Requirement already satisfied: scipy>=1.6.0 in /usr/local/lib/python3.11/dist-pa
Requirement already satisfied: joblib>=1.2.0 in /usr/local/lib/python3.11/dist-p
Requirement already satisfied: threadpoolctl>=3.1.0 in /usr/local/lib/python3.11
Requirement already satisfied: sklearn-compat<1,>=0.1 in /usr/local/lib/python3.
Requirement already satisfied: contourpy>=1.0.1 in /usr/local/lib/python3.11/dis
Requirement already satisfied: cycler>=0.10 in /usr/local/lib/python3.11/dist-pa
Requirement already satisfied: fonttools>=4.22.0 in /usr/local/lib/python3.11/di
Requirement already satisfied: kiwisolver>=1.3.1 in /usr/local/lib/python3.11/di
Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist
Requirement already satisfied: pillow>=8 in /usr/local/lib/python3.11/dist-packag
Requirement already satisfied: pyparsing>=2.3.1 in /usr/local/lib/python3.11/dis
Requirement already satisfied: python-dateutil>=2.7 in /usr/local/lib/python3.11
Requirement already satisfied: pandas>=1.2 in /usr/local/lib/python3.11/dist-pac
Requirement already satisfied: pytz>=2020.1 in /usr/local/lib/python3.11/dist-pa
Requirement already satisfied: tzdata>=2022.7 in /usr/local/lib/python3.11/dist-
Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.11/dist-packag
Epoch 1/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 188ms/step - accuracy: 0.9242 - auc: 0.9702
Epoch 1: val_auc improved from -inf to 0.99826, saving model to best_gmail_spam_m
WARNING:absl:You are saving your model as an HDF5 file via `[Link]()` or `ke
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 412s 199ms/step - accuracy: 0.9242 - auc: 0.97
Epoch 2/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 189ms/step - accuracy: 0.9895 - auc: 0.9987
Epoch 2: val_auc did not improve from 0.99826
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 463s 210ms/step - accuracy: 0.9895 - auc: 0.99
Epoch 3/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 189ms/step - accuracy: 0.9938 - auc: 0.9993
Epoch 3: val_auc did not improve from 0.99826
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 423s 200ms/step - accuracy: 0.9938 - auc: 0.99
Epoch 4/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 189ms/step - accuracy: 0.9966 - auc: 0.9996
Epoch 4: val_auc did not improve from 0.99826
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 441s 200ms/step - accuracy: 0.9966 - auc: 0.99
Epoch 5/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 189ms/step - accuracy: 0.9984 - auc: 0.9998
Epoch 5: val_auc did not improve from 0.99826
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 459s 209ms/step - accuracy: 0.9984 - auc: 0.99
Epoch 6/10
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 0s 189ms/step - accuracy: 0.9995 - auc: 0.9999
Epoch 6: val_auc did not improve from 0.99826
2044/2044 ━━━━━━━━━━━━━━━━━━━━ 442s 209ms/step - accuracy: 0.9995 - auc: 0.99
1277/1277 ━━━━━━━━━━━━━━━━━━━━ 32s 25ms/step
precision recall f1-score support
Ham 0.98 0.99 0.99 20581
Spam 0.99 0.98 0.99 20283
accuracy 0.99 40864
macro avg 0.99 0.99 0.99 40864
weighted avg 0.99 0.99 0.99 40864
AUC-ROC: 0.9862
WARNING:absl:You are saving your model as an HDF5 file via `[Link]()` or `ke
============================================================
GMAIL SPAM CLASSIFIER PREDICTIONS
============================================================
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 323ms/step
Email: WIN A FREE iPhone! Click now: [Link]
Prediction: SPAM (Probability: 81.72%)
Confidence: HIGH
Suspicious factors detected:
- Multiple (!) marks (4)
- Spam keywords (4)
------------------------------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 30ms/step
Email: Hi John, just following up about our meeting tomorrow
Prediction: HAM (Probability: 0.79%)
Confidence: HIGH
Suspicious factors detected:
- Spam keywords (1)
------------------------------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step
Email: URGENT: Your account has been compromised. Verify now! $$$
Prediction: SPAM (Probability: 98.70%)
Confidence: HIGH
Suspicious factors detected:
- Contains dollar signs
------------------------------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 29ms/step
Email: Please find attached the quarterly report
Prediction: HAM (Probability: 0.54%)
Confidence: HIGH
------------------------------------------------------------
1/1 ━━━━━━━━━━━━━━━━━━━━ 0s 32ms/step
Email: Congratulations! You've won a $1000 gift card! Click: [Link]
Prediction: HAM (Probability: 43.60%)
Confidence: MEDIUM
Suspicious factors detected:
- Contains dollar signs
Spam keywords (2)