!
pip install tldextract
Collecting tldextract
Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (
Requirement already satisfied: requests>=2.1.0 in /usr/local/lib/python3.11/dist
Collecting requests-file>=1.4 (from tldextract)
Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.11/dist
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/d
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/d
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 107.4/107.4 kB 3.2 MB/s eta 0
Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0
import numpy as np
import pandas as pd
from urllib.parse import urlparse, unquote
import re
import tldextract
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import os
import logging
from datetime import datetime
import warnings
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score, precisio
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, Conv1D, MaxPooling1D, Bidirectional,
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau, ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.losses import BinaryCrossentropy
# Configure environment
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
tf.config.optimizer.set_jit(True)
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
np.random.seed(42)
tf.random.set_seed(42)
# Define patterns, TLDs, keywords, whitelist, and blacklist
patterns_ip = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
patterns_shortener = re.compile(r'(bit\.ly|goo\.gl|tinyurl|t\.co|ow\.ly|buff\.ly|adf\.ly|sho
patterns_executable = re.compile(r'\.(exe|zip|rar|dmg|apk|msi|bat|cmd|com|scr|jar|js|vbs|wsf
patterns_double_extension = re.compile(r'\.(jpg|png|gif|pdf|doc|docx|xls|xlsx|ppt|pptx|txt)
patterns_hex = re.compile(r'%[0-9a-fA-F]{2}')
patterns_non_standard_port = re.compile(r':(80|443|8080|8443|21|22|23|25|110|143|3389|5900|3
suspicious_tlds = {'tk', 'gq', 'ml', 'xyz', 'top', 'cf', 'ga', 'pw', 'cc', 'club', 'loan',
# Expanded whitelist and blacklist
whitelist_domains = {
'google.com', 'microsoft.com', 'github.com', 'python.org', 'amazon.com', 'facebook.com'
'wikipedia.org', 'youtube.com', 'apple.com', 'oracle.com', 'ibm.com', 'netflix.com', 'ad
'reddit.com', 'bbc.com', 'cnn.com', 'nytimes.com', 'gov.uk', 'edu.au', 'harvard.edu', 'm
'nasa.gov', 'cdc.gov', 'who.int', 'un.org', 'europa.eu', 'ox.ac.uk', 'cam.ac.uk', 'ucla
'forbes.com', 'bloomberg.com', 'wsj.com', 'reuters.com', 'theguardian.com', 'whitehouse
}
blacklist_domains = {
'malicious-site.tk', 'fake-bank.gq', 'scam-site.top', 'evil-site.cf', 'phish-site.ml',
'000webhostapp.com', 'beget.tech', 'duckdns.org', 'no-ip.org', 'zapto.org', 'hopto.org'
'webredirect.org', 'mixh.jp', 'fbsacc.com', 'hejnnet.pl', 'dol-esa.gov', 'malwareathome
'blob.core.windows.net', 'poostipay.masterset.redirect00.com', 'spam.com', 'phish.net',
'darkweb.to', 'malicious.site', 'phishingpage.xyz', 'trojan.download', 'ransomware.win'
}
keywords_security = ['login', 'signin', 'verify', 'account', 'update', 'secure', 'password'
keywords_download = ['download', 'install', 'update', 'plugin', 'flash', 'java', 'runtime',
keywords_hacking = ['hacked', 'defaced', 'by', 'admin', 'wp-content', 'shell', 'root', 'exp
keywords_scams = ['free', 'win', 'prize', 'offer', 'click', 'deal', 'limited', 'bonus', 'rew
keywords_brands = ['paypal', 'ebay', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
keywords_admin = ['wp-admin', 'administrator', 'cpanel', 'phpmyadmin', 'dbadmin', 'whm', 'p
keywords_injection = ['cmd', 'exec', 'eval', 'union', 'select', 'from', 'where', 'script',
# Load and preprocess data
logging.info("Loading and preprocessing data...")
df = pd.read_csv('/content/drive/MyDrive/Dataset /benign_vs_malicious_213k1.csv')
# Handle missing values
df = df.dropna(subset=['url', 'type'])
df = df[df['url'].str.strip() != '']
# URL cleaning
df['url'] = df['url'].astype(str).apply(unquote).apply(unquote)
df['url'] = df['url'].str.encode('ascii', errors='ignore').str.decode('ascii')
df['url'] = df['url'].str.strip().str.replace(r'\s+', '', regex=True)
df['url'] = df['url'].str.replace(r'[^\x00-\x7F]+', '', regex=True)
df['url'] = np.where(
df['url'].str.contains(r'^https?://', case=False, regex=True),
df['url'],
'http://' + df['url']
)
df = df[df['url'].str.contains(r'\.|localhost', regex=True)]
df = df[~df['url'].str.contains(r'[\s<>"\'{}|\\^~\[\]]', regex=True, na=False)]
# Handle duplicates and labels
df['type'] = df.groupby('url')['type'].transform(lambda x: x.mode()[0] if len(x.mode()) ==
df = df.drop_duplicates(subset=['url'])
df['label'] = df['type'].str.contains('benign', case=False).astype(int)
# Balance classes by undersampling
min_count = df['label'].value_counts().min()
df = df.groupby('label').sample(n=min_count, random_state=42).sample(frac=1, random_state=42
# Reset index to avoid issues
df = df.reset_index(drop=True)
# Feature extraction (expanded to 75 features)
feature_vectors = np.zeros((len(df), 75), dtype=np.float32)
for idx in range(len(df)):
url = df['url'].iloc[idx]
features = np.zeros(75, dtype=np.float32)
try:
if not isinstance(url, str) or not url.strip():
feature_vectors[idx] = features
continue
parsed = urlparse(url)
tld = tldextract.extract(url)
path = parsed.path.lower()
query = parsed.query.lower()
netloc = parsed.netloc.lower()
domain = tld.domain.lower()
url_lower = url.lower()
# Basic features
features[0] = len(url)
features[1] = len(netloc)
features[2] = len(tld.domain)
features[3] = 1 if tld.subdomain else 0
features[4] = len(tld.subdomain.split('.')) if tld.subdomain else 0
features[5] = len(parsed.path)
features[6] = parsed.path.count('/')
features[7] = 1 if '.php' in parsed.path else 0
features[8] = 1 if '.html' in parsed.path else 0
features[9] = len(parsed.query)
features[10] = parsed.query.count('&')
features[11] = 1 if '=' in parsed.query else 0
# Character counts
char_counts_at = url.count('@')
char_counts_dash = url.count('-')
char_counts_underscore = url.count('_')
char_counts_question = url.count('?')
char_counts_equal = url.count('=')
char_counts_dot = url.count('.')
char_counts_comma = url.count(',')
char_counts_double_slash = url.count('//')
features[12] = char_counts_at
features[13] = char_counts_dash
features[14] = char_counts_underscore
features[15] = char_counts_question
features[16] = char_counts_equal
features[17] = char_counts_dot
features[18] = char_counts_comma
features[19] = char_counts_double_slash
# Pattern matching
features[20] = 1 if patterns_ip.search(url) else 0
features[21] = 2 if patterns_shortener.search(netloc) else 0
features[22] = 1 if re.search(r'(https?://)?(www\.)?\w+\.\w+\.\w+', url) else 0
# Entropy calculations
if parsed.netloc:
freq = Counter(parsed.netloc)
entropy = 0
for f in freq.values():
p = f / len(parsed.netloc)
entropy -= p * np.log2(p + 1e-10)
features[23] = entropy
# Character distributions
total_chars = len(url)
if total_chars > 0:
alpha = sum(1 for c in url if c.isalpha())
digits = sum(1 for c in url if c.isdigit())
specials = sum(1 for c in url if not c.isalnum())
upper = sum(1 for c in url if c.isupper())
features[24] = digits / total_chars
features[25] = alpha / total_chars
features[26] = specials / total_chars
features[27] = upper / total_chars
freq_url = Counter(url)
p = np.array(list(freq_url.values())) / total_chars
features[28] = -np.sum(p * np.log2(p + 1e-10))
if netloc:
freq_netloc = Counter(netloc)
p_netloc = np.array(list(freq_netloc.values())) / len(netloc)
features[29] = -np.sum(p_netloc * np.log2(p_netloc + 1e-10))
# Keyword matching (weighted higher for malicious indicators)
features[30] = 1.5 * sum(1 for kw in keywords_security if kw in url_lower)
features[31] = sum(1 for kw in keywords_download if kw in url_lower)
features[32] = 1.5 * sum(1 for kw in keywords_hacking if kw in url_lower)
features[33] = 1.5 * sum(1 for kw in keywords_scams if kw in url_lower)
features[34] = sum(1 for kw in keywords_brands if kw in url_lower)
features[35] = 1.5 * sum(1 for kw in keywords_admin if kw in url_lower)
features[36] = 1.5 * sum(1 for kw in keywords_injection if kw in url_lower)
# Security features
features[37] = 2 if patterns_shortener.search(netloc) else 0
features[38] = 2 if patterns_executable.search(url_lower) else 0
features[39] = 2 if patterns_double_extension.search(url_lower) else 0
features[40] = 2 if tld.suffix in suspicious_tlds else 0
features[41] = int(len(netloc.split('.')) > 3)
features[42] = int(len(domain) > 15 and '-' in domain)
features[43] = -1 if parsed.scheme == 'https' else 0
features[44] = 1 if parsed.scheme == 'http' else 0
features[45] = 1 if bool(patterns_hex.search(url)) else 0
features[46] = 1 if len(parsed.fragment) > 20 else 0
features[47] = int(any(brand in path for brand in keywords_brands))
features[48] = 1.5 * int(any(hint in path for hint in ['admin', 'login', 'signup',
# Additional features (existing)
features[49] = char_counts_dot
features[50] = char_counts_dash
features[51] = char_counts_underscore
features[52] = url.count('/')
features[53] = char_counts_question
features[54] = specials
features[55] = digits
features[56] = alpha
features[57] = features[10]
features[58] = 1 if 'php' in url_lower else 0
# Existing features (whitelist, blacklist, etc.)
features[59] = -2 if any(netloc.endswith(d) for d in whitelist_domains) else 0
features[60] = 3 if any(netloc.endswith(d) for d in blacklist_domains) else 0
features[61] = len(tld.suffix)
features[62] = 1 if tld.domain in keywords_brands else 0
features[63] = sum(1 for c in domain if c == '-')
features[64] = 1 if len(domain) > 20 else 0
# New features
features[65] = parsed.netloc.count('.') # Number of subdomains
features[66] = 1 if patterns_non_standard_port.search(url) else 0 # Non-standard po
features[67] = sum(url_lower.count(kw) for kw in keywords_security) # Keyword repet
features[68] = sum(url_lower.count(kw) for kw in keywords_hacking) # Keyword repet
features[69] = sum(url_lower.count(kw) for kw in keywords_scams) # Keyword repetit
features[70] = parsed.query.count('=') # Number of key-value pairs in query
features[71] = len(parsed.query.split('&')) if parsed.query else 0 # Number of que
features[72] = 1 if 'www' in netloc else 0 # Presence of 'www'
features[73] = 1 if tld.subdomain.count('.') > 1 else 0 # Multiple subdomains
features[74] = 1 if len(tld.domain) < 4 else 0 # Very short domain name (e.g., "ab
feature_vectors[idx] = features
except Exception as e:
logging.warning(f"Feature extraction error: {str(e)[:100]}")
# Prepare features
X_num = feature_vectors
y = df['label'].values
# Scale numerical features
scaler = StandardScaler()
X_num_scaled = scaler.fit_transform(X_num)
# Preprocess text features
max_words = 20000
max_len = 200
tokenizer = Tokenizer(
num_words=max_words,
char_level=True,
filters='',
lower=True,
oov_token='<OOV>'
)
tokenizer.fit_on_texts(df['url'])
sequences = tokenizer.texts_to_sequences(df['url'])
X_text = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')
# Split data (include URLs for post-processing)
X_num_train, X_num_test, X_text_train, X_text_test, y_train, y_test, urls_train, urls_test =
X_num_scaled, X_text, y, df['url'].values,
test_size=0.2, random_state=42, stratify=y
)
X_num_train, X_num_val, X_text_train, X_text_val, y_train, y_val, urls_train, urls_val = tra
X_num_train, X_text_train, y_train, urls_train,
test_size=0.1, random_state=42, stratify=y_train
)
# Build improved model architecture
input_text = Input(shape=(max_len,), name='text_input')
embedding = Embedding(input_dim=max_words, output_dim=64)(input_text)
conv1 = Conv1D(filters=32, kernel_size=3, padding='same', activation='relu')(embedding)
conv1 = BatchNormalization()(conv1)
conv1 = MaxPooling1D(pool_size=2)(conv1)
conv2 = Conv1D(filters=64, kernel_size=5, padding='same', activation='relu')(conv1)
conv2 = BatchNormalization()(conv2)
conv2 = MaxPooling1D(pool_size=2)(conv2)
conv3 = Conv1D(filters=128, kernel_size=3, padding='same', activation='relu')(conv2) # Add
conv3 = BatchNormalization()(conv3)
conv3 = MaxPooling1D(pool_size=2)(conv3)
lstm = Bidirectional(LSTM(64, return_sequences=True))(conv3) # Increased LSTM units
attention = Attention()([lstm, lstm]) # Attention mechanism
pool_text = GlobalMaxPooling1D()(attention)
dropout_text = Dropout(0.4)(pool_text) # Increased dropout
input_num = Input(shape=(X_num_scaled.shape[1],), name='num_input')
dense_num = Dense(128, activation='relu')(input_num) # Increased units
dense_num = BatchNormalization()(dense_num)
dense_num = Dropout(0.4)(dense_num)
dense_num2 = Dense(64, activation='relu')(dense_num)
dense_num2 = BatchNormalization()(dense_num2)
dropout_num = Dropout(0.4)(dense_num2)
concat = Concatenate()([dropout_text, dropout_num])
dense = Dense(128, activation='relu')(concat)
dense = BatchNormalization()(dense)
dense = Dropout(0.4)(dense)
output = Dense(1, activation='sigmoid')(dense)
model = Model(inputs=[input_text, input_num], outputs=output)
# Compile model
optimizer = Adam(learning_rate=0.001)
model.compile(
optimizer=optimizer,
loss=BinaryCrossentropy(),
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]
)
# Define callbacks
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.1, patience=3, min_lr=1e-6) # Mo
checkpoint = ModelCheckpoint(filepath='best_urlmodel_improved.h5', monitor='val_loss', save_
callbacks = [early_stopping, reduce_lr, checkpoint]
# Train model
logging.info("Training model...")
history = model.fit(
[X_text_train, X_num_train], y_train,
validation_data=([X_text_val, X_num_val], y_val),
epochs=20, # Increased epochs
batch_size=128,
callbacks=callbacks,
verbose=1
)
Epoch 1/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 254ms/step - accuracy: 0.8843 - loss: 0.288
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 611s 261ms/step - accuracy: 0.8843 - loss: 0.2
Epoch 2/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 654s 275ms/step - accuracy: 0.9434 - loss: 0.1
Epoch 3/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 262ms/step - accuracy: 0.9499 - loss: 0.130
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 668s 269ms/step - accuracy: 0.9499 - loss: 0.1
Epoch 4/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 263ms/step - accuracy: 0.9549 - loss: 0.119
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 630s 272ms/step - accuracy: 0.9549 - loss: 0.1
Epoch 5/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 679s 271ms/step - accuracy: 0.9575 - loss: 0.1
Epoch 6/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 260ms/step - accuracy: 0.9602 - loss: 0.103
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 620s 269ms/step - accuracy: 0.9602 - loss: 0.1
Epoch 7/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 263ms/step - accuracy: 0.9632 - loss: 0.096
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 629s 272ms/step - accuracy: 0.9632 - loss: 0.0
Epoch 8/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 704s 282ms/step - accuracy: 0.9640 - loss: 0.0
Epoch 9/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 263ms/step - accuracy: 0.9661 - loss: 0.088
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 659s 272ms/step - accuracy: 0.9661 - loss: 0.0
Epoch 10/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 644s 256ms/step - accuracy: 0.9684 - loss: 0.0
Epoch 11/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 0s 259ms/step - accuracy: 0.9703 - loss: 0.078
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 643s 265ms/step - accuracy: 0.9703 - loss: 0.0
Epoch 12/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 568s 241ms/step - accuracy: 0.9720 - loss: 0.0
Epoch 13/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 588s 252ms/step - accuracy: 0.9730 - loss: 0.0
Epoch 14/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 607s 246ms/step - accuracy: 0.9735 - loss: 0.0
Epoch 15/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 531s 231ms/step - accuracy: 0.9785 - loss: 0.0
Epoch 16/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 630s 260ms/step - accuracy: 0.9831 - loss: 0.0
Epoch 17/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 613s 266ms/step - accuracy: 0.9860 - loss: 0.0
Epoch 18/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 618s 269ms/step - accuracy: 0.9878 - loss: 0.0
Epoch 19/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 647s 280ms/step - accuracy: 0.9888 - loss: 0.0
Epoch 20/20
2300/2300 ━━━━━━━━━━━━━━━━━━━━ 683s 280ms/step - accuracy: 0.9888 - loss: 0.0
# Evaluate model
logging.info("Evaluating model...")
y_pred_proba = model.predict([X_text_test, X_num_test], batch_size=128)
y_pred = (y_pred_proba > 0.3).astype(int)
# Post-processing with whitelist/blacklist
final_predictions = y_pred.copy()
final_probabilities = y_pred_proba.copy()
for i in range(len(urls_test)):
url = urls_test[i]
parsed = urlparse(url)
netloc = parsed.netloc.lower()
if any(netloc.endswith(d) for d in whitelist_domains):
final_predictions[i] = 0
final_probabilities[i] = 0.0
elif any(netloc.endswith(d) for d in blacklist_domains):
final_predictions[i] = 1
final_probabilities[i] = 1.0
y_pred = final_predictions
y_pred_proba = final_probabilities
# Evaluation metrics
print(classification_report(y_test, y_pred, target_names=['Benign', 'Malicious']))
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC-AUC Score: {roc_auc:.4f}")
# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('True')
plt.savefig('confusion_matrix_improved.png')
639/639 ━━━━━━━━━━━━━━━━━━━━ 46s 71ms/step
precision recall f1-score support
Benign 0.87 0.93 0.90 40881
Malicious 0.92 0.87 0.89 40881
accuracy 0.90 81762
macro avg 0.90 0.90 0.90 81762
weighted avg 0.90 0.90 0.90 81762
ROC-AUC Score: 0.8617
model.save('final_urlmodel_improved.h5')
with open('scaler_improved.pkl', 'wb') as f:
pickle.dump(scaler, f)
with open('tokenizer_improved.pkl', 'wb') as f:
pickle.dump(tokenizer, f)
WARNING:absl:You are saving your model as an HDF5 file via `model.save()` or `ke
import numpy as np
import pandas as pd
from urllib.parse import urlparse, unquote
import re
import tldextract
from collections import Counter
import pickle
import logging
import warnings
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# Configure environment
warnings.filterwarnings('ignore')
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
np.random.seed(42)
# Define patterns, TLDs, keywords, whitelist, and blacklist
patterns_ip = re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
patterns_shortener = re.compile(r'(bit\.ly|goo\.gl|tinyurl|t\.co|ow\.ly|buff\.ly|adf\.ly|sho
patterns_executable = re.compile(r'\.(exe|zip|rar|dmg|apk|msi|bat|cmd|com|scr|jar|js|vbs|wsf
patterns_double_extension = re.compile(r'\.(jpg|png|gif|pdf|doc|docx|xls|xlsx|ppt|pptx|txt)
patterns_hex = re.compile(r'%[0-9a-fA-F]{2}')
patterns_non_standard_port = re.compile(r':(80|443|8080|8443|21|22|23|25|110|143|3389|5900|3
suspicious_tlds = {'tk', 'gq', 'ml', 'xyz', 'top', 'cf', 'ga', 'pw', 'cc', 'club', 'loan',
whitelist_domains = {
'google.com', 'microsoft.com', 'github.com', 'python.org', 'amazon.com', 'facebook.com'
'wikipedia.org', 'youtube.com', 'apple.com', 'oracle.com', 'ibm.com', 'netflix.com', 'ad
'reddit.com', 'bbc.com', 'cnn.com', 'nytimes.com', 'gov.uk', 'edu.au', 'harvard.edu', 'm
'nasa.gov', 'cdc.gov', 'who.int', 'un.org', 'europa.eu', 'ox.ac.uk', 'cam.ac.uk', 'ucla
'forbes.com', 'bloomberg.com', 'wsj.com', 'reuters.com', 'theguardian.com', 'whitehouse
}
blacklist_domains = {
'malicious-site.tk', 'fake-bank.gq', 'scam-site.top', 'evil-site.cf', 'phish-site.ml',
'000webhostapp.com', 'beget.tech', 'duckdns.org', 'no-ip.org', 'zapto.org', 'hopto.org'
'webredirect.org', 'mixh.jp', 'fbsacc.com', 'hejnnet.pl', 'dol-esa.gov', 'malwareathome
'blob.core.windows.net', 'poostipay.masterset.redirect00.com', 'spam.com', 'phish.net',
'darkweb.to', 'malicious.site', 'phishingpage.xyz', 'trojan.download', 'ransomware.win'
}
keywords_security = ['login', 'signin', 'verify', 'account', 'update', 'secure', 'password'
keywords_download = ['download', 'install', 'update', 'plugin', 'flash', 'java', 'runtime',
keywords_hacking = ['hacked', 'defaced', 'by', 'admin', 'wp-content', 'shell', 'root', 'exp
keywords_scams = ['free', 'win', 'prize', 'offer', 'click', 'deal', 'limited', 'bonus', 'rew
keywords_brands = ['paypal', 'ebay', 'amazon', 'apple', 'google', 'microsoft', 'facebook',
keywords_admin = ['wp-admin', 'administrator', 'cpanel', 'phpmyadmin', 'dbadmin', 'whm', 'p
keywords_injection = ['cmd', 'exec', 'eval', 'union', 'select', 'from', 'where', 'script',
# Load model, scaler, and tokenizer
logging.info("Loading model and artifacts...")
model = load_model('final_urlmodel_improved.h5')
with open('scaler_improved.pkl', 'rb') as f:
scaler = pickle.load(f)
with open('tokenizer_improved.pkl', 'rb') as f:
tokenizer = pickle.load(f)
# Prediction logic for a list of URLs
test_urls = [
"http://update-java-required.com/download/latest.exe",
"http://paypal-security-alert.com/login",
"http://confirm-your-account.ru/secure/login",
"http://secure-verification-bank.tk/account-update",
"http://192.168.0.100:8080/malware.exe",
"http://bit.ly/free-robux-generator",
"http://login.facebook.com.account-verification.io",
"http://steam-login.ru/giftcard",
"http://microsoft-security-alert.net/update",
"http://apple-id-verification.com/login",
"http://get-free-giftcards-now.biz/claim",
"http://dropbox-fileshare.xyz/download.php?file=trojan",
"http://hacked-site.org/malicious.js",
"http://clickmeforprize.online/win.exe",
"http://phishing-login-page.top/login",
"http://malware-distribution-site.net/download.exe",
"http://yourbank-verification.tk/security",
"http://amazon-login-check.gq/account",
"http://secure-installer-updates.com/patch.exe",
"http://win-an-iphone-now.co/fakeform"
]
max_len = 200
print("\nPredictions for test URLs:")
print("+------------------------------------------------+--------------+--------------+")
print("| URL | Prediction | Confidence |")
print("+================================================+==============+==============+")
for url in test_urls:
# Clean URL
url_cleaned = unquote(unquote(url))
url_cleaned = url_cleaned.encode('ascii', errors='ignore').decode('ascii')
url_cleaned = url_cleaned.strip().replace(r'\s+', '')
url_cleaned = url_cleaned.replace(r'[^\x00-\x7F]+', '')
if not re.match(r'^https?://', url_cleaned, re.IGNORECASE):
url_cleaned = 'http://' + url_cleaned
# Extract features (75 features as in the improved model)
features = np.zeros(75, dtype=np.float32)
parsed = urlparse(url_cleaned)
tld = tldextract.extract(url_cleaned)
path = parsed.path.lower()
query = parsed.query.lower()
netloc = parsed.netloc.lower()
domain = tld.domain.lower()
url_lower = url_cleaned.lower()
features[0] = len(url_cleaned)
features[1] = len(netloc)
features[2] = len(tld.domain)
features[3] = 1 if tld.subdomain else 0
features[4] = len(tld.subdomain.split('.')) if tld.subdomain else 0
features[5] = len(parsed.path)
features[6] = parsed.path.count('/')
features[7] = 1 if '.php' in parsed.path else 0
features[8] = 1 if '.html' in parsed.path else 0
features[9] = len(parsed.query)
features[10] = parsed.query.count('&')
features[11] = 1 if '=' in parsed.query else 0
char_counts_at = url_cleaned.count('@')
char_counts_dash = url_cleaned.count('-')
char_counts_underscore = url_cleaned.count('_')
char_counts_question = url_cleaned.count('?')
char_counts_equal = url_cleaned.count('=')
char_counts_dot = url_cleaned.count('.')
char_counts_comma = url_cleaned.count(',')
char_counts_double_slash = url_cleaned.count('//')
features[12] = char_counts_at
features[13] = char_counts_dash
features[14] = char_counts_underscore
features[15] = char_counts_question
features[16] = char_counts_equal
features[17] = char_counts_dot
features[18] = char_counts_comma
features[19] = char_counts_double_slash
features[20] = 1 if patterns_ip.search(url_cleaned) else 0
features[21] = 2 if patterns_shortener.search(netloc) else 0
features[22] = 1 if re.search(r'(https?://)?(www\.)?\w+\.\w+\.\w+', url_cleaned) else 0
if parsed.netloc:
freq = Counter(parsed.netloc)
entropy = 0
for f in freq.values():
p = f / len(parsed.netloc)
entropy -= p * np.log2(p + 1e-10)
features[23] = entropy
total_chars = len(url_cleaned)
if total_chars > 0:
alpha = sum(1 for c in url_cleaned if c.isalpha())
digits = sum(1 for c in url_cleaned if c.isdigit())
specials = sum(1 for c in url_cleaned if not c.isalnum())
upper = sum(1 for c in url_cleaned if c.isupper())
features[24] = digits / total_chars
features[25] = alpha / total_chars
features[26] = specials / total_chars
features[27] = upper / total_chars
freq_url = Counter(url_cleaned)
p = np.array(list(freq_url.values())) / total_chars
features[28] = -np.sum(p * np.log2(p + 1e-10))
if netloc:
freq_netloc = Counter(netloc)
p_netloc = np.array(list(freq_netloc.values())) / len(netloc)
features[29] = -np.sum(p_netloc * np.log2(p_netloc + 1e-10))
features[30] = 1.5 * sum(1 for kw in keywords_security if kw in url_lower)
features[31] = sum(1 for kw in keywords_download if kw in url_lower)
features[32] = 1.5 * sum(1 for kw in keywords_hacking if kw in url_lower)
features[33] = 1.5 * sum(1 for kw in keywords_scams if kw in url_lower)
features[34] = sum(1 for kw in keywords_brands if kw in url_lower)
features[35] = 1.5 * sum(1 for kw in keywords_admin if kw in url_lower)
features[36] = 1.5 * sum(1 for kw in keywords_injection if kw in url_lower)
features[37] = 2 if patterns_shortener.search(netloc) else 0
features[38] = 2 if patterns_executable.search(url_lower) else 0
features[39] = 2 if patterns_double_extension.search(url_lower) else 0
features[40] = 2 if tld.suffix in suspicious_tlds else 0
features[41] = int(len(netloc.split('.')) > 3)
features[42] = int(len(domain) > 15 and '-' in domain)
features[43] = -1 if parsed.scheme == 'https' else 0
features[44] = 1 if parsed.scheme == 'http' else 0
features[45] = 1 if bool(patterns_hex.search(url_cleaned)) else 0
features[46] = 1 if len(parsed.fragment) > 20 else 0
features[47] = int(any(brand in path for brand in keywords_brands))
features[48] = 1.5 * int(any(hint in path for hint in ['admin', 'login', 'signup', 'sec
features[49] = char_counts_dot
features[50] = char_counts_dash
features[51] = char_counts_underscore
features[52] = url_cleaned.count('/')
features[53] = char_counts_question
features[54] = specials
features[55] = digits
features[56] = alpha
features[57] = features[10]
features[58] = 1 if 'php' in url_lower else 0
features[59] = -2 if any(netloc.endswith(d) for d in whitelist_domains) else 0
features[60] = 3 if any(netloc.endswith(d) for d in blacklist_domains) else 0
features[61] = len(tld.suffix)
features[62] = 1 if tld.domain in keywords_brands else 0
features[63] = sum(1 for c in domain if c == '-')
features[64] = 1 if len(domain) > 20 else 0
features[65] = parsed.netloc.count('.')
features[66] = 1 if patterns_non_standard_port.search(url_cleaned) else 0
features[67] = sum(url_lower.count(kw) for kw in keywords_security)
features[68] = sum(url_lower.count(kw) for kw in keywords_hacking)
features[69] = sum(url_lower.count(kw) for kw in keywords_scams)
features[70] = parsed.query.count('=')
features[71] = len(parsed.query.split('&')) if parsed.query else 0
features[72] = 1 if 'www' in netloc else 0
features[73] = 1 if tld.subdomain.count('.') > 1 else 0
features[74] = 1 if len(tld.domain) < 4 else 0
num_features = scaler.transform([features])
# Preprocess text
sequence = tokenizer.texts_to_sequences([url_cleaned])
text_features = pad_sequences(sequence, maxlen=max_len, padding='post', truncating='post
# Predict
prob = model.predict([text_features, num_features], verbose=0)[0][0]
pred = 1 if prob > 0.3 else 0
# Post-process with whitelist/blacklist
if any(netloc.endswith(d) for d in whitelist_domains):
pred, prob = 0, 0.0
elif any(netloc.endswith(d) for d in blacklist_domains):
pred, prob = 1, 1.0
label = 'Malicious' if pred == 1 else 'Benign'
confidence = prob * 100 if pred == 1 else (1 - prob) * 100
print(f"| {url:<46} | {label:<12} | {confidence:>5.2f}% |")
print("+------------------------------------------------+--------------+--------------+")
WARNING:absl:Compiled the loaded model, but the compiled metrics have yet to be
Predictions for test URLs:
+------------------------------------------------+--------------+--------------+
| URL | Prediction | Confidence |
+================================================+==============+==============+
| http://update-java-required.com/download/latest.exe | Benign | 99.77%
| http://paypal-security-alert.com/login | Malicious | 33.77% |
| http://confirm-your-account.ru/secure/login | Benign | 96.18% |
| http://secure-verification-bank.tk/account-update | Benign | 99.95%
| http://192.168.0.100:8080/malware.exe | Benign | 99.83% |
| http://bit.ly/free-robux-generator | Malicious | 53.98% |
| http://login.facebook.com.account-verification.io | Benign | 99.97%
| http://steam-login.ru/giftcard | Benign | 90.85% |
| http://microsoft-security-alert.net/update | Benign | 99.16% |
| http://apple-id-verification.com/login | Malicious | 30.64% |
| http://get-free-giftcards-now.biz/claim | Malicious | 75.22% |
| http://dropbox-fileshare.xyz/download.php?file=trojan | Benign | 98.68%
| http://hacked-site.org/malicious.js | Malicious | 89.82% |
| http://clickmeforprize.online/win.exe | Benign | 99.99% |
| http://phishing-login-page.top/login | Benign | 90.69% |
| http://malware-distribution-site.net/download.exe | Benign | 98.44%
| http://yourbank-verification.tk/security | Benign | 92.72% |
| http://amazon-login-check.gq/account | Benign | 99.70% |
| http://secure-installer-updates.com/patch.exe | Benign | 99.80% |
| http://win-an-iphone-now.co/fakeform | Malicious | 93.67% |
+------------------------------------------------+--------------+--------------+
import shutil
import os
from google.colab import drive
# Step 2: Define paths
# Replace this with the actual path to your model and pkl files
model_file = '/content/best_urlmodel_improved.h5'
pkl_file = '/content/tokenizer_improved.pkl'
# Replace this with your desired directory inside Google Drive
drive_dir = '/content/drive/MyDrive/Url Model Folder '
# Step 3: Create directory in Drive if it doesn't exist
os.makedirs(drive_dir, exist_ok=True)
# Step 4: Copy files using shutil
shutil.copy(model_file, drive_dir)
shutil.copy(pkl_file, drive_dir)
print(f"Files saved to: {drive_dir}")
Files saved to: /content/drive/MyDrive/Url Model Folder
import os
import pickle
# Create a directory to save the datasets
os.makedirs('saved_data', exist_ok=True)
# Save training set
with open('saved_data/X_text_train.pkl', 'wb') as f:
pickle.dump(X_text_train, f)
with open('saved_data/X_num_train.pkl', 'wb') as f:
pickle.dump(X_num_train, f)
with open('saved_data/y_train.pkl', 'wb') as f:
pickle.dump(y_train, f)
# Save validation set
with open('saved_data/X_text_val.pkl', 'wb') as f:
pickle.dump(X_text_val, f)
with open('saved_data/X_num_val.pkl', 'wb') as f:
pickle.dump(X_num_val, f)
with open('saved_data/y_val.pkl', 'wb') as f:
pickle.dump(y_val, f)
# Save test set
with open('saved_data/X_text_test.pkl', 'wb') as f:
pickle.dump(X_text_test, f)
with open('saved_data/X_num_test.pkl', 'wb') as f:
pickle.dump(X_num_test, f)
with open('saved_data/y_test.pkl', 'wb') as f:
pickle.dump(y_test, f)
print("Train, validation, and test datasets saved successfully in 'saved_data/' directory."
Train, validation, and test datasets saved successfully in 'saved_data/' directo
import shutil
source = '/content/saved_data'
destination = '/content/drive/MyDrive/Url Model Folder'
# This will overwrite contents if destination exists (Python 3.8+)
shutil.copytree(source, destination, dirs_exist_ok=True)
print(f"Copied '{source}' to '{destination}'")
Copied '/content/saved_data' to '/content/drive/MyDrive/Url Model Folder'