0% found this document useful (0 votes)

26 views14 pages

File Model - Ipynb - Colab

The document outlines a Python script that installs necessary libraries, loads a dataset, and performs data preprocessing, including handling duplicates and missing values. It conducts exploratory data analysis (EDA) with visualizations for class distribution and file extension analysis, and extracts enhanced features from files, including byte sequences and string patterns. The script also defines regex patterns for malicious content and prepares the data for further analysis or modeling.

Uploaded by

yaskalai1602

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

0% found this document useful (0 votes)

26 views14 pages

File Model - Ipynb - Colab

Uploaded by

yaskalai1602

We take content rights seriously. If you suspect this is your content, claim it here.

Available Formats

Download as PDF, TXT or read online on Scribd

You are on page 1/ 14

!

pip install tldextract

Collecting tldextract
Downloading tldextract-5.3.0-py3-none-any.whl.metadata (11 kB)
Requirement already satisfied: idna in /usr/local/lib/python3.11/dist-packages (
Requirement already satisfied: requests>=2.1.0 in /usr/local/lib/python3.11/dist
Collecting requests-file>=1.4 (from tldextract)
Downloading requests_file-2.1.0-py2.py3-none-any.whl.metadata (1.7 kB)
Requirement already satisfied: filelock>=3.0.8 in /usr/local/lib/python3.11/dist
Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python
Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/d
Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/d
Downloading tldextract-5.3.0-py3-none-any.whl (107 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 107.4/107.4 kB 5.7 MB/s eta 0
Downloading requests_file-2.1.0-py2.py3-none-any.whl (4.2 kB)
Installing collected packages: requests-file, tldextract
Successfully installed requests-file-2.1.0 tldextract-5.3.0

!pip install python-magic

!pip install lief
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, roc_auc_score
from imblearn.over_sampling import SMOTE
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Conv1D, LSTM, Dense, Dropout, GlobalMaxPooling1D
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
import pickle
import magic
import hashlib
import lief
import re
import os
import zlib
from datetime import datetime
import stat

# Cell 2: Load Data and Handle Duplicates

# Load Data
df = pd.read_csv('/content/drive/MyDrive/Original file.csv')

# Handle duplicates
df = df.drop_duplicates(subset=[col for col in df.columns if col not in ['Name', 'md5']])
hashes = []
duplicate_indices = []
for index, row in df.iterrows():
filepath = row['Name']
if os.path.exists(filepath):
with open(filepath, 'rb') as f:
sha256 = hashlib.sha256(f.read()).hexdigest()
if sha256 in hashes:
duplicate_indices.append(index)
else:
hashes.append(sha256)
df = df.drop(duplicate_indices).reset_index(drop=True)
print(f"After removing duplicates: {df.shape}")

# Load dataset
df_eda = pd.read_csv('/content/drive/MyDrive/Original file.csv')
df_eda = df_eda[df_eda['Name'].notna()].copy()

# Dataset Info
print("Dataset Info:")
print(df_eda.info())
print("\nMissing Values:")
print(df_eda.isnull().sum())
print("\nClass Distribution:")
print(df_eda['legitimate'].value_counts())

# Class Distribution Visualization

plt.figure(figsize=(8, 4))
sns.countplot(x='legitimate', data=df_eda, palette='Set2')
plt.title("Class Distribution (0: Malicious, 1: Benign)")
plt.xlabel("Class")
plt.ylabel("Count")
plt.xticks([0, 1], ['Malicious', 'Benign'])
plt.show()

# File Extension Analysis

df_eda['file_extension'] = df_eda['Name'].apply(lambda x: os.path.splitext(x)[1].lower() if
top_extensions = df_eda.groupby('file_extension')['legitimate'].value_counts().unstack().fi
top_extensions['total'] = top_extensions.sum(axis=1)
top_extensions = top_extensions.sort_values('total', ascending=False).head(10)
print("\nTop File Extensions by Class:")
print(top_extensions)

top_extensions[[0, 1]].plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set2')

plt.title("Top File Extensions by Class")
plt.xlabel("File Extension")
plt.ylabel("Count")
plt.legend(['Malicious', 'Benign'])
plt.show()

# Categorical Features Distribution

categorical_cols = ['Machine', 'Subsystem']
for col in categorical_cols:
top_values = df_eda[col].value_counts().index[:5]
temp_df = df_eda[df_eda[col].isin(top_values)]
plt.figure(figsize=(10, 6))
sns.countplot(x=col, hue='legitimate', data=temp_df, palette='Set2')
plt.title(f"Distribution of {col} by Class (Top 5)")
plt.xlabel(col)
plt.ylabel("Count")
plt.legend(['Malicious', 'Benign'])
plt.xticks(rotation=45)
plt.show()

# Correlation Matrix
numerical_cols = df_eda.select_dtypes(include=['int64', 'float64']).columns
corr_matrix = df_eda[numerical_cols].corr()
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm', fmt='.2f')
plt.title("Correlation Matrix of Numerical Features")
plt.show()

# String Patterns (URLs and IPs)

patterns = {
'url': re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\$\$,]|(?:%[0-9a-fA-F][0
'ip': re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}')
}
df_eda['url_count'] = 0
df_eda['ip_count'] = 0
for idx, row in df_eda.iterrows():
filepath = row['Name']
if os.path.exists(filepath):
try:
with open(filepath, 'rb') as f:
content = f.read(1024).decode('ascii', errors='ignore')
df_eda.at[idx, 'url_count'] = len(patterns['url'].findall(content))
df_eda.at[idx, 'ip_count'] = len(patterns['ip'].findall(content))
except:
pass
for feature in ['url_count', 'ip_count']:
print(f"\n{feature} Distribution by Class:")
print(df_eda.groupby('legitimate')[feature].mean())
plt.figure(figsize=(8, 4))
sns.boxplot(x='legitimate', y=feature, data=df_eda, palette='Set2')
plt.title(f"Distribution of {feature} by Class")
plt.xlabel("Class")
plt.ylabel(feature)
plt.xticks([0, 1], ['Malicious', 'Benign'])
plt.show()

# Cell 3: Handle Missing Values

# Handle missing values
numerical_cols = df.select_dtypes(include=['int64', 'float64']).columns
for col in numerical_cols:
df[col] = df[col].fillna(df[col].median())
categorical_cols = ['Machine', 'SizeOfOptionalHeader', 'SectionAlignment', 'FileAlignment',
'MajorOperatingSystemVersion', 'MinorOperatingSystemVersion',
'MajorSubsystemVersion', 'MinorSubsystemVersion', 'Subsystem', 'LoaderF
for col in categorical_cols:
df[col] = df[col].fillna(df[col].mode()[0])
print("Missing values imputed.")

# Cell 4: Define Enhanced Patterns, Extensions, and Keywords

# Define enhanced regex patterns for string patterns
patterns = {
'url': re.compile(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\$\$,]|(?:%[0-9a-fA-F][0
'ip': re.compile(r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}'),
'registry': re.compile(r'HKEY_', re.IGNORECASE),
'cmd': re.compile(r'cmd\.exe|powershell|net\s+user|reg\s+add|taskkill|schtasks|wmic|msht
'script': re.compile(r'javascript|vbscript|eval\(|base64|powershell|python|perl|ruby',
'crypto': re.compile(r'bitcoin|wallet|crypto|monero|ethereum|blockchain|publickey|privat
'obfuscation': re.compile(r'xor|packer|obfuscate|encode|decode|encrypt|decrypt|shellcode
}

# Define suspicious file extensions

suspicious_extensions = {
'.vbs', '.scr', '.js', '.ps1', '.bat', '.cmd', '.com', '.pif', '.wsf', '.jse',
'.exe', '.dll', '.sys', '.msi', '.hta', '.msc', '.lnk', '.cpl', '.ocx', '.jar'
}

# Define keywords dictionary for malicious content

keywords = {
'security': ['login', 'password', 'credential', 'auth', 'verify', 'secure', 'certificate
'hacking': ['exploit', 'backdoor', 'trojan', 'worm', 'virus', 'ransomware', 'spyware',
'scams': ['free', 'win', 'prize', 'lottery', 'gift', 'bonus', 'reward', 'promo', 'millio
'injection': ['cmd', 'exec', 'eval', 'script', 'iframe', 'shell', 'sql', 'xss', 'csrf',
}

# Cell 5: Extract Enhanced Features and Byte Sequences

# Extract enhanced features and byte sequences
max_len = 1024
file_features = []
byte_sequences = []
file_type_detector = magic.Magic()
valid_indices = []

for index, row in df.iterrows():

filepath = row['Name']
try:
# File metadata
file_type = file_type_detector.from_file(filepath) if os.path.exists(filepath) else
is_pe = 1 if 'PE32' in file_type or 'MS-DOS' in file_type else 0
file_ext = os.path.splitext(filepath)[1].lower() if os.path.exists(filepath) else '
is_suspicious_ext = 1 if file_ext in suspicious_extensions else 0
file_size = os.path.getsize(filepath) if os.path.exists(filepath) else 0
mod_time = os.path.getmtime(filepath) if os.path.exists(filepath) else 0
mod_time_days = (datetime.now().timestamp() - mod_time) / (24 * 3600) if mod_time e
permissions = os.stat(filepath).st_mode if os.path.exists(filepath) else 0
is_executable = 1 if permissions & stat.S_IXUSR else 0

# Read bytes once for both features and sequences

with open(filepath, 'rb') as f:
raw_data = f.read(max_len)
bytes_data = np.frombuffer(raw_data, dtype=np.uint8)
if len(bytes_data) < max_len:
bytes_data = np.pad(bytes_data, (0, max_len - len(bytes_data)))
else:
bytes_data = bytes_data[:max_len]
byte_seq = bytes_data / 255.0

# Byte-level features
byte_mean = np.mean(byte_seq)
byte_entropy = -np.sum([(c/len(byte_seq)) * np.log2(c/len(byte_seq) + 1e-10)
for c in np.bincount((byte_seq * 255).astype(int), minlength
byte_var = np.var(byte_seq)
null_bytes = np.sum(byte_seq == 0)
printable_ratio = np.sum((byte_seq >= 0x20/255) & (byte_seq <= 0x7E/255)) / len(byte
control_chars = np.sum((byte_seq < 0x20/255) | (byte_seq == 0x7F/255))
byte_hist_var = np.var(np.histogram(byte_seq * 255, bins=256, range=(0,255))[0])
compressed_data = zlib.compress(bytes_data.tobytes())
compression_ratio = len(compressed_data) / (len(bytes_data) + 1e-10)

# String patterns
content_str = bytes_data.tobytes().decode('ascii', errors='ignore')
url_count = len(re.findall(patterns['url'], content_str))
ip_count = len(re.findall(patterns['ip'], content_str))
registry_count = len(re.findall(patterns['registry'], content_str))
cmd_count = len(re.findall(patterns['cmd'], content_str))
script_count = len(re.findall(patterns['script'], content_str))
crypto_count = len(re.findall(patterns['crypto'], content_str))
obfuscation_count = len(re.findall(patterns['obfuscation'], content_str))
signature = bytes_data[:4].tobytes().hex() if len(bytes_data) >= 4 else '00000000'

# Keyword counts
security_keywords = sum(content_str.lower().count(kw) for kw in keywords['security'
hacking_keywords = sum(content_str.lower().count(kw) for kw in keywords['hacking'])
scam_keywords = sum(content_str.lower().count(kw) for kw in keywords['scams'])
injection_keywords = sum(content_str.lower().count(kw) for kw in keywords['injection

# High-entropy regions
window_size = 256
high_entropy_count = 0
for i in range(0, len(bytes_data) - window_size + 1, window_size // 2):
window = bytes_data[i:i+window_size]
entropy = -np.sum([(c/len(window)) * np.log2(c/len(window) + 1e-10)
for c in np.bincount(window, minlength=256)])
if entropy > 7:
high_entropy_count += 1

# PE-specific features
header_entropy = 0
section_entropy_diff = 0
imports_entropy = 0
api_call_count = 0
resource_size = 0
section_count = 0
metadata_size = 0
if is_pe and os.path.exists(filepath):
binary = lief.parse(filepath)
if binary:
header_bytes = bytes(binary.header)
header_entropy = -np.sum([(c/len(header_bytes)) * np.log2(c/len(header_byte
for c in np.bincount(np.frombuffer(header_bytes, d
sections = binary.sections
section_entropies = [(-np.sum([(c/len(s.content)) * np.log2(c/len(s.content
for c in np.bincount(np.frombuffer(s.content
for s in sections if len(s.content) > 0]
section_entropy_diff = max(section_entropies) - min(section_entropies) if se
imports = binary.imports
import_bytes = b''.join([imp.name.encode() for imp in imports]) if imports e
imports_entropy = -np.sum([(c/len(import_bytes)) * np.log2(c/len(import_byte
for c in np.bincount(np.frombuffer(import_bytes,
api_call_count = len([entry for imp in imports for entry in imp.entries])
resources = binary.resources
resource_size = len(bytes(resources)) if resources else 0
section_count = len(sections)
else:
section_count = len([i for i in range(0, len(bytes_data) - window_size + 1, wind
if (-np.sum([(c/len(bytes_data[i:i+window_size])) * np.log2
for c in np.bincount(bytes_data[i:i+window_size
metadata_size = len(content_str.encode('ascii', errors='ignore')) / (file_size +

file_features.append([
byte_mean, byte_entropy, byte_var, null_bytes, printable_ratio,
control_chars, url_count, ip_count, registry_count, cmd_count,
script_count, crypto_count, obfuscation_count, security_keywords,
hacking_keywords, scam_keywords, injection_keywords, file_size,
header_entropy, section_entropy_diff, imports_entropy, api_call_count,
resource_size, section_count, metadata_size, compression_ratio,
high_entropy_count, is_pe, mod_time_days, is_executable, is_suspicious_ext
])
byte_sequences.append(byte_seq)
valid_indices.append(index)
except:
file_features.append([0] * 31)
byte_sequences.append(np.zeros(max_len))
valid_indices.append(index)

# Filter df to valid indices

df = df.loc[valid_indices].reset_index(drop=True)
new_features = pd.DataFrame(file_features, columns=[
'byte_mean', 'byte_entropy', 'byte_var', 'null_bytes', 'printable_ratio',
'control_chars', 'url_count', 'ip_count', 'registry_count', 'cmd_count',
'script_count', 'crypto_count', 'obfuscation_count', 'security_keywords',
'hacking_keywords', 'scam_keywords', 'injection_keywords', 'file_size',
'header_entropy', 'section_entropy_diff', 'imports_entropy', 'api_call_count',
'resource_size', 'section_count', 'metadata_size', 'compression_ratio',
'high_entropy_count', 'is_pe', 'mod_time_days', 'is_executable', 'is_suspicious_ext'
])
byte_sequences = np.array(byte_sequences).reshape(-1, max_len, 1)
print(f"Extracted features shape: {new_features.shape}, Byte sequences shape: {byte_sequence

# Cell 6: Combine Features and Preprocess Data

# Combine original and new features
X = df.drop(['Name', 'md5', 'legitimate'], axis=1)
X = pd.concat([X, new_features], axis=1)
y = df['legitimate']
print(f"Combined features shape: {X.shape}, Target shape: {y.shape}")

# Preprocess the data

categorical_cols = ['Machine', 'SizeOfOptionalHeader', 'SectionAlignment', 'FileAlignment',
'MajorOperatingSystemVersion', 'MinorOperatingSystemVersion',
'MajorSubsystemVersion', 'MinorSubsystemVersion', 'Subsystem', 'LoaderF
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)
file_extensions = [os.path.splitext(row['Name'])[1].lower() if os.path.exists(row['Name']) e
for _, row in df.iterrows()]
extension_df = pd.get_dummies(file_extensions, prefix='ext')
X = pd.concat([X, extension_df], axis=1)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
with open('scaler.pkl', 'wb') as f:
pickle.dump(scaler, f)
print(f"After preprocessing: {X_scaled.shape}, Byte sequences: {byte_sequences.shape}")

# Verify sample consistency

if X_scaled.shape[0] != y.shape[0] or X_scaled.shape[0] != byte_sequences.shape[0]:
raise ValueError(f"Inconsistent sample counts: X_scaled={X_scaled.shape[0]}, y={y.shape

# Cell 7: Handle Imbalance and Split Data

# Handle data imbalance with SMOTE
X_train, X_temp, y_train, y_temp, byte_train, byte_temp = train_test_split(
X_scaled, y, byte_sequences, test_size=0.3, random_state=42, stratify=y
)
X_val, X_test, y_val, y_test, byte_val, byte_test = train_test_split(
X_temp, y_temp, byte_temp, test_size=0.5, random_state=42, stratify=y_temp
)
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)
# Align SMOTE samples with byte sequences
byte_train_smote = np.zeros((X_train_smote.shape[0], max_len, 1))
for i in range(X_train.shape[0]):
byte_train_smote[i] = byte_train[i]
for i in range(X_train.shape[0], X_train_smote.shape[0]):
# Use nearest neighbor’s byte sequence for synthetic samples
idx = np.random.randint(0, byte_train.shape[0])
byte_train_smote[i] = byte_train[idx]
print(f"Train shape: {X_train_smote.shape}, Validation shape: {X_val.shape}, Test shape: {X_
print(f"Class distribution after SMOTE: {pd.Series(y_train_smote).value_counts()}")

# Cell 8: Build CNN+LSTM Model

# Build the CNN+LSTM model
byte_input = Input(shape=(max_len, 1), name='byte_input')
struct_input = Input(shape=(X_scaled.shape[1],), name='struct_input')
x_byte = Conv1D(64, 3, activation='relu', padding='same')(byte_input)
x_byte = BatchNormalization()(x_byte)
x_byte = MaxPooling1D(2)(x_byte)
x_byte = Dropout(0.2)(x_byte)
x_byte = Conv1D(128, 5, activation='relu', padding='same')(x_byte)
x_byte = BatchNormalization()(x_byte)
x_byte = MaxPooling1D(2)(x_byte)
x_byte = Dropout(0.2)(x_byte)
x_byte = LSTM(64, return_sequences=False)(x_byte)
x_struct = Dense(128, activation='relu')(struct_input)
x_struct = BatchNormalization()(x_struct)
x_struct = Dropout(0.3)(x_struct)
x_struct = Dense(64, activation='relu')(x_struct)
x_struct = BatchNormalization()(x_struct)
x_struct = Dropout(0.3)(x_struct)
x = concatenate([x_byte, x_struct])
x = Dense(256, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
x = Dense(128, activation='relu')(x)
x = BatchNormalization()(x)
x = Dropout(0.4)(x)
output = Dense(1, activation='sigmoid')(x)
model = Model(inputs=[byte_input, struct_input], outputs=output)

# Cell 9: Compile and Train Model

# Compile the model
model.compile(
optimizer=tf.keras.optimizers.Adam(learning_rate=0.0005),
loss=tf.keras.losses.BinaryFocalCrossentropy(gamma=2.0),
metrics=['accuracy', tf.keras.metrics.AUC(name='auc'), tf.keras.metrics.Precision(name=
)
# Define callbacks
early_stopping = EarlyStopping(monitor='val_auc', patience=5, mode='max', restore_best_weigh
model_checkpoint = ModelCheckpoint('best_model.h5', monitor='val_auc', save_best_only=True,

# Calculate class weights

from sklearn.utils.class_weight import compute_class_weight
classes = np.unique(y_train_smote)
weights = compute_class_weight('balanced', classes=classes, y=y_train_smote)
class_weights = dict(zip(classes, weights))
# Train the model
history = model.fit(
[byte_train_smote, X_train_smote], y_train_smote,
validation_data=([byte_val, X_val], y_val),
epochs=10,
batch_size=32,
callbacks=[early_stopping, model_checkpoint],
class_weight=class_weights
)

# Cell 10: Evaluate Model

# Evaluate the model
y_pred = model.predict([byte_test, X_test])
y_pred_class = (y_pred > 0.5).astype(int)
print("Classification Report:")
print(classification_report(y_test, y_pred_class, target_names=['Malicious', 'Benign']))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred):.4f}")
cm = confusion_matrix(y_test, y_pred_class)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Save preprocessing artifacts

import pickle
from sklearn.preprocessing import StandardScaler

# Assuming X is the preprocessed feature matrix after dummy encoding

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
with open('scaler.pkl', 'wb') as f:
pickle.dump(scaler, f)
print("Scaler saved to scaler.pkl")
Collecting python-magic
Downloading python_magic-0.4.27-py2.py3-none-any.whl.metadata (5.8 kB)
Downloading python_magic-0.4.27-py2.py3-none-any.whl (13 kB)
Installing collected packages: python-magic
Successfully installed python-magic-0.4.27
Collecting lief
Downloading lief-0.16.5-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (2.4 kB
Downloading lief-0.16.5-cp311-cp311-manylinux_2_28_x86_64.whl (3.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 57.8 MB/s eta 0:00
Installing collected packages: lief
Successfully installed lief-0.16.5
After removing duplicates: (104589, 57)
Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 138047 entries, 0 to 138046
Data columns (total 57 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Name 138047 non-null object
1 md5 138047 non-null object
2 Machine 138047 non-null int64
3 SizeOfOptionalHeader 138047 non-null int64
4 Characteristics 138047 non-null int64
5 MajorLinkerVersion 138047 non-null int64
6 MinorLinkerVersion 138047 non-null int64
7 SizeOfCode 138047 non-null int64
8 SizeOfInitializedData 138047 non-null int64
9 SizeOfUninitializedData 138047 non-null int64
10 AddressOfEntryPoint 138047 non-null int64
11 BaseOfCode 138047 non-null int64
12 BaseOfData 138047 non-null int64
13 ImageBase 138047 non-null float64
14 SectionAlignment 138047 non-null int64
15 FileAlignment 138047 non-null int64
16 MajorOperatingSystemVersion 138047 non-null int64
17 MinorOperatingSystemVersion 138047 non-null int64
18 MajorImageVersion 138047 non-null int64
19 MinorImageVersion 138047 non-null int64
20 MajorSubsystemVersion 138047 non-null int64
21 MinorSubsystemVersion 138047 non-null int64
22 SizeOfImage 138047 non-null int64
23 SizeOfHeaders 138047 non-null int64
24 CheckSum 138047 non-null int64
25 Subsystem 138047 non-null int64
26 DllCharacteristics 138047 non-null int64
27 SizeOfStackReserve 138047 non-null int64
28 SizeOfStackCommit 138047 non-null int64
29 SizeOfHeapReserve 138047 non-null int64
30 SizeOfHeapCommit 138047 non-null int64
31 LoaderFlags 138047 non-null int64
32 NumberOfRvaAndSizes 138047 non-null int64
33 SectionsNb 138047 non-null int64
34 SectionsMeanEntropy 138047 non-null float64
35 SectionsMinEntropy 138047 non-null float64
36 SectionsMaxEntropy 138047 non-null float64
37 SectionsMeanRawsize 138047 non-null float64
i i i ll i
38 SectionsMinRawsize 138047 non-null int64
39 SectionMaxRawsize 138047 non-null int64
40 SectionsMeanVirtualsize 138047 non-null float64
41 SectionsMinVirtualsize 138047 non-null int64
42 SectionMaxVirtualsize 138047 non-null int64
43 ImportsNbDLL 138047 non-null int64
44 ImportsNb 138047 non-null int64
45 ImportsNbOrdinal 138047 non-null int64
46 ExportNb 138047 non-null int64
47 ResourcesNb 138047 non-null int64
48 ResourcesMeanEntropy 138047 non-null float64
49 ResourcesMinEntropy 138047 non-null float64
50 ResourcesMaxEntropy 138047 non-null float64
51 ResourcesMeanSize 138047 non-null float64
52 ResourcesMinSize 138047 non-null int64
53 ResourcesMaxSize 138047 non-null int64
54 LoadConfigurationSize 138047 non-null int64
55 VersionInformationSize 138047 non-null int64
56 legitimate 138047 non-null int64
dtypes: float64(10), int64(45), object(2)
memory usage: 60.0+ MB
None

Missing Values:
Name 0
md5 0
Machine 0
SizeOfOptionalHeader 0
Characteristics 0
MajorLinkerVersion 0
MinorLinkerVersion 0
SizeOfCode 0
SizeOfInitializedData 0
SizeOfUninitializedData 0
AddressOfEntryPoint 0
BaseOfCode 0
BaseOfData 0
ImageBase 0
SectionAlignment 0
FileAlignment 0
MajorOperatingSystemVersion 0
MinorOperatingSystemVersion 0
MajorImageVersion 0
MinorImageVersion 0
MajorSubsystemVersion 0
MinorSubsystemVersion 0
SizeOfImage 0
SizeOfHeaders 0
CheckSum 0
Subsystem 0
DllCharacteristics 0
SizeOfStackReserve 0
SizeOfStackCommit 0
SizeOfHeapReserve 0
SizeOfHeapCommit 0
LoaderFlags 0
NumberOfRvaAndSizes 0
NumberOfRvaAndSizes 0
SectionsNb 0
SectionsMeanEntropy 0
SectionsMinEntropy 0
SectionsMaxEntropy 0
SectionsMeanRawsize 0
SectionsMinRawsize 0
SectionMaxRawsize 0
SectionsMeanVirtualsize 0
SectionsMinVirtualsize 0
SectionMaxVirtualsize 0
ImportsNbDLL 0
ImportsNb 0
ImportsNbOrdinal 0
ExportNb 0
ResourcesNb 0
ResourcesMeanEntropy 0
ResourcesMinEntropy 0
ResourcesMaxEntropy 0
ResourcesMeanSize 0
ResourcesMinSize 0
ResourcesMaxSize 0
LoadConfigurationSize 0
VersionInformationSize 0
legitimate 0
dtype: int64

Class Distribution:
legitimate
0 96724
1 41323
Name: count, dtype: int64
<ipython-input-2-4155c6684167>:59: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v

sns.countplot(x='legitimate', data=df_eda, palette='Set2')

Top File Extensions by Class:
legitimate 0 1 total
file_extension
.unknown 96724 41323 138047
url_count Distribution by Class:
legitimate
0 0.0
1 0.0
Name: url_count, dtype: float64
<ipython-input-2-4155c6684167>:124: FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be removed in v

sns.boxplot(x='legitimate', y=feature, data=df_eda, palette='Set2')

ip_count Distribution by Class:

legitimate

Gmail Model - Ipynb - Colab
No ratings yet
Gmail Model - Ipynb - Colab
8 pages
Url Model - Ipynb - Colab
No ratings yet
Url Model - Ipynb - Colab
15 pages
Predictions - Ipynb - Colab
No ratings yet
Predictions - Ipynb - Colab
9 pages
Gradio - Ipynb - Colab
No ratings yet
Gradio - Ipynb - Colab
15 pages
1Speech&LangProc - Ipynb - Colab
No ratings yet
1Speech&LangProc - Ipynb - Colab
3 pages
AI Text Generation Setup
No ratings yet
AI Text Generation Setup
5 pages
RagApplication - Ipynb - Colab
No ratings yet
RagApplication - Ipynb - Colab
6 pages
Final .Ipynb - Colab
No ratings yet
Final .Ipynb - Colab
12 pages
Pip Install Tensorflow Pandas Matplotlib Scikit Learn
No ratings yet
Pip Install Tensorflow Pandas Matplotlib Scikit Learn
19 pages
AI Phase4
No ratings yet
AI Phase4
26 pages
3 2 Expand Klasifikasi FIX 750 IndoBERT MLP, CNN, LSTM, CNN + LSTM 16 June 2025
No ratings yet
3 2 Expand Klasifikasi FIX 750 IndoBERT MLP, CNN, LSTM, CNN + LSTM 16 June 2025
121 pages
AI8
No ratings yet
AI8
2 pages
Neural Network Ex 1
No ratings yet
Neural Network Ex 1
2 pages
Model Training
No ratings yet
Model Training
8 pages
Requirements Dev
No ratings yet
Requirements Dev
7 pages
Llama2 With Llamaindex
No ratings yet
Llama2 With Llamaindex
40 pages
Experiment No 2 (TF) .Ipynb - Colaboratory
No ratings yet
Experiment No 2 (TF) .Ipynb - Colaboratory
1 page
M6 Wayan Agus Budiyanto 17518838
No ratings yet
M6 Wayan Agus Budiyanto 17518838
5 pages
ComfyUI Installation Guide for Python
No ratings yet
ComfyUI Installation Guide for Python
24 pages
Untitled7.ipynb - Colab
No ratings yet
Untitled7.ipynb - Colab
28 pages
Clickbait Classifier Modified
No ratings yet
Clickbait Classifier Modified
21 pages
Roop Unleashed 02.ipynb
No ratings yet
Roop Unleashed 02.ipynb
15 pages
Sentence Reconstruction with Neural Networks
No ratings yet
Sentence Reconstruction with Neural Networks
21 pages
Morphological Colab
No ratings yet
Morphological Colab
2 pages
Kaggle GPU Setup for ComfyUI
No ratings yet
Kaggle GPU Setup for ComfyUI
10 pages
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
No ratings yet
Hand On Day 2 Salinan - Dari - 2 - Using - Transformers
10 pages
HSUB3 - Week7.Ipynb - Colaboratory
No ratings yet
HSUB3 - Week7.Ipynb - Colaboratory
5 pages
Reinforcement Learning Examples in Python
No ratings yet
Reinforcement Learning Examples in Python
69 pages
1e2RzvrZ 1SueJwXaVHXb0x25yZtvmI0d
No ratings yet
1e2RzvrZ 1SueJwXaVHXb0x25yZtvmI0d
5 pages
Data Science Package Setup
No ratings yet
Data Science Package Setup
28 pages
Requirements
No ratings yet
Requirements
5 pages
Python Package Installation Guide
No ratings yet
Python Package Installation Guide
12 pages
NFL - SURVIVAL - Ipynb - Colab
No ratings yet
NFL - SURVIVAL - Ipynb - Colab
5 pages
Installing Gradio and Generative AI
No ratings yet
Installing Gradio and Generative AI
13 pages
Image Processing 3 - Colab
No ratings yet
Image Processing 3 - Colab
19 pages
Trash Detection
No ratings yet
Trash Detection
17 pages
Python Package Setup Instructions
No ratings yet
Python Package Setup Instructions
9 pages
Pertemuan 3 - Latihan - Faiz Anugerah Gunawan
No ratings yet
Pertemuan 3 - Latihan - Faiz Anugerah Gunawan
6 pages
TensorFlow Developer Exam Setup Guide
No ratings yet
TensorFlow Developer Exam Setup Guide
4 pages
Spark-TTS Installation Issues
No ratings yet
Spark-TTS Installation Issues
25 pages
Carbon Credit Beta
No ratings yet
Carbon Credit Beta
8 pages
Design & Development of AI Agents
No ratings yet
Design & Development of AI Agents
17 pages
Install Requirements
No ratings yet
Install Requirements
13 pages
Major Project
No ratings yet
Major Project
144 pages
Requirements
No ratings yet
Requirements
4 pages
Python Package Installation Commands
No ratings yet
Python Package Installation Commands
6 pages
Python Package Installation Guide
No ratings yet
Python Package Installation Guide
2 pages
Dash Plotly + D3.Ipynb - Colab
No ratings yet
Dash Plotly + D3.Ipynb - Colab
2 pages
Python Watermark & Pie Chart Guide
No ratings yet
Python Watermark & Pie Chart Guide
3 pages
For Cor Pc1 Lismasari - Ipynb - Colab
No ratings yet
For Cor Pc1 Lismasari - Ipynb - Colab
5 pages
Ko Mendy
No ratings yet
Ko Mendy
3 pages
Tweet Emotion Recognition: NLP With Tensorflow
No ratings yet
Tweet Emotion Recognition: NLP With Tensorflow
10 pages
Working Setup MulTalk - Windows
No ratings yet
Working Setup MulTalk - Windows
2 pages
Requirements Py36 Locked
No ratings yet
Requirements Py36 Locked
3 pages
Graph Vae Training - Log
No ratings yet
Graph Vae Training - Log
146 pages
RAG - Pipeline - Ipynb - Colab
No ratings yet
RAG - Pipeline - Ipynb - Colab
12 pages
YOLOv5 Setup for Aircraft Detection
No ratings yet
YOLOv5 Setup for Aircraft Detection
5 pages
Gemini Pro vs. OpenAI GPT Models Comparison
No ratings yet
Gemini Pro vs. OpenAI GPT Models Comparison
40 pages
Untitled2.ipynb - Colaboratory
No ratings yet
Untitled2.ipynb - Colaboratory
4 pages
Python Built-in Functions Overview
No ratings yet
Python Built-in Functions Overview
142 pages
National Senior Certificate: Kereite Ya 12
No ratings yet
National Senior Certificate: Kereite Ya 12
26 pages
Grade 10 English Assessment Plan 2025
No ratings yet
Grade 10 English Assessment Plan 2025
3 pages
Vijeo Citect - Project Development
No ratings yet
Vijeo Citect - Project Development
342 pages
A Cricket Team Management Mini Project
No ratings yet
A Cricket Team Management Mini Project
8 pages
The Kite Runner Schedule 2016
No ratings yet
The Kite Runner Schedule 2016
2 pages
Smarthome Project
No ratings yet
Smarthome Project
47 pages
Therapeutic Communication
100% (11)
Therapeutic Communication
5 pages
Dream Format
No ratings yet
Dream Format
2 pages
Mr. Know All. Text 4
No ratings yet
Mr. Know All. Text 4
3 pages
Chemical Industries in Mumbai
0% (1)
Chemical Industries in Mumbai
2 pages
E20250727 17th Sunday OT C
No ratings yet
E20250727 17th Sunday OT C
4 pages
50 Common English Phrases
No ratings yet
50 Common English Phrases
1 page
Mastering English Introductions
No ratings yet
Mastering English Introductions
13 pages
MC Script for Study Abroad Event
No ratings yet
MC Script for Study Abroad Event
3 pages
Buchi-Emecheta - A-Feminist-With-A-Small - F'-Or-A-Motherist-With-A-Big - M'
No ratings yet
Buchi-Emecheta - A-Feminist-With-A-Small - F'-Or-A-Motherist-With-A-Big - M'
20 pages
LESSON 7: Asking For Directions
100% (1)
LESSON 7: Asking For Directions
3 pages
Lesson Plan Format: Standards
No ratings yet
Lesson Plan Format: Standards
2 pages
Ep 106
No ratings yet
Ep 106
145 pages
Building Single Page Applications in Net Core 3 Jumpstart Coding Using Blazor and C 1st Edition Michele Aponte PDF Download
100% (1)
Building Single Page Applications in Net Core 3 Jumpstart Coding Using Blazor and C 1st Edition Michele Aponte PDF Download
18 pages
ENG BA Merged
No ratings yet
ENG BA Merged
16 pages
1 Book Review
No ratings yet
1 Book Review
23 pages
2-Data Types (E-Next - In)
No ratings yet
2-Data Types (E-Next - In)
19 pages
Amardeep Mishra-4.7Yrs 230
No ratings yet
Amardeep Mishra-4.7Yrs 230
3 pages
Diagnostic Test - English 5
No ratings yet
Diagnostic Test - English 5
7 pages
Understanding Full Infinitives
No ratings yet
Understanding Full Infinitives
1 page
Alphabet & Phonics for All Ages
No ratings yet
Alphabet & Phonics for All Ages
3 pages
Crucible Version 3
No ratings yet
Crucible Version 3
2 pages
ADA Pitchraq Manual
No ratings yet
ADA Pitchraq Manual
11 pages
Mobikwik Integration Guide
0% (1)
Mobikwik Integration Guide
36 pages

File Model - Ipynb - Colab

Uploaded by

File Model - Ipynb - Colab

Uploaded by

!

pip install tldextract

!pip install python-magic

# Cell 2: Load Data and Handle Duplicates

# Class Distribution Visualization

# File Extension Analysis

top_extensions[[0, 1]].plot(kind='bar', stacked=True, figsize=(10, 6), colormap='Set2')

# Categorical Features Distribution

# String Patterns (URLs and IPs)

# Cell 3: Handle Missing Values

# Cell 4: Define Enhanced Patterns, Extensions, and Keywords

# Define suspicious file extensions

# Define keywords dictionary for malicious content

# Cell 5: Extract Enhanced Features and Byte Sequences

for index, row in df.iterrows():

# Read bytes once for both features and sequences

# Filter df to valid indices

# Cell 6: Combine Features and Preprocess Data

# Preprocess the data

# Verify sample consistency

# Cell 7: Handle Imbalance and Split Data

# Cell 8: Build CNN+LSTM Model

# Cell 9: Compile and Train Model

# Calculate class weights

# Cell 10: Evaluate Model

# Save preprocessing artifacts

# Assuming X is the preprocessed feature matrix after dummy encoding

Passing `palette` without assigning `hue` is deprecated and will be removed in v

sns.countplot(x='legitimate', data=df_eda, palette='Set2')

Passing `palette` without assigning `hue` is deprecated and will be removed in v

sns.boxplot(x='legitimate', y=feature, data=df_eda, palette='Set2')

ip_count Distribution by Class:

You might also like