Q1)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)
# Separate features and target variable
X = data.drop(columns=[’Weight’]) # Input features
y = data[’Weight’] # Target variable
# Define train-test split ratios
split_ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]
# Store results
results = []
# Train and evaluate the model for each split ratio
for train_ratio, test_ratio in split_ratios:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)
# Initialize and train the model
model = LinearRegression()
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Store the results
results.append({
’Train Ratio’: train_ratio,
’Test Ratio’: test_ratio,
’MSE’: mse,
’R^2’: r2
})
# Display results
results_df = pd.DataFrame(results)
print(results_df)
# Compare and explain results
def explain_results(results):
print("\nModel Performance Explanation:")
for result in results:
train_ratio = result[’Train Ratio’]
test_ratio = result[’Test Ratio’]
mse = result[’MSE’]
r2 = result[’R^2’]
print(f"Train-Test Split ({train_ratio*100}% - {test_ratio*100}%):")
print(f" Mean Squared Error (MSE): {mse:.2f}")
print(f" R-squared (R^2): {r2:.2f}\n")
explain_results(results)
Q2)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Weight’ and relevant features like ’Length’, ’Height’,
etc.
data = pd.read_csv(’fish_data.csv’)
# Separate features and target variable
X = data.drop(columns=[’Weight’]) # Input features
y = data[’Weight’] # Target variable
# Define train-test split ratios
split_ratios = [(0.8, 0.2), (0.7, 0.3), (0.6, 0.4)]
# Store results
results = []
# Train and evaluate the models for each split ratio
for train_ratio, test_ratio in split_ratios:
# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_ratio, random_state=42)
# Simple Linear Regression
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
mse_linear = mean_squared_error(y_test, y_pred_linear)
r2_linear = r2_score(y_test, y_pred_linear)
# Polynomial Regression (degree 2)
poly_pipeline = Pipeline([
(’poly_features’, PolynomialFeatures(degree=2)),
(’linear_model’, LinearRegression())
])
poly_pipeline.fit(X_train, y_train)
y_pred_poly = poly_pipeline.predict(X_test)
mse_poly = mean_squared_error(y_test, y_pred_poly)
r2_poly = r2_score(y_test, y_pred_poly)
# Store the results
results.append({
’Train Ratio’: train_ratio,
’Test Ratio’: test_ratio,
’Linear MSE’: mse_linear,
’Linear R^2’: r2_linear,
’Poly MSE’: mse_poly,
’Poly R^2’: r2_poly
})
# Display results
results_df = pd.DataFrame(results)
print(results_df)
# Compare and explain results
def explain_results(results):
print("\nModel Performance Comparison:")
for result in results:
train_ratio = result[’Train Ratio’]
test_ratio = result[’Test Ratio’]
mse_linear = result[’Linear MSE’]
r2_linear = result[’Linear R^2’]
mse_poly = result[’Poly MSE’]
r2_poly = result[’Poly R^2’]
print(f"Train-Test Split ({train_ratio*100}% - {test_ratio*100}%):")
print(f" Linear Regression - MSE: {mse_linear:.2f}, R^2: {r2_linear:.2f}")
print(f" Polynomial Regression - MSE: {mse_poly:.2f}, R^2: {r2_poly:.2f}\n")
explain_results(results)
Q3)
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
# Load dataset
# Replace ’fish_data.csv’ with the actual file path
# Ensure that the dataset contains columns including ’Species’, ’Length1’, ’Length2’, ’Length3’, ’Height’,
and ’Width’
data = pd.read_csv(’fish_data.csv’)
# Encode the Species column into numerical values
label_encoder = LabelEncoder()
data[’Species’] = label_encoder.fit_transform(data[’Species’])
# Separate features and target variable
X = data[[’Length1’, ’Length2’, ’Length3’, ’Height’, ’Width’]] # Input features
y = data[’Species’] # Target variable
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Define SVM kernels to evaluate
kernels = [’linear’, ’poly’, ’rbf’, ’sigmoid’]
results = []
# Train and evaluate the SVM model for each kernel
for kernel in kernels:
# Initialize and train the model
svm_model = SVC(kernel=kernel, random_state=42)
svm_model.fit(X_train, y_train)
# Make predictions
y_pred = svm_model.predict(X_test)
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
# Store the results
results.append({
’Kernel’: kernel,
’Accuracy’: accuracy,
’Confusion Matrix’: conf_matrix
})
# Print confusion matrix for the current kernel
print(f"\nKernel: {kernel}")
print(f"Accuracy: {accuracy:.2f}")
print("Confusion Matrix:")
print(conf_matrix)
# Plot confusion matrix
sns.heatmap(conf_matrix, annot=True, fmt=’d’, cmap=’Blues’, xticklabels=label_encoder.classes_,
yticklabels=label_encoder.classes_)
plt.title(f"Confusion Matrix - Kernel: {kernel}")
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.show()
# Identify the best-performing kernel
best_result = max(results, key=lambda x: x[’Accuracy’])
print(f"\nBest Kernel: {best_result[’Kernel’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.2f}")
# Explanation of findings
def explain_findings(best_result):
print("\nExplanation of Findings:")
print(f"The best-performing kernel is ’{best_result[’Kernel’]}’ with an accuracy of
{best_result[’Accuracy’]:.2f}. ")
print("This suggests that the ’{best_result[’Kernel’]}’ kernel is most effective at capturing the relationship
between the features and the target variable.")
explain_findings(best_result)
Q4)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Load the Wine dataset
data = load_wine()
X = data.data
y = data.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Define scaling techniques
scalers = {
’StandardScaler’: StandardScaler(),
’MinMaxScaler’: MinMaxScaler(),
’RobustScaler’: RobustScaler()
}
# Define SVM kernels
kernels = [’linear’, ’poly’, ’rbf’, ’sigmoid’]
# Store results
results = []
for scaler_name, scaler in scalers.items():
# Scale the data
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
for kernel in kernels:
# Train SVM model
svm = SVC(kernel=kernel, random_state=42)
svm.fit(X_train_scaled, y_train)
# Make predictions
y_pred = svm.predict(X_test_scaled)
# Evaluate performance
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
results.append({
’Scaler’: scaler_name,
’Kernel’: kernel,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})
# Display results
best_result = max(results, key=lambda x: x[’Accuracy’])
print("Best Result:")
print(f"Scaler: {best_result[’Scaler’]}")
print(f"Kernel: {best_result[’Kernel’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.4f}")
print("Confusion Matrix:")
print(best_result[’Confusion Matrix’])
# Visualize confusion matrix of the best result
sns.heatmap(best_result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,
xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"Best Combination: {best_result[’Scaler’]} + {best_result[’Kernel’]}\nAccuracy:
{best_result[’Accuracy’]:.4f}")
plt.show()
# Summary of findings
print("\nSummary of Findings:")
print(f"The best performance was achieved with the {best_result[’Scaler’]} scaling method and the
{best_result[’Kernel’]} kernel. This scaling method is likely best suited for the dataset because it effectively
handles feature distributions and scales them appropriately for the SVM kernel.")
Q5)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier, OneVsOneClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix,
classification_report
import seaborn as sns
import matplotlib.pyplot as plt
# Load the Wine dataset
data = load_wine()
X = data.data
y = data.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Train logistic regression using One-vs-Rest (OvR) strategy
ovr_model = OneVsRestClassifier(LogisticRegression(random_state=42, max_iter=10000))
ovr_model.fit(X_train_scaled, y_train)
ovr_preds = ovr_model.predict(X_test_scaled)
# Train logistic regression using One-vs-One (OvO) strategy
ovo_model = OneVsOneClassifier(LogisticRegression(random_state=42, max_iter=10000))
ovo_model.fit(X_train_scaled, y_train)
ovo_preds = ovo_model.predict(X_test_scaled)
# Evaluate models
metrics = {}
# One-vs-Rest (OvR) metrics
metrics[’OvR’] = {
’Accuracy’: accuracy_score(y_test, ovr_preds),
’Precision’: precision_score(y_test, ovr_preds, average=’weighted’),
’Recall’: recall_score(y_test, ovr_preds, average=’weighted’),
’Confusion Matrix’: confusion_matrix(y_test, ovr_preds)
}
# One-vs-One (OvO) metrics
metrics[’OvO’] = {
’Accuracy’: accuracy_score(y_test, ovo_preds),
’Precision’: precision_score(y_test, ovo_preds, average=’weighted’),
’Recall’: recall_score(y_test, ovo_preds, average=’weighted’),
’Confusion Matrix’: confusion_matrix(y_test, ovo_preds)
}
# Display results
for strategy, result in metrics.items():
print(f"\n{strategy} Strategy:")
print(f"Accuracy: {result[’Accuracy’]:.4f}")
print(f"Precision: {result[’Precision’]:.4f}")
print(f"Recall: {result[’Recall’]:.4f}")
print("Confusion Matrix:")
print(result[’Confusion Matrix’])
sns.heatmap(result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,
xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"{strategy} Strategy Confusion Matrix")
plt.show()
# Summary of findings
if metrics[’OvR’][’Accuracy’] > metrics[’OvO’][’Accuracy’]:
better_strategy = ’One-vs-Rest (OvR)’
else:
better_strategy = ’One-vs-One (OvO)’
print("\nSummary of Findings:")
print(f"The better-performing strategy is {better_strategy}. OvR is generally simpler and may perform well
on datasets with a large number of classes, whereas OvO could be more computationally intensive but
might better capture pairwise class distinctions.")
Q6)
import numpy as np
import pandas as pd
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
# Load the Wine dataset
data = load_wine()
X = data.data
y = data.target
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Initialize results storage
results = []
# Try different numbers of principal components
for n_components in range(1, X.shape[1] + 1):
# Apply PCA
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca = pca.transform(X_test_scaled)
# Train Logistic Regression model
model = LogisticRegression(random_state=42, max_iter=10000)
model.fit(X_train_pca, y_train)
# Evaluate the model
y_pred = model.predict(X_test_pca)
accuracy = accuracy_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
# Store results
results.append({
’n_components’: n_components,
’Accuracy’: accuracy,
’Confusion Matrix’: cm
})
# Find the best result
best_result = max(results, key=lambda x: x[’Accuracy’])
# Print results
print("Optimal Number of Principal Components:")
print(f"Number of Components: {best_result[’n_components’]}")
print(f"Accuracy: {best_result[’Accuracy’]:.4f}")
print("Confusion Matrix:")
print(best_result[’Confusion Matrix’])
# Plot accuracy vs. number of principal components
n_components = [result[’n_components’] for result in results]
accuracies = [result[’Accuracy’] for result in results]
plt.plot(n_components, accuracies, marker=’o’)
plt.title(’Accuracy vs. Number of Principal Components’)
plt.xlabel(’Number of Principal Components’)
plt.ylabel(’Accuracy’)
plt.grid()
plt.show()
# Visualize the confusion matrix for the best result
sns.heatmap(best_result[’Confusion Matrix’], annot=True, fmt=’d’, cmap=’Blues’,
xticklabels=data.target_names, yticklabels=data.target_names)
plt.xlabel(’Predicted’)
plt.ylabel(’Actual’)
plt.title(f"Confusion Matrix (n_components={best_result[’n_components’]})")
plt.show()
# Summary of findings
print("\nSummary of Findings:")
print(f"The optimal number of principal components is {best_result[’n_components’]}, achieving an
accuracy of {best_result[’Accuracy’]:.4f}. PCA helps reduce the dimensionality of the dataset while
retaining essential information, improving model performance or interpretability.")
Q7)
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Load dataset
# Replace ’your_dataset.csv’ with the actual file path
data = pd.read_csv(’your_dataset.csv’)
# Inspect the dataset
print("Dataset Head:")
print(data.head())
print("\nDataset Info:")
print(data.info())
print("\nSummary Statistics:")
print(data.describe())
# Visualize the distribution of key features
features_to_plot = [’Relative Compactness’, ’Surface Area’, ’Wall Area’, ’Roof Area’, ’Heating Load’,
’Cooling Load’]
for feature in features_to_plot:
plt.figure(figsize=(6, 4))
sns.histplot(data[feature], kde=True, bins=20)
plt.title(f’Distribution of {feature}’)
plt.show()
# Pair plot for relationships between features
sns.pairplot(data[features_to_plot], diag_kind=’kde’)
plt.show()
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())
# Handle missing values (if any)
data = data.dropna()
# Normalize the features for clustering
scaler = StandardScaler()
normalized_data = scaler.fit_transform(data[features_to_plot])
# Perform K-Means clustering with 3 and 4 clusters
clusters = [3, 4]
kmeans_results = {}
for k in clusters:
kmeans = KMeans(n_clusters=k, random_state=42)
labels = kmeans.fit_predict(normalized_data)
kmeans_results[k] = labels
data[f’Cluster_{k}’] = labels
# Visualize clusters using PCA (reduce to 2D for visualization)
pca = PCA(n_components=2)
reduced_data = pca.fit_transform(normalized_data)
plt.figure(figsize=(8, 6))
sns.scatterplot(
x=reduced_data[:, 0], y=reduced_data[:, 1], hue=labels, palette=’Set1’, s=50
)
plt.title(f’K-Means Clustering with {k} Clusters (PCA Reduced)’)
plt.xlabel(’PCA Component 1’)
plt.ylabel(’PCA Component 2’)
plt.legend(title=’Cluster’)
plt.show()
# Interpretation of clusters
for k in clusters:
print(f"\nCluster Centers for K={k}:")
centers = pd.DataFrame(
scaler.inverse_transform(kmeans.cluster_centers_),
columns=features_to_plot
)
print(centers)
# Save the results to a CSV file
data.to_csv(’clustered_data.csv’, index=False)
print("\nClustering completed. Results saved to ’clustered_data.csv’.")
Q8)
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import DBSCAN
from sklearn.decomposition import PCA
# Load the dataset
data = pd.read_csv(’building_energy_efficiency.csv’)
# Display basic information about the dataset
print("Dataset Info:")
print(data.info())
# Display the first few rows of the dataset
print("\nFirst 5 Rows:")
print(data.head())
# Display summary statistics
print("\nSummary Statistics:")
print(data.describe())
# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())
# Data Visualization
# Histogram for each numerical feature
data.hist(figsize=(12, 8))
plt.tight_layout()
plt.show()
# Scatter plot matrix
sns.pairplot(data)
plt.show()
# Box plots for target variables
plt.figure(figsize=(10, 6))
sns.boxplot(data=data[[’Relative Compactness’, ’Surface Area’, ’Wall Area’, ’Roof Area’, ’Heating Load’,
’Cooling Load’]])
plt.title(’Box plots of features’)
plt.show()
# Normalize the features
scaler = StandardScaler()
scaled_data = scaler.fit_transform(data)
# Apply DBSCAN clustering
dbscan = DBSCAN(eps=0.3, min_samples=5)
clusters = dbscan.fit_predict(scaled_data)
# Add cluster labels to the dataset
data[’Cluster’] = clusters
# Visualize the clusters using PCA for dimensionality reduction
pca = PCA(n_components=2)
pca_data = pca.fit_transform(scaled_data)
plt.figure(figsize=(8, 6))
plt.scatter(pca_data[:, 0], pca_data[:, 1], c=clusters, cmap=’viridis’, s=50, alpha=0.6)
plt.title(’DBSCAN Clustering Results’)
plt.xlabel(’PCA Component 1’)
plt.ylabel(’PCA Component 2’)
plt.colorbar(label=’Cluster Label’)
plt.show()
# Interpretation of results
print("\nDBSCAN Clustering Results:")
unique_clusters = np.unique(clusters)
print(f"Number of clusters: {len(unique_clusters) - (1 if -1 in clusters else 0)}")
print(f"Noise points: {np.sum(clusters == -1)}")
# Provide interpretation based on clusters
for cluster in unique_clusters:
if cluster != -1:
print(f"Cluster {cluster}:")
cluster_data = data[data[’Cluster’] == cluster]
print(cluster_data.describe())
Q9)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Load the dataset
df = pd.read_csv(’path_to_your_dataset.csv’)
# Handle missing values (fill with median for numerical features and mode for categorical features)
imputer_num = SimpleImputer(strategy=’median’)
imputer_cat = SimpleImputer(strategy=’most_frequent’)
df[’age’] = imputer_num.fit_transform(df[[’age’]])
df[’education’] = imputer_cat.fit_transform(df[[’education’]])
df[’occupation’] = imputer_cat.fit_transform(df[[’occupation’]])
# Encode categorical features
label_enc = LabelEncoder()
df[’education’] = label_enc.fit_transform(df[’education’])
df[’occupation’] = label_enc.fit_transform(df[’occupation’])
# Separate features and target variable
X = df.drop(’income’, axis=1)
y = df[’income’]
# Normalize the numeric features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X.select_dtypes(include=[’int64’, ’float64’]))
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Train Logistic Regression model
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
# Train Decision Tree model
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train, y_train)
y_pred_tree = decision_tree.predict(X_test)
# Evaluate models
def evaluate_model(y_true, y_pred):
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, pos_label=’>50K’)
recall = recall_score(y_true, y_pred, pos_label=’>50K’)
f1 = f1_score(y_true, y_pred, pos_label=’>50K’)
return accuracy, precision, recall, f1
log_reg_metrics = evaluate_model(y_test, y_pred_log)
tree_metrics = evaluate_model(y_test, y_pred_tree)
print("Logistic Regression Metrics:")
print(f"Accuracy: {log_reg_metrics[0]}, Precision: {log_reg_metrics[1]}, Recall: {log_reg_metrics[2]}, F1
Score: {log_reg_metrics[3]}")
print("\nDecision Tree Metrics:")
print(f"Accuracy: {tree_metrics[0]}, Precision: {tree_metrics[1]}, Recall: {tree_metrics[2]}, F1 Score:
{tree_metrics[3]}")
# Comparison of models
if log_reg_metrics[3] > tree_metrics[3]:
best_model = "Logistic Regression"
best_metrics = log_reg_metrics
else:
best_model = "Decision Tree"
best_metrics = tree_metrics
print(f"\nBest model: {best_model} with F1 Score: {best_metrics[3]}")
Q10)
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
# Load the dataset
data = pd.read_csv(’car_evaluation.csv’)
# Encode categorical features
# Define categorical columns
categorical_columns = [’buying’, ’maint’, ’doors’, ’persons’, ’lug_boot’, ’safety’]
# Use One-Hot Encoding
encoder = OneHotEncoder(drop=’first’, sparse=False) # Drop one category to avoid multicollinearity
encoded_features = encoder.fit_transform(data[categorical_columns])
# Convert the encoded array back to a DataFrame
encoded_df = pd.DataFrame(encoded_features,
columns=encoder.get_feature_names(categorical_columns))
# Drop original categorical columns and concatenate with encoded features
data = data.drop(categorical_columns, axis=1)
data = pd.concat([data, encoded_df], axis=1)
# Split the data into features (X) and target (y)
X = data.drop(’class’, axis=1)
y = data[’class’]
# Split data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Train a Logistic Regression model
model = LogisticRegression(random_state=42)
model.fit(X_train, y_train)
# Predict on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=’weighted’)
recall = recall_score(y_test, y_pred, average=’weighted’)
f1 = f1_score(y_test, y_pred, average=’weighted’)
# Print the evaluation metrics
print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")