#Exp-1:/ Create histograms,box plots,Outliers #EXP-3:/ Principal Component Analysis
import pandas as pd print(f"{feature}: {len(outliers)} outliers") import numpy as np
import numpy as np #Step-6: Optional: Print a summary of the dataset import pandas as pd
import seaborn as sns print("\nDataset Summary:") from sklearn.datasets import load_iris
import matplotlib.pyplot as plt print(housing_df.describe()) from sklearn.decomposition import PCA
from sklearn.datasets import fetch_california_housing import matplotlib.pyplot as plt
------------------------------------------------------------------------ ======================================== -------------------------------------------------------------------
# Step 1: Load the California Housing dataset #Exp-2:/ Correlation matrix # Step-1: Load the Iris dataset
import pandas as pd iris = load_iris()
data = fetch_california_housing(as_frame=True) import seaborn as sns data = iris.data
housing_df = data.frame import matplotlib.pyplot as plt labels = iris.target
---------------------------------------------------- from sklearn.datasets import fetch_california_housing label_names = iris.target_names
# Step 2: Create histograms for numerical features -------------------------------------------------------------- ----------------------------------------------------------
numerical_features = # Step 1: Load the California Housing Dataset # Step2:Convert to a DataFrame for better visualization
housing_df.select_dtypes(include=[np.number]).columns iris_df = pd.DataFrame(data, columns=iris.feature_names)
---------------------------------------------------------------------- california_data =fetch_california_housing(as_frame=True) ---------------------------------------------------------------------
# Step 3: Plot histograms ------------------------------------------------------------- #Step3: Perform PCA to reduce dimensionality to 2
plt.figure(figsize=(15, 10)) # Step2:Convert to a DataFrame components
for i, feature in enumerate(numerical_features): data = california_data.frame
plt.subplot(3, 3, i + 1) --------------------------------------------------------------- pca = PCA(n_components=2)
sns.histplot(housing_df[feature], kde=True, bins=30, # Step 3: Compute the correlation matrix data_reduced = pca.fit_transform(data)
color='blue') correlation_matrix = data.corr() --------------------------------------------------------------
plt.title(f'Distribution of {feature}') -------------------------------------------------------------- # Step4:Create a DataFrame for the reduced data
plt.tight_layout() # Step 4: Visualize the correlation matrix using a reduced_df = pd.DataFrame(data_reduced, columns=['Principal
plt.show() heatmap Component 1', 'Principal Component 2'])
----------------------------------------------------------- plt.figure(figsize=(10, 8)) reduced_df['Label'] = labels
# Step 4: Generate box plots for numerical features sns.heatmap(correlation_matrix, annot=True, -----------------------------------------------------------------
plt.figure(figsize=(15, 10)) cmap='coolwarm', fmt='.2f', linewidths=0.5) # Step5: Visualize the 2D PCA result
for i, feature in enumerate(numerical_features): plt.title('Correlation Matrix of California Housing plt.figure(figsize=(8, 6))
plt.subplot(3, 3, i + 1) Features') colors = ['r', 'g', 'b']
sns.boxplot(x=housing_df[feature], color='orange') plt.show() for i, label in enumerate(np.unique(labels)):
plt.title(f'Box Plot of {feature}') ------------------------------------------------------ plt.scatter(
plt.tight_layout() # Step 5: Create a pair plot to visualize pairwise reduced_df[reduced_df['Label'] == label]['Principal
plt.show() relationships Component 1'],
---------------------------------------------------------------------------- reduced_df[reduced_df['Label'] == label]['Principal
# Step 5: Optional: print outliers count sns.pairplot(data, diag_kind='kde', plot_kws={'alpha': Component 2'],
print("Outliers Detection:") 0.5}) label=label_names[label],
outliers_summary = {} plt.suptitle('Pair Plot of California Housing Features', color=colors[i]
for feature in numerical_features: y=1.02) )
Q1 = housing_df[feature].quantile(0.25) plt.show()
Q3 = housing_df[feature].quantile(0.75) plt.title('PCA on Iris Dataset')
IQR = Q3 - Q1 plt.xlabel('Principal Component 1')
lower_bound = Q1 - 1.5 * IQR plt.ylabel('Principal Component 2')
upper_bound = Q3 + 1.5 * IQR plt.legend()
outliers = housing_df[(housing_df[feature] < lower_bound) | plt.grid()
(housing_df[feature] > upper_bound)] plt.show()
outliers_summary[feature] = len(outliers)
plt.xlabel("x value")
#EXP-4/Find-S algorithm #EXP-5:/ k-Nearest Neighbour algorithm plt.ylabel("k (for visualization only)")
import pandas as pd import numpy as np plt.show()
------------------------------------------------------------------------------------ from sklearn.neighbors import KNeighborsClassifier plt.legend()
def find_s_algorithm(data_file): import matplotlib.pyplot as plt plt.title("KNN Classification of x values")
# Step 1: Load training data from CSV --------------------------------------------------------------- plt.grid(True)
data = pd.read_csv(data_file) # Step1. Generate 100 random x values in range [0, 1] -----------------------------------------------------------
#Step2:Print data file x = np.random.rand(100).reshape(-1, 1) #EXP-6/Locally Weighted Regression algorithm
print("data_file:") ---------------------------------------------------------------- import numpy as np
print(data) #Step 2. Assign labels to first 50 import matplotlib.pyplot as plt
------------------------------------------------------------------------------- y = np.array(['Class 1' if xi <= 0.5 else 'Class 2' for xi in #Step1: Generate sample nonlinear dataset
# Step 3: Identify the columns and the target attribute x[:50].flatten()]) np.random.seed(42)
# Assuming the last column is the target ----------------------------------------------------------------------- X = np.linspace(-3, 3, 100).reshape(-1, 1)
target_column = data.columns[-1] #Step 3. Prepare training and test data y = np.sin(X).ravel() + np.random.normal(scale=0.1,
# All columns except the target X_train = x[:50] size=X.shape[0])
features = data.columns[:-1] X_test = x[50:] # Step2Locally Weighted Regression function
------------------------------------------------------------------------ --------------------------------------------------------------------- def locally_weighted_regression(x_query, X_train, y_train, tau):
# Step 4: Initialize the hypothesis with the first positive example #Step 4. Test for different k values -------------------------------------------------------------------
# Assuming 'Yes' represents positive examples k_values = [1, 2, 3, 4, 5, 20, 30] # Step3:Compute weights using Gaussian kernel
positive_data = data[data[target_column] == 'Yes'] results = {} W = np.exp(-((X_train - x_query) ** 2) / (2 * tau ** 2))
------------------------------------------------------------------------------------------ Step5:Apply KNN for each k ------------------------------------------------------------------
# Step5:The first positive example for k in k_values: # Step4:Add bias (intercept) term for linear regression
hypothesis = positive_data.iloc[0, :-1].values knn = KNeighborsClassifier(n_neighbors=k) X_bias = np.c_[np.ones_like(X_train), X_train]
----------------------------------------------------------------------------- knn.fit(X_train, y) -------------------------------------------------------------------
# Step 6: Update the hypothesis based on all positive examples y_pred = knn.predict(X_test) # Step5:Weighted linear regression: θ = (X^T W X)^(-1)
for i, example in positive_data.iterrows(): #Dictionary mapping each k to predicted classes for X^T W y
# Compare and generalize hypothesis if necessary x[51:100] # Predict y for x_query
for j in range(len(hypothesis)): results[k] = y_pred
if hypothesis[j] != example[features[j]]: ---------------------------------------------------- theta = np.linalg.pinv(X_bias.T @ np.diag(W.ravel()) @
# Use '?' to generalize when mismatches occur # Step5:Display results X_bias) @ X_bias.T @ np.diag(W.ravel()) @ y_train
hypothesis[j] = '?' for k in k_values: return np.array([1, x_query]) @ theta
--------------------------------------------------------------------------------- print(f"\nK = {k} classification results:") --------------------------------------------------------------------
# Step 7: Output the final hypothesis for i, pred in enumerate(results[k]): # Step6:Prediction over a dense grid
return hypothesis print(f"x{51 + i}: {X_test[i][0]:.3f} -> {pred}") X_test = np.linspace(-3, 3, 100)
-------------------------------------------------------------------------------- ------------------------------------------------------------------ y_pred = np.array([locally_weighted_regression(x, X, y,
# Step8:Example usage # Step6:Plot for visual understanding(Optional) tau=0.5) for x in X_test])
# Path to the CSV file plt.figure(figsize=(10, 6)) -------------------------------------------------
data_file = r'C:/Users/91943/Desktop/KITM/ML LAB/training_data.csv' plt.scatter(X_train, [0]*50, c=['red' if yi == 'Class 1' else #Step7: Plotting the result
-------------------------------------------------------------------------- 'blue' for yi in y], label='Training Data') plt.scatter(X, y, color="gray", alpha=0.5, label="Training Data")
# Step9:Main function for k in k_values: plt.plot(X_test, y_pred, color="red", linewidth=2, label="LWR
final_hypothesis = find_s_algorithm(data_file) y_pred_k = results[k] Fit (τ=0.5)")
------------------------------------------------------------------------- plt.scatter(X_test, [k]*50, c=['red' if yi == 'Class 1' else plt.legend()
#Step10: Print final hypothesis 'blue' for yi in y_pred_k], marker='x', label=f'Test plt.show()
print("\nFinal Hypothesis:", final_hypothesis) (k={k})')
plt.yticks(k_values + [0])
EXP8/decision tree algorithm
#EXP-7: Linear Regression #EXP-7: polynomial Regression: import numpy as np
import numpy as np import numpy as np import matplotlib.pyplot as plt
import pandas as pd import pandas as pd from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split import seaborn as sns from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression from sklearn.model_selection import train_test_split from sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, r2_score from sklearn.preprocessing import PolynomialFeatures import matplotlib.pyplot as plt
---------------------------------------------------------------------------------- from sklearn.metrics import mean_squared_error ---------------------------------------------------------------
# Step1:Load the Boston Housing Dataset #Step1:Load Auto MPG dataset #Step 1. Load the dataset
boston_df = auto_mpg = data = load_breast_cancer()
pd.read_csv(r"C:\Users\91943\Downloads\boston_housing_data.csv") pd.read_csv(r"C:\Users\91943\Downloads\auto-mpg.csv") --------------------------------------------------------------
print("Linear Regression on Boston Housing Dataset") auto_mpg.dropna(inplace=True) #Step 2. Split into train and test sets
#Step2:Choose one feature for visualization #Step2: Remove missing values X_train, X_test, y_train, y_test = train_test_split(data.data,
# Using 'RM' (average number of rooms) as the feature auto_mpg = auto_mpg[auto_mpg['horsepower'] != '?'] data.target, test_size=0.2, random_state=42)
X = boston_df[['RM']] auto_mpg['horsepower'] = -------------------------------------------------------------
---------------------------------------------------------------------------- auto_mpg['horsepower'].astype(float) #Step 3. Train Decision Tree Classifier
# Step3:AssignTarget variable #Step3: Predictor and target model = DecisionTreeClassifier(max_depth=4,
y = boston_df['MEDV'] X_auto = auto_mpg[['horsepower']] random_state=42)
-------------------------------------------------------------------------- y_auto = auto_mpg['mpg'] model.fit(X_train, y_train)
# Step4:Split the data into training and testing sets --------------------------------------------------------------
Step4:Split the data into training and testing sets # Step4. Evaluate the model
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, X_train, X_test, y_train, y_test = train_test_split(X_auto,
random_state=42) print(f"Accuracy: {accuracy_score(y_test,
y_auto, test_size=0.2, random_state=42) model.predict(X_test)):.2f}")
------------------------------------------------------------------------ --------------------------------------------------------------
# Step5:Create and train the Linear Regression model -----------------------------------------------------------------
#Step5:Create and train Polynomial Regression (degree # Step5. Visualize the Decision Tree
LR_model = LinearRegression() 2)
LR_model.fit(X_train, y_train) plot_tree(model, filled=True,
poly = PolynomialFeatures(degree=2) feature_names=data.feature_names,
---------------------------------------------------------------------------- X_train_poly = poly.fit_transform(X_train)
# Step6:Make predictions class_names=data.target_names)
X_test_poly = poly.transform(X_test) plt.show()
y_pred = LR_model.predict(X_test) ------------------------------------------------------------
-------------------------------------------------------------------------- --------------------------------------------------------------------
#Step6: creates an instance of the LinearRegression
# Step7:Evaluate the model/Performance #Step 6. Classify a new sample
model from scikit-learn sample = [[15.3, 20.5, 85.2, 521, 0.1, 0.1, 0.08, 0.18, 0.19, 0.06,
mse = mean_squared_error(y_test, y_pred) poly_reg = LinearRegression()
r2 = r2_score(y_test, y_pred) 0.32, 0.43, 2.5, 20.3, 0.007, 0.018, 0.016, 0.003, 0.015, 0.002,
#Fits the linear model to the transformed training data. 17.5, 25.0, 110.5, 900, 0.14, 0.25, 0.18, 0.25, 0.27, 0.09]]
print(f"Mean Squared Error: {mse:.4f}") poly_reg.fit(X_train_poly, y_train)
print(f"R^2 Score: {r2:.4f}") =========================================
#Predicts the target variable (MPG) on the test data #Step7: Print predicted class
------------------------------------------------------------------------------------------ y_pred_poly = poly_reg.predict(X_test_poly)
#Step8: Plot the results
#Step7: Plot the results print("Predicted Class:",
plt.scatter(X_test, y_test, color='green', label='Actual')
plt.plot(X_test, y_pred, color='red', label='Predicted') plt.scatter(X_test, y_test, color="blue", label="Actual data.target_names[model.predict(sample)[0]])
plt.xlabel('Average Number of Rooms (RM)') Data", alpha=0.5)
plt.ylabel('House Price (MEDV)') plt.scatter(X_test, y_pred_poly, color="red",
plt.title('Linear Regression on Boston Housing Dataset') label="Predicted Data", alpha=0.5)
plt.legend() plt.xlabel("Horsepower")
plt.show() plt.ylabel("MPG")
plt.legend()
plt.title("Polynomial Regression on Auto MPG Dataset")
plt.show()
#EXP-9/Naive Bayesian classifier #EXP-10/k-means clustering # Clustering result
from sklearn.datasets import fetch_olivetti_faces import numpy as np plt.subplot(1, 2, 1)
from sklearn.model_selection import train_test_split import pandas as pd sns.scatterplot(data=plot_df, x='PCA1', y='PCA2', hue='Cluster',
from sklearn.naive_bayes import GaussianNB import matplotlib.pyplot as plt palette='Set2')
from sklearn.metrics import accuracy_score import seaborn as sns plt.title('K-Means Clustering Result')
import matplotlib.pyplot as plt from sklearn.datasets import load_breast_cancer --------------------------------------------------------------------
---------------------------------------------------------------------- from sklearn.cluster import KMeans #Step8: Actual labels
# Step 1: Load the Olivetti face dataset from sklearn.preprocessing import StandardScaler plt.subplot(1, 2, 2)
data = fetch_olivetti_faces() from sklearn.decomposition import PCA sns.scatterplot(data=plot_df, x='PCA1', y='PCA2', hue='Actual',
X = data.data # Each image is flattened (64x64 = 4096 pixels) from sklearn.metrics import confusion_matrix, palette='Set1')
y = data.target # Labels: person IDs (0–39) classification_report plt.title('Actual Cancer Types')
-------------------------------------------------------------------- --------------------------------------------------------------------- plt.tight_layout()
# Step 2: Split data into training and test sets #Step1: Load the dataset plt.show()
# Use 7 images per person for training, 3 for testing data = load_breast_cancer()
X_train, X_test, y_train, y_test = train_test_split( X = data.data
X, y, test_size=0.3, stratify=y, random_state=42 y = data.target
) ----------------------------------------------------------------------- *****Red color line in code are
-------------------------------------------------------------------
# Step 3: Train Naive Bayes classifier
#Step2: Standardize the features
scaler = StandardScaler()
optional
nb_classifier = GaussianNB() X_scaled = scaler.fit_transform(X)
nb_classifier.fit(X_train, y_train) -----------------------------------------------------------------------
----------------------------------------------------------- #Step3: Perform K-Means clustering
# Step 4: Predict and evaluate kmeans = KMeans(n_clusters=2, random_state=42)
y_pred = nb_classifier.predict(X_test) y_kmeans = kmeans.fit_predict(X_scaled)
# Step5:Print show accuracy -----------------------------------------------------------------------
accuracy = accuracy_score(y_test, y_pred) # Step3:Print Confusion matrix with report(optional)
print(f"Naive Bayes Classifier Accuracy: {accuracy * 100:.2f}%") print("Confusion Matrix:")
---------------------------------------------------------------------- print(confusion_matrix(y, y_kmeans))
# Step6: Show a few test faces with predictions(Optional) print("\nClassification Report:")
plt.figure(figsize=(10, 4)) print(classification_report(y, y_kmeans))
for i in range(8): -----------------------------------------------------------------------
plt.subplot(2, 4, i+1) # Step5:Reduce dimensions for visualization using PCA
plt.imshow(X_test[i].reshape(64, 64), cmap='gray') pca = PCA(n_components=2)
plt.title(f"Pred: {y_pred[i]}\nTrue: {y_test[i]}") X_pca = pca.fit_transform(X_scaled)
plt.axis('off') ---------------------------------------------------------------------
plt.tight_layout() # Step6:Create a DataFrame for plotting
plt.show() df = pd.DataFrame(X_pca, columns=['PC1', 'PC2'])
df['Cluster'] = y_kmeans
df['True Label'] = y
-----------------------------------------------------------------------
#Step7: Plot clustering results
plt.figure(figsize=(12, 5))
-----------------------------------------------------------------------