1.
Basic Data Preprocessing
a. Installation of python environment/Anaconda IDE for machine
learning and installing python modules/packages like scikit-learn,
Keras and Tensorflow.
b. Programs involving pandas, numpy and scipy libraries.
a. Installation of python environment/Anaconda IDE for machine learning and
installing python modules/packages like scikit-learn, Keras and Tensorflow.
Program:
Output:
b. Programs involving pandas, numpy and scipy libraries
i. Pandas
Program:
import pandas as pd
# Create sample dataset
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, None, 30, 35, 40],
'Gender': ['Female', 'Male', None, 'Male', 'Female'],
'Salary': [50000, 60000, None, 80000, 70000]
}
df = pd.DataFrame(data)
# Fill missing values
df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(0, inplace=True)
# Encode categorical data
df['Gender'] = df['Gender'].fillna('Unknown') # Fill missing Gender with 'Unknown'
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1, 'Unknown': 2}) # Map Gender values
df['Salary_in_Lakhs'] = df['Salary'] / 100000 # Add new column, convert Salary to lakhs
print("\nPreprocessed Dataset:")
print(df)
Output:
ii. Numpy
Program:
import numpy as np
# Example data with missing values and categorical features
data = np.array([
[25, 5.5, 60, 'male'],
[30, 6.0, 75, 'female'],
[35, 5.9, np.nan, 'male'],
[40, 6.1, 85, 'female'],
[45, 5.8, 95, 'male'],
])
def handle_missing_data(data):
# Convert the data to a float array for processing
data_float = data[:, :-1].astype(float) # Ignore the categorical column
# Replace missing values (np.nan) with the column mean
column_means = np.nanmean(data_float, axis=0)
inds = np.where(np.isnan(data_float))
data_float[inds] = np.take(column_means, inds[1])
return data_float
# Normalize the features (scale them to a range of 0-1)
def normalize_data(data):
min_vals = data.min(axis=0)
max_vals = data.max(axis=0)
return (data - min_vals) / (max_vals - min_vals)
# Encoding categorical data (convert 'male'/'female' to 0/1)
def encode_categorical(data):
gender = data[:, -1] # Extract the last column (gender)
gender_encoded = np.where(gender == 'male', 0, 1)
return gender_encoded
# Data Preprocessing Steps
processed_data = handle_missing_data(data)
normalized_data = normalize_data(processed_data)
encoded_gender = encode_categorical(data)
# Final Processed Data
print("Processed Data (Missing Values Handled):\n", processed_data)
print("\nNormalized Data (Scaled between 0 and 1):\n", normalized_data)
print("\nEncoded Gender Data (0: Male, 1: Female):\n", encoded_gender)
Output:
iii. Scipy
Program:
import numpy as np
from scipy import stats
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder
# Example data with missing values and categorical features
data = np.array([
[25, 5.5, 60, 'male'],
[30, 6.0, 75, 'female'],
[35, 5.9, np.nan, 'male'],
[40, 6.1, 85, 'female'],
[45, 5.8, 95, 'male'],
])
def handle_missing_data(data):
data_float = data[:, :-1].astype(float) # Exclude categorical column
col_means = np.nanmean(data_float, axis=0)
# Replace missing values (np.nan) with the column mean
inds = np.where(np.isnan(data_float))
data_float[inds] = np.take(col_means, inds[1])
return data_float
# Standardize (Z-score normalization) the features
def standardize_data(data):
return stats.zscore(data, axis=0)
# Encoding categorical data (convert 'male'/'female' to 0/1)
def encode_categorical(data):
gender = data[:, -1] # Extract the last column (gender)
label_encoder = LabelEncoder()
gender_encoded = label_encoder.fit_transform(gender)
return gender_encoded
# Data Preprocessing Steps
processed_data = handle_missing_data(data)
standardized_data = standardize_data(processed_data)
encoded_gender = encode_categorical(data)
# Final Processed Data
print("Processed Data (Missing Values Handled):\n", processed_data)
print("\nStandardized Data (Z-score normalization):\n", standardized_data)
print("\nEncoded Gender Data (0: Male, 1: Female):\n", encoded_gender)
Output:
2. Programs for classification
a. Build models using linear regression and logistic regression and apply it
to classify a new instance.
b. Write a program to demonstrate the following classifiers. Use an
appropriate dataset for building the model. Apply the model to classify
a new instance.
i. Decision tree
ii. K-Nearest Neighbour
iii. Naïve Bayes
iv. Support Vector Machine
a. Build models using linear regression and logistic regression and apply it to
classify a new instance.
i. Linear regression
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
# Load the iris dataset
iris = load_iris()
X = iris.data
y = iris.target
# For linear regression, let's predict the Petal Length (use y == iris.target for another feature)
y_continuous = iris.data[:, 2] # Since petal length is at index 2
X_train, X_test, y_train, y_test = train_test_split(X, y_continuous, test_size=0.2,
random_state=42)
lr = LinearRegression()
lr.fit(X_train, y_train)
y_pred = lr.predict(X_test)
print("Linear Regression (Iris Dataset - Petal Length):")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
# Predict for a new instance
new_instance = [[5.1, 3.5, 1.4, 0.2]] # Example feature values
predicted_petal_length = lr.predict(new_instance)
print("Predicted Petal Length for New Instance:", predicted_petal_length)
Output:
ii. Logistic regression
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
iris = load_iris()
X = iris.data
y = iris.target
# Use the original target for Logistic Regression (species classification)
y_classification = iris.target # Species labels (0, 1, 2 for setosa, versicolor, virginica)
X_train, X_test, y_train, y_test = train_test_split(X, y_classification, test_size=0.2,
random_state=42)
log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)
y_pred_log = log_reg.predict(X_test)
print("\nLogistic Regression (Iris Dataset - Species Classification):")
print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))
# Predict for a new instance
new_instance = [[5.1, 3.5, 1.4, 0.2]] # Example feature values
predicted_species = log_reg.predict(new_instance)
species = iris.target_names[predicted_species][0]
print("Predicted Species for New Instance:", species)
Output:
b. Write a program to demonstrate the following classifiers. Use an appropriate
dataset for building the model. Apply the model to classify a new instance.
i. Decision Tree
Program:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the wine dataset
wine = load_wine()
X = wine.data
y = wine.target # Target – classes of wine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print("Decision Tree Classifier (Wine Dataset):")
print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))
# Predict for a new instance
new_instance = [[13.4, 2.5, 2.6, 19.4, 100.0, 2.9, 2.5, 0.3, 1.3, 3.0, 1.0, 2.2, 680]]
dt_prediction = clf.predict(new_instance)
print("Predicted Wine Class with Decision Tree:", wine.target_names[dt_prediction][0])
Output:
ii. K-Nearest Neighbour
Program:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load the Digits dataset
digits = load_digits()
X = digits.data # Features (flattened pixel values)
y = digits.target # Target (digits 0-9)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print("K-Nearest Neighbor Classifier (Digits Dataset):")
print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))
# Predict for a new instance with 64 features (flattened 8x8 pixel values)
new_instance = [[0, 0, 6, 15, 14, 3, 0, 0, 0, 0, 7, 16, 13, 5, 0, 0, 0, 9, 16, 14, 5, 0, 0, 11, 14, 7,
0, 0, 2, 14, 15, 0, 0, 0, 9, 16, 10, 3, 0, 0, 0, 0, 7, 14, 13, 4, 0, 0, 1, 12, 13, 0, 0, 0, 12, 12, 0, 0,
1, 15, 16, 0, 0, 0]]
knn_prediction = clf.predict(new_instance)
print("Predicted Digit with K-Nearest Neighbor:", knn_prediction[0])
Output:
iii. Naïve Bayes
Program:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
# Load the Breast Cancer dataset
cancer = load_breast_cancer()
X = cancer.data # Features (30 features)
y = cancer.target # Target (benign or malignant)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = GaussianNB()
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print("Naïve Bayes Classifier (Breast Cancer Dataset):")
print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))
# Predict for a new instance (30 features)
new_instance = [[15.0, 10.0, 110.0, 0.15, 0.1, 0.08, 0.3, 0.25, 0.7, 0.5, 0.8, 1.2, 0.5, 1.0, 0.6,
0.8, 1.0, 0.3, 1.2, 1.0, 0.4, 0.6, 0.9, 0.5, 0.7, 14.0, 1.2, 0.4, 0.1, 0.3]]
nb_prediction = clf.predict(new_instance)
print("Predicted Class with Naïve Bayes:", "Malignant" if nb_prediction[0] == 1 else
"Benign")
Output:
iv. Support Vector Machine
Program:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
wine = load_wine()
X = wine.data
y = wine.target # Target - classes of wine
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
clf = SVC(kernel='linear', random_state=42)
clf.fit(X_train, y_train)
y_pred_clf = clf.predict(X_test)
print("Support Vector Machine Classifier (Wine Dataset):")
print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))
# Predict for a new instance
new_instance = [[13.4, 2.5, 2.6, 19.4, 100.0, 2.9, 2.5, 0.3, 1.3, 3.0, 1.0, 2.2, 680]]
svm_prediction = clf.predict(new_instance)
print("Predicted Wine Class with SVM:", wine.target_names[svm_prediction][0])
Output:
3. Demonstration of clustering algorithms using
a. K-means
b. Hierarchical algorithms
a. K-means Algorithm
Program:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
iris = load_iris()
X = iris.data
y = iris.target
# Apply K-means clustering
kmeans = KMeans(n_clusters=3, random_state=42) # We know there are 3 species
kmeans.fit(X)
# Predict the clusters
y_kmeans = kmeans.predict(X)
# Visualizing the clusters
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
# Plotting using the first two features for simplicity
plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis')
centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='x') # centroids
plt.title('K-Means Clustering on Iris Dataset')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()
# Evaluation (comparing with actual labels)
df = pd.DataFrame({'True Label': y, 'Cluster Label': y_kmeans})
print(df.head())
Output:
b. Hierarchical algorithm – Agglomerative Clustering
Program:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
X = iris.data
y = iris.target
# Apply Agglomerative Clustering
agg_clust = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agg_labels = agg_clust.fit_predict(X)
# Visualizing the Hierarchical Clustering using Dendrogram
plt.figure(figsize=(10, 6))
sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()
# Evaluating the clustering results
df_agg = pd.DataFrame({'True Label': y, 'Agglomerative Label': agg_labels})
print(df_agg.head())
Output:
4. Demonstrate ensemble techniques like boosting, bagging, and random
forests.
i. Boosting
Program:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create an AdaBoost classifier with a Decision Tree as the base classifier
adaboost_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=50, random_state=42)
adaboost_classifier.fit(X_train, y_train)
y_pred_adaboost = adaboost_classifier.predict(X_test)
# Evaluate the model
accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
print(f'AdaBoost Classifier Accuracy: {accuracy_adaboost:.2f}')
Output:
ii. Bagging
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Bagging classifier with a Decision Tree as the base classifier
bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50,
random_state=42)
bagging_classifier.fit(X_train, y_train)
y_pred = bagging_classifier.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Bagging Classifier Accuracy: {accuracy:.2f}')
Output:
iii. Random Forests
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
iris = load_iris()
X = iris.data
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)
rf_classifier.fit(X_train, y_train)
y_pred_rf = rf_classifier.predict(X_test)
# Evaluate the model
accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Classifier Accuracy: {accuracy_rf:.2f}')
Output:
5. Build a classifier, compare its performance with an ensemble technique
like random forest.