0% found this document useful (0 votes)
13 views22 pages

Record

Uploaded by

sravyaracha1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
13 views22 pages

Record

Uploaded by

sravyaracha1
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
You are on page 1/ 22

1.

Basic Data Preprocessing


a. Installation of python environment/Anaconda IDE for machine
learning and installing python modules/packages like scikit-learn,
Keras and Tensorflow.
b. Programs involving pandas, numpy and scipy libraries.

a. Installation of python environment/Anaconda IDE for machine learning and


installing python modules/packages like scikit-learn, Keras and Tensorflow.
Program:

Output:
b. Programs involving pandas, numpy and scipy libraries
i. Pandas
Program:
import pandas as pd

# Create sample dataset


data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, None, 30, 35, 40],
'Gender': ['Female', 'Male', None, 'Male', 'Female'],
'Salary': [50000, 60000, None, 80000, 70000]
}

df = pd.DataFrame(data)

# Fill missing values


df['Age'].fillna(df['Age'].mean(), inplace=True)
df['Salary'].fillna(0, inplace=True)

# Encode categorical data


df['Gender'] = df['Gender'].fillna('Unknown') # Fill missing Gender with 'Unknown'
df['Gender'] = df['Gender'].map({'Female': 0, 'Male': 1, 'Unknown': 2}) # Map Gender values
df['Salary_in_Lakhs'] = df['Salary'] / 100000 # Add new column, convert Salary to lakhs

print("\nPreprocessed Dataset:")
print(df)

Output:
ii. Numpy
Program:
import numpy as np

# Example data with missing values and categorical features


data = np.array([
[25, 5.5, 60, 'male'],
[30, 6.0, 75, 'female'],
[35, 5.9, np.nan, 'male'],
[40, 6.1, 85, 'female'],
[45, 5.8, 95, 'male'],
])

def handle_missing_data(data):
# Convert the data to a float array for processing
data_float = data[:, :-1].astype(float) # Ignore the categorical column

# Replace missing values (np.nan) with the column mean


column_means = np.nanmean(data_float, axis=0)
inds = np.where(np.isnan(data_float))
data_float[inds] = np.take(column_means, inds[1])
return data_float

# Normalize the features (scale them to a range of 0-1)


def normalize_data(data):
min_vals = data.min(axis=0)
max_vals = data.max(axis=0)
return (data - min_vals) / (max_vals - min_vals)

# Encoding categorical data (convert 'male'/'female' to 0/1)


def encode_categorical(data):
gender = data[:, -1] # Extract the last column (gender)
gender_encoded = np.where(gender == 'male', 0, 1)
return gender_encoded

# Data Preprocessing Steps


processed_data = handle_missing_data(data)
normalized_data = normalize_data(processed_data)
encoded_gender = encode_categorical(data)

# Final Processed Data


print("Processed Data (Missing Values Handled):\n", processed_data)
print("\nNormalized Data (Scaled between 0 and 1):\n", normalized_data)
print("\nEncoded Gender Data (0: Male, 1: Female):\n", encoded_gender)
Output:
iii. Scipy
Program:
import numpy as np
from scipy import stats
from scipy.sparse import csr_matrix
from sklearn.preprocessing import LabelEncoder

# Example data with missing values and categorical features


data = np.array([
[25, 5.5, 60, 'male'],
[30, 6.0, 75, 'female'],
[35, 5.9, np.nan, 'male'],
[40, 6.1, 85, 'female'],
[45, 5.8, 95, 'male'],
])

def handle_missing_data(data):
data_float = data[:, :-1].astype(float) # Exclude categorical column
col_means = np.nanmean(data_float, axis=0)

# Replace missing values (np.nan) with the column mean


inds = np.where(np.isnan(data_float))
data_float[inds] = np.take(col_means, inds[1])
return data_float

# Standardize (Z-score normalization) the features


def standardize_data(data):
return stats.zscore(data, axis=0)

# Encoding categorical data (convert 'male'/'female' to 0/1)


def encode_categorical(data):
gender = data[:, -1] # Extract the last column (gender)
label_encoder = LabelEncoder()
gender_encoded = label_encoder.fit_transform(gender)
return gender_encoded

# Data Preprocessing Steps


processed_data = handle_missing_data(data)
standardized_data = standardize_data(processed_data)
encoded_gender = encode_categorical(data)

# Final Processed Data


print("Processed Data (Missing Values Handled):\n", processed_data)
print("\nStandardized Data (Z-score normalization):\n", standardized_data)
print("\nEncoded Gender Data (0: Male, 1: Female):\n", encoded_gender)
Output:
2. Programs for classification
a. Build models using linear regression and logistic regression and apply it
to classify a new instance.
b. Write a program to demonstrate the following classifiers. Use an
appropriate dataset for building the model. Apply the model to classify
a new instance.
i. Decision tree
ii. K-Nearest Neighbour
iii. Naïve Bayes
iv. Support Vector Machine

a. Build models using linear regression and logistic regression and apply it to
classify a new instance.
i. Linear regression
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Load the iris dataset


iris = load_iris()
X = iris.data
y = iris.target

# For linear regression, let's predict the Petal Length (use y == iris.target for another feature)
y_continuous = iris.data[:, 2] # Since petal length is at index 2

X_train, X_test, y_train, y_test = train_test_split(X, y_continuous, test_size=0.2,


random_state=42)

lr = LinearRegression()
lr.fit(X_train, y_train)

y_pred = lr.predict(X_test)

print("Linear Regression (Iris Dataset - Petal Length):")


print("Mean Squared Error:", mean_squared_error(y_test, y_pred))

# Predict for a new instance


new_instance = [[5.1, 3.5, 1.4, 0.2]] # Example feature values
predicted_petal_length = lr.predict(new_instance)
print("Predicted Petal Length for New Instance:", predicted_petal_length)
Output:

ii. Logistic regression


Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

iris = load_iris()
X = iris.data
y = iris.target

# Use the original target for Logistic Regression (species classification)


y_classification = iris.target # Species labels (0, 1, 2 for setosa, versicolor, virginica)

X_train, X_test, y_train, y_test = train_test_split(X, y_classification, test_size=0.2,


random_state=42)

log_reg = LogisticRegression(max_iter=200)
log_reg.fit(X_train, y_train)

y_pred_log = log_reg.predict(X_test)

print("\nLogistic Regression (Iris Dataset - Species Classification):")


print("Accuracy:", accuracy_score(y_test, y_pred_log))
print("Classification Report:\n", classification_report(y_test, y_pred_log))

# Predict for a new instance


new_instance = [[5.1, 3.5, 1.4, 0.2]] # Example feature values
predicted_species = log_reg.predict(new_instance)
species = iris.target_names[predicted_species][0]
print("Predicted Species for New Instance:", species)
Output:
b. Write a program to demonstrate the following classifiers. Use an appropriate
dataset for building the model. Apply the model to classify a new instance.
i. Decision Tree
Program:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the wine dataset


wine = load_wine()
X = wine.data
y = wine.target # Target – classes of wine

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = DecisionTreeClassifier(random_state=42)
clf.fit(X_train, y_train)

y_pred_clf = clf.predict(X_test)

print("Decision Tree Classifier (Wine Dataset):")


print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))

# Predict for a new instance


new_instance = [[13.4, 2.5, 2.6, 19.4, 100.0, 2.9, 2.5, 0.3, 1.3, 3.0, 1.0, 2.2, 680]]
dt_prediction = clf.predict(new_instance)
print("Predicted Wine Class with Decision Tree:", wine.target_names[dt_prediction][0])

Output:
ii. K-Nearest Neighbour
Program:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the Digits dataset


digits = load_digits()
X = digits.data # Features (flattened pixel values)
y = digits.target # Target (digits 0-9)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train, y_train)

y_pred_clf = clf.predict(X_test)

print("K-Nearest Neighbor Classifier (Digits Dataset):")


print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))

# Predict for a new instance with 64 features (flattened 8x8 pixel values)
new_instance = [[0, 0, 6, 15, 14, 3, 0, 0, 0, 0, 7, 16, 13, 5, 0, 0, 0, 9, 16, 14, 5, 0, 0, 11, 14, 7,
0, 0, 2, 14, 15, 0, 0, 0, 9, 16, 10, 3, 0, 0, 0, 0, 7, 14, 13, 4, 0, 0, 1, 12, 13, 0, 0, 0, 12, 12, 0, 0,
1, 15, 16, 0, 0, 0]]
knn_prediction = clf.predict(new_instance)
print("Predicted Digit with K-Nearest Neighbor:", knn_prediction[0])
Output:
iii. Naïve Bayes
Program:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report

# Load the Breast Cancer dataset


cancer = load_breast_cancer()
X = cancer.data # Features (30 features)
y = cancer.target # Target (benign or malignant)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = GaussianNB()
clf.fit(X_train, y_train)

y_pred_clf = clf.predict(X_test)

print("Naïve Bayes Classifier (Breast Cancer Dataset):")


print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))

# Predict for a new instance (30 features)


new_instance = [[15.0, 10.0, 110.0, 0.15, 0.1, 0.08, 0.3, 0.25, 0.7, 0.5, 0.8, 1.2, 0.5, 1.0, 0.6,
0.8, 1.0, 0.3, 1.2, 1.0, 0.4, 0.6, 0.9, 0.5, 0.7, 14.0, 1.2, 0.4, 0.1, 0.3]]
nb_prediction = clf.predict(new_instance)
print("Predicted Class with Naïve Bayes:", "Malignant" if nb_prediction[0] == 1 else
"Benign")

Output:
iv. Support Vector Machine
Program:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

wine = load_wine()
X = wine.data
y = wine.target # Target - classes of wine

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = SVC(kernel='linear', random_state=42)


clf.fit(X_train, y_train)

y_pred_clf = clf.predict(X_test)

print("Support Vector Machine Classifier (Wine Dataset):")


print("Accuracy:", accuracy_score(y_test, y_pred_clf))
print("Classification Report:\n", classification_report(y_test, y_pred_clf))

# Predict for a new instance


new_instance = [[13.4, 2.5, 2.6, 19.4, 100.0, 2.9, 2.5, 0.3, 1.3, 3.0, 1.0, 2.2, 680]]
svm_prediction = clf.predict(new_instance)
print("Predicted Wine Class with SVM:", wine.target_names[svm_prediction][0])

Output:
3. Demonstration of clustering algorithms using
a. K-means
b. Hierarchical algorithms

a. K-means Algorithm
Program:
from sklearn.datasets import load_iris
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

iris = load_iris()
X = iris.data
y = iris.target

# Apply K-means clustering


kmeans = KMeans(n_clusters=3, random_state=42) # We know there are 3 species
kmeans.fit(X)

# Predict the clusters


y_kmeans = kmeans.predict(X)

# Visualizing the clusters


sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))

# Plotting using the first two features for simplicity


plt.scatter(X[:, 0], X[:, 1], c=y_kmeans, cmap='viridis')

centers = kmeans.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1], c='red', s=200, marker='x') # centroids
plt.title('K-Means Clustering on Iris Dataset')
plt.xlabel('Sepal Length')
plt.ylabel('Sepal Width')
plt.show()

# Evaluation (comparing with actual labels)


df = pd.DataFrame({'True Label': y, 'Cluster Label': y_kmeans})
print(df.head())
Output:
b. Hierarchical algorithm – Agglomerative Clustering
Program:
from sklearn.cluster import AgglomerativeClustering
import scipy.cluster.hierarchy as sch
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.datasets import load_iris

iris = load_iris()
X = iris.data
y = iris.target

# Apply Agglomerative Clustering


agg_clust = AgglomerativeClustering(n_clusters=3, metric='euclidean', linkage='ward')
agg_labels = agg_clust.fit_predict(X)

# Visualizing the Hierarchical Clustering using Dendrogram


plt.figure(figsize=(10, 6))
sch.dendrogram(sch.linkage(X, method='ward'))
plt.title('Hierarchical Clustering Dendrogram')
plt.xlabel('Samples')
plt.ylabel('Distance')
plt.show()

# Evaluating the clustering results


df_agg = pd.DataFrame({'True Label': y, 'Agglomerative Label': agg_labels})
print(df_agg.head())

Output:
4. Demonstrate ensemble techniques like boosting, bagging, and random
forests.

i. Boosting
Program:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create an AdaBoost classifier with a Decision Tree as the base classifier


adaboost_classifier = AdaBoostClassifier(estimator=DecisionTreeClassifier(max_depth=1),
n_estimators=50, random_state=42)

adaboost_classifier.fit(X_train, y_train)

y_pred_adaboost = adaboost_classifier.predict(X_test)

# Evaluate the model


accuracy_adaboost = accuracy_score(y_test, y_pred_adaboost)
print(f'AdaBoost Classifier Accuracy: {accuracy_adaboost:.2f}')

Output:
ii. Bagging
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Bagging classifier with a Decision Tree as the base classifier


bagging_classifier = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=50,
random_state=42)

bagging_classifier.fit(X_train, y_train)

y_pred = bagging_classifier.predict(X_test)

# Evaluate the model


accuracy = accuracy_score(y_test, y_pred)
print(f'Bagging Classifier Accuracy: {accuracy:.2f}')

Output:
iii. Random Forests
Program:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

iris = load_iris()
X = iris.data
y = iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Create a Random Forest classifier


rf_classifier = RandomForestClassifier(n_estimators=50, random_state=42)

rf_classifier.fit(X_train, y_train)

y_pred_rf = rf_classifier.predict(X_test)

# Evaluate the model


accuracy_rf = accuracy_score(y_test, y_pred_rf)
print(f'Random Forest Classifier Accuracy: {accuracy_rf:.2f}')

Output:
5. Build a classifier, compare its performance with an ensemble technique
like random forest.

You might also like