Machine Learning Lab Manual
1 Install set up Python and essential Libraries and display the version
#listed below
#numPy
#pandas
#scikit-learn
#matplotlib
#seaborn
!pip install numpy
!pip install pandas
!pip install -U scikit-learn
!pip install matplotlib
import numpy
import pandas
import sklearn
import matplotlib
#version
print(numpy.__version__)
print(pandas.__version__)
print(sklearn.__version__)
print(matplotlib.__version__)
#2 Introduce scikit-learn as a machine learning library
!pip install -U scikit-learn
#3 Write a program to visualize the dataset to gain insights using Matplotlib or
seaborn by plotting scatter plots, bar charts
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
def visualize_dataset(file_path):
# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)
# Plot scatter plots (pairplot)
sns.pairplot(df)
plt.title("Pairplot of the Dataset")
plt.show()
# Plot bar chart for categorical column (assuming the first column is
categorical)
if df.iloc[:, 0].dtype == 'object':
sns.countplot(x=df.columns[0], data=df)
plt.title("Bar Chart of Categorical Column")
plt.xlabel(df.columns[0])
plt.ylabel("Count")
plt.show()
else:
print("No categorical column found to plot bar chart.")
# Example usage
file_path = "C:\\Users\\NDC43\\Downloads\\Iris.csv"
visualize_dataset(file_path)
#4 Write a program to Handle missing data,encode categorical variables,and perform
Feature Scaling
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
# Load Iris dataset
iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
def preprocess_dataset(df):
# Handle missing data (Iris dataset doesn't have missing values, but we'll
simulate some)
df.iloc[::10, 0] = float('NaN')
# Impute missing values
imputer = SimpleImputer(strategy='mean')
df[df.columns] = imputer.fit_transform(df[df.columns])
# Encode categorical variables (if applicable)
# Since the Iris dataset doesn't have categorical variables, we'll skip this
step
# Perform feature scaling (excluding the 'target' column)
scaler = StandardScaler()
df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])
return df
# Preprocess Iris dataset
preprocessed_df = preprocess_dataset(iris_df)
# Display preprocessed dataset
print("Preprocessed dataset:")
print(preprocessed_df.head())
#5 Write a program to implement a k-nearest Neighbours(k-NN)Classifier using
Scikit-learn and train the classifier on the dataset and evaluate the performance
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
# Load Iris dataset
iris = load_iris()
X = iris.data # Features (independent variables)
y = iris.target # Target labels (dependent variable)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize the k-NN classifier
k = 3 # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)
# Train the classifier
knn_classifier.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = knn_classifier.predict(X_test)
# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Display classification report
print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris. Target_names))
#6 Write a Python program to:
1. Load the Iris dataset
2. Convert it to a DataFrame
3. Display the full dataset and first 5 rows
4. Show dataset info
5. Check for missing values
import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)
print(df)
print(df.head())
print(df.info())
print(df.isnull().sum())
output:
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
.. ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8
[150 rows x 4 columns]
sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal length (cm) 150 non-null float64
1 sepal width (cm) 150 non-null float64
2 petal length (cm) 150 non-null float64
3 petal width (cm) 150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
None
sepal length (cm) 0
sepal width (cm) 0
petal length (cm) 0
petal width (cm) 0
dtype: int64
#7 Write a program to Load and explore the dataset of .CSV and Excel files using
pandas
import pandas as pd
csv_file_path ="C:\\Users\\NDC-LAB1-30\\Desktop\\data.csv"
excel_file_path ="C:\\Users\\NDC-LAB1-30\\Desktop\\data2.xlsx"
data_csv = pd.read_csv(csv_file_path)
print("CSV file data:")
print(data_csv)
data_excel = pd.read_excel(excel_file_path)
print("\nExcel file data:")
print(data_excel)
print("\n CSV Data Description:")
print(data_csv.describe())
print("\n excel Data Description:")
print(data_excel.describe())
print("\n Datatypes in CSV file")
print(data_csv.dtypes)
print("\n Datatypes in excel file")
print(data_excel.dtypes)
#8 Write a program to implement a linear regression model for regression
tasks and Train the model on a dataset with continuous target Variable
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
# Load California Housing dataset
california = fetch_california_housing()
X = california.data # Features (independent variables)
y = california.target # Target (dependent variable)
# Convert the data to a pandas DataFrame for easier manipulation
california_df = pd.DataFrame(data=X, columns=california.feature_names)
california_df['target'] = y
# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize Linear Regression model
linear_regression = LinearRegression()
# Train the model
linear_regression.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = linear_regression.predict(X_test)
# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print performance metrics
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)
#9 Write a program to implement a decision tree Classifier using scikit-learn and
visualize the decision tree and understand its splits.
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt
# Load Iris dataset
iris = load_iris()
X = iris.data # Features
y = iris.target # Target labels
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Initialize Decision Tree classifier
decision_tree = DecisionTreeClassifier(random_state=42)
# Train the classifier
decision_tree.fit(X_train, y_train)
# Make predictions on the testing set
y_pred = decision_tree.predict(X_test)
# Evaluate the classifier's performance
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
# Display classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))
# Visualize the trained decision tree
plt.figure(figsize=(12, 8))
plot_tree(decision_tree, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True)
plt.title("Decision Tree for Iris Dataset")
plt.show()
#10 write a program to implement K-means Clustering and Visualize clusters .
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
# Generate sample data
X, y = make_blobs(n_samples=500, centers=4, cluster_std=0.8, random_state=42)
# Create a K-Means clusterer with 4 clusters
kmeans = KMeans(n_clusters=4, random_state=42)
# Fit the data
kmeans.fit(X)
# Get cluster labels
labels = kmeans.labels_
# Plot the data with cluster labels
plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')
# Plot the centroids
plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100,
c='red', label='Centroids')
# Add title and labels
plt.title('K-Means Clustering')
plt.xlabel('X')
plt.ylabel('Y')
# Add legend
plt.legend()
# Show the plot
plt.show()