0% found this document useful (0 votes)
8 views7 pages

ML FINAL Lab Manual

The document is a lab manual for machine learning, detailing steps to set up Python and essential libraries, visualize datasets, preprocess data, and implement various machine learning algorithms including k-NN, linear regression, decision trees, and K-means clustering. It includes code snippets for handling missing data, encoding categorical variables, and evaluating model performance. The manual also covers loading and exploring datasets from CSV and Excel files using pandas.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
8 views7 pages

ML FINAL Lab Manual

The document is a lab manual for machine learning, detailing steps to set up Python and essential libraries, visualize datasets, preprocess data, and implement various machine learning algorithms including k-NN, linear regression, decision trees, and K-means clustering. It includes code snippets for handling missing data, encoding categorical variables, and evaluating model performance. The manual also covers loading and exploring datasets from CSV and Excel files using pandas.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 7

Machine Learning Lab Manual

1 Install set up Python and essential Libraries and display the version
#listed below
#numPy
#pandas
#scikit-learn
#matplotlib
#seaborn

!pip install numpy


!pip install pandas
!pip install -U scikit-learn
!pip install matplotlib

import numpy
import pandas
import sklearn
import matplotlib

#version
print(numpy.__version__)
print(pandas.__version__)
print(sklearn.__version__)
print(matplotlib.__version__)

#2 Introduce scikit-learn as a machine learning library

!pip install -U scikit-learn

#3 Write a program to visualize the dataset to gain insights using Matplotlib or


seaborn by plotting scatter plots, bar charts

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def visualize_dataset(file_path):
# Load the dataset into a pandas DataFrame
df = pd.read_csv(file_path)

# Plot scatter plots (pairplot)


sns.pairplot(df)
plt.title("Pairplot of the Dataset")
plt.show()

# Plot bar chart for categorical column (assuming the first column is
categorical)
if df.iloc[:, 0].dtype == 'object':
sns.countplot(x=df.columns[0], data=df)
plt.title("Bar Chart of Categorical Column")
plt.xlabel(df.columns[0])
plt.ylabel("Count")
plt.show()
else:
print("No categorical column found to plot bar chart.")

# Example usage
file_path = "C:\\Users\\NDC43\\Downloads\\Iris.csv"
visualize_dataset(file_path)

#4 Write a program to Handle missing data,encode categorical variables,and perform


Feature Scaling
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

# Load Iris dataset


iris = load_iris()
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target

def preprocess_dataset(df):
# Handle missing data (Iris dataset doesn't have missing values, but we'll
simulate some)
df.iloc[::10, 0] = float('NaN')

# Impute missing values


imputer = SimpleImputer(strategy='mean')
df[df.columns] = imputer.fit_transform(df[df.columns])

# Encode categorical variables (if applicable)


# Since the Iris dataset doesn't have categorical variables, we'll skip this
step

# Perform feature scaling (excluding the 'target' column)


scaler = StandardScaler()
df[df.columns[:-1]] = scaler.fit_transform(df[df.columns[:-1]])

return df

# Preprocess Iris dataset


preprocessed_df = preprocess_dataset(iris_df)

# Display preprocessed dataset


print("Preprocessed dataset:")
print(preprocessed_df.head())
#5 Write a program to implement a k-nearest Neighbours(k-NN)Classifier using
Scikit-learn and train the classifier on the dataset and evaluate the performance
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load Iris dataset


iris = load_iris()
X = iris.data # Features (independent variables)
y = iris.target # Target labels (dependent variable)

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize the k-NN classifier


k = 3 # Number of neighbors
knn_classifier = KNeighborsClassifier(n_neighbors=k)

# Train the classifier


knn_classifier.fit(X_train, y_train)

# Make predictions on the testing set


y_pred = knn_classifier.predict(X_test)

# Evaluate the classifier's performance


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report


print("\n Classification Report:")
print(classification_report(y_test, y_pred, target_names=iris. Target_names))

#6 Write a Python program to:

1. Load the Iris dataset

2. Convert it to a DataFrame

3. Display the full dataset and first 5 rows

4. Show dataset info

5. Check for missing values

import pandas as pd
from sklearn.datasets import load_iris
iris = load_iris()
df = pd.DataFrame(iris.data, columns=iris.feature_names)

print(df)
print(df.head())
print(df.info())
print(df.isnull().sum())

output:

sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
.. ... ... ... ...
145 6.7 3.0 5.2 2.3
146 6.3 2.5 5.0 1.9
147 6.5 3.0 5.2 2.0
148 6.2 3.4 5.4 2.3
149 5.9 3.0 5.1 1.8

[150 rows x 4 columns]


sepal length (cm) sepal width (cm) petal length (cm) petal width (cm)
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 4 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 sepal length (cm) 150 non-null float64
1 sepal width (cm) 150 non-null float64
2 petal length (cm) 150 non-null float64
3 petal width (cm) 150 non-null float64
dtypes: float64(4)
memory usage: 4.8 KB
None
sepal length (cm) 0
sepal width (cm) 0
petal length (cm) 0
petal width (cm) 0
dtype: int64

#7 Write a program to Load and explore the dataset of .CSV and Excel files using
pandas
import pandas as pd

csv_file_path ="C:\\Users\\NDC-LAB1-30\\Desktop\\data.csv"
excel_file_path ="C:\\Users\\NDC-LAB1-30\\Desktop\\data2.xlsx"

data_csv = pd.read_csv(csv_file_path)
print("CSV file data:")
print(data_csv)

data_excel = pd.read_excel(excel_file_path)
print("\nExcel file data:")
print(data_excel)

print("\n CSV Data Description:")


print(data_csv.describe())

print("\n excel Data Description:")


print(data_excel.describe())

print("\n Datatypes in CSV file")


print(data_csv.dtypes)

print("\n Datatypes in excel file")


print(data_excel.dtypes)

#8 Write a program to implement a linear regression model for regression


tasks and Train the model on a dataset with continuous target Variable

import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

# Load California Housing dataset


california = fetch_california_housing()
X = california.data # Features (independent variables)
y = california.target # Target (dependent variable)

# Convert the data to a pandas DataFrame for easier manipulation


california_df = pd.DataFrame(data=X, columns=california.feature_names)
california_df['target'] = y

# Split the dataset into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize Linear Regression model


linear_regression = LinearRegression()

# Train the model


linear_regression.fit(X_train, y_train)

# Make predictions on the testing set


y_pred = linear_regression.predict(X_test)

# Evaluate the model's performance


mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
# Print performance metrics
print("Mean Squared Error:", mse)
print("R-squared Score:", r2)

#9 Write a program to implement a decision tree Classifier using scikit-learn and


visualize the decision tree and understand its splits.

import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.metrics import accuracy_score, classification_report
import matplotlib.pyplot as plt

# Load Iris dataset


iris = load_iris()
X = iris.data # Features
y = iris.target # Target labels

# Split the dataset into training and testing sets


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)

# Initialize Decision Tree classifier


decision_tree = DecisionTreeClassifier(random_state=42)

# Train the classifier


decision_tree.fit(X_train, y_train)

# Make predictions on the testing set


y_pred = decision_tree.predict(X_test)

# Evaluate the classifier's performance


accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Display classification report


print("\nClassification Report:")
print(classification_report(y_test, y_pred, target_names=iris.target_names))

# Visualize the trained decision tree


plt.figure(figsize=(12, 8))
plot_tree(decision_tree, feature_names=iris.feature_names,
class_names=iris.target_names, filled=True)
plt.title("Decision Tree for Iris Dataset")
plt.show()
#10 write a program to implement K-means Clustering and Visualize clusters .
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans

# Generate sample data


X, y = make_blobs(n_samples=500, centers=4, cluster_std=0.8, random_state=42)

# Create a K-Means clusterer with 4 clusters


kmeans = KMeans(n_clusters=4, random_state=42)

# Fit the data


kmeans.fit(X)

# Get cluster labels


labels = kmeans.labels_

# Plot the data with cluster labels


plt.figure(figsize=(8, 6))
plt.scatter(X[:, 0], X[:, 1], c=labels, cmap='viridis')

# Plot the centroids


plt.scatter(kmeans.cluster_centers_[:, 0], kmeans.cluster_centers_[:, 1], s=100,
c='red', label='Centroids')

# Add title and labels


plt.title('K-Means Clustering')
plt.xlabel('X')
plt.ylabel('Y')

# Add legend
plt.legend()

# Show the plot


plt.show()

You might also like