Name : Snehal Kotkar Div : A Roll No.
: 46
Practical No. : 2 Problem Statement : Build a machine learning model using k-Nearest
Neighbors algorithm to predict whether the patients in the "Pima Indians Diabetes Dataset"
have diabetes or not.
import numpy as np
import pandas as pd
import [Link] as plt
[Link]('ggplot')
from [Link] import drive
[Link]('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly
remount, call [Link]("/content/drive", force_remount=True).
df = pd.read_csv('/content/drive/MyDrive/ML /[Link]')
[Link]()
{"summary":"{\n \"name\": \"df\",\n \"rows\": 768,\n \"fields\": [\
n {\n \"column\": \"Pregnancies\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 3,\n \"min\": 0,\n
\"max\": 17,\n \"num_unique_values\": 17,\n \"samples\":
[\n 6,\n 1,\n 3\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Glucose\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 31,\n
\"min\": 0,\n \"max\": 199,\n \"num_unique_values\":
136,\n \"samples\": [\n 151,\n 101,\n
112\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"BloodPressure\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 19,\n \"min\": 0,\n
\"max\": 122,\n \"num_unique_values\": 47,\n
\"samples\": [\n 86,\n 46,\n 85\
n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"SkinThickness\",\n \"properties\": {\n \"dtype\":
\"number\",\n \"std\": 15,\n \"min\": 0,\n
\"max\": 99,\n \"num_unique_values\": 51,\n \"samples\":
[\n 7,\n 12,\n 48\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Insulin\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 115,\n
\"min\": 0,\n \"max\": 846,\n \"num_unique_values\":
186,\n \"samples\": [\n 52,\n 41,\n
183\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"BMI\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 7.884160320375446,\n \"min\": 0.0,\n \"max\":
67.1,\n \"num_unique_values\": 248,\n \"samples\": [\n
19.9,\n 31.0,\n 38.1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"DiabetesPedigreeFunction\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0.3313285950127749,\n \"min\": 0.078,\n \"max\": 2.42,\n
\"num_unique_values\": 517,\n \"samples\": [\n 1.731,\
n 0.426,\n 0.138\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Age\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 11,\n \"min\": 21,\n
\"max\": 81,\n \"num_unique_values\": 52,\n \"samples\":
[\n 60,\n 47,\n 72\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Outcome\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 0,\n
\"min\": 0,\n \"max\": 1,\n \"num_unique_values\": 2,\n
\"samples\": [\n 0,\n 1\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n }\n ]\n}","type":"dataframe","variable_name":"df"}
[Link]
(768, 9)
[Link]().sum()
Pregnancies 0
Glucose 0
BloodPressure 0
SkinThickness 0
Insulin 0
BMI 0
DiabetesPedigreeFunction 0
Age 0
Outcome 0
dtype: int64
X = [Link]('Outcome',axis=1).values
y = df['Outcome'].values
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test =
train_test_split(X,y,test_size=0.25,random_state=42, stratify=y)
#import KNeighborsClassifier
from [Link] import KNeighborsClassifier
#Setup arrays to store training and test accuracies
neighbors = [Link](1,15)
train_accuracy =[Link](len(neighbors))
test_accuracy = [Link](len(neighbors))
for i,k in enumerate(neighbors):
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=k)
#Fit the model
[Link](X_train, y_train)
#Compute accuracy on the training set
train_accuracy[i] = [Link](X_train, y_train)
#Compute accuracy on the test set
test_accuracy[i] = [Link](X_test, y_test)
#Generate plot
[Link]('k-NN Varying number of neighbors')
[Link](neighbors, test_accuracy, label='Testing Accuracy')
[Link](neighbors, train_accuracy, label='Training accuracy')
[Link]()
[Link]('Number of neighbors')
[Link]('Accuracy')
[Link]()
#Setup a knn classifier with k neighbors
knn = KNeighborsClassifier(n_neighbors=4)
#Fit the model
[Link](X_train,y_train)
KNeighborsClassifier(n_neighbors=4)
#Get accuracy. Note: In case of classification algorithms score method
represents accuracy.
[Link](X_test,y_test)
0.7291666666666666
#let us get the predictions using the classifier we had fit above
y_pred = [Link](X_test)
y_pred
array([0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
0,
0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0,
0,
1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0,
0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0,
0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1,
0,
1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0,
1,
0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0])