0% found this document useful (0 votes)
15 views3 pages

KNN Py

Uploaded by

Fahad Nasim
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
15 views3 pages

KNN Py

Uploaded by

Fahad Nasim
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as TXT, PDF, TXT or read online on Scribd

#!

/usr/bin/env python
# coding: utf-8

# In[487]:

import pandas as pd
import numpy as np
import [Link] as plt
get_ipython().run_line_magic('matplotlib', 'notebook')

# In[488]:

df = pd.read_csv(r'C:\Users\DELL\Downloads\Social_Network_Ads.csv')

# In[508]:

df = [Link](frac=1, random_state=42).reset_index(drop=True)
#shuffles the rows #resets the index starting from 0

# In[509]:

df['Gender'] = [Link](df['Gender'])[0] #returns a tuple where the first


element of that tuple is the array of
#integers assigned to string values
and the second element is the array of the
# actual string values

# In[524]:

dfc = [Link]() #created a copy of the actual data set cuz i want to, no
explanation 😒😒😒

# In[527]:

def feature_scaling(dfc):
f1, f2 = dfc['Age'], dfc['EstimatedSalary']
dfc['Age'] = (f1-min(f1))/(max(f1) - min(f1)) #so basically i am scaling the
data points after which they will range from 0 to 1
dfc['EstimatedSalary'] = (f2-min(f2))/(max(f2) - min(f2))

return (dfc['Age'], dfc['EstimatedSalary'])


feature_scaling(dfc) #calling this function otherwise it will have no effect on
the data set, ofcourse you need to call the function, otherwise
#whats the point of making a function

# In[537]:
training_data = dfc[:320] #splitting the data into two parts, one i'll use for
training other for the testing
test_data = dfc[320:]

# In[538]:

#converting the data set coloumns to numpy arrays and assigning them to variables

x = training_data.iloc[:, [1, 2, 3]].values #x: numpy array, shape (320, 3),


training feature data
y = training_data.iloc[:, -1].values #y: numpy array, shape (320, ),
training labels

a = test_data.iloc[:, [1, 2, 3]].values #a: numpy array, shape (320, 3), test
feature data
b = test_data.iloc[:, -1].values #b: numpy array, shape (320, 3), test
labels

# In[544]:

import [Link] as px

fig = px.scatter_3d(x=x[:, 0], y=x[:, 1], z=x[:, 2], color=[ 'blue' if label==0
else 'red' for label in y])
fig.update_layout(width=1000,height=600)
fig.update_traces(marker=dict(size=2)) # you can try size=2 or 1 as well
[Link]()

# In[560]:

fig = px.scatter_3d(x=a[:, 0], y=a[:, 1], z=a[:, 2], color=[ 'blue' if label==0
else 'red' for label in b])
fig.update_layout(width=1000,height=500)
fig.update_traces(marker=dict(size=2)) #can try size=2 or 1 as well
[Link]()

# In[554]:

def KNN(x, query_point, K):


dist = [Link](((x-query_point)**2).sum(axis=1)) #calculating the
euclidean distance

stacked_array = [Link]([dist, y], axis = 1) #stacking the distances


with there actual y labels
sorted_indices = [Link](stacked_array[:, 0])
ranked_array = stacked_array[sorted_indices] #now ranking the arrays
on the basis of distance(y labels remains intact)
ranked_array = ranked_array[:K] #returns actual sorted
array but only K rows are returned
predict = [Link](ranked_array[:, 1], return_counts=True) #returns a tulple
of two array one element of tuple is the array of all number
#and the other one
is the array of the number of times they occured.

purchased_or_not = predict[0][predict[1].argmax()] #This is for


[Link] purchased(1) is more or not purchased(0) is more
#select the first
array and then select the second array with has
#max count and the
whole gives the number which occured most

return (1 if purchased_or_not==1 else 0)

# In[555]:

err_arr = [] #empty list for storing the


errors. Called error_array.
for i in range(len(a)):
if (KNN(x, a[i], 5))==b[i]:
err = 0 #0 if there is no error
else:
err = 1 #1 if there is error

err_arr.append(err) #that array containing all the


errors in terms of 0 and 1

total_cost=(sum(err_arr)/len(a))*100 #calculating the cost ie.,


total error percentage

print(total_cost, '%') #for the test data set i


got minimum error as 7.5% with a K=5.

# In[ ]:

You might also like