7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [1]:
import pandas as pd
In [2]:
df=pd.read_csv('C:/shubhangi/2023-24/LP-III_ML/Assignment 1/[Link]')
In [3]:
[Link]()
Out[3]:
Unnamed:
key fare_amount pickup_datetime pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0
2015-05-07 2015-05-07
0 24238194 7.5 -73.999817 40.738354 -73.999512 40.723217 1
[Link].0000003 [Link] UTC
2009-07-17 2009-07-17
1 27835199 7.7 -73.994355 40.728225 -73.994710 40.750325 1
[Link].0000002 [Link] UTC
2009-08-24 2009-08-24
2 44984355 12.9 -74.005043 40.740770 -73.962565 40.772647 1
[Link].00000061 [Link] UTC
2009-06-26 2009-06-26
3 25894730 5.3 -73.976124 40.790844 -73.965316 40.803349 3
[Link].0000001 [Link] UTC
2014-08-28 2014-08-28
4 17610152 16.0 -73.925023 40.744085 -73.973082 40.761247 5
[Link].000000188 [Link] UTC
In [4]:
df=[Link](['Unnamed: 0','key','pickup_datetime'],axis=1)
In [5]:
[Link]
Out[5]:
(200000, 6)
In [6]:
[Link]
Out[6]:
fare_amount float64
pickup_longitude float64
pickup_latitude float64
dropoff_longitude float64
dropoff_latitude float64
passenger_count int64
dtype: object
In [7]:
set([Link])
Out[7]:
{dtype('int64'), dtype('float64')}
In [8]:
[Link]()
Out[8]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count
0 7.5 -73.999817 40.738354 -73.999512 40.723217 1
1 7.7 -73.994355 40.728225 -73.994710 40.750325 1
2 12.9 -74.005043 40.740770 -73.962565 40.772647 1
3 5.3 -73.976124 40.790844 -73.965316 40.803349 3
4 16.0 -73.925023 40.744085 -73.973082 40.761247 5
... ... ... ... ... ... ...
199995 3.0 -73.987042 40.739367 -73.986525 40.740297 1
199996 7.5 -73.984722 40.736837 -74.006672 40.739620 1
199997 30.9 -73.986017 40.756487 -73.858957 40.692588 2
199998 14.5 -73.997124 40.725452 -73.983215 40.695415 1
199999 14.1 -73.984395 40.720077 -73.985508 40.768793 1
199999 rows × 6 columns
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 1/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [9]:
[Link]().sum()
Out[9]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 1
dropoff_latitude 1
passenger_count 0
dtype: int64
In [10]:
df['dropoff_longitude'].fillna(value=df['dropoff_longitude'].median(),inplace=True)
In [11]:
df['dropoff_latitude'].fillna(value=df['dropoff_latitude'].mean(),inplace=True)
In [12]:
[Link]().sum()
Out[12]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dtype: int64
In [13]:
import [Link] as px
In [14]:
fig=[Link](df,y='fare_amount')
In [15]:
[Link]()
500
400
300
fare_amount
200
100
In [16]:
x=[Link](['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude'],axis=1)
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 2/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [17]:
[Link]()[['fare_amount', 'passenger_count']]
Out[17]:
fare_amount passenger_count
count 200000.000000 200000.000000
mean 11.359955 1.684535
std 9.901776 1.385997
min -52.000000 0.000000
25% 6.000000 1.000000
50% 8.500000 1.000000
75% 12.500000 2.000000
max 499.000000 208.000000
In [47]:
import numpy as np
In [48]:
def remove_outlier(df1 , col):
Q1 = df1[col].quantile(0.25)
Q3 = df1[col].quantile(0.75)
IQR = Q3 - Q1
lower_whisker = Q1-1.5*IQR
upper_whisker = Q3+1.5*IQR
df[col] = [Link](df1[col] , lower_whisker , upper_whisker)
return df1
In [49]:
def treat_outliers_all(df1 , col_list):
for c in col_list:
df1 = remove_outlier(df , c)
return df1
In [50]:
df = treat_outliers_all(df , [Link][: , 0::])
In [52]:
import [Link] as plt
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 3/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [53]:
[Link](kind = "box",subplots = True,layout = (7,2),figsize=(15,20))
Out[53]:
fare_amount Axes(0.125,0.786098;0.352273x0.0939024)
pickup_longitude Axes(0.547727,0.786098;0.352273x0.0939024)
pickup_latitude Axes(0.125,0.673415;0.352273x0.0939024)
dropoff_longitude Axes(0.547727,0.673415;0.352273x0.0939024)
dropoff_latitude Axes(0.125,0.560732;0.352273x0.0939024)
passenger_count Axes(0.547727,0.560732;0.352273x0.0939024)
dtype: object
In [54]:
pip install haversine
Requirement already satisfied: haversine in c:\programdata\anaconda3\lib\site-packages (2.8.0)
Note: you may need to restart the kernel to use updated packages.
In [56]:
import haversine as hs
In [57]:
travel_dist = []
for pos in range(len(df['pickup_longitude'])):
long1,lati1,long2,lati2 = [df['pickup_longitude'][pos],df['pickup_latitude'][pos],df['dropoff_longitude'][pos],df['dropoff_latitud
loc1=(lati1,long1)
loc2=(lati2,long2)
c = [Link](loc1,loc2)
travel_dist.append(c)
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 4/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [58]:
print(travel_dist)
df['dist_travel_km'] = travel_dist
[Link]()
IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.
Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)
Out[58]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km
0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325
1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593
2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384
3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686
4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088
In [59]:
#Uber doesn't travel over 130 kms so minimize the distance
df= [Link][(df.dist_travel_km >= 1) | (df.dist_travel_km <= 130)]
print("Remaining observastions in the dataset:", [Link])
Remaining observastions in the dataset: (200000, 7)
In [60]:
90) and longitude (greater than or less than 180)
) |(df.pickup_latitude < -90)|(df.dropoff_latitude > 90) |(df.dropoff_latitude < -90) |(df.pickup_longitude > 180) |(df.pickup_longitude <
In [61]:
[Link](incorrect_coordinates, inplace = True, errors = 'ignore')
[Link]()
Out[61]:
fare_amount pickup_longitude pickup_latitude dropoff_longitude dropoff_latitude passenger_count dist_travel_km
0 7.5 -73.999817 40.738354 -73.999512 40.723217 1.0 1.683325
1 7.7 -73.994355 40.728225 -73.994710 40.750325 1.0 2.457593
2 12.9 -74.005043 40.740770 -73.962565 40.772647 1.0 5.036384
3 5.3 -73.976124 40.790844 -73.965316 40.803349 3.0 1.661686
4 16.0 -73.929786 40.744085 -73.973082 40.761247 3.5 4.116088
In [62]:
[Link]().sum()
Out[62]:
fare_amount 0
pickup_longitude 0
pickup_latitude 0
dropoff_longitude 0
dropoff_latitude 0
passenger_count 0
dist_travel_km 0
dtype: int64
In [63]:
import seaborn as sns
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 5/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [64]:
[Link]([Link]()) #Free for null values
Out[64]:
<Axes: >
In [65]:
corr = [Link]() #Function to find the correlation
print(corr)
fare_amount pickup_longitude pickup_latitude \
fare_amount 1.000000 0.154069 -0.110842
pickup_longitude 0.154069 1.000000 0.259497
pickup_latitude -0.110842 0.259497 1.000000
dropoff_longitude 0.218675 0.425619 0.048889
dropoff_latitude -0.125898 0.073290 0.515714
passenger_count 0.015778 -0.013213 -0.012889
dist_travel_km 0.786385 0.048446 -0.073362
dropoff_longitude dropoff_latitude passenger_count \
fare_amount 0.218675 -0.125898 0.015778
pickup_longitude 0.425619 0.073290 -0.013213
pickup_latitude 0.048889 0.515714 -0.012889
dropoff_longitude 1.000000 0.245667 -0.009303
dropoff_latitude 0.245667 1.000000 -0.006308
passenger_count -0.009303 -0.006308 1.000000
dist_travel_km 0.155191 -0.052701 0.009884
dist_travel_km
fare_amount 0.786385
pickup_longitude 0.048446
pickup_latitude -0.073362
dropoff_longitude 0.155191
dropoff_latitude -0.052701
passenger_count 0.009884
dist_travel_km 1.000000
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 6/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [66]:
[Link]([Link](),annot = True)
Out[66]:
<Axes: >
In [67]:
x = df[['pickup_longitude','pickup_latitude','dropoff_longitude','dropoff_latitude','passenger_count','dist_travel_km']]
y = df['fare_amount']
In [68]:
from sklearn.model_selection import train_test_split
In [69]:
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size = 0.33)
In [70]:
from sklearn.linear_model import LinearRegression
regression = LinearRegression()
In [71]:
[Link](X_train,y_train)
Out[71]:
▾ LinearRegression
LinearRegression()
In [72]:
regression.intercept_
Out[72]:
4461.8731571535045
In [73]:
regression.coef_
Out[73]:
array([ 26.29632195, -7.60159329, 19.73368384, -18.21120668,
0.05898655, 1.8490378 ])
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 7/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [74]:
prediction = [Link](X_test) #To predict the target values
print(prediction)
[ 6.49105246 6.92068004 5.82905968 ... 13.55261447 7.52776996
7.4194044 ]
In [75]:
y_test
from [Link] import r2_score
In [76]:
r2_score(y_test,prediction)
Out[76]:
0.6475045527243914
In [77]:
from [Link] import mean_squared_error
MSE = mean_squared_error(y_test,prediction)
print(MSE)
10.429294359791001
In [78]:
RMSE = [Link](MSE)
print(RMSE)
3.229441803128058
In [79]:
from [Link] import RandomForestRegressor
In [80]:
rf = RandomForestRegressor(n_estimators=100)
In [81]:
[Link](X_train,y_train)
Out[81]:
▾ RandomForestRegressor
RandomForestRegressor()
In [84]:
y_pred = [Link](X_test)
y_pred
Out[84]:
array([ 6.209, 6.919, 4.642, ..., 15.599, 8.569, 5.437])
In [85]:
R2_Random = r2_score(y_test,y_pred)
R2_Random
Out[85]:
0.7612178302829902
In [86]:
MSE_Random = mean_squared_error(y_test,y_pred)
In [87]:
print(MSE_Random)
7.064855887063792
In [88]:
RMSE_Random = [Link](MSE_Random)
print(RMSE_Random)
2.657979662650524
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 8/9
7/18/23, 7:06 PM new_LP-III_LR_FR - Jupyter Notebook
In [89]:
print("OK")
OK
In [ ]:
localhost:8888/notebooks/new_LP-III_LR_FR.ipynb 9/9