Naan Mudhalvan - Project
Project Title:Apllied Data Science
Project Title: A-Reliable-Energy-Consumption-Analysis-
System-For-Energy-Efficient-Appliances
Team ID : NM2023TMID01842
Team Size : 4
Team Leader : SANJAY K
Team member : DHAYANITHI P
Team member : MATHIVANAN N
Team member : HEYRAM T
College Name:Adhiparasakthi college of engineering.
import numpy as np
import pandas as pd
import [Link] as plt
import seaborn as sns
df=pd.read_csv('household_power_consumption.txt')
[Link]()
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity
[Link]
(930143, 1)
Univariate Analsis
Histogram
[Link](df['Date'])
----------------------------------------------------------------------
-----
KeyError Traceback (most recent call
last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
KeyError: 'Date'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call
last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key,
_check_indexing_error will raise
K E 'D t '
[Link](df['Date'])
----------------------------------------------------------------------
-----
NameError Traceback (most recent call
last)
<ipython-input-160-f96959050710> in <cell line: 1>()
----> 1 [Link](data['Date'])
NameError: name 'data' is not defined
SEARCH STACK OVERFLOW
x = df['Date'].value_counts()
[Link]([Link],
labels=[Link],
autopct='%1.1f%%')
[Link]()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas libs hashtable PyObjectHashTable get item()
Bivariate Analysis
[Link](x=data['Voltage'],
y=data['Date'])
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-166-f93113986f18> in <cell line: 1>()
----> 1 [Link](x=data['Voltage'],
2 y=data['Date'])
NameError: name 'data' is not defined
SEARCH STACK OVERFLOW
import [Link] as plt
[Link](figsize=(15, 5))
[Link](x=data['Date'], y=data['Voltage'])
[Link](rotation='90')
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-163-46fdf18b99e8> in <cell line: 3>()
1 import [Link] as plt
2 [Link](figsize=(15, 5))
----> 3 [Link](x=data['Date'], y=data['Voltage'])
4 [Link](rotation='90')
NameError: name 'data' is not defined
SEARCH STACK OVERFLOW
<Figure size 1500x500 with 0 Axes>
[Link](df,hue="Global_reactive_power").map([Link],"Global_reactive_power","c
Multivariate Analysis
from sklearn import datasets, decomposition
iris = datasets.load_iris()
X = [Link]
y = [Link]
pca = [Link](n_components=2)
X = pca.fit_transform(X)
[Link](x=X[:, 0], y=X[:, 1], hue=y)
<Axes: >
[Link]([Link](), annot=True)
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-165-b699050ce883> in <cell line: 1>()
----> 1 [Link]([Link](), annot=True)
NameError: name 'data' is not defined
SEARCH STACK OVERFLOW
[Link](df,hue="Voltage")
Descriptive Statistics
[Link]()
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inten
count
unique
top
freq
df['Date'].value_counts()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
KeyError: 'Date'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
df['Voltage'].value_counts().to_frame()
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
raise
KeyError: 'Date'
SEARCH STACK OVERFLOW
df['Global_active_power'].value_counts()
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
KeyError: 'Global_active_power'
'Voltage'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
-> 3804 raise KeyError(key) from err
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
Handling missing values
raise
KeyError: 'Global_active_power'
[Link]()
SEARCH STACK OVERFLOW
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
[Link]().sum()
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_me
0
dtype: int64
[Link]().sum().sum()
filling null values
df2=[Link](value=0)
df
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
...
930138
930139
930140
930141
930142
930143 rows × 1 columns
[Link]().sum().sum()
0
#filling null values with previous value
df4=[Link](method = 'pad')
df4
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
...
930138
930139
930140
930141
930142
930143 rows × 1 columns
[Link]().sum()
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;Sub_me
0
dtype: int64
df5=[Link]({'Global_reactive_power':'4.216','Global_reactive_power':'0.418','Date':'1
df5
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
...
930138
Finding the outliars
import pandas as pd
import numpy as np
import [Link] as px
Histogram
fig = [Link](df, x='Global_active_power')
[Link]()
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-141-b195d9dee47c> in <cell line: 1>()
----> 1 fig = [Link](df, x='Global_active_power')
2
**Box plot 3 [Link]()
3 frames
/usr/local/lib/python3.10/dist-packages/plotly/express/_core.py in
fig = [Link](df, y='Global_active_power')
process_args_into_dataframe(args, wide_mode, var_name, value_name)
[Link]()1206 if argument == "index":
1207 err_msg += "\n To use the index, pass it in
directly as `[Link]`."
-> 1208 raise ValueError(err_msg)
1209 elif length and len(df_input[argument]) != length:
*scatterplot
1210* raise ValueError(
ValueError: Value of 'x' is not the name of a column in 'data_frame'. Expected
fig = [Link](x=df['Global_active_power'], y=df['Global_reactive_power'])
one of
['Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_intensity;
[Link]()
but received: Global_active_power
SEARCH STACK OVERFLOW
def find_outliers_IQR(df):
q1=[Link](0.25)
q3=[Link](0.75)
IQR=q3-q1
outliers = df[((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
return outliers
outliers = find_outliers_IQR(df["Global_active_power"])
print("number of outliers: "+ str(len(outliers)))
print("max outlier value: "+ str([Link]()))
print("min outlier value: "+ str([Link]()))
outliers
outliers = find_outliers_IQR(df[["Global_active_power","Global_reactive_power"]])
outliers
**Drop t he out liar
def drop_outliers_IQR(df):
q1=[Link](0.25)
q3=[Link](0.75)
IQR=q3-q1
not_outliers = df[~((df<(q1-1.5*IQR)) | (df>(q3+1.5*IQR)))]
outliers_dropped = [Link]().reset_index()
return outliers_dropped
**Cap t he out liars
upper_limit = df['Global_active_power'].mean() + 3*df['Global_active_power'].std()
print(upper_limit)
lower_limit = df['Global_active_power'].mean() - 3*df['Global_active_power'].std()
print(lower_limit)
df['Global_active_power'] = [Link](df['Global_active_power'] > upper_limit,
upper_limit,
[Link](
df['Global_active_power'] < lower_limit,
lower_limit,
df['Global_active_power']
)
[Link]()[['Global_active_power']]
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-137-db5887173d2e> in <cell line: 1>()
**Replace
---->out
1 liers using imput at ion as if t hey were missing values
[Link]()[['Global_active_power']]
2 frames
def impute_outliers_IQR(df):
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
_raise_if_missing(self, key, indexer, axis_name)
q1=[Link](0.25)
6128 if use_interval_msg:
6129
q3=[Link](0.75) key = list(key)
-> 6130 raise KeyError(f"None of [{key}] are in the
[{axis_name}]")
IQR=q3-q1
6131
6132 not_found = list(ensure_index(key)[missing_mask.nonzero()
upper = df[~(df>(q3+1.5*IQR))].max()
[0]].unique())
lower = df[~(df<(q1-1.5*IQR))].min()
KeyError: "None of [Index(['Global_active_power'], dtype='object')] are in the
df = [Link](df > upper,
[columns]"
[Link](),
SEARCH STACK OVERFLOW
[Link](
df < lower,
[Link](),
df
)
)
return df
df['Global_active_power'] = impute_outliers_IQR(df['Global_active_power'])
[Link]()['Global_active_power']
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self, key, method, tolerance)
3801 try:
-> 3802 return self._engine.get_loc(casted_key)
3803 except KeyError as err:
4 frames
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
cat egorical column
pandas/_libs/hashtable_class_helper.pxi in
pandas._libs.[Link].get_item()
df = pd.read_csv('household_power_consumption.txt');
df5
KeyError: 'Global_active_power'
The above exception was the direct cause of the following exception:
Date;Time;Global_active_power;Global_reactive_power;Voltage;Global_inte
KeyError Traceback (most recent call last)
0
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
get_loc(self,
1 key, method, tolerance)
3802 return self._engine.get_loc(casted_key)
3803
2 except KeyError as err:
-> 3804 raise KeyError(key) from err
3
3805 except TypeError:
3806 # If we have a listlike key, _check_indexing_error will
4
raise
...
KeyError: 'Global_active_power'
930138
SEARCH STACK OVERFLOW
930139
930140
930141
930142
930143 rows × 1 columns
df_numeric = df[['Global_active_power','Global_reactive_power','Date','Time']];
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
<ipython-input-135-24092e91a73d> in <cell line: 1>()
----> 1 df_numeric =
df[['Global_active_power','Global_reactive_power','Date','Time']];
2 frames
/usr/local/lib/python3.10/dist-packages/pandas/core/indexes/[Link] in
_raise_if_missing(self, key, indexer, axis_name)
6128 if use_interval_msg:
6129 key = list(key)
-> 6130
df_numeric.head(); raise KeyError(f"None of [{key}] are in the
[{axis_name}]")
6131
---------------------------------------------------------------------------
6132
NameError not_found = list(ensure_index(key)[missing_mask.nonzero()
Traceback (most recent call last)
[0]].unique())
<ipython-input-134-2f4b77347dae> in <cell line: 1>()
----> 1 df_numeric.head();
KeyError: "None of [Index(['Global_active_power', 'Global_reactive_power',
'Date', 'Time'],
NameError: dtype='object')]
name 'df_numeric' are
is not in the [columns]"
defined
SEARCH
SEARCH STACK
STACK OVERFLOW
OVERFLOW
print(df['Global_active_power'].unique())
print(df['Global_reactive_power'].unique())
print(df['Date'].unique())
print(df['Time'].unique())
print(df['Voltage'].unique())
**Encode t he labels using label encoding
from [Link] import LabelEncoder
species_encoder = LabelEncoder()
species_encoder.fit(df_categorical['species'])
species_values = species_encoder.transform(df_categorical['species'])
print("Before Encoding:", list(df_categorical['species'][-10:]))
print("After Encoding:", species_values[-10:])
print("The inverse from the encoding result:", species_encoder.inverse_transform(species
island_encoder = LabelEncoder()
island_values =island_encoder.fit_transform(df_categorical['island'])
print("Before Encoding:", list(df_categorical['island'][:5]))
print("After Encoding:",island_values[:5])
print("The inverse from the encoding result:", island_encoder.inverse_transform(island_v
**split t ing int o dependent and independent variables
X = [Link][:, :-1].values
print(X)
Y = [Link][:, -1].values
print(Y)
**split int o dependent and independent
x=df['Date']
y=[Link](columns=['Voltage'],axis=1)
[Link]()
[Link]()
from [Link] import scale
x=df['Time']
y=[Link](columns=['Global_active_power'],axis=1)
[Link]()
x=scale(x)
x
**Scale t he dat a
from [Link] import scale
x=df['Date']
y=[Link](columns=['Voltage'],axis=1)
[Link]()
x=scale(x)
x
[Link]()
[Link]()
**perform any of t he clust ering algorit hm
import pandas as pd
import numpy as np
import [Link] as plt
from [Link] import KMeans
from [Link] import StandardScaler
scaled_df = StandardScaler().fit_transform(df)
print(scaled_df[:5])
kmeans_kwargs = {
"init": "random",
"n_init": 10,
"random_state": 1,
}
sse = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
[Link](scaled_df)
[Link](kmeans.inertia_)
[Link](range(1, 11), sse)
[Link](range(1, 11))
[Link]("Number of Clusters")
[Link]("SSE")
[Link]()
kmeans = KMeans(init="random", n_clusters=3, n_init=10, random_state=1)
[Link](scaled_df)
kmeans.labels_
df['cluster'] = kmeans.labels_
print(df)
**Split t he dat a int o t raining and t est ing
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=train_test_split(x,y,test_size=0.2)
x_train
x_train.shape
x_test
x_test.shape
y_train
y_train.shape
y_test
y_test.shape
**Build t he model
from sklearn.linear_model import LinearRegression
lin_reg=LinearRegression()
**Train t he model
import numpy
import [Link] as plt
[Link](2)
x = [Link](3, 1, 100)
y = [Link](150, 40, 100) / x
train_x = x[:80]
train_y = y[:80]
test_x = x[80:]
test_y = y[80:]
mymodel = numpy.poly1d([Link](train_x, train_y, 4))
myline = [Link](0, 6, 100)
[Link](train_x, train_y)
[Link]()
**Test t he model
[Link](myline, mymodel(myline))
[Link]()
**Performance using Evaluat ion Met rics.
from sklearn import metrics
from [Link] import mean_squared_error
For regression problems
from [Link] import confusion_matrix
from [Link] import accuracy_score
from [Link] import classification_report
from [Link] import roc_auc_score
from [Link] import log_loss
X_actual = [1, 1, 0, 1, 0, 0, 1, 0, 0, 0]
Y_predic = [1, 0, 1, 1, 1, 0, 1, 1, 0, 0]
results = confusion_matrix(X_actual, Y_predic)
print ('Confusion Matrix :')
print(results)
print ('Accuracy Score is',accuracy_score(X_actual, Y_predic))
print ('Classification Report : ')
print (classification_report(X_actual, Y_predic))
print('AUC-ROC:',roc_auc_score(X_actual, Y_predic))
print('LOGLOSS Value is',log_loss(X_actual, Y_predic))
For classificat ion problems
from [Link] import r2_score
from [Link] import mean_absolute_error
from [Link] import mean_squared_error
X_actual = [5, -1, 2, 10]
Y_predic = [3.5, -0.9, 2, 9.9]
print ('R Squared =',r2_score(X_actual, Y_predic))
print ('MAE =',mean_absolute_error(X_actual, Y_predic))
print ('MSE =',mean_squared_error(X_actual, Y_predic))