import numpy as np
import pandas as pd
pd.options.display.width = 1000
pd.options.display.max_rows = 500
pd.options.display.max_columns = 500 # this code helps to output
columns in 1 line , it is easy to see & understand
from google.colab import drive
drive.mount('/content/drive')
Drive already mounted at /content/drive; to attempt to forcibly
remount, call drive.mount("/content/drive", force_remount=True).
df = pd.read_csv('/content/drive/MyDrive/Colab
Notebooks/pokemon_data.csv')
sample = df[df['Legendary']==True].head(10)
print(sample)
sample['Attack']
# Name Type 1 Type 2 HP Attack
Defense Sp. Atk Sp. Def Speed Generation Legendary
156 144 Articuno Ice Flying 90 85
100 95 125 85 1 True
157 145 Zapdos Electric Flying 90 90
85 125 90 100 1 True
158 146 Moltres Fire Flying 90 100
90 125 85 90 1 True
162 150 Mewtwo Psychic NaN 106 110
90 154 90 130 1 True
163 150 MewtwoMega Mewtwo X Psychic Fighting 106 190
100 154 100 130 1 True
164 150 MewtwoMega Mewtwo Y Psychic NaN 106 150
70 194 120 140 1 True
262 243 Raikou Electric NaN 90 85
75 115 100 115 2 True
263 244 Entei Fire NaN 115 115
85 90 75 100 2 True
264 245 Suicune Water NaN 100 75
115 90 115 85 2 True
269 249 Lugia Psychic Flying 106 90
130 90 154 110 2 True
156 85
157 90
158 100
162 110
163 190
164 150
262 85
263 115
264 75
269 90
Name: Attack, dtype: int64
p = sample[sample['Attack']>100]
p
{"summary":"{\n \"name\": \"p\",\n \"rows\": 4,\n \"fields\": [\n
{\n \"column\": \"#\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 47,\n \"min\": 150,\n
\"max\": 244,\n \"num_unique_values\": 2,\n \"samples\":
[\n 244,\n 150\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Name\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 4,\n
\"samples\": [\n \"MewtwoMega Mewtwo X\",\n
\"Entei\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Type 1\",\n \"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 2,\n \"samples\": [\n
\"Fire\",\n \"Psychic\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Type 2\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1,\n \"samples\": [\n \"Fighting\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"HP\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 4,\n \"min\": 106,\n
\"max\": 115,\n \"num_unique_values\": 2,\n \"samples\":
[\n 115\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Attack\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 37,\n \"min\": 110,\n \"max\": 190,\n
\"num_unique_values\": 4,\n \"samples\": [\n 190\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Defense\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
12,\n \"min\": 70,\n \"max\": 100,\n
\"num_unique_values\": 4,\n \"samples\": [\n 100\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Sp. Atk\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
43,\n \"min\": 90,\n \"max\": 194,\n
\"num_unique_values\": 3,\n \"samples\": [\n 154\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Sp. Def\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
18,\n \"min\": 75,\n \"max\": 120,\n
\"num_unique_values\": 4,\n \"samples\": [\n 100\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Speed\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 17,\n
\"min\": 100,\n \"max\": 140,\n \"num_unique_values\":
3,\n \"samples\": [\n 130\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Generation\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 1,\n \"max\": 2,\n
\"num_unique_values\": 2,\n \"samples\": [\n 2\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Legendary\",\n
\"properties\": {\n \"dtype\": \"boolean\",\n
\"num_unique_values\": 1,\n \"samples\": [\n true\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe","variable_name":"p"}
n = sample[sample['Attack']<=100]
n
# total = 10 , in which negative are 6 , positive are 4
{"summary":"{\n \"name\": \"n\",\n \"rows\": 6,\n \"fields\": [\n
{\n \"column\": \"#\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 55,\n \"min\": 144,\n
\"max\": 249,\n \"num_unique_values\": 6,\n \"samples\":
[\n 144,\n 145,\n 249\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Name\",\n \"properties\": {\n
\"dtype\": \"string\",\n \"num_unique_values\": 6,\n
\"samples\": [\n \"Articuno\",\n \"Zapdos\",\n
\"Lugia\"\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Type 1\",\n \"properties\": {\n \"dtype\": \"string\",\n
\"num_unique_values\": 5,\n \"samples\": [\n
\"Electric\",\n \"Psychic\",\n \"Fire\"\n ],\
n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Type 2\",\n \"properties\":
{\n \"dtype\": \"category\",\n \"num_unique_values\":
1,\n \"samples\": [\n \"Flying\"\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"HP\",\n \"properties\": {\n
\"dtype\": \"number\",\n \"std\": 6,\n \"min\": 90,\n
\"max\": 106,\n \"num_unique_values\": 3,\n \"samples\":
[\n 90\n ],\n \"semantic_type\": \"\",\n
\"description\": \"\"\n }\n },\n {\n \"column\":
\"Attack\",\n \"properties\": {\n \"dtype\": \"number\",\n
\"std\": 8,\n \"min\": 75,\n \"max\": 100,\n
\"num_unique_values\": 4,\n \"samples\": [\n 90\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Defense\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
20,\n \"min\": 75,\n \"max\": 130,\n
\"num_unique_values\": 6,\n \"samples\": [\n 100\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Sp. Atk\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
16,\n \"min\": 90,\n \"max\": 125,\n
\"num_unique_values\": 4,\n \"samples\": [\n 125\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Sp. Def\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
25,\n \"min\": 85,\n \"max\": 154,\n
\"num_unique_values\": 6,\n \"samples\": [\n 125\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Speed\",\n \"properties\":
{\n \"dtype\": \"number\",\n \"std\": 12,\n
\"min\": 85,\n \"max\": 115,\n \"num_unique_values\":
5,\n \"samples\": [\n 100\n ],\n
\"semantic_type\": \"\",\n \"description\": \"\"\n }\
n },\n {\n \"column\": \"Generation\",\n
\"properties\": {\n \"dtype\": \"number\",\n \"std\":
0,\n \"min\": 1,\n \"max\": 2,\n
\"num_unique_values\": 2,\n \"samples\": [\n 2\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n },\n {\n \"column\": \"Legendary\",\n
\"properties\": {\n \"dtype\": \"boolean\",\n
\"num_unique_values\": 1,\n \"samples\": [\n true\n
],\n \"semantic_type\": \"\",\n \"description\": \"\"\n
}\n }\n ]\n}","type":"dataframe","variable_name":"n"}
# lets calculate entropy for root node p =4, n=6
# Entropy for root node
-(P/total)*np.log2(p/total)-(n/total)*np.log2(n/total)
enR = -(4/10)*np.log2(4/10)-(6/10)*np.log2(6/10)
print(enR)
0.9709505944546686
#total values in root node are Posotive = 4 , Negative = 6
# Now lets build the left node and right node based on 'Generation'
column
LN = n[n['Generation']==1] ,p[p['Generation']==1]
print(LN)
#total 6 rows , 3 rows of negative and 3 rows of positive
( # Name Type 1 Type 2 HP Attack Defense Sp. Atk
Sp. Def Speed Generation Legendary
156 144 Articuno Ice Flying 90 85 100 95
125 85 1 True
157 145 Zapdos Electric Flying 90 90 85 125
90 100 1 True
158 146 Moltres Fire Flying 90 100 90 125
85 90 1 True, # Name Type
1 Type 2 HP Attack Defense Sp. Atk Sp. Def Speed Generation
Legendary
162 150 Mewtwo Psychic NaN 106 110 90
154 90 130 1 True
163 150 MewtwoMega Mewtwo X Psychic Fighting 106 190 100
154 100 130 1 True
164 150 MewtwoMega Mewtwo Y Psychic NaN 106 150 70
194 120 140 1 True)
RN = n[n['Generation']==2],p[p['Generation']==2]
print(RN)
# total rows 4 , 3 negative and 1 positive
( # Name Type 1 Type 2 HP Attack Defense Sp. Atk
Sp. Def Speed Generation Legendary
262 243 Raikou Electric NaN 90 85 75 115
100 115 2 True
264 245 Suicune Water NaN 100 75 115 90
115 85 2 True
269 249 Lugia Psychic Flying 106 90 130 90
154 110 2 True, # Name Type 1 Type 2 HP
Attack Defense Sp. Atk Sp. Def Speed Generation Legendary
263 244 Entei Fire NaN 115 115 85 90 75
100 2 True)
#calculating the entropy , of LN , we have p = 3 , N= 3 , in which our
total sample was 6 were negative , and 4 were positive
# Entropy for left node
entropyLN = -(3/6)*np.log2(3/6)-(3/6)*np.log2(3/6)
print(entropyLN)
1.0
# we have p = 1 , n = 3 , total = 4
# Entropy for right node
entropyRN = -(1/4)*np.log2(1/4)-(3/4)*np.log2(3/4)
print(entropyRN)
0.8112781244591328
# Now let us calculate Information Gain which is IG = E(root)-E(Root|
part{left or right node})
# Since left node entropy is 1.0 , right node entropy is ~= 0.8113 and
root node entropy is ~= 0.9710 based on this we calculate IG
print('for left node')
IG_left = enR - entropyLN
print(IG_left)
print('\n\t')
print('for right node')
IG_right = enR - entropyRN
print(IG_right)
for left node
-0.02904940554533142
for right node
0.15967246999553575
K = pd.DataFrame(sample)
print(K)
CorrelationSample = K['Attack'].corr(K['HP'])
print('\n\t')
print('Here is the correlation b/w Attack & HP\n')
print(CorrelationSample)
# Name Type 1 Type 2 HP Attack
Defense Sp. Atk Sp. Def Speed Generation Legendary
156 144 Articuno Ice Flying 90 85
100 95 125 85 1 True
157 145 Zapdos Electric Flying 90 90
85 125 90 100 1 True
158 146 Moltres Fire Flying 90 100
90 125 85 90 1 True
162 150 Mewtwo Psychic NaN 106 110
90 154 90 130 1 True
163 150 MewtwoMega Mewtwo X Psychic Fighting 106 190
100 154 100 130 1 True
164 150 MewtwoMega Mewtwo Y Psychic NaN 106 150
70 194 120 140 1 True
262 243 Raikou Electric NaN 90 85
75 115 100 115 2 True
263 244 Entei Fire NaN 115 115
85 90 75 100 2 True
264 245 Suicune Water NaN 100 75
115 90 115 85 2 True
269 249 Lugia Psychic Flying 106 90
130 90 154 110 2 True
Here is the correlation b/w Attack & HP
0.4980818208834152