In [14]: import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
In [2]: list1 = [1,4,7,9,20]
In [3]: plt.plot(list1)
Out[3]: [<matplotlib.lines.Line2D at 0x1c36fee99d0>]
In [4]: cars = pd.read_excel("cars.xlsx")
In [5]: cars.head()
Out[5]: Model Mileage Cylinders Displacement Horsepower Weight Acceleration Year Origin
0 chevrolet chevelle malibu 18.0 8 307.0 130.0 3504 12.0 70 US
1 buick skylark 320 15.0 8 350.0 165.0 3693 11.5 70 US
2 plymouth satellite 18.0 8 318.0 150.0 3436 11.0 70 US
3 amc rebel sst 16.0 8 304.0 150.0 3433 12.0 70 US
4 ford torino 17.0 8 302.0 140.0 3449 10.5 70 US
In [6]: cars['Mileage'].plot.hist()
#highest num of cars are with mileage 12-15
#highest num of cars are with mileage 38-43
Out[6]: <Axes: ylabel='Frequency'>
In [7]: cars['Mileage'].plot.hist(bins=5)
Out[7]: <Axes: ylabel='Frequency'>
In [9]: cars['Horsepower'].plot.box()
#Q2 or median is around 90 - 50% of the data is less tham 90
#Q1 is around 75 - 25% of the data is less than 75
# there are outliers after 200
Out[9]: <Axes: >
In [10]: cars['Mileage'].plot.box()
Out[10]: <Axes: >
In [12]: cars['Year'].value_counts().plot.bar()
Out[12]: <Axes: xlabel='Year'>
In [13]: cars.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 394 entries, 0 to 393
Data columns (total 9 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 Model 394 non-null object
1 Mileage 394 non-null float64
2 Cylinders 394 non-null int64
3 Displacement 394 non-null float64
4 Horsepower 392 non-null float64
5 Weight 394 non-null int64
6 Acceleration 394 non-null float64
7 Year 394 non-null int64
8 Origin 394 non-null object
dtypes: float64(4), int64(3), object(2)
memory usage: 27.8+ KB
In [23]: sns.countplot(y='Origin',data=cars)
Out[23]: <Axes: xlabel='count', ylabel='Origin'>
In [55]: sns.boxplot(y='Mileage',data=cars)
Out[55]: <Axes: ylabel='Mileage'>
In [25]: #bvivariate
# cat num
# Mileage of each country
sns.boxplot(x='Origin',y='Mileage',data=cars)
#median mileage of Japan > Europe > US
Out[25]: <Axes: xlabel='Origin', ylabel='Mileage'>
In [28]: # cat and cat
#Origin and Cylinders
sns.countplot(x='Origin',hue='Cylinders',data=cars)
Out[28]: <Axes: xlabel='Origin', ylabel='count'>
In [29]: plt.scatter(cars['Mileage'],cars['Horsepower'])
Out[29]: <matplotlib.collections.PathCollection at 0x1c37bb92b90>
In [31]: sns.scatterplot(x='Weight',y='Horsepower',data=cars)
# as one variable increase other variable decrease due to negative correlation
Out[31]: <Axes: xlabel='Weight', ylabel='Horsepower'>
In [32]: titanic=pd.read_excel("titanic.xlsx")
In [33]: titanic.head()
Out[33]: PassengerId Gender Gender_Category Age Fare Class Embarked_Town Accompany_Status Status
0 1 Male Man 22.0 7.2500 Third Southampton Accompanied Died
1 2 Female Woman 38.0 71.2833 First Cherbourg Accompanied Survived
2 3 Female Woman 26.0 7.9250 Third Southampton Alone Survived
3 4 Female Woman 35.0 53.1000 First Southampton Accompanied Survived
4 5 Male Man 35.0 8.0500 Third Southampton Alone Died
In [35]: # How many died, how many survived?
sns.countplot(x='Status',data=titanic)
Out[35]: <Axes: xlabel='Status', ylabel='count'>
In [36]: titanic['Status'].value_counts(normalize=True).plot.bar()
#60% died
Out[36]: <Axes: xlabel='Status'>
In [37]: titanic['Age'].plot.hist()
Out[37]: <Axes: ylabel='Frequency'>
In [40]: # Q3 number of fare
sns.boxplot(y='Age',data=titanic)
Out[40]: <Axes: ylabel='Age'>
In [41]: #which gender survived more?
sns.countplot(x='Gender',hue='Status',data=titanic)
Out[41]: <Axes: xlabel='Gender', ylabel='count'>
In [43]: # Younger or old passengers died more?
sns.boxplot(y='Age',x='Status',data=titanic)
Out[43]: <Axes: xlabel='Status', ylabel='Age'>
In [44]: # add gender as third variable
sns.boxplot(y='Age',x='Gender',hue='Status',data=titanic)
# in females older age group survived more than younger, on an avg
Out[44]: <Axes: xlabel='Gender', ylabel='Age'>
In [45]: cars.head()
Out[45]: Model Mileage Cylinders Displacement Horsepower Weight Acceleration Year Origin
0 chevrolet chevelle malibu 18.0 8 307.0 130.0 3504 12.0 70 US
1 buick skylark 320 15.0 8 350.0 165.0 3693 11.5 70 US
2 plymouth satellite 18.0 8 318.0 150.0 3436 11.0 70 US
3 amc rebel sst 16.0 8 304.0 150.0 3433 12.0 70 US
4 ford torino 17.0 8 302.0 140.0 3449 10.5 70 US
In [47]: cars_num=cars[['Mileage','Displacement','Horsepower','Weight','Acceleration']]
In [48]: cars_num.corr()
Out[48]: Mileage Displacement Horsepower Weight Acceleration
Mileage 1.000000 -0.805249 -0.778427 -0.832279 0.420574
Displacement -0.805249 1.000000 0.897257 0.932813 -0.542985
Horsepower -0.778427 0.897257 1.000000 0.864538 -0.689196
Weight -0.832279 0.932813 0.864538 1.000000 -0.414675
Acceleration 0.420574 -0.542985 -0.689196 -0.414675 1.000000
In [52]: sns.heatmap(cars_num.corr(),annot=True)
Out[52]: <Axes: >
In [ ]: