Ex. No.
: 01 WORKING WITH NUMPY ARRAYS
Program:
#Creating 1D array
import numpy as np
arr = np.array([1, 2, 3, 4, 5])
print(arr)
#Creating 2D array
import numpy as np
arr = np.array([[1, 2, 3], [4, 5, 6]])
print(arr)
#Creating 3D array
import numpy as np
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
print(arr)
#Accessing 1D array
import numpy as np
arr = np.array([1, 2, 3, 4])
print(arr[0])
#Accessing 2D array
import numpy as np
arr = np.array([[1,2,3,4,5], [6,7,8,9,10]])
print('2nd element on 1st dim: ', arr[0, 1])
#Accessing 3D array
import numpy as np
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[7, 8, 9], [10, 11, 12]]])
print(arr[0, 1, 2])
#Slicing 1D array
import numpy as np
arr = np.array([11, 12, 13, 14, 15, 16, 17])
print(arr[1:5])
#Slicing 2D array
import numpy as np
arr = np.array([[1, 2, 3, 4, 5], [6, 7, 8, 9, 10]])
print(arr[1, 1:4])
Output:
#Creating 1D array
[1 2 3 4 5]
#Creating 2D array
[[1 2 3]
[4 5 6]]
#Creating 3D array
[[[1 2 3]
[4 5 6]]
[[1 2 3]
[4 5 6]]]
#Accessing 1D array
#Accessing 2D array
#Accessing 3D array
6
#Slicing 1D array
[12 13 14 15]
#Slicing 2D array
[7 8 9]
Ex. No: 2 WORKING WITH PANDAS
PROGRAM
# Creating DataFrame from list
import pandas as pd
data = [['tom', 10], ['nick', 15], ['juli', 14]]
df = pd.DataFrame(data, columns = ['Name', 'Age'])
print(df)
# Creating DataFrame from dictionary
import pandas as pd
data = {'Name':['Tom', 'nick', 'krish', 'jack'],'Age':[20, 21, 19, 18]}
df = pd.DataFrame(data)
print(df)
# Creating DataFrame with explicit index
import pandas as pd
data = {'Name':['Tom', 'Jack', 'nick', 'juli'],'marks':[99, 98, 95, 90]}
df = pd.DataFrame(data, index =['rank1','rank2','rank3','rank4'])
print(df)
# Creating DataFrame from list of dicts
import pandas as pd
data = [{'a': 1, 'b': 2, 'c':3},{'a':10, 'b': 20, 'c': 30}]
df = pd.DataFrame(data)
print(df)
# Add records to dataframe using the .loc function
import pandas as pd
df = pd.DataFrame(columns = ['year','make','model'])
df.loc[0] = [2014,"toyota","corolla"]
df.loc[1] = [2018,"honda","civic"]
print(df)
Output:
# Creating DataFrame from list
Name Age
0 tom 10
1 nick 15
2 juli 14
# Creating DataFrame from dictionary
Name Age
0 Tom 20
1 nick 21
2 krish 19
3 jack 18
# Creating data frame with explicit index
Name marks
rank1 Tom 99
rank2 Jack 98
rank3 nick 95
rank4 juli 90
# Creating DataFrame from list of dicts
a b c
0 1 2 3
1 10 20 30
# Add records to dataframe using the .loc function
year make model
0 2014 toyota corolla
1 2018 honda civic
Ex. No: 3 BASIC PLOTS USING MATPLOTLIB
PROGRAM:~
#Line plot
import matplotlib.pyplot as plt
x = [10, 20, 30, 40]
y = [20, 30, 40, 50]
plt.plot(x, y)
plt.title("Simple Plot")
plt.ylabel("y-axis")
plt.xlabel("x-axis")
plt.show()
#Histogram
import matplotlib.pyplot as plt
x = [1, 2, 3, 4, 5, 6, 7, 4]
plt.hist(x, bins = [1, 2, 3, 4, 5, 6, 7])
plt.title("Histogram")
plt.legend(["bar"])
plt.show()
#Scatter plot
import matplotlib.pyplot as plt
x = [3, 1, 3, 12, 2, 4, 4]
y = [3, 2, 1, 4, 5, 6, 7]
plt.scatter(x, y)
plt.legend("A")
plt.title("Scatter chart")
plt.show()
#Pie chart
import matplotlib.pyplot as plt
import numpy as np
y = np.array([35, 25, 25, 15])
mylabels = ["Apples", "Bananas", "Cherries", "Dates"]
plt.pie(y, labels = mylabels)
plt.show()
#Bar chart
import matplotlib.pyplot as plt
import numpy as np
x = np.array(["A", "B", "C", "D"])
y = np.array([3, 8, 1, 10])
plt.bar(x,y)
plt.show()
Output:
#Line plot
#Histogram
#Scatter plot
#Pie chart
#Bar chart
Ex. No: 4.a FREQUENCY DISTRIBUTION, AVERAGES,VARIABILITY
Program:
#Frequency distribution for marks
import pandas as pd
import matplotlib.pyplot as plt
d={'Maths':[90,75,68,66,72,50,45],'English':[95,74,60,63,79,80,55],'Science':
[60,85,58,76,52,70,65],'Names':['Avni','Bharathi','Dadlin','Irfan','Karan','Mano','Ranjit']}
df=pd.DataFrame(d)
plt.hist(df['English'])
plt.xlabel('Marks')
plt.ylabel('Count')
plt.show()
#Frequency distribution with bins
import matplotlib.pyplot as plt
x=[1,1,2,2,3,3,4,4,5,5,6,7,8,8,10,10,15,16,17,18,20,25,28,30,35,35,37,40]
plt.hist(x,bins=[0,10,20,30,40,50])
plt.show()
#Frequency distribution with colors
import pandas as pd
import matplotlib.pyplot as plt
d={'Maths':[90,75,68,66,72,50,45],'English':[95,74,60,63,79,80,55],'Science':
[60,85,58,76,52,70,65],'Names':['Avni','Bharathi','Dadlin','Irfan','Karan','Mano','Ranjit']}
df=pd.DataFrame(d)
df['Maths'].plot(kind='hist',bins=[40,60,80,100],color='brown')
plt.xlabel('Marks')
plt.ylabel('Count')
plt.show()
#Frequency distribution with edge color
import pandas as pd
import matplotlib.pyplot as plt
d={'Maths':[90,75,68,66,72,50,45],'English':[95,74,60,63,79,80,55],'Science':
[60,85,58,76,52,70,65],'Names':['Avni','Bharathi','Dadlin','Irfan','Karan','Mano','Ranjit']}
df=pd.DataFrame(d)
df['Maths'].plot(kind='hist',bins=4,color='red',edgecolor='black',linewidth=2)
plt.xlabel('Marks')
plt.ylabel('Count')
plt.show()
Output:
#Frequency distribution for marks
#Frequency distribution with bins
#Frequency distribution with colors
#Frequency distribution with edge color
Ex. No: 4.a AVERAGES – MEAN, MEDIAN AND MODE
Program:
#Mean
import numpy as np
s=[2,4,55,6,7,7,7,6,78]
x=np.mean(s)
print(x)
#Median
import numpy as np
s=[2,4,55,6,7,7,7,6,78]
x=np.median(s)
print(x)
#Mode
import scipy.stats as st
s=[2,4,55,6,7,7,7,6,78]
x=st.mode(s)
print(x)
Output:
19.11111111111111
7.0
ModeResult(mode=array([7]), count=array([3]))
Ex.No:5 NORMAL CURVES,CORRELATION AND SCATTER
PLOTS, CORRELATION COEFFICIENT
PROGRAM
PROGRAM:
import matplotlib.pyplot as plt
import numpy as np
import scipy.stats as stats
import math
mu = 0
variance = 1
sigma = math.sqrt(variance)
x = np.linspace(mu - 3*sigma, mu + 3*sigma, 100)
plt.plot(x, stats.norm.pdf(x, mu, sigma)) plt.show()
OUTPUT:
PROGRAM:
import pandas as pd
from sklearn.datasets import load_diabetes
import seaborn as sns
import matplotlib.pyplot as plt
df = load_diabetes(as_frame=True)
df = df.frame
corr = df.corr(method = 'pearson')
corr.head(7)
c = np.corrcoef(df['age'],df['sex']) print('Correlations
between age and sex\n',c)
OUTPUT:
PROGRAM:
import sklearn import
numpy as np
import matplotlib.pyplot as plt
import pandas as pd
y = pd.Series([1, 2, 3, 4, 3, 5, 4])
x = pd.Series([1, 2, 3, 4, 5, 6, 7])
correlation = y.corr(x)
plt.title('Correlation')
plt.scatter(x, y)
plt.plot(np.unique(x),np.poly1d(np.polyfit(x, y, 1))(np.unique(x)), color='red')
plt.xlabel('x axis')
plt.ylabel('y axis')
OUTPUT:
Ex. No:6 REGGRESSION
PROGRAM
import numpy as np
import matplotlib.pyplot as plt
20
defestimate_coef(x, y):
# number of observations/points
n = np.size(x)
# mean of x and y vector
m_x = np.mean(x)
m_y = np.mean(y)
# calculating cross-deviation and deviation about x
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
# calculating regression coefficients
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
defplot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)
# predicted response vector
y_pred = b[0] + b[1]*x
# plotting the regression line
plt.plot(x, y_pred, color = "g")
# putting labels
plt.xlabel('x')
plt.ylabel('y')
# function to show plot
21
plt.show()
defmain():
# observations / data
x = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
y = np.array([1, 3, 2, 5, 7, 8, 8, 9, 10, 12])
# estimating coefficients
b = estimate_coef(x, y)
print("Estimated coefficients:\nb_0 = {} \
\nb_1 = {}".format(b[0], b[1]))
# plotting regression line
plot_regression_line(x, y, b)
if __name__ == "__main__":
main()
OUTPUT:
22
Ex. No: 7 Z-TEST
Program:-
import pandas as
pd import numpy
as np
from statsmodels.stats.weightstats import ztest
data= pd.read_csv(r"D:\kp\CAR DETAILS FROM CAR DEKHO.csv")
selling_price=data["selling_price"]
selling_price_mean=np.mean(selling_price)
print("Mean of selling price:",selling_price_mean)
ztest_score,pval=ztest(data['selling_price'],value=484120)
print("Pval:",pval)
if pval>0.05:
print("Accepting the null hypothesis")
else:
print("Rejecting the null hypothesis")
OUTPUT:
Mean of selling price: 504127.3117511521
Pval: 0.022714099677937877
Rejecting the null hypothesis
Ex.No:08 T-TEST
Program:-
PROGRAM:
print('NULL HYPOTHESIS: Both datasets are from same population')
print('ALTERNATE HYPOTHESIS: Both datasets are not from same population') import
pandas as pd
from scipy import stats
data1= pd.read_csv(r"D:\kp\CAR DETAILS FROM CAR DEKHO1.csv")
data2= pd.read_csv(r"D:\kp\FINAL_SPINNY_900.csv")
selling_price1=data1["selling_price"]
selling_price2=data2["selling_price"]
ttest,p_value=stats.ttest_ind(selling_price1,selling_price2)
print('Test statistic is
%f'%float("{:.6f}".format(ttest))) print('p-value for
two tailed test is %f'%p_value)
alpha = 0.05
if p_value<=alpha:
print('''Rejecting null
hypothesis
Both datasets are not from same population''')
else:
print("""Accepting null hypothesis.
Both datasets are from same population""")
OUTPUT:
NULL HYPOTHESIS: Both datasets are from same population ALTERNATE
HYPOTHESIS: Both datasets are not from same population Test statistic is -
1.243571
p-value for two tailed test is 0.213977
Accepting null hypothesis.
Both datasets are from same population
Ex.No:9 ANOVA
PROGRAM:
# Importing libraries
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.formula.api import ols
# Create a dataframe
dataframe = pd.DataFrame({'Fertilizer': np.repeat(['daily', 'weekly'], 15),
'Watering': np.repeat(['daily', 'weekly'], 15),
'height': [14, 16, 15, 15, 16, 13, 12, 11,14, 15, 16, 16, 17, 18, 14, 13,14, 14, 14,
15, 16, 16, 17, 18, 14, 13, 14, 14, 14, 15]}
# Performing two-way ANOVA
model = ols('height ~ C(Fertilizer) + C(Watering) +\
C(Fertilizer):C(Watering)',
data=dataframe).fit()
result = sm.stats.anova_lm(model, type=2)
# Print the result
print(result)
OUTPUT:
C(Fertilizer) 1.0 0.033333 0.033333 0.012069 0.913305
C(Watering) 1.0 0.000369 0.000369 0.000133 0.990865
C(Fertilizer):C(Watering) 1.0 0.040866 0.040866 0.014796 0.904053
Residual 28.0 77.333333 2.761905 NaNNaN
Ex No:10 BUILDING AND VALIDATING LINEAR MODELS
PROGRAM:
import matplotlib.pyplot as plt
from scipy import stats
x = [89,43,36,36,95,10,66,34,38,20,26,29,48,64,6,5,36,66,72,40]
y = [21,46,3,35,67,95,53,72,58,10,26,34,90,33,38,20,56,2,47,15]
slope, intercept, r, p, std_err = stats.linregress(x, y)
defmyfunc(x):
return slope * x + intercept
mymodel = list(map(myfunc, x))
plt.scatter(x, y)
plt.plot(x, mymodel)
plt.show()
OUTPUT:
Ex No:11 BUILDING AND VALIDATING LOGISTIC MODEL
PROGRAM:
import numpy
from sklearn import linear_model
X = numpy.array([3.78, 2.44, 2.09, 0.14, 1.72, 1.65, 4.92, 4.37, 4.96, 4.52, 3.69,
5.88]).reshape(-1,1)
y = numpy.array([0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1])
logr = linear_model.LogisticRegression()
logr.fit(X,y)
deflogit2prob(logr, X):
log_odds = logr.coef_ * X + logr.intercept_
odds = numpy.exp(log_odds)
probability = odds / (1 + odds)
return(probability)
print(logit2prob(logr, X))
OUTPUT:
[[0.60749955]
[0.19268876]
[0.12775886]
[0.00955221]
[0.08038616]
[0.07345637]
[0.88362743]
[0.77901378]
[0.88924409]
[0.81293497]
[0.57719129]
[0.96664243]]
Ex No:12 TIME SERIES ANALYSIS
PROGRAM:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
# reading the dataset using read_csv
df = pd.read_csv("stock_data.csv",
parse_dates=True,
index_col="Date")
# displaying the first five rows of dataset
df.head()
# deleting column
df.drop(columns='Unnamed: 0')
df['Volume'].plot()
plt.show()
OUTPUT: