DATA ANALYTICS LAB (BIT - 651)
Department of Information Technology
G. L. Bajaj Institute of Technology and Management,
Greater Noida
SESSION: 2024-25
Course: - B. Tech
Year: 3rd /6th Sem
SUBMITTED TO SUBMITTED BY
Name: Ms.Kavya Goswami Name: Amitesh Pandey
Ms.Pooja Tomar
Designation :-Assistant Professor
Roll no: 2201920130025
INDEX
Execution Submission
S. No. Program Name Signature Remarks
Date Date
To get the input from the user and perform
numerical operations (MAX,MIN,
1 AVG,SUM, SQRT, ROUND) using
R/python.
To perform data import/export (.CSV, .XLS,
2 .TXT) operations using data frames in
R/Python
To get the input matrix from the user and
perform Matrix addition,subtraction,
3 multiplication, inverse transpose and division
operations using vector concept in Python.
To perform statistical operations (Mean,
4 Median, Mode and Standard
deviation) using Python.
To perform data pre-processing operations
5 i) Handling Missing data ii) Min-Max
normalization.
6 To perform dimensionality reduction
operation using PCA for Houses Data Set.
7 To perform Simple Linear Regression with
Python.
8 To perform K-Means clustering operation
and visualize for iris data set.
Write a Python script to diagnose any
9 disease using KNN classification and plot the
results.
10 To perform market basket analysis using
Association Rules (Apriori).
11 To perform Multiple Linear Regression
with Python
Program -1
Objective: To get the input from the user and perform numerical operations
(MAX,MIN, AVG,SUM, SQRT, ROUND) using python.
Code:
import pandas as pd
import numpy as np
import seaborn as sns
df = sns.load_dataset('iris')
numeric=df.select_dtypes(include=[np.number]).columns.tolist()
col = "sepal_length"
data=df[col].values
#print heads
print(df.head())
print(numeric)
#prints max of data
print(np.max(data))
#prints min of data
print(np.min(data))
#prints mean of data
print(np.mean(data))
#prints sum of data
print(np.sum(data))
#prints square root
print(np.sqrt(data[:6]))
#prints round off
print(np.round(data[:5]))
Output:
Program -2
Objective: To perform data import/export (.CSV, .XLS, .TXT) operations using data frames
in R/Python
Implementation:
Import the necessary library:
import pandas as pd
Importing Data
1. Import .CSV file
df_csv = pd.read_csv('data.csv')
print(df_csv.head())
2. Import .XLS or .XLSX file (Excel)
df_excel = pd.read_excel('data.xlsx', sheet_name='Sheet1') # Optional:
sheet_name
print(df_excel.head())
3. Import .TXT file (Assuming tab-separated or any delimiter)
df_txt = pd.read_csv('data.txt', delimiter='\t') # Use ',' or other delimiter if
needed
print(df_txt.head())
Exporting Data
1. Export to .CSV
df_csv.to_csv('output.csv', index=False)
2. Export to .XLSX
df_excel.to_excel('output.xlsx', index=False)
3. Export to .TXT (tab-separated)
df_txt.to_csv('output.txt', sep='\t', index=False)
Program -3
Objective: To get the input matrix from the user and perform Matrix addition,subtraction,
multiplication, inverse transpose and division operations using vector concept in Python.
Code:
import numpy as np
# Input matrices from user
def get_matrix(name):
r = int(input(f"Enter number of rows for {name}: "))
c = int(input(f"Enter number of columns for {name}: "))
print(f"Enter the elements for {name} row-wise:")
elements = list(map(float, input().split()))
matrix = np.array(elements).reshape(r, c)
return matrix
# Get matrices A and B
print("Matrix A:")
A = get_matrix("Matrix A")
print("\nMatrix B:")
B = get_matrix("Matrix B")
# Perform operations
print("\n--- Matrix Operations ---")
# Addition
if A.shape == B.shape:
print("\nAddition (A + B):")
print(A + B)
else:
print("\nAddition not possible (shape mismatch)")
# Subtraction
if A.shape == B.shape:
print("\nSubtraction (A - B):")
print(A - B)
else:
print("\nSubtraction not possible (shape mismatch)")
# Multiplication
if A.shape[1] == B.shape[0]:
print("\nMultiplication (A * B):")
print(np.dot(A, B))
else:
print("\nMultiplication not possible (columns of A != rows of B)")
# Transpose
print("\nTranspose of A:")
print(A.T)
print("\nTranspose of B:")
print(B.T)
# Inverse
try:
if A.shape[0] == A.shape[1]:
print("\nInverse of A:")
print(np.linalg.inv(A))
else:
print("\nInverse of A not possible (A is not square)")
if B.shape[0] == B.shape[1]:
print("\nInverse of B:")
print(np.linalg.inv(B))
else:
print("\nInverse of B not possible (B is not square)")
except np.linalg.LinAlgError:
print("\nInverse not possible (matrix is singular)")
# Division (A * inv(B))
try:
if B.shape[0] == B.shape[1] and A.shape[1] == B.shape[0]:
print("\nDivision (A * inv(B)):")
B_inv = np.linalg.inv(B)
print(np.dot(A, B_inv))
else:
print("\nDivision not possible (shape mismatch or B not square)")
except np.linalg.LinAlgError:
print("\nDivision not possible (B is singular)")
Output:
Program - 4
Objective: To perform statistical operations (Mean, Median, Mode and Standard
deviation) using Python.
Code:
import csv
import statistics
def read_data_from_csv(file_path):
data = []
try:
with open(file_path, mode='r') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
try:
value = float(row['value']) # Ensure numeric data
data.append(value)
except ValueError:
print(f"Skipping invalid data: {row['value']}")
except FileNotFoundError:
print(f"File not found: {file_path}")
return data
def compute_statistics(data):
if not data:
print("No valid data to compute statistics.")
return
try:
mean = statistics.mean(data)
median = statistics.median(data)
mode = statistics.mode(data)
std_dev = statistics.stdev(data)
print(f"\nStatistics Summary:")
print(f"Mean: {mean}")
print(f"Median: {median}")
print(f"Mode: {mode}")
print(f"Standard Deviation: {std_dev:.2f}")
except statistics.StatisticsError as e:
print(f"Statistics Error: {e}")
def main():
file_path = 'data.csv'
data = read_data_from_csv(file_path)
compute_statistics(data)
if __name__ == "__main__":
main()
Output:
Program - 5
Objective: To perform data pre-processing operations i) Handling Missing data ii) Min-Max
normalization.
Code:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
def handle_missing_data(df, column):
# Replace missing values with mean of the column
if df[column].isnull().any():
mean_val = df[column].mean()
df[column].fillna(mean_val, inplace=True)
print(f"Missing values in '{column}' replaced with mean: {mean_val}")
return df
def min_max_normalize(df, column):
scaler = MinMaxScaler()
df[column + "_normalized"] = scaler.fit_transform(df[[column]])
print(f"Min-Max normalization applied on '{column}'")
return df
def main():
file_path = 'data_preprocessing.csv'
try:
df = pd.read_csv(file_path)
print("Original Data:\n", df)
# Handle missing data
df = handle_missing_data(df, 'value')
# Apply Min-Max normalization
df = min_max_normalize(df, 'value')
print("\nPreprocessed Data:\n", df)
except FileNotFoundError:
print(f"File '{file_path}' not found.")
if __name__ == "__main__":
main()
Output:
Program - 6
Objective: To perform dimensionality reduction operation using PCA for Houses Data Set.
Code:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Step 1: Load California Housing dataset
california = fetch_california_housing(as_frame=True)
df = california.frame
print(df)
# Step 2: Standardize the data (important for PCA)
features = df.drop('MedHouseVal', axis=1) # Remove the target column
scaler = StandardScaler()
features_scaled = scaler.fit_transform(features)
# Step 3: Apply PCA
pca = PCA(n_components=2) # Reduce to 2 dimensions
principal_components = pca.fit_transform(features_scaled)
# Step 4: Create DataFrame with PCA results
pca_df = pd.DataFrame(data=principal_components, columns=['PC1',
'PC2'])
# Step 5: Plot PCA result
plt.figure(figsize=(8, 6))
plt.scatter(pca_df['PC1'], pca_df['PC2'], alpha=0.5,
c=df['MedHouseVal'], cmap='viridis')
plt.colorbar(label='Median House Value ($100,000s)')
plt.title('PCA on California Housing Dataset')
plt.xlabel('Principal Component 1')
plt.ylabel('Principal Component 2')
plt.grid(True)
plt.show()
# Step 6: Print explained variance
print("Explained Variance Ratio:", pca.explained_variance_ratio_)
Output:
Program - 7
Objective: To perform Simple Linear Regression with Python.
Code:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
sd=pd.read_csv("salary_data.csv")
wd.head()
sd.columns
x=sd['YearsExperience']
y=sd['Salary']
#plotting X,Y points
plt.scatter(x,y)
plt.xlabel('YearsExperience')
plt.ylabel('Salary')
#Simple Linear Regression Model
from sklearn.linear_model import LinearRegression
x=sd['YearsExperience'].values.reshape(-1,1)
y=sd['Salary']
lr_model=LinearRegression()
lr_model.fit(x,y)
y_pred=lr_model.predict(x)
y_pred
#Plot the Regression Line
plt.scatter(x,y)
plt.xlabel('YearsExperience')
plt.ylabel('Salary')
plt.plot(x,y_pred)
theta_0=lr_model.intercept_
theta_1=lr_model.coef_
theta_0,theta_1
#calculation of new value using prection model
y_pred=lr_model.predict(np.array([17]).reshape(1,1))
y_pred
Output : Successfully performed the simple linear regression in Python.
Program - 8
Objective: To perform K-Means clustering operation and visualize for iris data set.
Code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans
import seaborn as sns
plt.style.use('seaborn')
#Importing the data from .csv file
#First we read the data from the dataset using read_csv from the pandas library.
data = pd.read_csv('iris.csv')
data
# Checking the dimensions/shape of the dataset using shape.
data.shape
(150, 6)
#Checking summary of missing values
data.isnull().sum()
#The 'Id' column has no relevence therefore deleting it would be better.
#Deleting 'customer_id' colummn using drop().
data.drop('Id', axis=1, inplace=True)
data.head()
clustering_data = data.iloc[:,[0,1,2,3]]
clustering_data.head()
from sklearn.cluster import KMeans
kms = KMeans(n_clusters=3, init='k-means++')
kms.fit(clustering_data)
clusters = clustering_data.copy()
clusters['Cluster_Prediction'] = kms.fit_predict(clustering_data)
clusters.head()
fig, ax = plt.subplots(figsize=(10,7))
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 0]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 0]['SepalWidthCm'],
s=70,edgecolor='lime', linewidth=0.3, c='lime', label='Iris-versicolor')
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 1]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 1]['SepalWidthCm'],
s=70,edgecolor='teal', linewidth=0.3, c='teal', label='Iris-setosa')
plt.scatter(x=clusters[clusters['Cluster_Prediction'] == 2]['SepalLengthCm'],
y=clusters[clusters['Cluster_Prediction'] == 2]['SepalWidthCm'],
s=70,edgecolor='magenta', linewidth=0.3, c='magenta', label='Iris-virginica')
plt.scatter(x=kms.cluster_centers_[:, 0], y=kms.cluster_centers_[:, 1], s = 170,
c = 'red', label = 'Centroids',edgecolor='black', linewidth=0.3)
plt.legend(loc='upper right')
plt.xlim(4,8)
plt.ylim(1.8,4.5)
ax.set_ylabel('Sepal Width (in cm)')
ax.set_xlabel('Sepal Length (in cm)')
plt.title('Clusters', fontsize = 20)
plt.show()
Output:
Program - 9
Objective: Write a Python script to diagnose any disease using KNN classification and plot
the results.
Code:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix
from sklearn import metrics
data = pd.read_csv('heart.csv')
data.shape
data.head()
sns.countplot(x="target", data=data, palette="bwr")
plt.show()
sns.countplot(x='sex', data=data, palette="mako_r")
plt.xlabel("Sex (0 = female, 1= male)")
plt.show()
plt.scatter(x=data.age[data.target==1], y=data.thalach[(data.target==1)], c="red")
plt.scatter(x=data.age[data.target==0], y=data.thalach[(data.target==0)], c = 'black')
plt.legend(["Disease", "Not Disease"])
plt.xlabel("Age")
plt.ylabel("Maximum Heart Rate")
plt.show()
X = data.iloc[:,:-1].values
y = data.iloc[:,13].values
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.25, random_stat
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)
classifier = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.88
#k=6
classifier = KNeighborsClassifier(n_neighbors = 6, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.88
#k=7
classifier = KNeighborsClassifier(n_neighbors = 7, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.86
#k=8
classifier = KNeighborsClassifier(n_neighbors = 8, metric = 'minkowski', p = 2)
classifier = classifier.fit(X_train,y_train)
#prediction
y_pred = classifier.predict(X_test)
#check accuracy
accuracy = metrics.accuracy_score(y_test, y_pred)
print('Accuracy: {:.2f}'.format(accuracy))
Accuracy: 0.89
Conclusion: Diagnose heart disease using KNN classification and plotted the results.
Program - 10
Objective: To perform market basket analysis using Association Rules (Apriori).
Code:
import pandas as pd
import numpy as np
from apyori import apriori
store_data=pd.read_csv("Market_Basket_Optimisation.csv",header=None)
num_record=len(store_data)
print(num_record)
7501
records=[]
for i in range(0,num_record):
records.append([str(store_data.values[i,j])for j in range(0,20)])
association_rules= apriori(records,min_support=0.0053,
min_confidence=0.20,min_lift=3,min_length=2)
association_results=list(association_rules)
print(len(association_results))
32
print(association_results[0])
results=[]
for item in association_results:
pair=item[0]
items=[x for x in pair]
value0=str(items[0])
value1=str(items[1])
value2=str(item[1])[:7]
value3=str(item[2][0][2])[:7]
value4=str(item[2][0][3])[:7]
rows=(value0,value1,value2,value3,value4)
results.append(rows)
Label=['Item1','Item2','Support_count','Confidence','Lift']
store_suggestions=pd.DataFrame.from_records(results,columns=Label)
print(store_suggestions)
Output:
Program - 11
Objective: To perform Multiple Linear Regression with Python.
Code:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns
df = pandas.read_csv("Student_Performance.csv")
df.columns
X1 = df[["Previous Scores", "Hours Studied",
"Sleep Hours", "Sample Question Papers Practiced"]]
y1 = df["Performance Index"]
X1_train, X1_test, y1_train, y1_test = train_test_split(X1, y1, test_size=0.3, random_
lin_reg1 = LinearRegression()
model1 = lin_reg1.fit(X1_train, y1_train)
predictions1 = lin_reg1.predict(X1_test)
print("Rscore (r): ", model1.score(X1_test, y1_test))
df1=pd.DataFrame({'Actual Performance': y1_test, 'Predicted Performance':
predictions1
df1
columns = df1[['Actual Performance','Predicted Performance']]
for column in columns:
plt.figure(figsize=(7, 5))
sns.countplot(x=column, data=df1, palette="colorblind", alpha=0.8)
plt.xticks([])
plt.title(f'{column} Distribution')
plt.show()
Output: