List of Experiments:
1. Data Preprocessing:
● Handling missing values.
● Noise detection and removal.
● Identifying and eliminating data redundancy
2. Implementing an imputation model
3. Implement Linear Regression.
4. Implement Logistic Regression.
5. Implement Decision Tree Induction for classification.
6. Implement Random Forest Classifier.
7. Apply ARIMA on Time Series data.
8. Perform object segmentation using hierarchical methods
9. Perform visualization techniques (e.g., bar, column, line, scatter plots, 3D cubes).
10.Conduct descriptive analytics on healthcare data
11.Perform predictive analytics on product sales data.
12.Apply predictive analytics for weather forecasting
Creating own dataset and filling missing value
import pandas as pd
import numpy as np
#create dataframe with missing values
data ={
'A':[1,2,np.nan,4,5],
'B':[np.nan,2,3,4,5],
'C':[1,2,3,np.nan,5],
'D':[1,2,3,4,5]
}
df=pd.DataFrame(data)
print(df)
output:
A B C D
0 1.0 NaN 1.0 1
1 2.0 2.0 2.0 2
2 NaN 3.0 3.0 3
3 4.0 4.0 NaN 4
4 5.0 5.0 5.0 5
A. Ignoring missing values
import pandas as pd
import numpy as np
# create a dataframe with missing values
data = {
'A': [1, 2, np.nan, 4, 5],
'B': [np.nan, 2, 3, 4, 5],
'C': [1, 2, 3, np.nan, 5],
'D': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)
# remove rows with missing values
df.dropna(inplace=True)
print(df)
output:
A B C D
1 2.0 2.0 2.0 2
4 5.0 5.0 5.0 5
Replacing missing values
import pandas as pd
import numpy as np
# create a dataframe with missing values
data = {
'A': [1, 2, np.nan, 4, 5],
'B': [np.nan, 2, 3, 4, 5],
'C': [1, 2, 3, np.nan, 5],
'D': [1, 2, 3, 4, 5]
}
df=pd.DataFrame(data)
# replace missing values with 0
df.fillna(value=0, inplace=True)
print(df)
Output:
A B C D
0 1.0 0.0 1.0 1
1 2.0 2.0 2.0 2
2 0.0 3.0 3.0 3
3 4.0 4.0 0.0 4
4 5.0 5.0 5.0 5
B. Creating a dataset and Identifying data redundancy and elimination
import pandas as pd
# Creating a simple student dataset
data = {
'ID': [1, 2, 3, 4, 1, 3, 5],
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Alice', 'Charlie', 'Eve']
}
# Creating a DataFrame
df = pd.DataFrame(data)
# Displaying the original dataset
print("Original Dataset:")
print(df)
# Finding duplicate records
duplicates = df[df.duplicated()]
print("\nDuplicate Records:")
print(duplicates)
# Removing duplicate records
df_cleaned = df.drop_duplicates()
# Displaying the cleaned dataset
print("\nDataset after removing duplicates:")
print(df_cleaned)
Output:
Original Dataset:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie
3 4 David
4 1 Alice
5 3 Charlie
6 5 Eve
Duplicate Records:
ID Name
4 1 Alice
5 3 Charlie
Dataset after removing duplicates:
ID Name
0 1 Alice
1 2 Bob
2 3 Charlie
3 4 David
6 5 Eve
C.Noise detection and removal
import pandas as pd
import numpy as np
from scipy import stats
# Creating a simple student dataset with noise
data = {
'Age': [18, 20, 19, 21, 22, 200, 10, -5] # 200 and -5 are noisy
}
# Creating a DataFrame
df = pd.DataFrame(data)
# Remove negative ages (since age cannot be negative)
df = df[df['Age'] > 0]
# Compute Z-scores
z_scores = np.abs(stats.zscore(df['Age']))
# Set a threshold for outlier detection
threshold = 2
# Filter out outliers
df_clean = df[z_scores < threshold]
# Display datasets
print("Dataset with Noise:")
print(df)
print("\nZ-Scores:")
print(z_scores)
print("\nCleaned Dataset using Z-score method:")
print(df_clean)
OUTPUT:
Dataset with Noise:
Age
0 18
1 20
2 19
3 21
4 22
5 200
6 10
Z-Scores:
0 0.412811
1 0.381402
2 0.397107
3 0.365697
4 0.349992
5 2.445459
6 0.538450
Cleaned Dataset using Z-score method:
Age
0 18
1 20
2 19
3 21
4 22
6 10
2.Implement an Imputation Model
Objective: Use an imputation model to fill missing values.
import pandas as pd
import numpy as np
# Create a DataFrame with missing values
data = {
'A': [1, 2, np.nan, 4, 5],
'B': [np.nan, 2, 3, 4, 5],
'C': [1, 2, 3, np.nan, 5],
'D': [1, 2, 3, 4, 5]
}
df = pd.DataFrame(data)
# Replace missing values with mean
df['A'].fillna(value=df['A'].mean(), inplace=True)
# Replace missing values with median
df['B'].fillna(value=df['B'].median(), inplace=True)
# Replace missing values with mode (most frequent value)
df['C'].fillna(value=df['C'].mode()[0], inplace=True)
# Print the cleaned DataFrame
print(df)
OUTPUT:
A B C D
0 1.0 3.5 1.0 1
1 2.0 2.0 2.0 2
2 3.0 3.0 3.0 3
3 4.0 4.0 1.0 4
4 5.0 5.0 5.0 5
3.Implementation of Linear regression in python
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
import numpy as np
# Given data
data = {
"Area (sq ft)": [2600, 3000, 3200, 3600, 4000],
"Price ($)": [550000, 565000, 610000, 630000, 725000]
}
# Create DataFrame
df = pd.DataFrame(data)
print("Data:")
print(df)
# Scatter plot
plt.scatter(df["Area (sq ft)"], df["Price ($)"], color='blue', label='Actual Data')
plt.xlabel("Area (sq ft)")
plt.ylabel("Price ($)")
plt.title("House Prices vs. Area")
# Train a simple linear regression model
X = df["Area (sq ft)"].values.reshape(-1, 1)
y = df["Price ($)"].values
model = LinearRegression()
model.fit(X, y)
# Predict prices for a range of areas
area_range = np.linspace(2500, 4200, 100).reshape(-1, 1)
predicted_prices = model.predict(area_range)
# Plot regression line
plt.plot(area_range, predicted_prices, color='red', linestyle='dashed', label='Regression Line')
plt.legend()
plt.show()
# Example: Predict price for 3500 sq ft
predicted_price = model.predict([[3500]])[0]
print(f"Predicted price for 3500 sq ft: ${predicted_price:,.2f}")
output:
Data:
Area (sq ft) Price ($)
0 2600 550000
1 3000 565000
2 3200 610000
3 3600 630000
4 4000 725000
Predicted price for 3500 sq ft: $642,859.59
4.Implementation of Logistic regression in python
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# Sample Data: [Hours Studied, Passed (1) / Failed (0)]
X = np.array([[1], [2], [3], [4], [5], [6], [7], [8], [9], [10]]) # Hours studied
y = np.array([0, 0, 0, 0, 1, 1, 1, 1, 1, 1]) # 0 = Fail, 1 = Pass
# Assuming X_train and y_train contain hours studied and pass/fail labels (0 or 1)
# Train Logistic Regression Model
model = LogisticRegression()
model.fit(X_train, y_train)
# Predict for a new student who studied 4.5 hours
new_data = np.array([[4.5]])
prediction = model.predict(new_data)
probability = model.predict_proba(new_data)
print("Predicted Class (Pass=1, Fail=0):", prediction[0])
print("Probability of Passing:", probability[0][1])
# Plotting the results
plt.scatter(X, y, color='blue', label="Actual Data")
plt.plot(X, model.predict_proba(X)[:, 1], color='red', linestyle="--", label="Logistic Curve")
plt.xlabel("Hours Studied")
plt.ylabel("Probability of Passing")
plt.title("Logistic Regression: Study Hours vs Pass/Fail")
plt.legend()
plt.show()
output:
Predicted Class (Pass=1, Fail=0): 1
Probability of Passing: 0.5019075444554663
Implement Arima model on time series data
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
# Step 1: Create Simple Blood Sugar Level Data (Time Series)
data = {
"Day": list(range(1, 11)), # 10 days
"Blood_Sugar": [90, 92, 95, 97, 99, 100, 102, 105, 107, 110] # Simple increasing trend
}
df = pd.DataFrame(data)
df.set_index("Day", inplace=True) # Set day as index
# Step 2: Plot Blood Sugar Levels
plt.plot(df, marker="o", label="Actual Blood Sugar")
plt.xlabel("Day")
plt.ylabel("Blood Sugar Level (mg/dL)")
plt.title("Blood Sugar Time Series")
plt.legend()
plt.show()
# Step 3: Fit an ARIMA Model (p=1, d=1, q=1) - Simple Parameters
model = ARIMA(df["Blood_Sugar"], order=(1, 1, 1)) # (p,d,q)
model_fit = model.fit()
# Step 4: Forecast Next 5 Days
forecast = model_fit.forecast(steps=5)
# Step 5: Plot Forecasted Values
plt.plot(df, marker="o", label="Actual Blood Sugar")
plt.plot(range(11, 16), forecast, marker="o", linestyle="dashed", color="red", label="Forecast")
plt.xlabel("Day")
plt.ylabel("Blood Sugar Level (mg/dL)")
plt.title("Blood Sugar Forecast using ARIMA")
plt.legend()
plt.show()
5.Write a python script to perform descriptive analytics on health care data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Creating the dataset
data = {
"PatientID": [101, 102, 103, 104],
"Age": [45, 54, np.nan, 60],
"BloodPressure": [120, 135, 110, 125],
"Glucose": [85, 95, 100, np.nan],
"DiseaseStatus": ["Diabetic", "Non-Diabetic", "Diabetic", "Non-Diabetic"]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Checking for missing values
print("\nMissing Values Before Handling:")
print(df.isnull().sum())
# Filling missing values
df["Age"].fillna(df["Age"].median(), inplace=True)
df["Glucose"].fillna(df["Glucose"].mean(), inplace=True)
# Checking for missing values after handling
print("\nMissing Values After Handling:")
print(df.isnull().sum())
# Display the updated DataFrame
print("\nUpdated Healthcare Dataset:")
print(df)
# Basic statistical summary
print("\nDescriptive Statistics:")
print(df.describe())
# Grouping by DiseaseStatus
print("\nMean Values by Disease Status:")
print(df.groupby("DiseaseStatus").mean(numeric_only=True))
# Distribution of Disease Status
print("\nDisease Status Counts:")
print(df["DiseaseStatus"].value_counts())
# 1. Age Distribution
plt.figure(figsize=(6,4))
sns.histplot(df["Age"], bins=10, kde=True, color="blue")
plt.title("Age Distribution")
plt.xlabel("Age")
plt.ylabel("Frequency")
plt.show()
# 2. Blood Pressure Distribution by Disease Status
plt.figure(figsize=(6,4))
sns.boxplot(x="DiseaseStatus", y="BloodPressure", data=df, palette="Set2")
plt.title("Blood Pressure by Disease Status")
plt.show()
# 3. Glucose Levels by Disease Status
plt.figure(figsize=(6,4))
sns.barplot(x="DiseaseStatus", y="Glucose", data=df, palette="pastel", estimator=np.mean,
ci=None)
plt.title("Average Glucose Levels by Disease Status")
plt.show()
# 4. Scatter Plot - Age vs. Blood Pressure
plt.figure(figsize=(6,4))
sns.scatterplot(x="Age", y="BloodPressure", hue="DiseaseStatus", data=df, s=100,
edgecolor="black")
plt.title("Age vs Blood Pressure")
plt.xlabel("Age")
plt.ylabel("Blood Pressure")
plt.show()
Output:
Missing Values Before Handling:
PatientID 0
Age 1
BloodPressure 0
Glucose 1
DiseaseStatus 0
dtype: int64
Missing Values After Handling:
PatientID 0
Age 0
BloodPressure 0
Glucose 0
DiseaseStatus 0
dtype: int64
Updated Healthcare Dataset:
PatientID Age BloodPressure Glucose DiseaseStatus
0 101 45.0 120 85.000000 Diabetic
1 102 54.0 135 95.000000 Non-Diabetic
2 103 54.0 110 100.000000 Diabetic
3 104 60.0 125 93.333333 Non-Diabetic
Descriptive Statistics:
PatientID Age BloodPressure Glucose
count 4.000000 4.000000 4.00000 4.000000
mean 102.500000 53.250000 122.50000 93.333333
std 1.290994 6.184658 10.40833 6.236096
min 101.000000 45.000000 110.00000 85.000000
25% 101.750000 51.750000 117.50000 91.250000
50% 102.500000 54.000000 122.50000 94.166667
75% 103.250000 55.500000 127.50000 96.250000
max 104.000000 60.000000 135.00000 100.000000
Mean Values by Disease Status:
PatientID Age BloodPressure Glucose
DiseaseStatus
Diabetic 102.0 49.5 115.0 92.500000
Non-Diabetic 103.0 57.0 130.0 94.166667
Disease Status Counts:
DiseaseStatus
Diabetic 2
Non-Diabetic 2
Perform predictive analytics on product sales data.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
# Step 1: Create a synthetic product sales dataset
data = {
"Month": np.arange(1, 13), # Months 1 to 12
"Quantity Sold": [50, 65, 80, 95, 110, 130, 150, 160, 175, 190, 210, 230],
"Revenue": [5000, 6500, 8000, 9500, 11000, 13000, 15000, 16000, 17500, 19000, 21000,
23000]
# Convert to DataFrame
df = pd.DataFrame(data)
# Step 2: Split data into training and testing sets
X = df[["Month"]]
y = df["Quantity Sold"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Step 3: Train Linear Regression Model
model = LinearRegression()
model.fit(X_train, y_train)
# Step 4: Predict sales for the next 3 months
future_months = np.array([[13], [14], [15]])
future_sales_pred = model.predict(future_months)
# Display Predictions
predicted_sales = dict(zip(["Month 13", "Month 14", "Month 15"], future_sales_pred))
print("Predicted Sales for Future Months:", predicted_sales)
# Step 5: Plot Actual vs Predicted Sales
plt.figure(figsize=(8, 5))
plt.scatter(df["Month"], df["Quantity Sold"], color="blue", label="Actual Sales")
plt.plot(df["Month"], model.predict(X), color="red", linestyle="--", label="Regression Line")
plt.scatter(future_months, future_sales_pred, color="green", marker="o", label="Predicted
Sales")
plt.xlabel("Month")
plt.ylabel("Quantity Sold")
plt.title("Product Sales Prediction")
plt.legend()
plt.grid(True)
plt.show()
output:
Apply predictive analytics on weather forecasting
import pandas as pd
import numpy as np
from prophet import Prophet
# Generate synthetic weather data
np.random.seed(42)
days = pd.date_range(start="2024-01-01", periods=100, freq='D') # 100 days
temperature = np.random.uniform(0, 40, 100) # Random temperatures in °C
# Create DataFrame
df = pd.DataFrame({'ds': days, 'y': temperature}) # 'ds' = Date, 'y' = Target Variable
(Temperature)
# Initialize and train Prophet model
model = Prophet()
model.fit(df)
# Create future dates for prediction
future = model.make_future_dataframe(periods=30) # Predict next 30 days
# Predict
forecast = model.predict(future)
# Print results
print(forecast[['ds', 'yhat']].tail(10)) # Show last 10 predictions
# Plot results
model.plot(forecast)
output:
ds yhat
120 2024-04-30 14.443973
121 2024-05-01 13.656203
122 2024-05-02 18.774803
123 2024-05-03 19.789734
124 2024-05-04 14.318090
125 2024-05-05 18.549194
126 2024-05-06 11.512856
127 2024-05-07 14.114318
128 2024-05-08 13.326549
129 2024-05-09 18.445149
Create a simple dataset in python and perform visualization techniques such as types of
maps-( bars,colum,line, scatter ,3D cubes)
import matplotlib.pyplot as plt
import numpy as np
# Sample data
categories = ['A', 'B', 'C', 'D', 'E']
values1 = [10, 25, 35, 20, 15]
values2 = [30, 40, 20, 10, 25]
# Bar Chart
plt.bar(categories, values1, color='blue')
plt.title('Bar Chart')
plt.show()
# Column Chart (Horizontal Bar)
plt.barh(categories, values1, color='green')
plt.title('Column Chart')
plt.show()
# Line Chart
plt.plot(categories, values1, marker='o', linestyle='-', color='red', label='Value1')
plt.plot(categories, values2, marker='s', linestyle='--', color='blue', label='Value2')
plt.title('Line Chart')
plt.legend()
plt.show()
# Scatter Plot
plt.scatter(values1, values2, color='purple')
plt.title('Scatter Plot')
plt.xlabel('Value1')
plt.ylabel('Value2')
plt.show()
# 3D Scatter Plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(values1, values2, np.arange(len(categories)), color='brown')
ax.set_title('3D Scatter Plot')
plt.show()
Output:
Implement Decision Tree Induction for classification.
import pandas as pd
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
# Simple dataset
data = {
"Weather": ["Sunny", "Rainy", "Overcast"],
"Temperature": ["Hot", "Cool", "Mild"],
"Play": ["No", "Yes", "Yes"]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Encode categorical variables
df["Weather"] = df["Weather"].map({"Sunny": 0, "Rainy": 1, "Overcast": 2})
df["Temperature"] = df["Temperature"].map({"Cool": 0, "Mild": 1, "Hot": 2})
df["Play"] = df["Play"].map({"No": 0, "Yes": 1})
# Features and target
X = df[["Weather", "Temperature"]]
y = df["Play"]
# Train Decision Tree Classifier
clf = DecisionTreeClassifier(criterion="gini", max_depth=2, random_state=42)
clf.fit(X, y)
# Visualizing the Decision Tree
plt.figure(figsize=(6, 4))
plot_tree(clf, filled=True, feature_names=["Weather", "Temperature"], class_names=["No",
"Yes"])
plt.show()
# Predicting for a new day (Sunny, Cool)
new_data = [[0, 0]] # Sunny = 0, Cool = 0
prediction = clf.predict(new_data)
print(f"Will they play outside? {'Yes' if prediction[0] == 1 else 'No'}")
output:
Will they play outside? No
Implement Random Forest Classifier.
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import plot_tree
# Dataset
data = {
"Weather": ["Sunny", "Rainy", "Overcast"],
"Temperature": ["Hot", "Cool", "Mild"],
"Play": ["No", "Yes", "Yes"]
}
# Convert to DataFrame
df = pd.DataFrame(data)
# Encode categorical variables
df["Weather"] = df["Weather"].map({"Sunny": 0, "Rainy": 1, "Overcast": 2})
df["Temperature"] = df["Temperature"].map({"Cool": 0, "Mild": 1, "Hot": 2})
df["Play"] = df["Play"].map({"No": 0, "Yes": 1})
# Features and target
X = df[["Weather", "Temperature"]]
y = df["Play"]
# Train Random Forest Classifier
clf = RandomForestClassifier(n_estimators=3, criterion="gini", random_state=42)
clf.fit(X, y)
# Visualizing the first 3 trees
for i in range(3):
plt.figure(figsize=(6, 4))
plot_tree(clf.estimators_[i], filled=True, feature_names=["Weather", "Temperature"],
class_names=["No", "Yes"])
plt.title(f"Decision Tree {i+1}")
plt.show()
Output:
Perform object segmentation using hierarchical methods.
import numpy as np
import matplotlib.pyplot as plt
from skimage.segmentation import felzenszwalb
from skimage.io import imread
from skimage.color import rgb2gray
# Load the image
image_path = r"C:\Users\VAAAG\Downloads\apple.jpg" # Replace with your image path
image = imread(image_path)
# Perform hierarchical segmentation using Felzenszwalb's method
segments = felzenszwalb(image, scale=100, sigma=0.5, min_size=50)
# Plot the results
fig, ax = plt.subplots(1, 2, figsize=(12, 6))
ax[0].imshow(image)
ax[0].set_title("Original Image")
ax[0].axis("off")
ax[1].imshow(segments, cmap='nipy_spectral')
ax[1].set_title("Segmented Image")
ax[1].axis("off")
plt.show()
Output: