1.
Data Preprocessing
a. Handling missing values
c. Identifying data redundancy and elimination
import pandas as pd
# Load the CSV file
df = pd.read_csv("C:/Users/nares/OneDrive/Desktop/student.csv")
# Display the number of missing values per column
print(df.isnull().sum())
# Forward fill missing values (propagate last valid value)
df.fillna(method='ffill', inplace=True)
df.drop_duplicates(inplace=True)
# Optionally, display the DataFrame to see the filled data
print(df)
b. Noise detection removal
import pandas as pd
# Load your dataset
df = pd.read_csv("C:/Users/nares/OneDrive/Desktop/student.csv")
# Function to remove outliers using IQR
def remove_outliers_iqr(df):
# Calculate the 25th and 75th percentiles (Q1 and Q3)
Q1 = df.quantile(0.25)
Q3 = df.quantile(0.75)
# Calculate the Interquartile Range (IQR)
IQR = Q3 - Q1
# Define the bounds for detecting outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Filter the DataFrame to exclude outliers
df_no_outliers = df[~((df < lower_bound) | (df > upper_bound)).any(axis=1)]
return df_no_outliers
# Apply the function to remove outliers
df_no_outliers = remove_outliers_iqr(df)
# Print the shape of the original and cleaned DataFrames
print("Original DataFrame shape:", df.shape)
print("After outlier removal:", df_no_outliers.shape)
2. Implement any one imputation model
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
# Sample DataFrame with missing values
data = {
'Age': [25, np.nan, 30, np.nan, 22],
'Salary': [50000, 55000, np.nan, 48000, 51000],
'Gender': ['Male', 'Female', 'Female', np.nan, 'Male']
df = pd.DataFrame(data)
# Simple Imputer for numerical columns (mean imputation)
imputer = SimpleImputer(strategy='mean')
# Impute numerical columns (Age and Salary)
df[['Age', 'Salary']] = imputer.fit_transform(df[['Age', 'Salary']])
# Simple Imputer for categorical columns (mode imputation)
imputer_categorical = SimpleImputer(strategy='most_frequent')
# Reshape the output to 1D before assigning
df['Gender'] = imputer_categorical.fit_transform(df[['Gender']]).ravel()
print("Data after Simple Imputation:")
print(df)
3. Implement Linear Regression
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
# Sample dataset
data = {
'Experience': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], # Independent variable (X)
'Salary': [40000, 45000, 50000, 55000, 60000, 65000, 70000, 75000, 80000, 85000] # Dependent
variable (y)
df = pd.DataFrame(data)
# Independent and dependent variables
X = df[['Experience']] # Features (1 feature in this case)
y = df['Salary'] # Target variable
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a Linear Regression model
model = LinearRegression()
# Train the model
model.fit(X_train, y_train)
# Make predictions
y_pred = model.predict(X_test)
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")
# Plotting the results
plt.scatter(X, y, color='blue') # Original data points
plt.plot(X, model.predict(X), color='red') # Fitted line
plt.title('Single Variable Linear Regression (Experience vs Salary)')
plt.xlabel('Experience (Years)')
plt.ylabel('Salary')
plt.show()
4. Implement Logistic Regression
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
# Sample data: Hours studied vs. Passed (1 = Passed, 0 = Failed)
data = {
'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Passed': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
df = pd.DataFrame(data)
# Independent variable (X) - Hours studied
X = df[['Hours_Studied']]
# Dependent variable (y) - Passed (0 or 1)
y = df['Passed']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Logistic Regression model
model = LogisticRegression()
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Plotting the decision boundary
plt.scatter(X, y, color='blue', label='Data Points')
plt.plot(X, model.predict_proba(X)[:, 1], color='red', label='Logistic Regression Curve')
plt.title('Logistic Regression: Hours Studied vs. Passed')
plt.xlabel('Hours Studied')
plt.ylabel('Probability of Passing')
plt.legend()
plt.show()
5. Implement Decision Tree Induction for classification
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn import tree
import matplotlib.pyplot as plt
# Sample dataset: Hours studied vs. Passed (1 = Passed, 0 = Failed)
data = {
'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Passed': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
df = pd.DataFrame(data)
# Independent variable (X) - Hours studied
X = df[['Hours_Studied']]
# Dependent variable (y) - Passed (0 or 1)
y = df['Passed']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Decision Tree Classifier model
model = DecisionTreeClassifier(random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Visualize the decision tree
plt.figure(figsize=(10, 8))
tree.plot_tree(model, filled=True, feature_names=['Hours_Studied'], class_names=['Failed (0)',
'Passed (1)'], rounded=True)
plt.title('Decision Tree for Classification')
plt.show()
6. Implement Random Forest Classifier
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
# Sample dataset: Hours studied vs. Passed (1 = Passed, 0 = Failed)
data = {
'Hours_Studied': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
'Passed': [0, 0, 0, 0, 1, 1, 1, 1, 1, 1]
df = pd.DataFrame(data)
# Independent variable (X) - Hours studied
X = df[['Hours_Studied']]
# Dependent variable (y) - Passed (0 or 1)
y = df['Passed']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
# Create a Random Forest Classifier model
model = RandomForestClassifier(n_estimators=100, random_state=42)
# Train the model
model.fit(X_train, y_train)
# Make predictions on the test set
y_pred = model.predict(X_test)
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")
# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
# Classification Report
print("Classification Report:")
print(classification_report(y_test, y_pred))
# Feature importance (for visualization)
importances = model.feature_importances_
print(f"Feature importances: {importances}")
# Visualize the feature importance
plt.barh(X.columns, importances)
plt.xlabel("Feature Importance")
plt.title("Feature Importance for Random Forest Classifier")
plt.show()
7. Implement ARIMA on Time Series data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from sklearn.metrics import mean_squared_error
# Generate sample time series data
# Let's generate a time series with monthly data for 3 years
date_range = pd.date_range(start='2015-01-01', periods=36, freq='M')
data = {
'Date': date_range,
'Value': np.sin(np.linspace(0, 10, 36)) + np.random.normal(0, 0.1, 36) # Sine wave with noise
df = pd.DataFrame(data)
df.set_index('Date', inplace=True)
# Visualize the data
plt.figure(figsize=(10, 6))
plt.plot(df.index, df['Value'], label='Observed')
plt.title('Time Series Data')
plt.xlabel('Date')
plt.ylabel('Value')
plt.legend()
plt.show()
# Step 1: Check for stationarity (plot ACF and PACF)
plot_acf(df['Value'])
plot_pacf(df['Value'])
plt.show()
# Step 2: Make the series stationary if needed
# If the series is not stationary, we difference the data
df_diff = df['Value'].diff().dropna()
# Step 3: Fit the ARIMA model
# Here we use (p=1, d=1, q=1) based on analysis or experimentation
model = ARIMA(df['Value'], order=(1,1 ,1))
8. Object segmentation using hierarchical based methods
import numpy as np
import cv2
from sklearn.cluster import AgglomerativeClustering
import matplotlib.pyplot as plt
# Load the image (change path to your image)
# Make sure 'image.jpg' is in the current directory or provide the full path
image = cv2.imread('image.jpg') # Updated line to use /content/
# Check if the image was loaded successfully
if image is None:
print("Error: Could not load image. Please check the file path.")
else:
# Convert to RGB (from BGR, which is default in OpenCV)
image_rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
# Reshape the image to a 2D array (pixels as rows and color channels as columns)
pixels = image_rgb.reshape((-1, 3))
# Apply Agglomerative Clustering for segmentation
n_clusters = 5 # Number of segments to segment the image into
agg_clustering = AgglomerativeClustering(n_clusters=n_clusters, affinity='euclidean',
linkage='ward')
# Fit the model to the pixels
labels = agg_clustering.fit_predict(pixels)
# Reshape the labels to the shape of the image
segmented_image = labels.reshape(image_rgb.shape[0], image_rgb.shape[1])
# Visualize the segmented image
plt.figure(figsize=(10, 6))
plt.imshow(segmented_image, cmap='nipy_spectral')
plt.title('Hierarchical Object Segmentation Using Agglomerative Clustering')
plt.axis('off') # Hide axes
plt.show()
9. Perform Visualization techniques (types of maps - Bar, Colum, Line, Scatter,
3D Cubes etc)
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D
# Sample Data
x = np.arange(1, 11)
y = np.random.randint(10, 100, 10)
z = np.random.randint(1, 10, 10)
# Bar Plot
plt.figure(figsize=(8, 6))
plt.bar(x, y, color='skyblue')
plt.title('Bar Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.show()
# Column Plot (using barh for horizontal bars)
plt.figure(figsize=(8, 6))
plt.barh(x, y, color='lightgreen')
plt.title('Column Plot')
plt.xlabel('Y Axis')
plt.ylabel('X Axis')
plt.show()
# Line Plot
plt.figure(figsize=(8, 6))
plt.plot(x, y, marker='o', linestyle='-', color='purple', label='Line Plot')
plt.title('Line Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.legend()
plt.show()
# Scatter Plot
plt.figure(figsize=(8, 6))
plt.scatter(x, y, color='orange', label='Scatter Plot')
plt.title('Scatter Plot')
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
plt.legend()
plt.show()
# 3D Cube Plot
fig = plt.figure(figsize=(10, 8))
ax = fig.add_subplot(111, projection='3d')
ax.scatter(x, y, z, c='red', marker='o')
ax.set_title('3D Cube Plot')
ax.set_xlabel('X Axis')
ax.set_ylabel('Y Axis')
ax.set_zlabel('Z Axis')
plt.show()
10. Perform Descriptive analytics on healthcare data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Load the healthcare dataset (you can replace this with your actual data file)
# For demonstration purposes, we'll create a hypothetical dataset
data = {
'Age': [45, 50, 65, 70, 80, 55, 60, 45, 50, 90],
'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Female', 'Male', 'Male', 'Female'],
'Disease': ['Diabetes', 'Hypertension', 'Diabetes', 'Cancer', 'Hypertension', 'Cancer', 'Diabetes',
'Cancer', 'Hypertension', 'Diabetes'],
'Treatment': ['Insulin', 'Medications', 'Insulin', 'Chemotherapy', 'Medications', 'Chemotherapy',
'Insulin', 'Chemotherapy', 'Medications', 'Insulin'],
'Treatment Cost': [200, 150, 210, 300, 180, 320, 230, 310, 200, 220],
'Length of Stay (days)': [7, 5, 10, 15, 8, 20, 9, 16, 7, 10]
# Create a DataFrame
df = pd.DataFrame(data)
# Display first 5 rows of the dataframe
print("First 5 Rows of the Healthcare Dataset:")
print(df.head())
# Descriptive statistics for numeric data
print("\nDescriptive Statistics for Numeric Data:")
print(df.describe())
# Mode for categorical variables
print("\nMode for Categorical Variables:")
print("Gender:", df['Gender'].mode()[0])
print("Disease:", df['Disease'].mode()[0])
print("Treatment:", df['Treatment'].mode()[0])
# Value counts for categorical variables
print("\nValue Counts for Categorical Variables:")
print(df['Gender'].value_counts())
print(df['Disease'].value_counts())
print(df['Treatment'].value_counts())
# Visualize the distribution of Age
plt.figure(figsize=(8, 6))
sns.histplot(df['Age'], kde=True, bins=10, color='blue')
plt.title('Age Distribution of Patients')
plt.xlabel('Age')
plt.ylabel('Frequency')
plt.show()
# Gender distribution (Bar plot)
plt.figure(figsize=(8, 6))
sns.countplot(data=df, x='Gender', palette='Set1')
plt.title('Gender Distribution of Patients')
plt.xlabel('Gender')
plt.ylabel('Count')
plt.show()
# Disease distribution (Pie chart)
disease_counts = df['Disease'].value_counts()
plt.figure(figsize=(8, 6))
plt.pie(disease_counts, labels=disease_counts.index, autopct='%1.1f%%', startangle=90,
colors=sns.color_palette("Set3", len(disease_counts)))
plt.title('Disease Distribution')
plt.show()
# Treatment cost vs Length of Stay (Scatter plot)
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='Treatment Cost', y='Length of Stay (days)', hue='Disease', palette='Set2')
plt.title('Treatment Cost vs Length of Stay')
plt.xlabel('Treatment Cost')
plt.ylabel('Length of Stay (days)')
plt.show()
# Box plot for treatment cost by disease
plt.figure(figsize=(8, 6))
sns.boxplot(data=df, x='Disease', y='Treatment Cost')
plt.title('Treatment Cost by Disease')
plt.xlabel('Disease')
plt.ylabel('Treatment Cost')
plt.show()
# Correlation heatmap for numeric columns
plt.figure(figsize=(8, 6))
correlation_matrix = df[['Age', 'Treatment Cost', 'Length of Stay (days)']].corr()
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt='.2f', cbar=True)
plt.title('Correlation Heatmap')
plt.show()
11. Perform Predictive analytics on Product Sales data
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
# 1. Load and prepare the sample product sales data
data = {
'Date': pd.date_range(start='2020-01-01', periods=100, freq='D'),
'Product_ID': np.random.choice([1, 2, 3, 4], size=100),
'Sales': np.random.randint(50, 200, size=100),
'Price': np.random.randint(10, 50, size=100),
'Marketing_Spend': np.random.randint(1000, 5000, size=100)
df = pd.DataFrame(data)
# Convert 'Date' column to datetime type
df['Date'] = pd.to_datetime(df['Date'])
# 2. Exploratory Data Analysis (EDA)
print("First 5 Rows of the Product Sales Data:")
print(df.head())
# 3. Feature Engineering: Add date-related features
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
df['DayOfYear'] = df['Date'].dt.dayofyear
# Create lag features (previous day's sales)
df['Lag_Sales'] = df['Sales'].shift(1)
# Drop missing values
df.dropna(inplace=True)
# Display the first few rows after feature engineering
print("\nData After Feature Engineering:")
print(df.head())
# 4. Split the data into training and testing sets (Linear Regression)
X = df[['Price', 'Marketing_Spend', 'Month', 'DayOfWeek', 'Lag_Sales']]
y = df['Sales']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 5. Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Make predictions on the test set
y_pred_lr = lr_model.predict(X_test)
# Evaluate the Linear Regression model
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("\nLinear Regression Model Evaluation:")
print(f'Mean Absolute Error (MAE): {mae_lr}')
print(f'Mean Squared Error (MSE): {mse_lr}')
print(f'R-squared: {r2_lr}')
# 6. Time Series Forecasting with ARIMA (Product Sales as Time Series)
df_time_series = df[['Date', 'Sales']]
# Set Date column as index for time series modeling
df_time_series.set_index('Date', inplace=True)
# Plot the time series data
plt.figure(figsize=(10,6))
plt.plot(df_time_series['Sales'])
plt.title('Product Sales Over Time')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.show()
# Fit an ARIMA model (p=5, d=1, q=0 as an example)
model_arima = sm.tsa.ARIMA(df_time_series['Sales'], order=(5, 1, 0))
model_arima_fit = model_arima.fit()
# Forecast the next 10 days
forecast_steps = 10
forecast_arima = model_arima_fit.forecast(steps=forecast_steps)
# Visualize the forecasted data
plt.figure(figsize=(10,6))
plt.plot(df_time_series['Sales'], label='Historical Sales')
plt.plot(pd.date_range(df_time_series.index[-1], periods=forecast_steps, freq='D'), forecast_arima,
label='Forecast', color='red')
plt.title('Product Sales Forecast (ARIMA)')
plt.xlabel('Date')
plt.ylabel('Sales')
plt.legend()
plt.show()
# 7. Evaluation of ARIMA Model
print("\nARIMA Model Summary:")
print(model_arima_fit.summary())
12. Apply Predictive analytics for Weather forecasting.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import statsmodels.api as sm
# 1. Load the sample weather data (or use real weather data in CSV)
# Here, we simulate a sample dataset for demonstration purposes.
data = {
'Date': pd.date_range(start='2020-01-01', periods=1000, freq='D'),
'Temperature': np.random.normal(20, 5, 1000), # Simulating daily temperature data
'Humidity': np.random.normal(60, 10, 1000), # Simulating humidity data
'WindSpeed': np.random.normal(15, 5, 1000), # Simulating wind speed data
'Pressure': np.random.normal(1013, 5, 1000), # Simulating pressure data
'Rainfall': np.random.normal(2, 1, 1000) # Simulating daily rainfall data
}
df = pd.DataFrame(data)
# 2. Convert 'Date' to datetime type
df['Date'] = pd.to_datetime(df['Date'])
# 3. Feature Engineering: Extract date features
df['Month'] = df['Date'].dt.month
df['DayOfWeek'] = df['Date'].dt.dayofweek
# Display the first few rows of the dataset
print(df.head())
# 4. Exploratory Data Analysis (EDA)
# Plot the distribution of temperature
plt.figure(figsize=(10,6))
sns.histplot(df['Temperature'], kde=True)
plt.title('Temperature Distribution')
plt.xlabel('Temperature (°C)')
plt.ylabel('Frequency')
plt.show()
# Plot the relationship between humidity and temperature
plt.figure(figsize=(10,6))
sns.scatterplot(x=df['Humidity'], y=df['Temperature'])
plt.title('Temperature vs Humidity')
plt.xlabel('Humidity (%)')
plt.ylabel('Temperature (°C)')
plt.show()
# 5. Split the data into training and testing sets
X = df[['Humidity', 'WindSpeed', 'Pressure', 'Rainfall', 'Month', 'DayOfWeek']]
y = df['Temperature']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# 6. Build a model (Linear Regression or Random Forest)
# Linear Regression Model
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
# Predict on the test set
y_pred_lr = lr_model.predict(X_test)
# Evaluate the Linear Regression model
mae_lr = mean_absolute_error(y_test, y_pred_lr)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)
print("\nLinear Regression Model Evaluation:")
print(f'Mean Absolute Error (MAE): {mae_lr}')
print(f'Mean Squared Error (MSE): {mse_lr}')
print(f'R-squared: {r2_lr}')
# Random Forest Model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
# Predict on the test set
y_pred_rf = rf_model.predict(X_test)
# Evaluate the Random Forest model
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)
print("\nRandom Forest Model Evaluation:")
print(f'Mean Absolute Error (MAE): {mae_rf}')
print(f'Mean Squared Error (MSE): {mse_rf}')
print(f'R-squared: {r2_rf}')
# 7. Forecasting future temperatures using Linear Regression
future_data = {
'Humidity': [60, 65, 70],
'WindSpeed': [10, 12, 15],
'Pressure': [1010, 1015, 1020],
'Rainfall': [0, 1, 0],
'Month': [3, 3, 3], # March (for example)
'DayOfWeek': [0, 1, 2] # Monday, Tuesday, Wednesday
future_df = pd.DataFrame(future_data)
# Predict future temperatures using the trained Linear Regression model
future_predictions_lr = lr_model.predict(future_df)
print("\nPredicted Future Temperatures (Linear Regression):")
print(future_predictions_lr)
# Forecasting using Random Forest
future_predictions_rf = rf_model.predict(future_df)
print("\nPredicted Future Temperatures (Random Forest):")
print(future_predictions_rf)
# Visualize the future predictions
plt.figure(figsize=(10,6))
plt.plot(future_df['DayOfWeek'], future_predictions_lr, label='Linear Regression Predictions',
color='blue', marker='o')
plt.plot(future_df['DayOfWeek'], future_predictions_rf, label='Random Forest Predictions',
color='red', marker='x')
plt.title('Future Temperature Predictions')
plt.xlabel('Day of Week')
plt.ylabel('Predicted Temperature (°C)')
plt.legend()
plt.show()