import pandas as pd
import [Link] as plt
# Loading the spreadsheet
file_path = '[Link]' # Update with the correct path if
necessary
excel_data = [Link](file_path)
df = excel_data.parse(excel_data.sheet_names[0]) # Load the first sheet
# Displaying basic information about the dataset
print("Initial dataset info:")
print([Link]())
# Removing rows with any missing values
[Link](inplace=True)
# Ensuring that numeric columns are of the correct type
# Adjusting based on your dataset columns
numeric_columns = ['Hours_Studied', 'Attendance', 'Sleep_Hours', 'Previous_Scores',
'Tutoring_Sessions',
'Physical_Activity', 'Exam_Score']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
# Droping irrelevant columns if necessary (customized as needed)
# [Link](columns=['Unnecessary_Column_Name'], inplace=True)
# Displayig cleaned dataset info
print("\nDataset info after cleaning:")
print([Link]())
# Saving the cleaned data to a new file
df.to_excel('Cleaned_StudentPerformanceFactors.xlsx', index=False)
print("Cleaned dataset saved as 'Cleaned_StudentPerformanceFactors.xlsx'")
# Visualization
# Bar Chart: Average Exam Score by Attendance Level
attendance_bins = [0, 50, 75, 100] # Define attendance levels
attendance_labels = ['Low (0-50%)', 'Medium (50-75%)', 'High (75-100%)']
df['Attendance_Level'] = [Link](df['Attendance'], bins=attendance_bins,
labels=attendance_labels)
avg_exam_score_by_attendance = [Link]('Attendance_Level')['Exam_Score'].mean()
[Link](figsize=(8, 5))
avg_exam_score_by_attendance.plot(kind='bar', color='skyblue', edgecolor='black')
[Link]('Average Exam Score by Attendance Level')
[Link]('Attendance Level')
[Link]('Average Exam Score')
[Link](rotation=45)
[Link]()
# Histogram: Distribution of Hours Studied
[Link](figsize=(8, 5))
[Link](df['Hours_Studied'], bins=10, color='lightgreen', edgecolor='black')
[Link]('Distribution of Hours Studied')
[Link]('Hours Studied')
[Link]('Frequency')
[Link]()