Program 1
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
file_path = "S:\ML\IPL 2022 Batters.csv"
data = pd.read_csv(file_path)
print ("First 5 rows of the dataset: ")
print(data.head())
print("\nData set Info: ")
print(data.info())
print("\nStatistical Summary: ")
print(data.describe())
x = data['Runs']
y = data['Mat'] # Corrected from data.Matches
plt.xlabel("Runs")
plt.ylabel("Matches")
plt.scatter(x, y, color='red')
plt.show()
Program 2
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Sample Data
data = {
'Age': [25, 30, np.nan, 45, 22, 38, 50, np.nan],
'Salary': [50000, 60000, 75000, np.nan, 48000, 8000, 9000, 65000],
'Gender': ['Male', 'Female', 'Male', 'Female', 'Male', 'Female', 'Male', 'Female'],
'city': ['New York', 'London', 'Paris', 'New York', 'London', 'Paris', 'New York',
'London'],
'Purchase': ['No', 'Yes', 'No', 'Yes', 'No', 'Yes', 'No', 'Yes']
}
df = pd.DataFrame(data)
print("Original DataFrame: ")
print(df)
print("\n")
# Define imputers
imputer_numerical = SimpleImputer(strategy='mean')
imputer_categorical = SimpleImputer(strategy='most_frequent')
numerical_features = ['Age', 'Salary']
categorical_features = ['Gender', 'city']
# Step 1: Imputation
preprocessor = ColumnTransformer(
transformers=[
('num_imputer', imputer_numerical, numerical_features),
('cat_imputer', imputer_categorical, categorical_features)
],
remainder='passthrough'
)
df_imputed = pd.DataFrame(preprocessor.fit_transform(df),
columns=numerical_features + categorical_features + ['Purchase'])
print("DataFrame after Imputation: ")
print(df_imputed)
print("\n")
# Step 2: One-Hot Encoding
# Step 2: One-Hot Encoding
encoder = OneHotEncoder(handle_unknown='ignore', sparse=False)
categorical_transformer = Pipeline(steps=[('onehot', encoder)])
preprocessor_encoding = ColumnTransformer(
transformers=[('cat', categorical_transformer, ['Gender', 'city'])],
remainder='passthrough'
)
df_encoded_array = preprocessor_encoding.fit_transform(df_imputed)
encoded_feature_names = preprocessor_encoding.named_transformers_['cat']
['onehot'].get_feature_names_out(['Gender', 'city'])
remaining_features = [col for col in df_imputed.columns if col not in ['Gender', 'city']]
df_encoded = pd.DataFrame(df_encoded_array,
columns=list(encoded_feature_names) + remaining_features)
print("DataFrame after One-Hot Encoding: ")
print(df_encoded)
print("\n")
# Step 3: Feature Scaling
scaler = StandardScaler()
numerical_transformer = Pipeline(steps=[('scaler', scaler)])
preprocessor_scaling = ColumnTransformer(
transformers=[('num', numerical_transformer, ['Age', 'Salary'])],
remainder='passthrough'
)
df_scaled_array = preprocessor_scaling.fit_transform(df_encoded)
df_scaled = pd.DataFrame(df_scaled_array, columns=df_encoded.columns)
print("DataFrame after Feature Scaling (Standardization): ")
print(df_scaled)
Program 3
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import datasets # Fixed: 'dataset' → 'datasets'
# Load the iris dataset
iris = datasets.load_iris()
df = pd.DataFrame(data=iris.data, columns=iris.feature_names) # Fixed:
'features_names' → 'feature_names'
# Add species column and map target numbers to names
df['species'] = iris.target
df['species'] = df['species'].map({i: name for i, name in enumerate(iris.target_names)})
# Print the first 5 rows
print("First 5 rows of the dataset:")
print(df.head())
# Dataset info
print("\nDataset Info:")
print(df.info())
# Summary statistics
print("\nSummary statistics:")
print(df.describe())
# Class distribution
print("\nClass distribution:")
print(df['species'].value_counts()) # Fixed: 'value_counter()' → 'value_counts()'
# Pairplot
sns.pairplot(df, hue='species', palette='Set2') # Fixed: 'set 2' → 'Set2'
plt.suptitle("Pairplot of Iris Features", y=1.02)
plt.show()
# Boxplot
plt.figure(figsize=(10, 6))
sns.boxplot(data=df.iloc[:, :-1], orient="h", palette="Set3") # Fixed: use only numeric
columns
plt.title("Boxplot of Iris Features")
plt.show()
# Correlation heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(df.iloc[:, :-1].corr(), annot=True, cmap="coolwarm", fmt=".2f") # Fixed
syntax: corr() and commas
plt.title("Features Correlation Heatmap")
plt.show()
# Violin plots
plt.figure(figsize=(12, 8))
for i, col in enumerate(df.columns[:-1]):
plt.subplot(2, 2, i + 1)
sns.violinplot(x='species', y=col, data=df, palette='pastel')
plt.tight_layout()
plt.show()