import pandas as pd
import numpy as np
from scipy.stats import norm
# Step 1: Extract
def extract_data():
# Example data as a CSV
data = {
'Name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
'Age': [25, 30, 35, 40, 29],
'Salary': [50000, 60000, 70000, 80000, 55000]
}
df = pd.DataFrame(data)
print("Data Extracted:")
print(df)
return df
# Step 2: Transform
def transform_data(df):
# Adding a column for Bonus (10% of Salary)
df['Bonus'] = df['Salary'] * 0.1
# Normalizing Age column (min-max scaling)
df['Age_Normalized'] = (df['Age'] - df['Age'].min()) / (df['Age'].max() -
df['Age'].min())
print("\nData Transformed:")
print(df)
return df
# Step 3: Load
def load_data(df):
# Save transformed data to a CSV file
output_file = "transformed_data.csv"
df.to_csv(output_file, index=False)
print(f"\nData Loaded to {output_file}")
# Statistical Functions
def statistical_functions(df):
# Mean and Median
mean_salary = np.mean(df['Salary'])
median_salary = np.median(df['Salary'])
# Normal Distribution Example
mu, sigma = mean_salary, np.std(df['Salary'])
normal_dist = norm.pdf(df['Salary'], mu, sigma)
df['Normal_Distribution'] = normal_dist
print("\nStatistical Analysis:")
print(f"Mean Salary: {mean_salary}")
print(f"Median Salary: {median_salary}")
print("\nNormal Distribution (Probability Density Function):")
print(df[['Salary', 'Normal_Distribution']])
# Modeling (Linear Regression Example)
def simple_model(df):
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
# Independent variable: Age, Dependent variable: Salary
X = df[['Age']]
y = df['Salary']
# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
# Model Training
model = LinearRegression()
model.fit(X_train, y_train)
# Prediction and Evaluation
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print("\nSimple Linear Regression Model:")
print(f"Coefficient: {model.coef_[0]}")
print(f"Intercept: {model.intercept_}")
print(f"Mean Squared Error: {mse}")
# Main Function to Execute the Steps
def main():
# ETL Process
df = extract_data()
df = transform_data(df)
load_data(df)
# Statistical Analysis
statistical_functions(df)
# Simple Modeling
simple_model(df)
# Run the main function
if __name__ == "__main__":
main()