# Use this cell to write your code for Task 2
clean_data = pd.read_csv('loyalty.csv')
# Calculate average spend and variance by loyalty years
spend_by_years = clean_data.groupby('loyalty_years').agg(
avg_spend=('spend', 'mean'),
var_spend=('spend', 'var')
).reset_index()
# Round the results to two decimal places
spend_by_years['avg_spend'] = spend_by_years['avg_spend'].round(2)
spend_by_years['var_spend'] = spend_by_years['var_spend'].round(2)
# Output the resulting DataFrame
print(spend_by_years)
....
# Use this cell to write your code for Task 3
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Load the training data
train_data = pd.read_csv('train.csv')
# Preprocess the training data
X_train = train_data.drop(columns=['customer_id', 'spend'])
y_train = train_data['spend']
# Define categorical and numerical features
categorical_features = ['region', 'loyalty_years', 'joining_month', 'promotion']
numerical_features = ['first_month', 'items_in_first_month']
# Create a Column Transformer to preprocess the data
preprocessor = ColumnTransformer(
transformers=[
('num', 'passthrough', numerical_features),
('cat', OneHotEncoder(), categorical_features)
])
# Define the model pipeline
baseline_model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', LinearRegression())
])
# Fit the baseline model
baseline_model.fit(X_train, y_train)
# Load test data and make predictions
test_data = pd.read_csv('test.csv')
X_test = test_data.drop(columns=['customer_id'])
# Make predictions
predicted_spend = baseline_model.predict(X_test)
# Create the result DataFrame
base_result = pd.DataFrame({
'customer_id': test_data['customer_id'],
'spend': predicted_spend
})
# Output the resulting DataFrame
print(base_result)
....
# Use this cell to write your code for Task 4
# Define the comparison model using Random Forest
comparison_model = Pipeline(steps=[
('preprocessor', preprocessor),
('regressor', RandomForestRegressor(random_state=42))
])
# Fit the comparison model
comparison_model.fit(X_train, y_train)
# Make predictions using the comparison model
predicted_spend_compare = comparison_model.predict(X_test)
# Create the result DataFrame for the comparison model
compare_result = pd.DataFrame({
'customer_id': test_data['customer_id'],
'spend': predicted_spend_compare
})
# Output the resulting DataFrame
print(compare_result)