4-Hour "Full Stack Data Science" Project Cheat Sheet
Step 1: Get Data (The SQL-in-Python Part)
Imports:
None
import pandas as pd
import numpy as np
import sqlite3 # <-- This is for running SQL
import re # For text cleaning
from datetime import datetime
# --- Modeling Imports ---
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier # (Good
default choice)
from sklearn.linear_model import LogisticRegression # (Simpler
choice)
from sklearn.metrics import accuracy_score, classification_report
Connect & Load Data
None
# 1. Create a connection
conn = sqlite3.connect('the_database_file_name.db')
# 2. (Optional) See what tables are in the database:
tables_query = "SELECT name FROM sqlite_master WHERE
type='table';"
tables = pd.read_sql_query(tables_query, conn)
print("Tables in the DB:")
print(tables)
# 3. Write your SQL query (as a multi-line string)
# (Use SQL to do heavy joins/filtering *before* Python)
sql_query = """
SELECT
c.customer_id,
c.age,
c.signup_date,
COUNT(t.transaction_id) as purchase_count,
SUM(t.amount) as total_spend
FROM
customers c
LEFT JOIN
transactions t ON c.customer_id = t.customer_id
GROUP BY
c.customer_id
"""
# 4. Load SQL query *directly* into a pandas DataFrame
df = pd.read_sql_query(sql_query, conn)
# 5. Close the connection
conn.close()
# (If they also give a CSV, just use this):
# df_csv = pd.read_csv('another_file.csv')
Step 2: Clean & Explore Data (EDA)
First Look (Find Problems)
None
df.info() # <-- MOST IMPORTANT. Check for nulls &
wrong data types
print(df.head()) # See what the data looks like
print(df.describe()) # Get stats (mean, median, etc.)
print(df.isnull().sum()) # See null counts per column
Common Cleaning
None
# --- Fill Missing Numbers (e.g., 'age') ---
median_val = df['age'].median()
df['age'] = df['age'].fillna(median_val)
# --- Fill Missing Text (e.g., 'category') ---
mode_val = df['category'].mode()[0]
df['category'] = df['category'].fillna(mode_val)
# --- Fix 'object' columns that should be numbers ---
# e.g., "$1,250.75" -> 1250.75
df['price'] = df['price'].str.replace(r'[$,]', '',
regex=True).astype(float)
# --- Fix 'object' columns that should be dates ---
df['signup_date'] = pd.to_datetime(df['signup_date'])
Step 3: Feature Engineering (How You Win)
From Dates (after pd.to_datetime)
None
df['signup_year'] = df['signup_date'].dt.year
df['signup_month'] = df['signup_date'].dt.month
df['signup_day_of_week'] = df['signup_date'].dt.dayofweek #
(Mon=0, Sun=6)
df['is_weekend'] = df['signup_day_of_week'].isin([5,
6]).astype(int)
From Text (using re)
None
# e.g., Extract area code from ' (555) 123-4567'
df['area_code'] = df['phone'].str.extract(r'\((\d{3})\)')
From Numbers (Binning)
None
# Group ages into categories
def age_group(age):
if age < 30: return '18-29'
elif age < 50: return '30-49'
else: return '50+'
df['age_group'] = df['age'].apply(age_group)
Turn Categories into Dummies (for Modeling)
None
# Creates new 0/1 columns for each category
df = pd.get_dummies(df, columns=['age_group', 'region'],
drop_first=True)
Step 4: Modeling (Train & Evaluate)
1. Define X (Features) and y (Target)
None
# 'y' is the one column you want to predict
y = df['churn']
# 'X' is all the features. Drop the target AND any ID/non-numeric
columns!
X = df.drop(['churn', 'customer_id', 'name', 'signup_date'],
axis=1)
2. Split Data
None
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
3. Scale Features (Very Important!)
None
scaler = StandardScaler()
# Fit on train data
X_train_scaled = scaler.fit_transform(X_train)
# ONLY transform on test data
X_test_scaled = scaler.transform(X_test)
4. Train Model
None
# Use a good, all-around model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_scaled, y_train)
5. Evaluate Model (on your test set)
None
predictions = model.predict(X_test_scaled)
print(f"Accuracy: {accuracy_score(y_test, predictions):.4f}")
print("\nClassification Report:")
print(classification_report(y_test, predictions))
Step 5: Final Submission (The Deliverable)
You will be given a new "test" file with no target column. You must apply your full pipeline to it.
None
# 5. Make Final Predictions
final_predictions = model.predict(X_final_test_scaled)
# 6. Create Submission File
submission = pd.DataFrame({
'customer_id': df_final_test['customer_id'], # Get ID from
the test file
'prediction': final_predictions
})
# 7. Save to CSV (index=False is VITAL)
submission.to_csv('my_submission.csv', index=False)
print("Submission file created successfully!")