0% found this document useful (0 votes)
11 views13 pages

Predictive Modeling

The document presents an analysis of a dataset containing information about visitors, ad impressions, and content views from a streaming platform. It includes data cleaning, exploratory data analysis with visualizations, and the development of predictive models using linear regression and random forest regression. Key findings include a strong correlation between trailer views and content views, as well as the impact of various features on visitor counts.

Uploaded by

anuragsingh0406
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
11 views13 pages

Predictive Modeling

The document presents an analysis of a dataset containing information about visitors, ad impressions, and content views from a streaming platform. It includes data cleaning, exploratory data analysis with visualizations, and the development of predictive models using linear regression and random forest regression. Key findings include a strong correlation between trailer views and content views, as well as the impact of various features on visitor counts.

Uploaded by

anuragsingh0406
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as PDF, TXT or read online on Scribd

import numpy as np

import pandas as pd

df=pd.read_csv("ottdata.csv")

df.head()

visitors ad_impressions major_sports_event genre dayofweek


season \
0 1.67 1113.81 0 Horror Wednesday
Spring
1 1.46 1498.41 1 Thriller Friday
Fall
2 1.47 1079.19 1 Thriller Wednesday
Fall
3 1.85 1342.77 1 Sci-Fi Friday
Fall
4 1.46 1498.41 0 Sci-Fi Sunday
Winter

views_trailer views_content
0 56.70 0.51
1 52.69 0.32
2 48.74 0.39
3 49.81 0.44
4 55.83 0.46

df.describe()

visitors ad_impressions major_sports_event views_trailer


\
count 1000.000000 1000.000000 1000.000000 1000.00000

mean 1.704290 1434.712290 0.400000 66.91559

std 0.231973 289.534834 0.490143 35.00108

min 1.250000 1010.870000 0.000000 30.08000

25% 1.550000 1210.330000 0.000000 50.94750

50% 1.700000 1383.580000 0.000000 53.96000

75% 1.830000 1623.670000 1.000000 57.75500

max 2.340000 2424.200000 1.000000 199.92000

views_content
count 1000.000000
mean 0.473400
std 0.105914
min 0.220000
25% 0.400000
50% 0.450000
75% 0.520000
max 0.890000

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 visitors 1000 non-null float64
1 ad_impressions 1000 non-null float64
2 major_sports_event 1000 non-null int64
3 genre 1000 non-null object
4 dayofweek 1000 non-null object
5 season 1000 non-null object
6 views_trailer 1000 non-null float64
7 views_content 1000 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 62.6+ KB

import matplotlib.pyplot as plt


import seaborn as sns

sns.set(style="whitegrid")

plt.figure(figsize=(10, 6))
sns.histplot(df['views_content'], bins=30, kde=True, color='red')
plt.title('Distribution of Content Views')
plt.xlabel('Views on Content')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='genre',
order=df['genre'].value_counts().index, palette='Set2')
plt.title('Distribution of Content Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

C:\Users\anura\AppData\Local\Temp\ipykernel_2388\2618020269.py:2:
FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.countplot(data=df, x='genre',
order=df['genre'].value_counts().index, palette='Set2')
day_avg_views = df.groupby('dayofweek')
['views_content'].mean().sort_values()

plt.figure(figsize=(8, 5))
sns.barplot(x=day_avg_views.index,y=day_avg_views.values,palette='cool
warm')
plt.title("Average Content Views by Day of Release")
plt.xlabel("Day of the Week")
plt.ylabel("Average Content Views")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

C:\Users\anura\AppData\Local\Temp\ipykernel_2388\1698395269.py:4:
FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.barplot(x=day_avg_views.index,y=day_avg_views.values,palette='cool
warm')
plt.figure(figsize=(5,5))
sns.boxplot(data=df, x='season',y='views_content');
custom_palette
={'Spring':'orange','Fall':'blue','Summer':'green','Winter':'red'}
sns.boxplot(data=df,x='season',
y='views_content',palette=custom_palette)

C:\Users\anura\AppData\Local\Temp\ipykernel_2388\2208172424.py:4:
FutureWarning:

Passing `palette` without assigning `hue` is deprecated and will be


removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.

sns.boxplot(data=df,x='season',
y='views_content',palette=custom_palette)

<Axes: xlabel='season', ylabel='views_content'>


correlation = df['views_trailer'].corr(df['views_content'])
print(f"Correlation between trailer views and content views:
{correlation:.2f}")

Correlation between trailer views and content views: 0.75

plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='views_trailer', y='views_content')
sns.regplot(data=df, x='views_trailer', y='views_content',
scatter=False, color='red')
plt.title('Correlation between Trailer Views and Content Views')
plt.xlabel('Trailer Views')
plt.ylabel('Content Views')
plt.show()
duplicates = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")

Number of duplicate rows: 0

df.isnull().sum()

visitors 0
ad_impressions 0
major_sports_event 0
genre 0
dayofweek 0
season 0
views_trailer 0
views_content 0
dtype: int64

sns.boxplot(x=df['views_content'])
plt.title('Boxplot of Content Views')
plt.xlabel('Content Views')
plt.show()
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

print("X_train shape:", X_train.shape)


print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)

X_train shape: (800, 7)


X_test shape: (200, 7)
y_train shape: (800,)
y_test shape: (200,)

from sklearn.linear_model import LinearRegression

df_encoded = pd.get_dummies(df, drop_first=True)

X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)

model = LinearRegression()
model.fit(X_train, y_train)

LinearRegression()

print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
print(f"{feature}: {coef}")

Intercept: 0.18811987016589798
Coefficients:
visitors: -0.04440734743739965
ad_impressions: 8.990939399706324e-06
major_sports_event: 0.07516397750507174
views_trailer: -0.0027899014914202452
views_content: 1.3118786110793876
genre_Comedy: 0.011095989825962885
genre_Drama: -0.0016560167230568895
genre_Horror: -0.08656635482699294
genre_Others: -0.056001381566065864
genre_Romance: -0.04919131479908598
genre_Sci-Fi: 0.02188836518732013
genre_Thriller: -0.08216761016780921
dayofweek_Monday: -0.16879363232769098
dayofweek_Saturday: -0.03442416007668321
dayofweek_Sunday: -0.04048380937273572
dayofweek_Thursday: 0.05764282958222824
dayofweek_Tuesday: -0.1028157961476516
dayofweek_Wednesday: -0.08262541105422154
season_Spring: -0.5195187305634899
season_Summer: -0.55614695920083

coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': model.coef_
})

print("\nModel Coefficients:")
print(coefficients)

Model Coefficients:
Feature Coefficient
0 visitors -0.044407
1 ad_impressions 0.000009
2 major_sports_event 0.075164
3 views_trailer -0.002790
4 views_content 1.311879
5 genre_Comedy 0.011096
6 genre_Drama -0.001656
7 genre_Horror -0.086566
8 genre_Others -0.056001
9 genre_Romance -0.049191
10 genre_Sci-Fi 0.021888
11 genre_Thriller -0.082168
12 dayofweek_Monday -0.168794
13 dayofweek_Saturday -0.034424
14 dayofweek_Sunday -0.040484
15 dayofweek_Thursday 0.057643
16 dayofweek_Tuesday -0.102816
17 dayofweek_Wednesday -0.082625
18 season_Spring -0.519519
19 season_Summer -0.556147

from sklearn.linear_model import LinearRegression


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm

X = df.drop(columns='visitors')
y = df['visitors']

categorical_features = ['genre', 'dayofweek', 'season']


numerical_features = ['ad_impressions', 'major_sports_event',
'views_trailer', 'views_content']

preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(drop='first'), categorical_features)
],
remainder='passthrough'
)

X_processed = preprocessor.fit_transform(X)
X_processed = sm.add_constant(X_processed) # Add intercept
model = sm.OLS(y, X_processed).fit()

residuals = model.resid
fitted = model.fittedvalues

plt.figure(figsize=(16, 12))

plt.subplot(2, 2, 1)
sns.scatterplot(x=fitted, y=y)
plt.plot(fitted, fitted, color='red')
plt.title("Linearity: Fitted vs Actual")
Text(0.5, 1.0, 'Linearity: Fitted vs Actual')

plt.subplot(2, 2, 2)
sns.scatterplot(x=fitted, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("Homoscedasticity: Residuals vs Fitted")

Text(0.5, 1.0, 'Homoscedasticity: Residuals vs Fitted')


plt.subplot(2, 2, 3)
sns.histplot(residuals, kde=True)
plt.title("Normality of Residuals")

Text(0.5, 1.0, 'Normality of Residuals')


from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score

df_clean = df.dropna()

df_encoded = pd.get_dummies(df_clean, columns=['genre', 'dayofweek',


'season'], drop_first=True)

X = df_encoded.drop('visitors', axis=1)
y = df_encoded['visitors']

model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)


mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)

mae, mse, rmse, r2

(0.26805, 0.1555445, 0.3943913031495497, 0.18123700486906158)

You might also like