import numpy as np
import pandas as pd
df=pd.read_csv("ottdata.csv")
df.head()
visitors ad_impressions major_sports_event genre dayofweek
season \
0 1.67 1113.81 0 Horror Wednesday
Spring
1 1.46 1498.41 1 Thriller Friday
Fall
2 1.47 1079.19 1 Thriller Wednesday
Fall
3 1.85 1342.77 1 Sci-Fi Friday
Fall
4 1.46 1498.41 0 Sci-Fi Sunday
Winter
views_trailer views_content
0 56.70 0.51
1 52.69 0.32
2 48.74 0.39
3 49.81 0.44
4 55.83 0.46
df.describe()
visitors ad_impressions major_sports_event views_trailer
\
count 1000.000000 1000.000000 1000.000000 1000.00000
mean 1.704290 1434.712290 0.400000 66.91559
std 0.231973 289.534834 0.490143 35.00108
min 1.250000 1010.870000 0.000000 30.08000
25% 1.550000 1210.330000 0.000000 50.94750
50% 1.700000 1383.580000 0.000000 53.96000
75% 1.830000 1623.670000 1.000000 57.75500
max 2.340000 2424.200000 1.000000 199.92000
views_content
count 1000.000000
mean 0.473400
std 0.105914
min 0.220000
25% 0.400000
50% 0.450000
75% 0.520000
max 0.890000
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 8 columns):
# Column Non-Null Count Dtype
--- ------ -------------- -----
0 visitors 1000 non-null float64
1 ad_impressions 1000 non-null float64
2 major_sports_event 1000 non-null int64
3 genre 1000 non-null object
4 dayofweek 1000 non-null object
5 season 1000 non-null object
6 views_trailer 1000 non-null float64
7 views_content 1000 non-null float64
dtypes: float64(4), int64(1), object(3)
memory usage: 62.6+ KB
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
plt.figure(figsize=(10, 6))
sns.histplot(df['views_content'], bins=30, kde=True, color='red')
plt.title('Distribution of Content Views')
plt.xlabel('Views on Content')
plt.ylabel('Frequency')
plt.tight_layout()
plt.show()
plt.figure(figsize=(10, 6))
sns.countplot(data=df, x='genre',
order=df['genre'].value_counts().index, palette='Set2')
plt.title('Distribution of Content Genres')
plt.xlabel('Genre')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\anura\AppData\Local\Temp\ipykernel_2388\2618020269.py:2:
FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be
removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.
sns.countplot(data=df, x='genre',
order=df['genre'].value_counts().index, palette='Set2')
day_avg_views = df.groupby('dayofweek')
['views_content'].mean().sort_values()
plt.figure(figsize=(8, 5))
sns.barplot(x=day_avg_views.index,y=day_avg_views.values,palette='cool
warm')
plt.title("Average Content Views by Day of Release")
plt.xlabel("Day of the Week")
plt.ylabel("Average Content Views")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
C:\Users\anura\AppData\Local\Temp\ipykernel_2388\1698395269.py:4:
FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be
removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.
sns.barplot(x=day_avg_views.index,y=day_avg_views.values,palette='cool
warm')
plt.figure(figsize=(5,5))
sns.boxplot(data=df, x='season',y='views_content');
custom_palette
={'Spring':'orange','Fall':'blue','Summer':'green','Winter':'red'}
sns.boxplot(data=df,x='season',
y='views_content',palette=custom_palette)
C:\Users\anura\AppData\Local\Temp\ipykernel_2388\2208172424.py:4:
FutureWarning:
Passing `palette` without assigning `hue` is deprecated and will be
removed in v0.14.0. Assign the `x` variable to `hue` and set
`legend=False` for the same effect.
sns.boxplot(data=df,x='season',
y='views_content',palette=custom_palette)
<Axes: xlabel='season', ylabel='views_content'>
correlation = df['views_trailer'].corr(df['views_content'])
print(f"Correlation between trailer views and content views:
{correlation:.2f}")
Correlation between trailer views and content views: 0.75
plt.figure(figsize=(8, 6))
sns.scatterplot(data=df, x='views_trailer', y='views_content')
sns.regplot(data=df, x='views_trailer', y='views_content',
scatter=False, color='red')
plt.title('Correlation between Trailer Views and Content Views')
plt.xlabel('Trailer Views')
plt.ylabel('Content Views')
plt.show()
duplicates = df[df.duplicated()]
print(f"Number of duplicate rows: {duplicates.shape[0]}")
Number of duplicate rows: 0
df.isnull().sum()
visitors 0
ad_impressions 0
major_sports_event 0
genre 0
dayofweek 0
season 0
views_trailer 0
views_content 0
dtype: int64
sns.boxplot(x=df['views_content'])
plt.title('Boxplot of Content Views')
plt.xlabel('Content Views')
plt.show()
from sklearn.model_selection import train_test_split
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
print("X_train shape:", X_train.shape)
print("X_test shape:", X_test.shape)
print("y_train shape:", y_train.shape)
print("y_test shape:", y_test.shape)
X_train shape: (800, 7)
X_test shape: (200, 7)
y_train shape: (800,)
y_test shape: (200,)
from sklearn.linear_model import LinearRegression
df_encoded = pd.get_dummies(df, drop_first=True)
X = df_encoded.iloc[:, :-1]
y = df_encoded.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y,
test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
LinearRegression()
print("Intercept:", model.intercept_)
print("Coefficients:")
for feature, coef in zip(X.columns, model.coef_):
print(f"{feature}: {coef}")
Intercept: 0.18811987016589798
Coefficients:
visitors: -0.04440734743739965
ad_impressions: 8.990939399706324e-06
major_sports_event: 0.07516397750507174
views_trailer: -0.0027899014914202452
views_content: 1.3118786110793876
genre_Comedy: 0.011095989825962885
genre_Drama: -0.0016560167230568895
genre_Horror: -0.08656635482699294
genre_Others: -0.056001381566065864
genre_Romance: -0.04919131479908598
genre_Sci-Fi: 0.02188836518732013
genre_Thriller: -0.08216761016780921
dayofweek_Monday: -0.16879363232769098
dayofweek_Saturday: -0.03442416007668321
dayofweek_Sunday: -0.04048380937273572
dayofweek_Thursday: 0.05764282958222824
dayofweek_Tuesday: -0.1028157961476516
dayofweek_Wednesday: -0.08262541105422154
season_Spring: -0.5195187305634899
season_Summer: -0.55614695920083
coefficients = pd.DataFrame({
'Feature': X.columns,
'Coefficient': model.coef_
})
print("\nModel Coefficients:")
print(coefficients)
Model Coefficients:
Feature Coefficient
0 visitors -0.044407
1 ad_impressions 0.000009
2 major_sports_event 0.075164
3 views_trailer -0.002790
4 views_content 1.311879
5 genre_Comedy 0.011096
6 genre_Drama -0.001656
7 genre_Horror -0.086566
8 genre_Others -0.056001
9 genre_Romance -0.049191
10 genre_Sci-Fi 0.021888
11 genre_Thriller -0.082168
12 dayofweek_Monday -0.168794
13 dayofweek_Saturday -0.034424
14 dayofweek_Sunday -0.040484
15 dayofweek_Thursday 0.057643
16 dayofweek_Tuesday -0.102816
17 dayofweek_Wednesday -0.082625
18 season_Spring -0.519519
19 season_Summer -0.556147
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
import statsmodels.api as sm
X = df.drop(columns='visitors')
y = df['visitors']
categorical_features = ['genre', 'dayofweek', 'season']
numerical_features = ['ad_impressions', 'major_sports_event',
'views_trailer', 'views_content']
preprocessor = ColumnTransformer(
transformers=[
('cat', OneHotEncoder(drop='first'), categorical_features)
],
remainder='passthrough'
)
X_processed = preprocessor.fit_transform(X)
X_processed = sm.add_constant(X_processed) # Add intercept
model = sm.OLS(y, X_processed).fit()
residuals = model.resid
fitted = model.fittedvalues
plt.figure(figsize=(16, 12))
plt.subplot(2, 2, 1)
sns.scatterplot(x=fitted, y=y)
plt.plot(fitted, fitted, color='red')
plt.title("Linearity: Fitted vs Actual")
Text(0.5, 1.0, 'Linearity: Fitted vs Actual')
plt.subplot(2, 2, 2)
sns.scatterplot(x=fitted, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.title("Homoscedasticity: Residuals vs Fitted")
Text(0.5, 1.0, 'Homoscedasticity: Residuals vs Fitted')
plt.subplot(2, 2, 3)
sns.histplot(residuals, kde=True)
plt.title("Normality of Residuals")
Text(0.5, 1.0, 'Normality of Residuals')
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error,
r2_score
df_clean = df.dropna()
df_encoded = pd.get_dummies(df_clean, columns=['genre', 'dayofweek',
'season'], drop_first=True)
X = df_encoded.drop('visitors', axis=1)
y = df_encoded['visitors']
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
RandomForestRegressor(random_state=42)
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae, mse, rmse, r2
(0.26805, 0.1555445, 0.3943913031495497, 0.18123700486906158)