8/16/24, 5:39 AM GamingBehaviourProject.
ipynb - Colab
import pandas as pd
import numpy as np
df=pd.read_csv("/content/online_gaming_behavior_dataset.csv")
# Explore the dataset
df
PlayerID Age Gender Location GameGenre PlayTimeHours InGamePurchases GameDifficulty SessionsPerWeek AvgSessionDurationMin
0 9000 43 Male Other Strategy 16.271119 0 Medium 6
1 9001 29 Female USA Strategy 5.525961 0 Medium 5
2 9002 22 Female USA Sports 8.223755 0 Easy 16
3 9003 35 Male USA Action 5.265351 1 Easy 9
4 9004 33 Male Europe Action 15.531945 0 Medium 2
... ... ... ... ... ... ... ... ... ...
40029 49029 32 Male USA Strategy 20.619662 0 Easy 4
40030 49030 44 Female Other Simulation 13.539280 0 Hard 19
40031 49031 15 Female USA RPG 0.240057 1 Easy 10
40032 49032 34 Male USA Sports 14.017818 1 Medium 3
40033 49033 19 Male USA Sports 10.083804 0 Easy 13
40034 rows × 13 columns
Next steps: Generate code with df
toggle_off View recommended plots New interactive sheet
df.isnull().sum()
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 1/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
PlayerID 0
Age 0
Gender 0
Location 0
GameGenre 0
PlayTimeHours 0
InGamePurchases 0
GameDifficulty 0
SessionsPerWeek 0
AvgSessionDurationMinutes 0
PlayerLevel 0
AchievementsUnlocked 0
EngagementLevel 0
dtype: int64
# create a new column 'avg_session_length'
df['avg_session_length'] = df['PlayTimeHours'] / df['SessionsPerWeek']
# now we can calculate 'avg_session_length_per_day'
df['avg_session_length_per_day'] = df['PlayTimeHours'] / df['SessionsPerWeek']
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 2/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
# Distribution of Variables:
import matplotlib.pyplot as plt
import seaborn as sns
df['EngagementLevel'] = df['EngagementLevel'].astype(str)
# histogram of engagement score
plt.hist(df['EngagementLevel'], bins=50)
plt.title('Distribution of Engagement Score')
plt.xlabel('Engagement Score')
plt.ylabel('Frequency')
plt.show()
# density plot of average session length
sns.kdeplot(df['avg_session_length'])
plt.title('Distribution of Average Session Length')
plt.xlabel('Average Session Length')
plt.ylabel('Density')
plt.show() #This will help us understand the shape of each variable's distribution.
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 3/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 4/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
# Correlation Analysis:
import seaborn as sns
import matplotlib.pyplot as plt
# Select only numeric columns for correlation calculation
numeric_df = df.select_dtypes(include=['number'])
# correlation matrix
corr_matrix = numeric_df.corr()
print(corr_matrix)
# heatmap of correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', square=True)
plt.title('Correlation Matrix')
plt.xlabel('Variables')
plt.ylabel('Variables')
plt.show() ##This will help us identify relationships between variables.
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 5/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
PlayerID Age PlayTimeHours \
PlayerID 1.000000 -0.003044 0.000923
Age -0.003044 1.000000 0.002462
PlayTimeHours 0.000923 0.002462 1.000000
InGamePurchases 0.002321 -0.000186 -0.006067
SessionsPerWeek -0.005944 0.008777 -0.003655
AvgSessionDurationMinutes -0.001801 -0.002269 -0.001925
PlayerLevel -0.001769 0.001353 -0.005152
AchievementsUnlocked 0.003190 -0.001100 0.003913
avg_session_length 0.004388 -0.003976 0.388983
avg_session_length_per_day 0.004388 -0.003976 0.388983
InGamePurchases SessionsPerWeek \
PlayerID 0.002321 -0.005944
Age -0.000186 0.008777
PlayTimeHours -0.006067 -0.003655
InGamePurchases 1.000000 0.005132
SessionsPerWeek 0.005132 1.000000
AvgSessionDurationMinutes -0.003059 -0.000620
PlayerLevel 0.006524 0.003257
AchievementsUnlocked 0.000098 0.003187
avg_session_length -0.006731 -0.571809
avg_session_length_per_day -0.006731 -0.571809
AvgSessionDurationMinutes PlayerLevel \
PlayerID -0.001801 -0.001769
Age -0.002269 0.001353
PlayTimeHours -0.001925 -0.005152
InGamePurchases -0.003059 0.006524
SessionsPerWeek -0.000620 0.003257
AvgSessionDurationMinutes 1.000000 0.001368
PlayerLevel 0.001368 1.000000
AchievementsUnlocked -0.002227 0.006343
avg_session_length -0.003486 -0.010218
avg_session_length_per_day -0.003486 -0.010218
AchievementsUnlocked avg_session_length \
PlayerID 0.003190 0.004388
Age -0.001100 -0.003976
PlayTimeHours 0.003913 0.388983
InGamePurchases 0.000098 -0.006731
SessionsPerWeek 0.003187 -0.571809
AvgSessionDurationMinutes -0.002227 -0.003486
PlayerLevel 0.006343 -0.010218
AchievementsUnlocked 1.000000 0.001011
avg_session_length 0.001011 1.000000
avg_session_length_per_day 0.001011 1.000000
i l th d
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 6/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
avg_session_length_per_day
PlayerID 0.004388
Age -0.003976
PlayTimeHours 0.388983
InGamePurchases -0.006731
SessionsPerWeek -0.571809
AvgSessionDurationMinutes -0.003486
PlayerLevel -0.010218
AchievementsUnlocked 0.001011
avg_session_length 1.000000
avg_session_length_per_day 1.000000
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 7/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
# Visualization:
import matplotlib.pyplot as plt
# scatter plot of engagement score vs. average session length
plt.scatter(df['avg_session_length'], df['EngagementLevel'])
plt.title('Engagement Score vs. Average Session Length')
plt.xlabel('Average Session Length')
plt.ylabel('Engagement Score')
plt.show()
# Check if there are any columns related to levels completed
print(df.columns)
# Convert 'GameDifficulty' to string before plotting
df['GameDifficulty'] = df['GameDifficulty'].astype(str)
# bar chart of levels completed by difficulty level
plt.bar(df['GameDifficulty'].unique(), df.groupby('GameDifficulty')['PlayerLevel'].mean())
plt.title('Levels Completed by Difficulty Level')
plt.xlabel('Difficulty Level')
plt.ylabel('Average Levels Completed')
plt.show() ##This will help us identify patterns and relationships between variables.
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 8/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
Index(['PlayerID', 'Age', 'Gender', 'Location', 'GameGenre', 'PlayTimeHours',
'InGamePurchases', 'GameDifficulty', 'SessionsPerWeek',
'AvgSessionDurationMinutes', 'PlayerLevel', 'AchievementsUnlocked',
'EngagementLevel', 'avg_session_length', 'avg_session_length_per_day'],
dtype='object')
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 9/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
df['Gender'].value_counts()
count
Gender
Male 23959
Female 16075
dtype: int64
df['Location'].value_counts()
count
Location
USA 16000
Europe 12004
Asia 8095
Other 3935
dtype: int64
df['GameGenre'].value_counts()
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 10/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
count
GameGenre
Sports 8048
Action 8039
Strategy 8012
Simulation 7983
RPG 7952
dtype: int64
df['GameDifficulty'].value_counts()
count
GameDifficulty
Easy 20015
Medium 12011
Hard 8008
dtype: int64
df['EngagementLevel'].value_counts()
count
EngagementLevel
Medium 19374
High 10336
Low 10324
dtype: int64
from sklearn.preprocessing import LabelEncoder
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 11/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
lb = LabelEncoder()
df['Gender'] = lb.fit_transform(df['Gender'])
df['Location'] = lb.fit_transform(df['Location'])
df['GameGenre'] = lb.fit_transform(df['GameGenre'])
df['GameDifficulty'] = lb.fit_transform(df['GameDifficulty'])
df['EngagementLevel'] = lb.fit_transform(df['EngagementLevel'])
df.head(3)
/usr/local/lib/python3.10/dist-packages/pandas/core/nanops.py:1010: RuntimeWarning: invalid value encountered in subtract
sqr = _ensure_numeric((avg - values) ** 2)
PlayerID Age Gender Location GameGenre PlayTimeHours InGamePurchases GameDifficulty SessionsPerWeek AvgSessionDurationMinutes
0 9000 43 1 2 4 16.271119 0 2 6 108
1 9001 29 0 3 4 5.525961 0 2 5 144
2 9002 22 0 3 3 8.223755 0 0 16 142
x = df.drop(columns = ['avg_session_length_per_day'] , axis = 1)
y = df['avg_session_length_per_day']
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 ,
random_state = 42)
print("Total data shape : ",df.shape)
print("Independent data shape : ",x.shape)
print("Dependent data shape : ",y.shape)
print("x_train data shape : ",x_train.shape)
print("x_test data shape : ",x_test.shape)
print("y_train data shape : ",y_train.shape)
print("y_test data shape : ",y_test.shape)
Total data shape : (40034, 15)
Independent data shape : (40034, 14)
Dependent data shape : (40034,)
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 12/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
x_train data shape : (32027, 14)
x_test data shape : (8007, 14)
y_train data shape : (32027,)
y_test data shape : (8007,)
np.round(x_train.describe())
/usr/local/lib/python3.10/dist-packages/pandas/core/nanops.py:1010: RuntimeWarning: invalid value encountered in subtract
sqr = _ensure_numeric((avg - values) ** 2)
/usr/local/lib/python3.10/dist-packages/pandas/core/nanops.py:1010: RuntimeWarning: invalid value encountered in subtract
sqr = _ensure_numeric((avg - values) ** 2)
PlayerID Age Gender Location GameGenre PlayTimeHours InGamePurchases GameDifficulty SessionsPerWeek AvgSessionDuratio
count 32027.0 32027.0 32027.0 32027.0 32027.0 32027.0 32027.0 32027.0 32027.0
mean 29023.0 32.0 1.0 2.0 2.0 12.0 0.0 1.0 9.0
std 11534.0 10.0 0.0 1.0 1.0 7.0 0.0 1.0 6.0
min 9000.0 15.0 0.0 0.0 0.0 0.0 0.0 0.0 0.0
25% 19070.0 23.0 0.0 1.0 1.0 6.0 0.0 0.0 4.0
50% 28994.0 32.0 1.0 1.0 2.0 12.0 0.0 1.0 9.0
75% 38978.0 41.0 1.0 3.0 3.0 18.0 0.0 2.0 14.0
max 49033.0 49.0 1.0 3.0 4.0 24.0 1.0 2.0 19.0
# STANDARDIZATION
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
# Replace infinite values with NaN
x_train.replace([np.inf, -np.inf], np.nan, inplace=True)
# Drop rows with NaN values
x_train.dropna(inplace=True)
x_train_sc = sc.fit_transform(x_train)
x_train_new = pd.DataFrame(x_train_sc , columns = x.columns)
np.round(x_train_new.describe() , 2)
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 13/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
PlayerID Age Gender Location GameGenre PlayTimeHours InGamePurchases GameDifficulty SessionsPerWeek AvgSessionDurat
count 30470.00 30470.00 30470.00 30470.00 30470.00 30470.00 30470.0 30470.00 30470.00
mean -0.00 -0.00 0.00 -0.00 0.00 -0.00 -0.0 -0.00 0.00
std 1.00 1.00 1.00 1.00 1.00 1.00 1.0 1.00 1.00
min -1.74 -1.69 -1.22 -1.43 -1.42 -1.74 -0.5 -0.92 -1.64
25% -0.86 -0.89 -1.22 -0.59 -0.71 -0.86 -0.5 -0.92 -0.91
50% -0.00 0.00 0.82 -0.59 -0.00 -0.00 -0.5 0.23 0.01
75% 0.86 0.90 0.82 1.09 0.70 0.86 -0.5 1.38 0.92
max 1.73 1.70 0.82 1.09 1.41 1.73 2.0 1.38 1.65
# NORMALIZATION
from sklearn.preprocessing import MinMaxScaler
mn = MinMaxScaler()
x_train_mn = mn.fit_transform(x_train)
x_train_new = pd.DataFrame(x_train_mn , columns = x.columns)
np.round(x_train_new.describe(),2)
PlayerID Age Gender Location GameGenre PlayTimeHours InGamePurchases GameDifficulty SessionsPerWeek AvgSessionDurat
count 30470.00 30470.00 30470.00 30470.00 30470.00 30470.00 30470.0 30470.00 30470.00
mean 0.50 0.50 0.60 0.57 0.50 0.50 0.2 0.40 0.50
std 0.29 0.30 0.49 0.40 0.35 0.29 0.4 0.44 0.30
min 0.00 0.00 0.00 0.00 0.00 0.00 0.0 0.00 0.00
25% 0.25 0.24 0.00 0.33 0.25 0.25 0.0 0.00 0.22
50% 0.50 0.50 1.00 0.33 0.50 0.50 0.0 0.50 0.50
75% 0.75 0.76 1.00 1.00 0.75 0.75 0.0 1.00 0.78
max 1.00 1.00 1.00 1.00 1.00 1.00 1.0 1.00 1.00
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 14/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
RandomForestClassifier
import numpy as np
import pandas as pd
x = df.drop(columns = ['avg_session_length_per_day'] , axis=1)
y = df['avg_session_length_per_day']
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size = 0.2 ,
random_state = 42)
from sklearn.ensemble import RandomForestRegressor
# Assuming 'y_train' is your target variable Series
y_train_cleaned = y_train.replace([np.inf, -np.inf], np.nan).dropna()
# Reindex x_train_new to align with the cleaned y_train
#x_train_new_cleaned = x_train_new.loc[y_train_cleaned.index]
y_train_cleaned = y_train_cleaned.reset_index(drop=True)
x_train_new_cleaned = x_train_new.reset_index(drop=True)
rf = RandomForestRegressor()
rf.fit(x_train_new_cleaned,y_train_cleaned)
# Handle potential infinite or out-of-range values in x_test
x_test_cleaned = x_test.replace([np.inf, -np.inf], np.nan).dropna()
y_test_cleaned = y_test[x_test_cleaned.index] # Keep y_test aligned with cleaned x_test
x_test_cleaned = x_test_cleaned.dropna()
y_test_cleaned = y_test_cleaned.dropna()
# If you want to fill missing values (NaN) with the mean:
x_test_cleaned = x_test_cleaned.fillna(x_test_cleaned.mean())
y_pred = rf.predict(x_test_cleaned)
from sklearn.metrics import r2_score
r2_score(y_test_cleaned , y_pred)
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 15/16
8/16/24, 5:39 AM GamingBehaviourProject.ipynb - Colab
-28.70649279143312
https://colab.research.google.com/drive/1loZ1r8IYhLYDL8oPOVtKQcR0VXAQYGty#scrollTo=eSR2Pks1lBQ5&printMode=true 16/16