#!
/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import seaborn as sns
# # Data Analysis
# In[2]:
# load data
books = pd.read_csv('Books.csv')
users = pd.read_csv('Users.csv')
ratings = pd.read_csv('Ratings.csv')
# In[3]:
books.head()
# In[4]:
users.head()
# In[5]:
ratings.head()
# In[6]:
books.drop_duplicates(subset=['Book-Title']).shape
# In[7]:
# Get shapes of dataframes
print(books.shape)
print(ratings.shape)
print(users.shape)
# In[8]:
# check for null values
books.isnull().sum()
# In[9]:
# check for null values
users.isnull().sum()
# In[10]:
# check for null values
ratings.isnull().sum()
# In[11]:
# check for any duplicates in dataframes
books.duplicated().sum(), ratings.duplicated().sum(),
users.duplicated().sum()
# In[12]:
# Ratings Stats
print('min rating:', ratings['Book-Rating'].min())
print('max rating:', ratings['Book-Rating'].max())
ratings['Book-Rating'].unique()
# In[13]:
# Merge dataframes
merged_df = pd.merge(ratings, books, on="ISBN")
# In[14]:
# Display basic statistics
print(merged_df.describe())
print()
# Check for missing values
print(merged_df.isnull().sum())
# In[15]:
# Visualize rating distribution
sns.countplot(x='Book-Rating', data=merged_df)
plt.title('Distribution of Book Ratings')
plt.show()
# In[16]:
num_rating_df = merged_df.groupby('Book-Title').count()['Book-
Rating'].reset_index()
num_rating_df.rename(columns = {'Book-Rating':'num_ratings'},
inplace=True)
avg_rating_df = merged_df.groupby('Book-Title').mean()['Book-
Rating'].reset_index()
avg_rating_df.rename(columns = {'Book-Rating':'avg_rating'},
inplace=True)
# In[ ]:
# Visualize rating distribution
sns.countplot(x='Book-Rating', data=merged_df)
plt.title('Distribution of Book Ratings')
plt.show()
# In[ ]:
# Calculate average rating for each book
average_ratings = merged_df.groupby('Book-Title')['Book-
Rating'].mean().reset_index().sort_values(by='Book-
Rating',ascending=False)
# Display top-rated books
print("Top Rated Books:")
print(average_ratings.head(10))
# In[ ]:
average_ratings.head()
# In[ ]:
import matplotlib.pyplot as plt
# Select the top N books for visualization
top_n = 10
top_rated_books = average_ratings.head(top_n)
# Plotting
plt.figure(figsize=(10, 6))
top_rated_books.plot(kind='barh', color='skyblue')
plt.xlabel('Average Rating')
plt.title(f'Top {top_n} Books by Average Rating')
plt.gca().invert_yaxis() # Invert y-axis for better readability
plt.show()
# # Recommender System
# In[ ]:
class RecommendationSystem:
# Constructor
def __init__(self, books_path: str = 'Books.csv', ratings_path: str
= 'Ratings.csv'):
# Load books and ratings data
self.books = pd.read_csv(books_path)
self.ratings = pd.read_csv(ratings_path)
# Initialize matrices and dataframes
self.interaction_matrix = None
self.popular_df = None
self.similarity_scores_matrix = None
# Method to train the recommendation system
def train(self):
# Merge ratings with book information
ratings_with_name = self.ratings.merge(self.books, on='ISBN')
# Compute number of ratings and average rating per book
num_rating_df = ratings_with_name.groupby('Book-Title').count()
['Book-Rating'].reset_index()
num_rating_df.rename(columns={'Book-Rating': 'num_ratings'},
inplace=True)
avg_rating_df = ratings_with_name.groupby('Book-Title').mean()
['Book-Rating'].reset_index()
avg_rating_df.rename(columns={'Book-Rating': 'avg_rating'},
inplace=True)
# Merge rating statistics with books dataset
stats_df = num_rating_df.merge(avg_rating_df, on='Book-Title')
self.books = self.books.merge(stats_df, on='Book-Title')
self.books = self.books.drop_duplicates(subset=['Book-Title'])
# Filter out less popular books
self.popular_df = stats_df[stats_df['num_ratings'] >=
250].sort_values('avg_rating', ascending=False).head(50)
self.popular_df = self.popular_df.merge(self.books, on='Book-
Title').drop_duplicates('Book-Title')
self.popular_df = self.popular_df[['Book-Title', 'Book-Author',
'num_ratings_x', 'avg_rating_x']]
self.popular_df.rename(columns={'num_ratings_x': 'num_ratings',
'avg_rating_x': 'avg_rating'}, inplace=True)
# Identify active users and filter ratings
x = ratings_with_name.groupby('User-ID').count()['Book-Rating']
> 200
active_users = x[x].index
filtered_rating = ratings_with_name[ratings_with_name['User-
ID'].isin(active_users)]
# Identify famous books and filter ratings
y = filtered_rating.groupby('Book-Title').count()['Book-
Rating'] >= 50
famous_books = y[y].index
final_ratings = filtered_rating[filtered_rating['Book-
Title'].isin(famous_books)]
# Create interaction matrix and fill missing values with 0
self.interaction_matrix =
final_ratings.pivot_table(index='Book-Title', columns='User-ID',
values='Book-Rating')
self.interaction_matrix.fillna(0, inplace=True)
# Compute cosine similarity matrix
self.similarity_scores_matrix = self.cosine_similarity()
print("--> Training Complete <--")
# Method to compute cosine similarity using numpy
def cosine_similarity(self):
matrix = np.array(self.interaction_matrix)
dot_product = np.dot(matrix, matrix.T)
norm = np.linalg.norm(matrix, axis=1)
self.similarity_scores_matrix = dot_product / (norm[:, None] *
norm)
return self.similarity_scores_matrix
# Alternative method to compute cosine similarity manually
def cosine_similarity_manual(self):
matrix = np.array(self.interaction_matrix)
num_of_books = matrix.shape[0]
num_of_users = matrix.shape[1]
self.similarity_scores_matrix = np.zeros((num_of_books,
num_of_books))
for i in range(num_of_books):
for j in range(num_of_books):
dot_product = sum(matrix[i][k] * matrix[j][k] for k in
range(num_of_users))
norm_i = math.sqrt(sum(val ** 2 for val in matrix[i]))
norm_j = math.sqrt(sum(val ** 2 for val in matrix[j]))
self.similarity_scores_matrix[i][j] = dot_product /
(norm_i * norm_j)
return self.similarity_scores_matrix
# Method to print popular recommendations
def get_popular_recommendations(self):
for _, row in self.popular_df.iterrows():
for col, value in row.items():
print(f"{col}: {value}")
print()
# Method to get recommendations for a given book
def get_recommendations(self, book_name):
index = np.where(self.interaction_matrix.index == book_name)[0]
[0]
similar_items =
sorted(list(enumerate(self.similarity_scores_matrix[index])),
key=lambda x: x[1], reverse=True)[1:6]
recommendations = []
print(" --> RECOMMENDATIONS <--\n")
for i in similar_items:
book_to_recommend = self.books[self.books['Book-Title'] ==
self.interaction_matrix.index[i[0]]]
title = book_to_recommend['Book-Title'].values[0]
print("Book Title:", title)
author = book_to_recommend['Book-Author'].values[0]
print("Author:", author)
num_rating = book_to_recommend['num_ratings'].values[0]
print("Number of ratings received:", num_rating)
avg_rating = book_to_recommend['avg_rating'].values[0]
print("Average rating:", avg_rating)
print()
recommendations.append((title, author))
return recommendations
# In[ ]:
# Initialize a object for RecommendationSystem
recommender = RecommendationSystem()
# In[ ]:
# Train the recommender
recommender.train()
# In[ ]:
# Get top 5 recommendataions for a book
recommended_books = recommender.get_recommendations('1984')
# In[ ]:
# Get top 50 popular book in the dataset
recommender.get_popular_recommendations()
# In[ ]: