# Import necessary libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# load the dataset
df = pd.read_excel('/content/Skilled nursing facilities dataset.xlsx', skiprows=1)
# Display the first few rows of the dataframe to understand its structure and provide a
summary of the dataset including data types and missing values
df.info(),df.head()
# 1. Descriptive Statistics of the data
descriptive_statistics = df.describe(include='all', datetime_is_numeric=True)
# 2. Finding out missing values if any exist
missing_values = df.isnull().sum()
descriptive_statistics,missing_values
# Dropping duplicate columns from the dataset
duplicate_columns = ['Total Days Title XVIII','Total Days Title XIX', 'Total Days Other',
'Total Days Total','Number of Beds','Total Bed Days Available','Total Discharges Title
XVIII','Total Discharges Title XIX','Total Discharges Title Other','Total Discharges Total'] #
Replace with actual column names
data_cleaned = df.drop(columns=duplicate_columns)
data_cleaned
# Handling Missing Values
# Given the varied nature of missing data across columns, we'll proceed with a simple
imputation for numerical columns and exclude categorical ones from this step.
numerical_data = data_cleaned.select_dtypes(include=[np.number])
imputer = SimpleImputer(strategy='mean')
numerical_data_imputed = pd.DataFrame(imputer.fit_transform(numerical_data),
columns=numerical_data.columns)
# Proceeding with handling missing values in categorical columns, as they are just 1.5% of
the total records in the dataset we won't lose much information. so, we're deleting null values.
data_cleaned = data_cleaned.dropna(subset=['Street Address', 'Fiscal Year Begin Date',
'Fiscal Year End Date','Rural versus Urban'])
data_cleaned.info()
# Defining categorical data
categorical_data = data_cleaned.select_dtypes(include=['object', 'category'])
categorical_data.info()
# Merging categorical data with numerical data imputed and naming it as data_combined.
data_combined = pd.merge(categorical_data,numerical_data_imputed, left_index=True,
right_index=True, how='inner')
data_combined.info()
# There are some identifiers in the data, let's ignore those columns and use relevant columns
for the analysis by defining this data as snf_data.
snf_data = data_combined[['Facility Name','Street Address','City','State Code','Zip
Code','County','Rural versus Urban','SNF Average Length of Stay Title XVIII','SNF Average
Length of Stay Title XIX','SNF Average Length of Stay Total','SNF Admissions Title
XVIII','SNF Admissions Title XIX','SNF Admissions Other','SNF Admissions Total','SNF
Days Title XVIII','SNF Days Title XIX','SNF Days Other','SNF Days Total','SNF Number of
Beds','SNF Bed Days Available','SNF Discharges Title XVIII','SNF Discharges Title
XIX','SNF Discharges Title Other','SNF Discharges Total']]
snf_data.info()
snf_data.describe()
snf_data.hist(figsize=(20, 15), bins=20)
plt.tight_layout() # Adjusts subplots to fit into the figure area.
plt.show()
# Define categories and values to create piechart
categories = ['Rural', 'Urban']
values = [10952, 3882]
# Create piechart to show the percentage of SNFs in Rural versus Urban Areas
explode = (0, 0.1)
colors = ['limegreen', 'lightblue']
plt.figure(figsize=(8, 8))
plt.pie(values, labels=categories, autopct='%1.1f%%', startangle=90, explode=explode,
colors=colors)
plt.title('SNFs in Rural versus Urban Areas')
plt.show()
# Categorical Data Exploration: Focusing on 'State Code' and 'Rural versus Urban' variables
state_code_distribution = snf_data['State Code'].value_counts()
rural_urban_distribution = snf_data['Rural versus Urban'].value_counts()
state_code_distribution, rural_urban_distribution
import plotly.express as px
snf_counts_by_state = data_combined['State Code'].value_counts().reset_index()
snf_counts_by_state.columns = ['State Code', 'Count']
# 'snf_counts_by_state' has 'State Code' and 'Count'
fig = px.bar(snf_counts_by_state, x='State Code', y='Count', text='Count')
fig.update_traces(texttemplate='%{text}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig.show()
# Correlation Analysis of the numerical data
correlations = snf_data.corr(numeric_only=True)
# Plotting the correlation matrix
plt.figure(figsize=(12, 12))
sns.heatmap(correlations, annot=True, fmt=".2f")
plt.title('Correlation Matrix')
plt.show()
# Exploring distributions of selected key variables using box plots
columns_to_plot = ['SNF Average Length of Stay Total', 'SNF Admissions Total','SNF
Discharges Total','SNF Number of Beds','SNF Days Total']
for column in columns_to_plot:
plt.figure() # Creates a new figure for each plot
sns.boxplot(y=snf_data[column])
plt.title(f'Box plot of {column}')
plt.ylabel('Value')
plt.show()
#Removing Outliers
def remove_outliers(snf_data, column):
Q1 = snf_data[column].quantile(0.25)
Q3 = snf_data[column].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return snf_data[(snf_data[column] >= lower_bound) & (snf_data[column] <=
upper_bound)]
# Removing Outliers
columns_to_plot = ['SNF Average Length of Stay Total', 'SNF Admissions Total','SNF
Discharges Total','SNF Number of Beds','SNF Days Total']
for column in columns_to_plot:
snf_data = remove_outliers(snf_data, column)
plt.figure()
sns.boxplot(y=snf_data[column])
plt.title(f'Box plot of {column} (outliers removed)')
plt.show()
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
# Identifying numerical columns (excluding identifiers like Provider CCN)
numerical_cols =
numerical_data_imputed.select_dtypes(include=['number']).columns.drop(['rpt_rec_num',
'Provider CCN'])
numerical_data = numerical_data_imputed[numerical_cols]
# Standardizing the numerical data
scaler = StandardScaler()
standardized_data = scaler.fit_transform(numerical_data)
# Performing PCA
pca = PCA()
pca.fit(standardized_data)
# Getting the variance ratios of the principal components
explained_variance_ratio = pca.explained_variance_ratio_
explained_variance_ratio
# Extracting the component loadings (i.e., the correlation between the original variables and
the components)
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i}' for i in range(1,
pca.components_.shape[1] + 1)], index=numerical_cols)
# Displaying the loadings for the first few principal components
loadings
# Visualizing the loadings of the first two principal components
plt.figure(figsize=(10, 6))
sns.heatmap(loadings[['PC1', 'PC2']])
plt.title('Loadings of PC1 and PC2')
plt.xlabel('Principal Components')
plt.ylabel('Original Variables')
plt.show()