0% found this document useful (0 votes)
4 views8 pages

A.I Code

The document outlines a data analysis process for a dataset related to skilled nursing facilities, including data cleaning, handling missing values, and exploratory data analysis. Key steps include dropping duplicate columns, imputing missing numerical data, and visualizing distributions and correlations using various plots. The analysis also involves standardizing numerical data and performing Principal Component Analysis (PCA) to understand the variance in the dataset.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
0% found this document useful (0 votes)
4 views8 pages

A.I Code

The document outlines a data analysis process for a dataset related to skilled nursing facilities, including data cleaning, handling missing values, and exploratory data analysis. Key steps include dropping duplicate columns, imputing missing numerical data, and visualizing distributions and correlations using various plots. The analysis also involves standardizing numerical data and performing Principal Component Analysis (PCA) to understand the variance in the dataset.
Copyright
© © All Rights Reserved
We take content rights seriously. If you suspect this is your content, claim it here.
Available Formats
Download as DOCX, PDF, TXT or read online on Scribd
You are on page 1/ 8

# Import necessary libraries

import pandas as pd

import numpy as np

import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.impute import SimpleImputer

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

# load the dataset

df = pd.read_excel('/content/Skilled nursing facilities dataset.xlsx', skiprows=1)

# Display the first few rows of the dataframe to understand its structure and provide a

summary of the dataset including data types and missing values

df.info(),df.head()

# 1. Descriptive Statistics of the data

descriptive_statistics = df.describe(include='all', datetime_is_numeric=True)

# 2. Finding out missing values if any exist

missing_values = df.isnull().sum()

descriptive_statistics,missing_values
# Dropping duplicate columns from the dataset

duplicate_columns = ['Total Days Title XVIII','Total Days Title XIX', 'Total Days Other',

'Total Days Total','Number of Beds','Total Bed Days Available','Total Discharges Title

XVIII','Total Discharges Title XIX','Total Discharges Title Other','Total Discharges Total'] #

Replace with actual column names

data_cleaned = df.drop(columns=duplicate_columns)

data_cleaned

# Handling Missing Values

# Given the varied nature of missing data across columns, we'll proceed with a simple

imputation for numerical columns and exclude categorical ones from this step.

numerical_data = data_cleaned.select_dtypes(include=[np.number])

imputer = SimpleImputer(strategy='mean')

numerical_data_imputed = pd.DataFrame(imputer.fit_transform(numerical_data),

columns=numerical_data.columns)

# Proceeding with handling missing values in categorical columns, as they are just 1.5% of

the total records in the dataset we won't lose much information. so, we're deleting null values.

data_cleaned = data_cleaned.dropna(subset=['Street Address', 'Fiscal Year Begin Date',

'Fiscal Year End Date','Rural versus Urban'])

data_cleaned.info()

# Defining categorical data

categorical_data = data_cleaned.select_dtypes(include=['object', 'category'])


categorical_data.info()

# Merging categorical data with numerical data imputed and naming it as data_combined.

data_combined = pd.merge(categorical_data,numerical_data_imputed, left_index=True,

right_index=True, how='inner')

data_combined.info()

# There are some identifiers in the data, let's ignore those columns and use relevant columns

for the analysis by defining this data as snf_data.

snf_data = data_combined[['Facility Name','Street Address','City','State Code','Zip

Code','County','Rural versus Urban','SNF Average Length of Stay Title XVIII','SNF Average

Length of Stay Title XIX','SNF Average Length of Stay Total','SNF Admissions Title

XVIII','SNF Admissions Title XIX','SNF Admissions Other','SNF Admissions Total','SNF

Days Title XVIII','SNF Days Title XIX','SNF Days Other','SNF Days Total','SNF Number of

Beds','SNF Bed Days Available','SNF Discharges Title XVIII','SNF Discharges Title

XIX','SNF Discharges Title Other','SNF Discharges Total']]

snf_data.info()

snf_data.describe()

snf_data.hist(figsize=(20, 15), bins=20)

plt.tight_layout() # Adjusts subplots to fit into the figure area.


plt.show()

# Define categories and values to create piechart

categories = ['Rural', 'Urban']

values = [10952, 3882]

# Create piechart to show the percentage of SNFs in Rural versus Urban Areas

explode = (0, 0.1)

colors = ['limegreen', 'lightblue']

plt.figure(figsize=(8, 8))

plt.pie(values, labels=categories, autopct='%1.1f%%', startangle=90, explode=explode,

colors=colors)

plt.title('SNFs in Rural versus Urban Areas')

plt.show()

# Categorical Data Exploration: Focusing on 'State Code' and 'Rural versus Urban' variables

state_code_distribution = snf_data['State Code'].value_counts()

rural_urban_distribution = snf_data['Rural versus Urban'].value_counts()

state_code_distribution, rural_urban_distribution

import plotly.express as px

snf_counts_by_state = data_combined['State Code'].value_counts().reset_index()

snf_counts_by_state.columns = ['State Code', 'Count']


# 'snf_counts_by_state' has 'State Code' and 'Count'

fig = px.bar(snf_counts_by_state, x='State Code', y='Count', text='Count')

fig.update_traces(texttemplate='%{text}', textposition='outside')

fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.show()

# Correlation Analysis of the numerical data

correlations = snf_data.corr(numeric_only=True)

# Plotting the correlation matrix

plt.figure(figsize=(12, 12))

sns.heatmap(correlations, annot=True, fmt=".2f")

plt.title('Correlation Matrix')

plt.show()

# Exploring distributions of selected key variables using box plots

columns_to_plot = ['SNF Average Length of Stay Total', 'SNF Admissions Total','SNF

Discharges Total','SNF Number of Beds','SNF Days Total']

for column in columns_to_plot:

plt.figure() # Creates a new figure for each plot

sns.boxplot(y=snf_data[column])

plt.title(f'Box plot of {column}')

plt.ylabel('Value')
plt.show()

#Removing Outliers

def remove_outliers(snf_data, column):

Q1 = snf_data[column].quantile(0.25)

Q3 = snf_data[column].quantile(0.75)

IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR

upper_bound = Q3 + 1.5 * IQR

return snf_data[(snf_data[column] >= lower_bound) & (snf_data[column] <=

upper_bound)]

# Removing Outliers

columns_to_plot = ['SNF Average Length of Stay Total', 'SNF Admissions Total','SNF

Discharges Total','SNF Number of Beds','SNF Days Total']

for column in columns_to_plot:

snf_data = remove_outliers(snf_data, column)

plt.figure()

sns.boxplot(y=snf_data[column])

plt.title(f'Box plot of {column} (outliers removed)')

plt.show()

from sklearn.preprocessing import StandardScaler


from sklearn.decomposition import PCA

# Identifying numerical columns (excluding identifiers like Provider CCN)

numerical_cols =

numerical_data_imputed.select_dtypes(include=['number']).columns.drop(['rpt_rec_num',

'Provider CCN'])

numerical_data = numerical_data_imputed[numerical_cols]

# Standardizing the numerical data

scaler = StandardScaler()

standardized_data = scaler.fit_transform(numerical_data)

# Performing PCA

pca = PCA()

pca.fit(standardized_data)

# Getting the variance ratios of the principal components

explained_variance_ratio = pca.explained_variance_ratio_

explained_variance_ratio

# Extracting the component loadings (i.e., the correlation between the original variables and

the components)
loadings = pd.DataFrame(pca.components_.T, columns=[f'PC{i}' for i in range(1,

pca.components_.shape[1] + 1)], index=numerical_cols)

# Displaying the loadings for the first few principal components

loadings

# Visualizing the loadings of the first two principal components

plt.figure(figsize=(10, 6))

sns.heatmap(loadings[['PC1', 'PC2']])

plt.title('Loadings of PC1 and PC2')

plt.xlabel('Principal Components')

plt.ylabel('Original Variables')

plt.show()

You might also like