import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#Import the Datasist Library
import datasist as ds
#Read in data set
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')
Quick summary of a data set using the describe function in the structdata module
ds.structdata.describe(train_data)
Remove features that contains only one unique field as these features are redundant
#Drop redundant features
ds.feature_engineering.drop_redundant(data=train_data)
ds.feature_engineering.drop_redundant(data=test_data)
Check for missing values in dataset with the display function
EXPLORATION OF CATEGORICAL FEATURES
cat_feats = ds.structdata.get_cat_feats(train_data)
cat_feats
ds.structdata.get_unique_counts(train_data)
From the unique display output, we notice that the TransactionId and BatchId contains too many classes and thus we can drop them
train_data.drop(['TransactionId', 'BatchId'], axis=1, inplace=True)
test_data.drop(['TransactionId', 'BatchId'], axis=1, inplace=True)
VISUALIZATION FOR CATEGORICAL FEATURES
ds.visualizations.countplot(train_data)