import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matpoltlib.pyplot as plt
import os
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.svm import SVR
from matplotlib.colors import ListedColormap
test=pd.read_csv("Testing.csv")
train=pd.read_csv("Training.csv")
train.sample(n=1)
#This will indicate any random 1 data will be retreived
data = pd.concat([train, test])
data.sample(10)
#This will give us randome sample data
data.head(5)
# This will retrieve the 1st 5 data
data.tail(5)
# This will retrive the last 5 data
data.columns
# This will retrieve all the colums associated with the dataset
data.shape
# It has 4961 rows and 133 columns
data.info()
# What we are expecting from visualization. ?
data.boxplot(column ='skin_rash', by='itching') #boxplot shows outlier, median,Q3,Q1
# # What we are expecting from visualization. ?
data1 =data.loc[:,["itching","skin_rash","coma"] ]
data1.plot()
## What we are expecting from visualization. ?
data1.plot(subplots =True)
data.plot(kind= "scatter",x="depression", y="muscle_pain")
data.plot(kind="hist", y="acidity", bins = 50, range=(0,250))
data.isnull().sum()
#Importing the train_test_split functionality
from sklearn.model_selection import train_test_split
X, y=data.iloc[:,:-1], data.iloc[:,-1]
#Spliting the dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3)
#70% training and 30% test
#Import Random Forest Model
from sklearn.ensemble import RandomForestClassifier
#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)
#Train the model using the training sets y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)
y_pred=clf.predict(X_test)
#Import scikit-learn metrics module for accuracy calculation
from sklearn import metrics
# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
feature_imp = pd.Series(clf.feature_importances_,index=list(data.columns[:-
1])).sort_values(ascending=False).head(50)
feature_imp[::-1]
feature_imp2 =
pd.Series(clf2.feature_importances_,index=list(X_reduced.columns)).sort_values(asce
nding=False)
feature_imp2[::-1]
#looking how much percent each diseases having
data['prognosis'].value_counts(normalize = True)