Decision Tree and Python Coding
Due today at 10:30 AM
Instructions
Read the article here and make the codes run in Jupyter. Review on how to make methods in Python.
Put your name as part of the screenshots for the codes.
y_not_zero = y>0 #get the index for each non zero value in y.
y[y_not_zero] = 1 #set each non-zero value in y to 1.
y.unique() #verify that y only contains 0 and 1.
#split data into training and testing
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y_int, test_size=0.30, random_state=42)
#create decision tree and fit it to the training data
clf_df=DecisionTreeClassifier(random_state=42)
clf_df=clf_df.fit(X_train,y_train)
##plot the tree
plt.figure(figsize=(15,7.5))
plot_tree(clf_df,filled=True,rounded=True,class_name=["","",],feature_names=X_encoded.columns);
path =clf_df.cost_complexity_pruning_path(X_train,y_train) #determine values for alpha
ccp_alphas = path.ccp_alphas #extract different values for alpha
ccp_alphas = ccp_alphas[:-1] #exclude the maximum value for alpha
clf_dt = [] #create an array that we will put decision trees into
## now create one decision tree per value for for alpha and store it in the array
for ccp_alpha in ccp_alphas:
~~graph the accuracy of the trees using the Training Dataset and the Testing Dataset as a function of
the alpha.
train_score = [clf_df.score(X_train,y_train) for clf_df in clf_dt]
test_score = [clf_df.score(X_test,y_test)for clf_df in clf_dt]
fig,ax = plt.subplots()
ax.set_xlabel("alphas")
ax.set_ylabel("accuracy")
ax.set_title("Accuracy vs alpha for training and testing sets")
ax.plot(ccp_alphas, train_scores, marker='o', label="train", drawstyle="steps-post")
ax.plot(ccp_alphas, test_scores, marker='o', label = "test", drawstyle="steps-post")
ax.legend()
plt.show()