DATA PREPROCESSING AND LOADING
1. LOGISTIC REGRESSION
# Import necessary libraries
import numpy as np
import pandas as pd
import [Link] as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from [Link] import accuracy_score, classification_report,
confusion_matrix
# Generate some sample data
[Link](0)
data = {
'Exam1': [Link](100) * 100,
'Exam2': [Link](100) * 100,
'Admitted': [Link](2, size=100)
}
df = [Link](data)
print(df)
#
# # Split the data into features (X) and target (y)
X = df[['Exam1', 'Exam2']]
y = df['Admitted']
print(X)
print(y)
#
# # Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,
random_state=42)
#
# # Create a logistic regression model
model = LogisticRegression()
#
# # Fit the model to the training data
[Link](X_train, y_train)
#
# # Make predictions on the test data
y_pred = [Link](X_test)
print("------------------")
print(y_pred)
#
# # Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')
#
# # Display classification report and confusion matrix
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))
#
# Plot the decision boundary
# x_min, x_max = X['Exam1'].min() - 10, X['Exam1'].max() + 10
# y_min, y_max = X['Exam2'].min() - 10, X['Exam2'].max() + 10
# xx, yy = [Link]([Link](x_min, x_max, 100), [Link](y_min,
y_max, 100))
# Z = [Link](np.c_[[Link](), [Link]()])
# Z = [Link]([Link])
#
# [Link](xx, yy, Z, cmap=[Link], alpha=0.8)
# [Link](X['Exam1'], X['Exam2'], c=y, cmap=[Link])
# [Link]('Exam 1 Score')
# [Link]('Exam 2 Score')
# [Link]('Logistic Regression Decision Boundary')
# [Link]()
Output:
2. CONFUSION MATRIX
#scikit-learn
from [Link] import make_classification
value1, y = make_classification(
n_features=6,
n_classes=2,
n_samples=800,
n_informative=2,
random_state=66,
n_clusters_per_class=1,
##. This code imports the make_classification function from the [Link] module.
##• The make_classification function generates a random dataset for classification tasks.
##• The function takes several arguments: n_features: the number of features (or independent
variables) in the dataset.
##• In this case, there are 6 features.
##• n_classes: the number of classes (or target variables) in the dataset.
##• In this case, there are 3 classes.
##• n_samples: the number of samples (or observations) in the dataset.
##• In this case, there are 800 samples.
##• n_informative: the number of informative features in the dataset.
##• These are the features that actually influence the target variable.
##• In this case, there are 2 informative features.
##• random_state: a seed value for the random number generator.
##• This ensures that the dataset is reproducible.
##• n_clusters_per_class: the number of clusters per class.
##• This determines the degree of separation between the classes.
##• In this case, there is only 1 cluster per class.
##• The function returns two arrays: X: an array of shape (n_samples, n_features) containing the
features of the dataset.
##• y: an array of shape (n_samples,) containing the target variable of the dataset.
import [Link] as plt
[Link](value1[:, 0], value1[:, 1], c=y, marker="*")
[Link]()
##This code imports the [Link] module and creates a scatter plot using the scatter() function.
##• The X and y variables are assumed to be previously defined arrays or data frames.
##• The scatter() function takes three arguments: X[:, 0] and X[:, 1] are the first and second columns of
the X array, respectively, and c=y assigns a color to each point based on the corresponding value in the y
array.
##• The marker argument specifies the shape of the marker used for each point, in this case, an asterisk.
##• The resulting plot will have the values in the first column of X on the x-axis, the values in the second
column of X on the y-axis, and each point will be colored based on the corresponding value in y.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
value1, y, test_size=0.33, random_state=125
##This code imports the train_test_split function from the sklearn.model_selection module.
##• This function is used to split the dataset into training and testing sets.
##• The train_test_split function takes four arguments: X, y, test_size, and random_state.
##• X and y are the input features and target variable, respectively.
##• test_size is the proportion of the dataset that should be allocated to the testing set.
##• In this case, it is set to 0.33, which means that 33% of the data will be used for testing.
##• random_state is used to set the seed for the random number generator, which ensures that the
same random split is generated each time the code is run.
##• The function returns four variables: X_train, X_test, y_train, and y_test.
##• X_train and y_train are the training set, while X_test and y_test are the testing set.
##• These variables can be used to train and evaluate a machine learning model.
from sklearn.naive_bayes import GaussianNB
# Build a Gaussian Classifier
model = GaussianNB()
# Model training
[Link](X_train, y_train)
# Predict Output
predicted = [Link]([X_test[6]])
print("Actual Value:", y_test[6])
print("Predicted Value:", predicted[0])
##This code uses the scikit-learn library to build a Gaussian Naive Bayes classifier.
##• First, the code imports the GaussianNB class from the sklearn.naive_bayes module.
##• Next, a new instance of the GaussianNB class is created and assigned to the variable 'model'.
##• The model is then trained using the fit() method, which takes in the training data X_train and the
corresponding target values y_train.
##• After the model is trained, it is used to predict the output for a single test data point, which is the
7th element in the X_test array.
##• The predicted value is stored in the 'predicted' variable.
##• Finally, the actual value for the test data point is printed using y_test[6], and the predicted value is
printed using predicted[0].
#---------------
from [Link] import (
accuracy_score,
confusion_matrix,
ConfusionMatrixDisplay,
f1_score,
y_pred = [Link](X_test)
accuray = accuracy_score(y_pred, y_test)
f1 = f1_score(y_pred, y_test, average="weighted")
print("Accuracy:", accuray)
print("F1 Score:", f1)
##This code imports several functions from the [Link] module, including accuracy_score,
confusion_matrix, ConfusionMatrixDisplay, and f1_score.
##• These functions are used to evaluate the performance of a machine learning model.
##• The code then uses the [Link] method to generate predictions for the test data (X_test).
##• These predictions are compared to the actual labels (y_test) using the accuracy_score and f1_score
functions.
##• The accuracy_score function calculates the accuracy of the model's predictions,
## while the f1_score function calculates the F1 score, which is a weighted average of precision and
recall.
##• Finally, the code prints out the accuracy and F1 score of the model's predictions.
#-----------------------------
#####Expected output
####
####Accuracy: 0.8484848484848485
####F1 Score: 0.8491119695890328
####This code snippet is not actually a code, but rather the output of some code that was run.
####• It shows the accuracy and F1 score of a model that was trained on some data.
####• The accuracy is 0.8484848484848485,
## which means that the model correctly predicted the outcome of 84.8% of the cases.
####• The F1 score is 0.8491119695890328,
#### which is a measure of the model's accuracy that takes into account both precision and recall.
####• A higher F1 score indicates better performance of the model.
#------------------------------------------------
labels = [0,1,2]
cm = confusion_matrix(y_test, y_pred, labels=labels)
print(cm)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
[Link]()
########This code is using the scikit-learn library to create a confusion matrix and display it using
ConfusionMatrixDisplay.
########• First, a list of labels is created with the values 0, 1, and 2.
########• Then, the confusion_matrix function is called with the test labels (y_test) and predicted
labels (y_pred) as inputs, along with the labels list.
########• This creates a confusion matrix with the specified labels.
########• Next, a ConfusionMatrixDisplay object is created with the confusion matrix as input, along
with the labels list.
########• Finally, the plot method is called on the display object to show the confusion matrix
graphically.
# Run this program on your local python
# interpreter, provided you have installed
# the required libraries.
# Importing the required packages
import numpy as np
import pandas as pd
from [Link] import confusion_matrix
from sklearn.model_selection import train_test_split
from [Link] import DecisionTreeClassifier
from [Link] import accuracy_score
from [Link] import classification_report
# Function importing Dataset
def importdata():
balance_data = pd.read_csv(
'[Link] +
'databases/balance-scale/[Link]',
sep=',', header=None)
# Printing the dataswet shape
print("Dataset Length: ", len(balance_data))
print("Dataset Shape: ", balance_data.shape)
# Printing the dataset obseravtions
print("Dataset: ", balance_data.head())
return balance_data
# Function to split the dataset
def splitdataset(balance_data):
# Separating the target variable
X = balance_data.values[:, 1:5]
Y = balance_data.values[:, 0]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.3, random_state=100)
return X, Y, X_train, X_test, y_train, y_test
# Function to perform training with giniIndex.
def train_using_gini(X_train, X_test, y_train):
# Creating the classifier object
clf_gini = DecisionTreeClassifier(criterion="gini",
random_state=100, max_depth=3, min_samples_leaf=5)
# Performing training
clf_gini.fit(X_train, y_train)
return clf_gini
# Function to perform training with entropy.
def tarin_using_entropy(X_train, X_test, y_train):
# Decision tree with entropy
clf_entropy = DecisionTreeClassifier(
criterion="entropy", random_state=100,
max_depth=3, min_samples_leaf=5)
# Performing training
clf_entropy.fit(X_train, y_train)
return clf_entropy
# Function to make predictions
def prediction(X_test, clf_object):
# Predicton on test with giniIndex
y_pred = clf_object.predict(X_test)
print("Predicted values:")
print(y_pred)
return y_pred
# Function to calculate accuracy
def cal_accuracy(y_test, y_pred):
print("Confusion Matrix: ",
confusion_matrix(y_test, y_pred))
print("Accuracy : ",
accuracy_score(y_test, y_pred) * 100)
print("Report : ",
classification_report(y_test, y_pred))
# Driver code
def main():
# Building Phase
data = importdata()
X, Y, X_train, X_test, y_train, y_test = splitdataset(data)
clf_gini = train_using_gini(X_train, X_test, y_train)
clf_entropy = tarin_using_entropy(X_train, X_test, y_train)
# Operational Phase
print("Results Using Gini Index:")
# Prediction using gini
y_pred_gini = prediction(X_test, clf_gini)
cal_accuracy(y_test, y_pred_gini)
print("Results Using Entropy:")
# Prediction using entropy
y_pred_entropy = prediction(X_test, clf_entropy)
cal_accuracy(y_test, y_pred_entropy)
# Calling main function
if __name__ == "__main__":
main()
3. CREDITCARD FRAUD CSV IMPORT
Conclusion
In conclusion, preprocessing data before applying it to a machine learning
algorithm is a crucial step in the ML workflow. It helps to improve the
accuracy, reduce the time and resources required to train the model, prevent
overfitting, and improve the interpretability of the model.
The above code example demonstrates how to preprocess data using the
popular Python library, Pandas, but there are many other libraries available for
preprocessing data, such as NumPy and Scikit-learn, that can be used
depending on the specific needs of your project.