import pandas as pd
import numpy as np
import math
from sklearn.utils import shuffle
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
filePath = "E:assignmentdata{}.csv"
folds = 5
iterations = 1000
lr = 0.0005
coeff = list()
def main():
infogainexercise()
for i in range(1, 57):
logreg(i)
print(coeff)
print("length of final ", len(coeff))
dfout =pd.DataFrame(coeff)
print(dfout)
dfout.to_csv(filePath.format("coeff"), index=False, header=None)
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def loss_fn(h, y):
return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
def predict(X, theta):
threshold = 0.5
prob = sigmoid(np.dot(X, theta))
return [x for x in map(lambda x: 1 if x else 0, prob >= threshold)]
def logreg(k):
df = pd.read_csv(filePath.format(k), header=None)
sample = math.floor(len(df) / folds)
weights = list()
loss = list()
accuracy = list()
fscore = list()
print("processing file {}".format(k), ">>>>>>>>>>>>>")
for f in range(1, folds + 1):
shuffledata = shuffle(df)
test = shuffledata[:sample]
train = shuffledata[sample:]
train_X =train.iloc[:, 0:20]
# add intercept to train data
train_X["intercept"] = 1
train_Y = train[20].gt(0).astype(int)
# print(train_X.shape)
test_X =test.iloc[:, 0:20]
# add intercept to test data
test_X["intercept"] = 1
test_Y = test[20].gt(0).astype(int)
theta = np.zeros(train_X.shape[1])
for i in range(iterations):
z = np.dot(train_X, theta)
h = sigmoid(z)
gradient = np.dot(train_X.T, (h -train_Y)) / train_Y.size
theta -= lr * gradient
if i % iterations == 0:
z = np.dot(train_X, theta)
h = sigmoid(z)
# loss
loss.append(loss_fn(h,train_Y))
# weights
weights.append([x for x in reversed(theta)])
# predictions on test data
y_hat = predict(test_X, theta)
# accuracy on test data
accuracy.append((y_hat ==test_Y).mean())
# confusion matrix
cf =confusion_matrix(test_Y, y_hat)
# f1 score
fscore.append(f1_score(test_Y,y_hat))
# print("predictions",y_hat)
print("weights", weights)
print("accuracy", accuracy)
print("f1 score", fscore)
# pick best weights based on max f1 measure
# if max f1 is 0, then pick max accuracy
maxpos = 0
if max(fscore) != 0:
maxpos =fscore.index(max(fscore))
else:
maxpos =accuracy.index(max(accuracy))
print("max value::", maxpos, fscore[maxpos], accuracy[maxpos])
x1 = weights[maxpos]
x1.insert(21, fscore[maxpos])
x1.insert(22, accuracy[maxpos])
# print(">>>>>>>>>>>>>>>>>>>>>>", x1)
coeff.append(x1)
return coeff
def infogainexercise():
infogainfinal = []
for i in range(1, 57):
print("file processed {}".format(i))
df = pd.read_csv(filePath.format(i), header=None)
features = df.loc[:, :19]
labels = df[20].gt(0).astype(int)
infogain = []
# print([x for x in df.columns])
for col in features.columns:
# ftinfo = []
threshold = np.mean(df[col])
features["new"] = features[col].gt(threshold).astype(int)
df1 = pd.DataFrame(list(zip(features["new"], labels)), columns=["x",
"y"])
tot = df1.count()[0]
# print("total rows:", tot)
infogain.append(infogain_fn(df1))
# print(df)
# infogain.append(ftinfo)
infogainfinal.append(infogain)
dfout =pd.DataFrame(infogainfinal)
print(dfout)
dfout.to_csv(filePath.format("infogain"), index=False, header=None)
def infogain_fn(df):
# print(df)
tot = df.count()[0]
# calcuate entropy of child
c00 = df[(df["x"] == 0) & (df["y"] == 0)].count()[0]
c01 = df[(df["x"] == 0) & (df["y"] == 1)].count()[0]
t0 = c00 + c01
c10 = df[(df["x"] == 1) & (df["y"] == 0)].count()[0]
c11 = df[(df["x"] == 1) & (df["y"] == 1)].count()[0]
t1 = c10 + c11
# print(c10, c11, c00, c01)
ig00 = 0
if c00 != 0:
ig00 = -1 * (c00 / t0) * math.log2(c00 / t0)
ig01 = 0
if c01 != 0:
ig01 = -1 * (c01 / t0) * math.log2(c01 / t0)
ig10 = 0
if c10 != 0:
ig1 = -1 * (c10 / t1) * math.log2(c10 / t1)
ig11 = 0
if c11 != 0:
ig11 = -1 * (c11 / t1) * math.log2(c11 / t1)
ig0 = ig00 + ig01
ig1 = ig10 + ig11
entchild = (t0 / tot) * ig0 + (t1 / tot) * ig1
# calculate entropy of parent
pc0 = df[(df["y"] == 0)].count()[0]
pc1 = df[(df["y"] == 1)].count()[0]
e0 = 0
if pc0 != 0:
e0 = -1 * (pc0 / tot) * math.log2(pc0 / tot)
e1 = 0
if pc0 != 0:
e1 = -1 * (pc1 / tot) * math.log2(pc1 / tot)
eparent = e0 + e1
return eparent -entchild
if __name__ == "__main__":
main()