-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmpp.py
More file actions
146 lines (125 loc) · 4.85 KB
/
mpp.py
File metadata and controls
146 lines (125 loc) · 4.85 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
#
# Maximum Posterior Probability (MPP)
#
# Supervised parametric learning assuming Gaussian pdf
# with 3 cases of discriminant functions
#
# Sample code for the Machine Learning class at UTK
#
# Hairong Qi, hqi@utk.edu
#
import numpy as np
import sys
import time
import util
import load_data
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn.metrics import accuracy_score as acs
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix
import pandas as pd
def mpp(Tr, yTr, Te, cases, P):
# training process - derive the model
covs, means = {}, {} # dictionaries
covsum = None
classes = np.unique(yTr) # get unique labels as dictionary items
print(f"classes = {classes}")
classn = len(classes) # number of classes
print(f"classn = {classn}")
for c in range(classn):
# filter out samples for the c^th class
arr = Tr[yTr == classes[c]]
arr = arr.astype(float)
print(f"arr.shape = {arr.shape}")
# calculate statistics
covs[c] = np.cov(np.transpose(arr))
means[c] = np.mean(arr, axis=0) # mean along the columns
# accumulate the covariance matrices for Case 1 and Case 2
if covsum is None:
covsum = covs[c]
else:
covsum += covs[c]
# used by case 2
covavg = covsum / classn
# used by case 1
varavg = np.sum(np.diagonal(covavg)) / classn
# testing process - apply the learned model on test set
disc = np.zeros(classn)
nr, _ = Te.shape
y = np.zeros(nr) # to hold labels assigned from the learned model
for i in range(nr):
for c in range(classn):
if cases == 1:
edist2 = util.euc2(means[c], Te[i])
disc[c] = -edist2 / (2 * varavg) + np.log(P[c] + 0.000001)
elif cases == 2:
mdist2 = util.mah2(means[c], Te[i], covavg)
disc[c] = -mdist2 / 2 + np.log(P[c] + 0.000001)
elif cases == 3:
mdist2 = util.mah2(means[c], Te[i], covs[c])
disc[c] = -mdist2 / 2 - np.log(np.linalg.det(covs[c])) / 2 + np.log(P[c] + 0.000001)
else:
print("Can only handle case numbers 1, 2, 3.")
sys.exit(1)
y[i] = disc.argmax()
return y
def main():
# load data
training_data = load_data.read_data("train.csv")
testing_data = load_data.read_data("test.csv")
testing_labels = load_data.read_data("submission.csv")
X_train, X_test = load_data.vectorize_data(training_data, testing_data)
X_train = X_train.toarray()
X_test = X_test.toarray()
Y_train = np.array(training_data)[:, -1]
Y_test = np.array(testing_labels)[:, -1]
# the training and testing datasets should have the same dimension
_, nftrain = X_train.shape
_, nftest = X_test.shape
assert nftrain == nftest
# ask the user to input which discriminant function to use
prompt = '''
Type of discriminant functions supported assuming Gaussian pdf:
1 - minimum Euclidean distance classifier
2 - minimum Mahalanobis distance classifier
3 - quadratic classifier
'''
print(prompt)
str = input('Please input 1, 2, or 3: ')
cases = int(str)
# ask the user to input prior probability that needs to sum to 1
prop_str = input("Please input prior probabilities in float numbers, separated by space, and they must add to 1: \n")
numbers = prop_str.split()
P = np.zeros(len(numbers))
Psum = 0
for i in range(len(numbers)):
P[i] = float(numbers[i])
Psum += P[i]
if Psum != 1:
print("Prior probabilities do not add up to 1. Please check!")
sys.exit(1)
# derive the decision rule from the training set and apply on the test set
t0 = time.time() # start time
Y_pred = mpp(X_train, Y_train, X_test, cases, P)
t1 = time.time() # ending time
print(Y_pred)
Y_pred = Y_pred.astype("int")
Y_pred = Y_pred.astype("str")
# calculate accuracy
precision, recall, fscore, train_support = score(Y_test, Y_pred, pos_label='1', average='binary')
print('Precision: {} / Recall: {} / F1-Score: {} / Accuracy: {}'.format(
round(precision, 3), round(recall, 3), round(fscore, 3), round(acs(Y_test, Y_pred), 3)))
cm = confusion_matrix(Y_test, Y_pred)
class_label = ["0", "1"]
df_cm = pd.DataFrame(cm, index=class_label, columns=class_label)
sns.heatmap(df_cm, annot=True, fmt='d')
plt.title("Confusion Matrix")
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.show()
print(f'The learning process takes {t1 - t0} seconds.')
if __name__ == "__main__":
main()