-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathperceptronAlgo.py
More file actions
208 lines (181 loc) · 9.09 KB
/
perceptronAlgo.py
File metadata and controls
208 lines (181 loc) · 9.09 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
import random
import time
import copy
import numpy
class Perceptron:
def __init__(self, img_size_y, filenames, enumerate_features):
# user-defined self variables
# Number of chars in each image (y dimension)
# Number of lines to read per image until starting a new image
self.img_size_y = img_size_y
# List of all files (ordered in terms of execution order. FIRST WILL BE USED AS TRAINING)
# FORMAT:
# [["training_img", "training_labels"], ["validate_img", "validate_labels"], ["test_img", "test_labels"]]
self.filenames = filenames
# Enumeration function
self.enumerate_features = enumerate_features
# internally used self variables
self.num_images_full = 0
# Number of training images
self.num_images = 0
# labels mapped to all of their respective images (represented as strings) (FULL SET)
self.label_images_full = dict()
# labels mapped to all of their respective images (represented as strings) (FRACTIONAL SET)
self.label_images = dict()
# List of all features in data
self.valid_features = set()
# Percentage of samples to be taken
self.sample_sizes = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
# Number of trials for each percentage of samples
self.max_iter = 3
# Dict with label as keys and the weight vector (as a list) as values
self.weights = dict()
# Number of trials for each percentage of samples
self.trials = 5
# Threshold for changing true weights
self.threshold = -1
# Calculates and prints data based on 2 dicts with filenames as keys and lists of accuracies/time as values
def print_data(self, acc, t_time, percent_data_used):
print("Sampling " + str(percent_data_used) + "% of Training File: " + "(Training Time = " + str(
numpy.sum(t_time)) + ")")
for filename in acc:
print("\tAccuracy for predicting file " + filename + ": ")
print("\t\tMean: " + str(numpy.mean(acc[filename])))
print("\t\tStd Dev: " + str(numpy.std(acc[filename])))
print()
def fractional_training(self):
for perc in self.sample_sizes:
acc = dict()
train_time = 0
for i in range(self.trials):
# Initializing fractional sets from full sets and re-used self variables
self.num_images = int(self.num_images_full * (perc / 100))
self.label_images = copy.deepcopy(self.label_images_full)
self.weights = dict()
# Remove proper amount of randomly chosen images from fractional set
for i in range(self.num_images_full - self.num_images):
available_labels = []
for label in self.label_images:
if len(self.label_images[label]) > 0:
available_labels.append(label)
random_label = random.choice(available_labels)
self.label_images[random_label].remove(random.choice(self.label_images[random_label]))
# Actual Training happens here
start_time = time.time()
self.init_weights()
self.update_weights()
end_time = time.time()
train_time = end_time - start_time
acc_dict = self.make_predictions()
for filename in acc_dict:
if filename not in acc:
acc[filename] = []
acc[filename].append(acc_dict[filename])
self.print_data(acc, train_time, perc)
def make_predictions(self):
acc = dict()
for filenames in self.filenames[1:]:
with open(filenames[0]) as imageReader, open(filenames[1]) as labelReader:
total_tries = 0
correct = 0
for labelLine in labelReader:
total_tries += 1
label = labelLine[:-1]
img = []
for line_num in range(self.img_size_y):
imageLine = next(imageReader).strip('\n')
[img.append(self.enumerate_features[c]) for c in imageLine]
scores = dict()
for label_in in self.label_images:
scores[label_in] = self.calc_score(label_in, img)
prediction = max(scores, key=scores.get)
# print(prediction)
if label == prediction:
correct += 1
# Returns [filename, Accuracy of Predictions]
acc[filenames[0]] = (float(correct) / total_tries)
return acc
def calc_score(self, label, img):
sum = 0
for ind in range(len(img)):
sum += (img[ind] * self.weights[label][ind])
return sum
def update_weights(self):
did_update = False
for iter_num in range(self.max_iter):
# Version 2
rand_order = []
for label in self.label_images:
for img in self.label_images[label]:
rand_order.append([label, img])
random.shuffle(rand_order)
scores_ex = dict()
for item in rand_order:
for label_ex in self.label_images:
scores_ex[label_ex] = self.calc_score(label_ex, item[1])
prediction_ex = max(scores_ex, key=scores_ex.get)
if prediction_ex != item[0]:
did_update = True
for ind in range(len(item[1])):
if abs(self.weights[prediction_ex][ind] - self.weights[item[0]][ind]) >= self.threshold:
self.weights[item[0]][ind] += item[1][ind]
self.weights[prediction_ex][ind] -= item[1][ind]
# Version 1 [Not random order when iterating to train. This makes the predictions/weights be overwhelmingly biased]
# for label in self.label_images:
# The dict to hold all scores for each label (max is the prediction)
# scores = dict()
# # Begin calculating scores
# for img in self.label_images[label]:
# for label_in in self.label_images:
# scores[label_in] = self.calc_score(label_in, img)
# prediction = max(scores, key=scores.get)
# If prediction is not correct...
# if prediction != label:
# did_update = True
# for ind in range(len(img)):
# if abs(self.weights[prediction][ind] - self.weights[label][ind]) >= self.threshold:
# self.weights[label][ind] += img[ind]
# self.weights[prediction][ind] -= img[ind]
if not did_update:
break
def training(self):
with open(self.filenames[0][0]) as imageReader, open(self.filenames[0][1]) as labelReader:
for labelLine in labelReader:
label = labelLine[:-1]
image = []
for line_num in range(self.img_size_y):
imageLine = next(imageReader).strip('\n')
[self.valid_features.add(c) for c in imageLine]
[image.append(self.enumerate_features[c]) for c in imageLine]
self.num_images_full += 1
if label not in self.label_images_full:
self.label_images_full[label] = [image]
else:
self.label_images_full[label].append(image)
self.fractional_training()
def init_weights(self):
for label in self.label_images:
self.weights[label] = []
valid_label = ""
for label_x in self.label_images:
if len(self.label_images[label_x]) > 0:
valid_label = label_x
break
for label in self.label_images:
for char in self.label_images[valid_label][0]:
# self.weights[label].append(random.uniform(-1, 1))
self.weights[label].append(0)
enumeration_func = {' ': 0, '+': 1, '#': 2}
digit_predictor = Perceptron(28,
[['digitdata/trainingimages', 'digitdata/traininglabels'], # Training Files Here
['digitdata/validationimages', 'digitdata/validationlabels'], # Validation Files Here
['digitdata/testimages', 'digitdata/testlabels']],
enumeration_func) # Test Files Here
digit_predictor.training()
face_predictor = Perceptron(70,
[['facedata/facedatatrain', 'facedata/facedatatrainlabels'], # Training Files Here
['facedata/facedatavalidation', 'facedata/facedatavalidationlabels'],
# Validation Files Here
['facedata/facedatatest', 'facedata/facedatatestlabels']],
enumeration_func) # Test Files Here
face_predictor.training()