Beginning model for training a spell recognizer
This commit is contained in:
+4
-1
@@ -197,4 +197,7 @@ fabric.properties
|
|||||||
# Android studio 3.1+ serialized cache file
|
# Android studio 3.1+ serialized cache file
|
||||||
.idea/caches/build_file_checksums.ser
|
.idea/caches/build_file_checksums.ser
|
||||||
|
|
||||||
data
|
data
|
||||||
|
train
|
||||||
|
test
|
||||||
|
val
|
||||||
|
|||||||
@@ -30,7 +30,7 @@ class DataCollectWand(Wand):
|
|||||||
if pressed:
|
if pressed:
|
||||||
self.data = []
|
self.data = []
|
||||||
print("Button pressed")
|
print("Button pressed")
|
||||||
await self.vibrate(PATTERN.BURST)
|
await self.vibrate(PATTERN.SHORT)
|
||||||
await self.reset_position()
|
await self.reset_position()
|
||||||
await self.subscribe_position()
|
await self.subscribe_position()
|
||||||
else:
|
else:
|
||||||
|
|||||||
@@ -3,6 +3,7 @@
|
|||||||
"lumos"
|
"lumos"
|
||||||
"nox"
|
"nox"
|
||||||
"accio"
|
"accio"
|
||||||
|
"stupify"
|
||||||
"expelliarmus"
|
"expelliarmus"
|
||||||
"engorgio"
|
"engorgio"
|
||||||
"reducio"
|
"reducio"
|
||||||
|
|||||||
|
@@ -0,0 +1,96 @@
|
|||||||
|
import os
|
||||||
|
from random import random
|
||||||
|
from shutil import copyfile, rmtree
|
||||||
|
import multiprocessing
|
||||||
|
|
||||||
|
train_dir = "./train/"
|
||||||
|
test_dir = "./test/"
|
||||||
|
val_dir = "./val/"
|
||||||
|
train = .80
|
||||||
|
test = .15
|
||||||
|
val = .05
|
||||||
|
|
||||||
|
|
||||||
|
def add_train_data(file, filename, label):
|
||||||
|
dest = train_dir + label + "/" + filename
|
||||||
|
if not os.path.exists(os.path.dirname(dest)):
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(dest))
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
try:
|
||||||
|
copyfile(file, dest)
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
print("INVALID FILE")
|
||||||
|
os.remove(file)
|
||||||
|
|
||||||
|
|
||||||
|
def add_val_data(file, filename, label):
|
||||||
|
dest = val_dir + label + "/" + filename
|
||||||
|
if not os.path.exists(os.path.dirname(dest)):
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(dest))
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
copyfile(file, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def add_test_data(file, filename, label):
|
||||||
|
dest = test_dir + label + "/" + filename
|
||||||
|
if not os.path.exists(os.path.dirname(dest)):
|
||||||
|
try:
|
||||||
|
os.makedirs(os.path.dirname(dest))
|
||||||
|
except Exception as e:
|
||||||
|
print(e)
|
||||||
|
copyfile(file, dest)
|
||||||
|
|
||||||
|
|
||||||
|
def remove_previous():
|
||||||
|
if os.path.exists(os.path.dirname(test_dir)):
|
||||||
|
rmtree(test_dir)
|
||||||
|
if os.path.exists(os.path.dirname(train_dir)):
|
||||||
|
rmtree(train_dir)
|
||||||
|
if os.path.exists(os.path.dirname(val_dir)):
|
||||||
|
rmtree(val_dir)
|
||||||
|
|
||||||
|
|
||||||
|
files_processed = 0
|
||||||
|
def test_split_file(file_root):
|
||||||
|
global files_processed
|
||||||
|
root = file_root[0]
|
||||||
|
file = file_root[1]
|
||||||
|
# print(file)
|
||||||
|
|
||||||
|
if file is ".DS_Store":
|
||||||
|
return
|
||||||
|
c = random()
|
||||||
|
|
||||||
|
if c < train:
|
||||||
|
add_train_data(os.path.join(root, file), file, root.split("/")[-1])
|
||||||
|
elif c < (train + val):
|
||||||
|
add_val_data(os.path.join(root, file), file, root.split("/")[-1])
|
||||||
|
else:
|
||||||
|
add_test_data(os.path.join(root, file), file, root.split("/")[-1])
|
||||||
|
files_processed += 1
|
||||||
|
|
||||||
|
if files_processed % 1000==0:
|
||||||
|
print(root.split("/")[-1])
|
||||||
|
print(files_processed)
|
||||||
|
print(file)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
remove_previous()
|
||||||
|
|
||||||
|
file_root_list = []
|
||||||
|
|
||||||
|
for root, dirs, files in os.walk("data/"):
|
||||||
|
for file in files:
|
||||||
|
file_root_list.append((root, file))
|
||||||
|
|
||||||
|
|
||||||
|
pool = multiprocessing.Pool(multiprocessing.cpu_count()*2)
|
||||||
|
|
||||||
|
pool.map(test_split_file, file_root_list)
|
||||||
|
|
||||||
+231
@@ -0,0 +1,231 @@
|
|||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
from sklearn.cluster import KMeans
|
||||||
|
from sklearn.metrics import confusion_matrix
|
||||||
|
from sklearn.ensemble import RandomForestClassifier
|
||||||
|
from sklearn.naive_bayes import GaussianNB
|
||||||
|
import matplotlib.pyplot as plt
|
||||||
|
from sklearn import svm
|
||||||
|
from pprint import pprint
|
||||||
|
import os
|
||||||
|
import seaborn as sn
|
||||||
|
|
||||||
|
def normalized(a, axis=-1, order=2):
|
||||||
|
return a / sum(a)
|
||||||
|
|
||||||
|
def get_activities(dir):
|
||||||
|
activity_data_train = {}
|
||||||
|
activity_data_test = {}
|
||||||
|
i= 0
|
||||||
|
for root, dirs, files in os.walk(dir):
|
||||||
|
|
||||||
|
activity_name = root.split('\\')[1]
|
||||||
|
activity_data_train[activity_name] = []
|
||||||
|
activity_data_test[activity_name] = []
|
||||||
|
for i in range(int(.8 * len(files))):
|
||||||
|
data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten()
|
||||||
|
activity_data_train[activity_name].append(data)
|
||||||
|
i+=1
|
||||||
|
while i < len(files):
|
||||||
|
data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten()
|
||||||
|
activity_data_test[activity_name].append(data)
|
||||||
|
i+=1
|
||||||
|
|
||||||
|
return activity_data_train, activity_data_test
|
||||||
|
|
||||||
|
def quantize_data(data, vector_len):
|
||||||
|
del data[""]
|
||||||
|
print("quantizing", data)
|
||||||
|
print(vector_len)
|
||||||
|
quantized_data = {}
|
||||||
|
total = 0
|
||||||
|
total2 = 0
|
||||||
|
for key, value in data.items():
|
||||||
|
|
||||||
|
print(key, len(value))
|
||||||
|
quantized_data[key] = {}
|
||||||
|
print(vector_len)
|
||||||
|
for i in range(len(value)):
|
||||||
|
total2+= len(value[i])/vector_len
|
||||||
|
while(len(value[i]) %vector_len != 0):
|
||||||
|
value[i] = value[i][:-1]
|
||||||
|
print(len(value[i]), vector_len)
|
||||||
|
new_data = np.split(value[i], len(value[i]) / vector_len)
|
||||||
|
# if len(new_data[len(new_data) - 1]) != vector_len:
|
||||||
|
# print(vector_len,len(new_data[len(new_data) - 1]))
|
||||||
|
# del new_data[len(new_data) - 1]
|
||||||
|
quantized_data[key][i] = new_data
|
||||||
|
total+= len(new_data)
|
||||||
|
|
||||||
|
|
||||||
|
print("Total is:", total, total2)
|
||||||
|
|
||||||
|
return quantized_data
|
||||||
|
|
||||||
|
lengths = [4]
|
||||||
|
# for i in range(3,4):
|
||||||
|
# lengths.append((i+1)*21)
|
||||||
|
data_train, data_test = get_activities('data\\')
|
||||||
|
print("----------------")
|
||||||
|
|
||||||
|
X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]])
|
||||||
|
|
||||||
|
# print(X)
|
||||||
|
clusters =[1000]# [600,600,600,1000,1000,1000,1400,1400,1400]
|
||||||
|
q_count = 0
|
||||||
|
n_count = 0
|
||||||
|
accuracies = []
|
||||||
|
for quantize_length in lengths:
|
||||||
|
accuracies.append([])
|
||||||
|
for num_clusters in clusters:
|
||||||
|
|
||||||
|
qdata_test = quantize_data(data_test, quantize_length)
|
||||||
|
print("----------------")
|
||||||
|
qdata_train = quantize_data(data_train, quantize_length)
|
||||||
|
print("----------------")
|
||||||
|
#fill kmeans_data with all of the data of all of the exercises reguardless of what they are
|
||||||
|
k_means_data = None
|
||||||
|
|
||||||
|
for exercise_name, exercise_data in qdata_train.items():
|
||||||
|
for example_number, example_data in exercise_data.items():
|
||||||
|
if k_means_data is None:
|
||||||
|
k_means_data = np.array(example_data)
|
||||||
|
else:
|
||||||
|
# print(k_means_data.shape)
|
||||||
|
# print( np.atleast_1d(part).shape)
|
||||||
|
# print( np.atleast_2d(part).shape)
|
||||||
|
k_means_data = np.concatenate((k_means_data,np.array(example_data)), axis=0)
|
||||||
|
|
||||||
|
print(len(k_means_data[0]))
|
||||||
|
print(len(k_means_data[:,0]))
|
||||||
|
|
||||||
|
|
||||||
|
print("training the model")
|
||||||
|
kmeans = KMeans(n_clusters=num_clusters, n_init=15, max_iter= 500).fit(k_means_data)
|
||||||
|
|
||||||
|
print("Done training model")
|
||||||
|
centers = kmeans.cluster_centers_
|
||||||
|
#Time to run the training by again and make the histograms for each training data point.
|
||||||
|
classifier_training_X = []
|
||||||
|
classifier_training_Y = []
|
||||||
|
|
||||||
|
# print("creating histograms for training")
|
||||||
|
for exercise_name, exercise_data in qdata_train.items():
|
||||||
|
for example_number, example_data in exercise_data.items():
|
||||||
|
center_histo = np.zeros((num_clusters,))
|
||||||
|
center_indices = kmeans.predict(example_data)
|
||||||
|
for i in center_indices:
|
||||||
|
center_histo[i] += 1
|
||||||
|
classifier_training_X.append(normalized(center_histo))
|
||||||
|
classifier_training_Y.append(exercise_name)
|
||||||
|
|
||||||
|
|
||||||
|
# print("done")
|
||||||
|
|
||||||
|
classifier_test_X = []
|
||||||
|
classifier_test_Y = []
|
||||||
|
# print("creating histograms for testing")
|
||||||
|
for exercise_name, exercise_data in qdata_test.items():
|
||||||
|
for example_number, example_data in exercise_data.items():
|
||||||
|
center_histo = np.zeros((num_clusters,))
|
||||||
|
center_indices = kmeans.predict(example_data)
|
||||||
|
for i in center_indices:
|
||||||
|
center_histo[i] += 1
|
||||||
|
classifier_test_X.append(normalized(center_histo))
|
||||||
|
classifier_test_Y.append(exercise_name)
|
||||||
|
|
||||||
|
# print(classifier_test_Y)
|
||||||
|
|
||||||
|
print("done")
|
||||||
|
forest_average = []
|
||||||
|
for g in range(1):
|
||||||
|
clf = RandomForestClassifier(n_estimators=(50)*2)
|
||||||
|
clf.fit(classifier_training_X, classifier_training_Y)
|
||||||
|
|
||||||
|
results = clf.predict(classifier_test_X)
|
||||||
|
print(results)
|
||||||
|
i = 0
|
||||||
|
#
|
||||||
|
#
|
||||||
|
con_mat = confusion_matrix(results, classifier_test_Y)
|
||||||
|
print(con_mat)
|
||||||
|
|
||||||
|
np.savetxt("con_mat.csv", con_mat, '%5.0f', delimiter=",\t")
|
||||||
|
|
||||||
|
wrong = 0
|
||||||
|
right = 0
|
||||||
|
for i in range(len(con_mat)):
|
||||||
|
for j in range(len(con_mat[0])):
|
||||||
|
if i != j:
|
||||||
|
wrong += con_mat[i][j]
|
||||||
|
else:
|
||||||
|
right += con_mat[i][j]
|
||||||
|
|
||||||
|
# print(right/len(results))
|
||||||
|
# print((g+1)*2)
|
||||||
|
forest_average.append(right/len(results))
|
||||||
|
print("Random Forest Average Accuracy:")
|
||||||
|
print(sum(forest_average) / len(forest_average))
|
||||||
|
print("SVM Average Accuracy:")
|
||||||
|
clf = svm.LinearSVC()
|
||||||
|
clf.fit(classifier_training_X, classifier_training_Y)
|
||||||
|
|
||||||
|
results = clf.predict(classifier_test_X)
|
||||||
|
# print(results)
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
|
||||||
|
con_mat = confusion_matrix(results, classifier_test_Y)
|
||||||
|
# print(con_mat)
|
||||||
|
|
||||||
|
wrong = 0
|
||||||
|
right = 0
|
||||||
|
for i in range(len(con_mat)):
|
||||||
|
for j in range(len(con_mat[0])):
|
||||||
|
if i != j:
|
||||||
|
wrong += con_mat[i][j]
|
||||||
|
else:
|
||||||
|
right += con_mat[i][j]
|
||||||
|
|
||||||
|
print(right/len(results))
|
||||||
|
|
||||||
|
print("Gaussian Naive Bayes Average Accuracy:")
|
||||||
|
clf = GaussianNB()
|
||||||
|
clf.fit(classifier_training_X, classifier_training_Y)
|
||||||
|
|
||||||
|
results = clf.predict(classifier_test_X)
|
||||||
|
# print(results)
|
||||||
|
i = 0
|
||||||
|
|
||||||
|
|
||||||
|
con_mat = confusion_matrix(results, classifier_test_Y)
|
||||||
|
# print(con_mat)
|
||||||
|
|
||||||
|
wrong = 0
|
||||||
|
right = 0
|
||||||
|
for i in range(len(con_mat)):
|
||||||
|
for j in range(len(con_mat[0])):
|
||||||
|
if i != j:
|
||||||
|
wrong += con_mat[i][j]
|
||||||
|
else:
|
||||||
|
right += con_mat[i][j]
|
||||||
|
|
||||||
|
print(right/len(results))
|
||||||
|
|
||||||
|
accuracies[q_count].append(sum(forest_average) / len(forest_average))
|
||||||
|
print(num_clusters)
|
||||||
|
print(quantize_length)
|
||||||
|
|
||||||
|
n_count+=1
|
||||||
|
n_count = 0
|
||||||
|
q_count +=1
|
||||||
|
|
||||||
|
#Following code copied froms tack overflow for printing out the confusion matrix
|
||||||
|
|
||||||
|
#convert the string into an array
|
||||||
|
df_cm = pd.DataFrame(con_mat, index = [i for i in "ABCDEFGHIJKLMN"],
|
||||||
|
columns = [i for i in "ABCDEFGHIJKLMN"])
|
||||||
|
plt.figure(figsize = (10,7))
|
||||||
|
sn.heatmap(df_cm, annot=True)
|
||||||
|
pprint(accuracies)
|
||||||
|
plt.show()
|
||||||
Reference in New Issue
Block a user