From fb70a81e87db0293997a4da6b82b5375106a4ba8 Mon Sep 17 00:00:00 2001 From: Lucas Oskorep Date: Sat, 14 Mar 2020 17:52:56 -0500 Subject: [PATCH] Beginning model for training a spell recognizer --- .gitignore | 5 +- collect_spell_data.py | 2 +- spells.csv | 1 + train-test-split.py | 96 ++++++++++++++++++ train_model.py | 231 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 333 insertions(+), 2 deletions(-) create mode 100644 train-test-split.py create mode 100644 train_model.py diff --git a/.gitignore b/.gitignore index 3d6f92d..c5c3717 100644 --- a/.gitignore +++ b/.gitignore @@ -197,4 +197,7 @@ fabric.properties # Android studio 3.1+ serialized cache file .idea/caches/build_file_checksums.ser -data \ No newline at end of file +data +train +test +val diff --git a/collect_spell_data.py b/collect_spell_data.py index c3e8bad..4e1e3c2 100644 --- a/collect_spell_data.py +++ b/collect_spell_data.py @@ -30,7 +30,7 @@ class DataCollectWand(Wand): if pressed: self.data = [] print("Button pressed") - await self.vibrate(PATTERN.BURST) + await self.vibrate(PATTERN.SHORT) await self.reset_position() await self.subscribe_position() else: diff --git a/spells.csv b/spells.csv index 5082737..af15536 100644 --- a/spells.csv +++ b/spells.csv @@ -3,6 +3,7 @@ "lumos" "nox" "accio" +"stupify" "expelliarmus" "engorgio" "reducio" diff --git a/train-test-split.py b/train-test-split.py new file mode 100644 index 0000000..f0c96e0 --- /dev/null +++ b/train-test-split.py @@ -0,0 +1,96 @@ +import os +from random import random +from shutil import copyfile, rmtree +import multiprocessing + +train_dir = "./train/" +test_dir = "./test/" +val_dir = "./val/" +train = .80 +test = .15 +val = .05 + + +def add_train_data(file, filename, label): + dest = train_dir + label + "/" + filename + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + try: + copyfile(file, dest) + except Exception as e: + print(e) + print("INVALID FILE") + os.remove(file) + + +def add_val_data(file, filename, label): + dest = val_dir + label + "/" + filename + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + copyfile(file, dest) + + +def add_test_data(file, filename, label): + dest = test_dir + label + "/" + filename + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + copyfile(file, dest) + + +def remove_previous(): + if os.path.exists(os.path.dirname(test_dir)): + rmtree(test_dir) + if os.path.exists(os.path.dirname(train_dir)): + rmtree(train_dir) + if os.path.exists(os.path.dirname(val_dir)): + rmtree(val_dir) + + +files_processed = 0 +def test_split_file(file_root): + global files_processed + root = file_root[0] + file = file_root[1] + # print(file) + + if file is ".DS_Store": + return + c = random() + + if c < train: + add_train_data(os.path.join(root, file), file, root.split("/")[-1]) + elif c < (train + val): + add_val_data(os.path.join(root, file), file, root.split("/")[-1]) + else: + add_test_data(os.path.join(root, file), file, root.split("/")[-1]) + files_processed += 1 + + if files_processed % 1000==0: + print(root.split("/")[-1]) + print(files_processed) + print(file) + + +if __name__ == '__main__': + remove_previous() + + file_root_list = [] + + for root, dirs, files in os.walk("data/"): + for file in files: + file_root_list.append((root, file)) + + + pool = multiprocessing.Pool(multiprocessing.cpu_count()*2) + + pool.map(test_split_file, file_root_list) + diff --git a/train_model.py b/train_model.py new file mode 100644 index 0000000..051feba --- /dev/null +++ b/train_model.py @@ -0,0 +1,231 @@ +import numpy as np +import pandas as pd +from sklearn.cluster import KMeans +from sklearn.metrics import confusion_matrix +from sklearn.ensemble import RandomForestClassifier +from sklearn.naive_bayes import GaussianNB +import matplotlib.pyplot as plt +from sklearn import svm +from pprint import pprint +import os +import seaborn as sn + +def normalized(a, axis=-1, order=2): + return a / sum(a) + +def get_activities(dir): + activity_data_train = {} + activity_data_test = {} + i= 0 + for root, dirs, files in os.walk(dir): + + activity_name = root.split('\\')[1] + activity_data_train[activity_name] = [] + activity_data_test[activity_name] = [] + for i in range(int(.8 * len(files))): + data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten() + activity_data_train[activity_name].append(data) + i+=1 + while i < len(files): + data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten() + activity_data_test[activity_name].append(data) + i+=1 + + return activity_data_train, activity_data_test + +def quantize_data(data, vector_len): + del data[""] + print("quantizing", data) + print(vector_len) + quantized_data = {} + total = 0 + total2 = 0 + for key, value in data.items(): + + print(key, len(value)) + quantized_data[key] = {} + print(vector_len) + for i in range(len(value)): + total2+= len(value[i])/vector_len + while(len(value[i]) %vector_len != 0): + value[i] = value[i][:-1] + print(len(value[i]), vector_len) + new_data = np.split(value[i], len(value[i]) / vector_len) + # if len(new_data[len(new_data) - 1]) != vector_len: + # print(vector_len,len(new_data[len(new_data) - 1])) + # del new_data[len(new_data) - 1] + quantized_data[key][i] = new_data + total+= len(new_data) + + + print("Total is:", total, total2) + + return quantized_data + +lengths = [4] +# for i in range(3,4): +# lengths.append((i+1)*21) +data_train, data_test = get_activities('data\\') +print("----------------") + +X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) + + # print(X) +clusters =[1000]# [600,600,600,1000,1000,1000,1400,1400,1400] +q_count = 0 +n_count = 0 +accuracies = [] +for quantize_length in lengths: + accuracies.append([]) + for num_clusters in clusters: + + qdata_test = quantize_data(data_test, quantize_length) + print("----------------") + qdata_train = quantize_data(data_train, quantize_length) + print("----------------") + #fill kmeans_data with all of the data of all of the exercises reguardless of what they are + k_means_data = None + + for exercise_name, exercise_data in qdata_train.items(): + for example_number, example_data in exercise_data.items(): + if k_means_data is None: + k_means_data = np.array(example_data) + else: + # print(k_means_data.shape) + # print( np.atleast_1d(part).shape) + # print( np.atleast_2d(part).shape) + k_means_data = np.concatenate((k_means_data,np.array(example_data)), axis=0) + + print(len(k_means_data[0])) + print(len(k_means_data[:,0])) + + + print("training the model") + kmeans = KMeans(n_clusters=num_clusters, n_init=15, max_iter= 500).fit(k_means_data) + + print("Done training model") + centers = kmeans.cluster_centers_ + #Time to run the training by again and make the histograms for each training data point. + classifier_training_X = [] + classifier_training_Y = [] + + # print("creating histograms for training") + for exercise_name, exercise_data in qdata_train.items(): + for example_number, example_data in exercise_data.items(): + center_histo = np.zeros((num_clusters,)) + center_indices = kmeans.predict(example_data) + for i in center_indices: + center_histo[i] += 1 + classifier_training_X.append(normalized(center_histo)) + classifier_training_Y.append(exercise_name) + + + # print("done") + + classifier_test_X = [] + classifier_test_Y = [] + # print("creating histograms for testing") + for exercise_name, exercise_data in qdata_test.items(): + for example_number, example_data in exercise_data.items(): + center_histo = np.zeros((num_clusters,)) + center_indices = kmeans.predict(example_data) + for i in center_indices: + center_histo[i] += 1 + classifier_test_X.append(normalized(center_histo)) + classifier_test_Y.append(exercise_name) + + # print(classifier_test_Y) + + print("done") + forest_average = [] + for g in range(1): + clf = RandomForestClassifier(n_estimators=(50)*2) + clf.fit(classifier_training_X, classifier_training_Y) + + results = clf.predict(classifier_test_X) + print(results) + i = 0 + # + # + con_mat = confusion_matrix(results, classifier_test_Y) + print(con_mat) + + np.savetxt("con_mat.csv", con_mat, '%5.0f', delimiter=",\t") + + wrong = 0 + right = 0 + for i in range(len(con_mat)): + for j in range(len(con_mat[0])): + if i != j: + wrong += con_mat[i][j] + else: + right += con_mat[i][j] + + # print(right/len(results)) + # print((g+1)*2) + forest_average.append(right/len(results)) + print("Random Forest Average Accuracy:") + print(sum(forest_average) / len(forest_average)) + print("SVM Average Accuracy:") + clf = svm.LinearSVC() + clf.fit(classifier_training_X, classifier_training_Y) + + results = clf.predict(classifier_test_X) + # print(results) + i = 0 + + + con_mat = confusion_matrix(results, classifier_test_Y) + # print(con_mat) + + wrong = 0 + right = 0 + for i in range(len(con_mat)): + for j in range(len(con_mat[0])): + if i != j: + wrong += con_mat[i][j] + else: + right += con_mat[i][j] + + print(right/len(results)) + + print("Gaussian Naive Bayes Average Accuracy:") + clf = GaussianNB() + clf.fit(classifier_training_X, classifier_training_Y) + + results = clf.predict(classifier_test_X) + # print(results) + i = 0 + + + con_mat = confusion_matrix(results, classifier_test_Y) + # print(con_mat) + + wrong = 0 + right = 0 + for i in range(len(con_mat)): + for j in range(len(con_mat[0])): + if i != j: + wrong += con_mat[i][j] + else: + right += con_mat[i][j] + + print(right/len(results)) + + accuracies[q_count].append(sum(forest_average) / len(forest_average)) + print(num_clusters) + print(quantize_length) + + n_count+=1 + n_count = 0 + q_count +=1 + +#Following code copied froms tack overflow for printing out the confusion matrix + +#convert the string into an array +df_cm = pd.DataFrame(con_mat, index = [i for i in "ABCDEFGHIJKLMN"], + columns = [i for i in "ABCDEFGHIJKLMN"]) +plt.figure(figsize = (10,7)) +sn.heatmap(df_cm, annot=True) +pprint(accuracies) +plt.show() \ No newline at end of file