import numpy as np import pandas as pd from sklearn.cluster import KMeans from sklearn.metrics import confusion_matrix from sklearn.ensemble import RandomForestClassifier from sklearn.naive_bayes import GaussianNB import matplotlib.pyplot as plt from sklearn import svm from pprint import pprint import os import seaborn as sn def normalized(a, axis=-1, order=2): return a / sum(a) def get_activities(dir): activity_data_train = {} activity_data_test = {} i= 0 for root, dirs, files in os.walk(dir): activity_name = root.split('\\')[1] activity_data_train[activity_name] = [] activity_data_test[activity_name] = [] for i in range(int(.8 * len(files))): data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten() activity_data_train[activity_name].append(data) i+=1 while i < len(files): data = pd.read_csv(os.path.join(root, files[i]), delimiter=' ').values.flatten() activity_data_test[activity_name].append(data) i+=1 return activity_data_train, activity_data_test def quantize_data(data, vector_len): del data[""] print("quantizing", data) print(vector_len) quantized_data = {} total = 0 total2 = 0 for key, value in data.items(): print(key, len(value)) quantized_data[key] = {} print(vector_len) for i in range(len(value)): total2+= len(value[i])/vector_len while(len(value[i]) %vector_len != 0): value[i] = value[i][:-1] print(len(value[i]), vector_len) new_data = np.split(value[i], len(value[i]) / vector_len) # if len(new_data[len(new_data) - 1]) != vector_len: # print(vector_len,len(new_data[len(new_data) - 1])) # del new_data[len(new_data) - 1] quantized_data[key][i] = new_data total+= len(new_data) print("Total is:", total, total2) return quantized_data lengths = [4] # for i in range(3,4): # lengths.append((i+1)*21) data_train, data_test = get_activities('data\\') print("----------------") X = np.array([[1, 2], [1, 4], [1, 0], [4, 2], [4, 4], [4, 0]]) # print(X) clusters =[1000]# [600,600,600,1000,1000,1000,1400,1400,1400] q_count = 0 n_count = 0 accuracies = [] for quantize_length in lengths: accuracies.append([]) for num_clusters in clusters: qdata_test = quantize_data(data_test, quantize_length) print("----------------") qdata_train = quantize_data(data_train, quantize_length) print("----------------") #fill kmeans_data with all of the data of all of the exercises reguardless of what they are k_means_data = None for exercise_name, exercise_data in qdata_train.items(): for example_number, example_data in exercise_data.items(): if k_means_data is None: k_means_data = np.array(example_data) else: # print(k_means_data.shape) # print( np.atleast_1d(part).shape) # print( np.atleast_2d(part).shape) k_means_data = np.concatenate((k_means_data,np.array(example_data)), axis=0) print(len(k_means_data[0])) print(len(k_means_data[:,0])) print("training the model") kmeans = KMeans(n_clusters=num_clusters, n_init=15, max_iter= 500).fit(k_means_data) print("Done training model") centers = kmeans.cluster_centers_ #Time to run the training by again and make the histograms for each training data point. classifier_training_X = [] classifier_training_Y = [] # print("creating histograms for training") for exercise_name, exercise_data in qdata_train.items(): for example_number, example_data in exercise_data.items(): center_histo = np.zeros((num_clusters,)) center_indices = kmeans.predict(example_data) for i in center_indices: center_histo[i] += 1 classifier_training_X.append(normalized(center_histo)) classifier_training_Y.append(exercise_name) # print("done") classifier_test_X = [] classifier_test_Y = [] # print("creating histograms for testing") for exercise_name, exercise_data in qdata_test.items(): for example_number, example_data in exercise_data.items(): center_histo = np.zeros((num_clusters,)) center_indices = kmeans.predict(example_data) for i in center_indices: center_histo[i] += 1 classifier_test_X.append(normalized(center_histo)) classifier_test_Y.append(exercise_name) # print(classifier_test_Y) print("done") forest_average = [] for g in range(1): clf = RandomForestClassifier(n_estimators=(50)*2) clf.fit(classifier_training_X, classifier_training_Y) results = clf.predict(classifier_test_X) print(results) i = 0 # # con_mat = confusion_matrix(results, classifier_test_Y) print(con_mat) np.savetxt("con_mat.csv", con_mat, '%5.0f', delimiter=",\t") wrong = 0 right = 0 for i in range(len(con_mat)): for j in range(len(con_mat[0])): if i != j: wrong += con_mat[i][j] else: right += con_mat[i][j] # print(right/len(results)) # print((g+1)*2) forest_average.append(right/len(results)) print("Random Forest Average Accuracy:") print(sum(forest_average) / len(forest_average)) print("SVM Average Accuracy:") clf = svm.LinearSVC() clf.fit(classifier_training_X, classifier_training_Y) results = clf.predict(classifier_test_X) # print(results) i = 0 con_mat = confusion_matrix(results, classifier_test_Y) # print(con_mat) wrong = 0 right = 0 for i in range(len(con_mat)): for j in range(len(con_mat[0])): if i != j: wrong += con_mat[i][j] else: right += con_mat[i][j] print(right/len(results)) print("Gaussian Naive Bayes Average Accuracy:") clf = GaussianNB() clf.fit(classifier_training_X, classifier_training_Y) results = clf.predict(classifier_test_X) # print(results) i = 0 con_mat = confusion_matrix(results, classifier_test_Y) # print(con_mat) wrong = 0 right = 0 for i in range(len(con_mat)): for j in range(len(con_mat[0])): if i != j: wrong += con_mat[i][j] else: right += con_mat[i][j] print(right/len(results)) accuracies[q_count].append(sum(forest_average) / len(forest_average)) print(num_clusters) print(quantize_length) n_count+=1 n_count = 0 q_count +=1 #Following code copied froms tack overflow for printing out the confusion matrix #convert the string into an array df_cm = pd.DataFrame(con_mat, index = [i for i in "ABCDEFGHIJKLMN"], columns = [i for i in "ABCDEFGHIJKLMN"]) plt.figure(figsize = (10,7)) sn.heatmap(df_cm, annot=True) pprint(accuracies) plt.show()