from glob import glob import numpy as np import scipy as sp import pandas as pd from sklearn.cluster import KMeans from sklearn.metrics import accuracy_score, confusion_matrix import os from sklearn.ensemble import RandomForestClassifier og_data_columns = 4 def apply_k_means_clustering(data, kmeans): hist = np.zeros((len(kmeans.cluster_centers_),)) for x in kmeans.predict(data): hist[x]+=1 print(hist) return hist def quantize_flattened_data(data, length): # print("quantizing the data") # print(data) quantile_length = length*og_data_columns rows = int(len(data)/quantile_length) return np.resize(data, (rows, quantile_length)) def load_data(data_dir): data = [] for root, dirs, files in os.walk(data_dir): if root is data_dir: continue for file in files: data.append([os.path.basename(root), pd.read_csv(os.path.join(root, file)).values.flatten()]) return pd.DataFrame(data, columns=["exercise", "flat_data"]) train_data = load_data("./train") test_data = load_data("./test") # val_data = load_data("./val") quant_len = 3 train_data["quantized_data"] = train_data["flat_data"].apply(lambda x : quantize_flattened_data(x, quant_len)) test_data["quantized_data"] = test_data["flat_data"].apply(lambda x : quantize_flattened_data(x, quant_len)) linked_data = train_data["quantized_data"][0] for x in train_data["quantized_data"]: linked_data = np.append(linked_data, x, axis=0) print(linked_data) k_means_model = KMeans(n_clusters=25, n_init=15, max_iter= 500).fit(linked_data) print(k_means_model.cluster_centers_) train_data["histogram"] = train_data["quantized_data"].apply(lambda x: apply_k_means_clustering(x, k_means_model)) test_data["histogram"] = test_data["quantized_data"].apply(lambda x: apply_k_means_clustering(x, k_means_model)) print(train_data) clf = RandomForestClassifier(n_estimators=50) print(train_data["histogram"].values) clf.fit([x for x in train_data["histogram"]], train_data["exercise"].values) results = clf.predict([x for x in test_data["histogram"]]) print(results) print(test_data["exercise"].values) print(accuracy_score( test_data["exercise"].values,results )) print(confusion_matrix( test_data["exercise"].values,results))