From bc44d301807b10e84ad8aadeca3255cf990518b0 Mon Sep 17 00:00:00 2001 From: Lucas Oskorep Date: Sun, 14 Apr 2019 14:32:08 -0500 Subject: [PATCH] Adding in the data first preprocessing, as well as the first model training. --- .gitignore | 0 FixFileTypes.py | 108 +++++++++++++++++++++++++ ImageGatherer.py | 6 +- KerasToTensorflow.py | 12 +++ README.md | 0 TestTrainSplit.py | 80 +++++++++++++++++++ TrainingModelKeras.py | 167 +++++++++++++++++++++++++++++++++++++++ TransferLearningKeras.py | 167 +++++++++++++++++++++++++++++++++++++++ graphResults.py | 62 +++++++++++++++ pokemon.csv | 0 train.py | 2 - 11 files changed, 600 insertions(+), 4 deletions(-) mode change 100644 => 100755 .gitignore create mode 100755 FixFileTypes.py mode change 100644 => 100755 ImageGatherer.py create mode 100755 KerasToTensorflow.py mode change 100644 => 100755 README.md create mode 100755 TestTrainSplit.py create mode 100755 TrainingModelKeras.py create mode 100755 TransferLearningKeras.py create mode 100755 graphResults.py mode change 100644 => 100755 pokemon.csv delete mode 100644 train.py diff --git a/.gitignore b/.gitignore old mode 100644 new mode 100755 diff --git a/FixFileTypes.py b/FixFileTypes.py new file mode 100755 index 0000000..845abad --- /dev/null +++ b/FixFileTypes.py @@ -0,0 +1,108 @@ +import glob +import subprocess +import os +import re +import logging +import traceback +from random import randint +import imghdr +import PIL +from PIL import Image +import sys + +directory = "downloads" + + +def random_with_N_digits(n): + range_start = 10 ** (n - 1) + range_end = (10 ** n) - 1 + return randint(range_start, range_end) + + +def change_file_extension(file_obj, extension): + old_path = os.path.splitext(file_obj) + if not os.path.isfile(old_path[0] + extension): + new_file = old_path[0] + extension + elif not os.path.isfile(file_obj + extension): + new_file = file_obj + extension + else: + print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj) + return + + print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file) + + subprocess.run(['mv', file_obj, new_file]) + + +def get_frames_from_gif(infile): + try: + im = Image.open(infile) + except IOError: + print + "Cant load", infile + sys.exit(1) + + i = 0 + + try: + while 1: + im2 = im.convert('RGBA') + im2.load() + filename = os.path.join(os.path.dirname(infile), 'foo' + str(i) + '.jpg') + background = Image.new("RGB", im2.size, (255, 255, 255)) + background.paste(im2, mask=im2.split()[3]) + background.save(filename, 'JPEG', quality=80) + print(f"FOUND GIF, SAVING FRAME AS {filename}") + i += 1 + im.seek(im.tell() + 1) + + except EOFError: + pass # end of sequence + + +for root, dirs, files in os.walk(directory): + + for file in files: + + try: + file_obj = os.path.join(root, file) + exten = os.path.splitext(file)[1].lower() + img_type = imghdr.what(file_obj) + # print(file_obj) + if img_type is None: + os.remove(file_obj) + elif "jpeg" in img_type: + if "jpeg" not in exten and "jpg" not in exten: + change_file_extension(file_obj, ".jpeg") + elif "png" in img_type: + if "png" not in exten: + change_file_extension(file_obj, ".png") + elif "gif" in img_type: + get_frames_from_gif(file_obj) + os.remove(file_obj) + else: + os.remove(file_obj) + + except Exception as e: + logging.error(traceback.format_exc()) + +i = 1 +for root, dirs, files in os.walk(directory): + for file in files: + try: + file_obj = os.path.join(root, file) + path, file_base_name = os.path.split(file_obj) + old_path = os.path.splitext(file_base_name) + old_ext = old_path[1] + old_name = old_path[0] + new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext) + if file_obj != new_file and "foo" not in old_name: + print(f"Moving file\n" + f"{new_file}\n" + f"{file_obj}") + subprocess.run(['mv', file_obj, new_file]) + i += 1 + except Exception as e: + logging.error(traceback.format_exc()) + +print("Cleaning JPEGs done") diff --git a/ImageGatherer.py b/ImageGatherer.py old mode 100644 new mode 100755 index 88bb4a8..ed247ad --- a/ImageGatherer.py +++ b/ImageGatherer.py @@ -6,12 +6,14 @@ df = pd.read_csv("pokemon.csv") response = google_images_download.googleimagesdownload() -for pokemon in df["identifier"][:251]: +for pokemon in ["abra", "xatu", "yanma", "zapdos", "zubat"]: # df["identifier"][:251]: absolute_image_paths = response.download( { "keywords": pokemon, "limit": 250, - "chromedriver": "/usr/lib/chromium-browser/chromedriver" + "chromedriver": "/usr/lib/chromium-browser/chromedriver", + # This needs to be changed based on the computer trying to download the images + "format": "jpg" } ) diff --git a/KerasToTensorflow.py b/KerasToTensorflow.py new file mode 100755 index 0000000..a0e1f54 --- /dev/null +++ b/KerasToTensorflow.py @@ -0,0 +1,12 @@ +from tensorflow import keras +from tensorflow.contrib import lite + +keras_file = "weights.mobilenet.non-transfer.best.hdf5" +keras.models.load_model(keras_file) + +h5_model = keras.models.load_model(keras_file) +converter = lite.TocoConverter.from_keras_model_file(keras_file) + +tflite_model = converter.convert() +with open('mobilenet.tflite', 'wb') as f: + f.write(tflite_model) diff --git a/README.md b/README.md old mode 100644 new mode 100755 diff --git a/TestTrainSplit.py b/TestTrainSplit.py new file mode 100755 index 0000000..f83d08a --- /dev/null +++ b/TestTrainSplit.py @@ -0,0 +1,80 @@ +import os +from random import random +from shutil import copyfile, rmtree + +train_dir = "./data/train/" +test_dir = "./data/test/" +val_dir = "./data/val/" +train = .75 +test = .20 +val = .05 + + +def add_train_data(file, filename, label): + dest = train_dir + label + "/" + filename + print(dest, label, filename) + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + try: + copyfile(file, dest) + except Exception as e: + print(e) + print("INVALID FILE") + os.remove(file) + # TODO: Remove the files + + +def add_val_data(file, filename, label): + dest = val_dir + label + "/" + filename + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + copyfile(file, dest) + + +def add_test_data(file, filename, label): + dest = test_dir + label + "/" + filename + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + copyfile(file, dest) + + +def remove_previous(): + if os.path.exists(os.path.dirname(test_dir)): + rmtree(test_dir) + if os.path.exists(os.path.dirname(train_dir)): + rmtree(train_dir) + if os.path.exists(os.path.dirname(val_dir)): + rmtree(val_dir) + + +remove_previous() +files_processed = 0 + +for root, dirs, files in os.walk("downloads/"): + + for file in files: + print(file) + + if file is ".DS_Store": + continue + c = random() + + if c < train: + add_train_data(os.path.join(root, file), file, root.split("/")[-1]) + elif c < (train + val): + add_val_data(os.path.join(root, file), file, root.split("/")[-1]) + else: + add_test_data(os.path.join(root, file), file, root.split("/")[-1]) + files_processed += 1 + print(root.split("/")[-1]) + print(files_processed) + print(file) diff --git a/TrainingModelKeras.py b/TrainingModelKeras.py new file mode 100755 index 0000000..c1d1bca --- /dev/null +++ b/TrainingModelKeras.py @@ -0,0 +1,167 @@ +import tensorflow as tf +import pandas as pd +import numpy as np +import os +import seaborn as sn +import matplotlib.pyplot as plt +from tensorflow import keras +from time import time +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + +input_shape = (224, 224, 3) +batch_size = 32 +model_name = "MobileNetV2FullDatasetNoTransfer" + +from keras.preprocessing.image import ImageDataGenerator +from keras.applications.inception_v3 import preprocess_input + +train_idg = ImageDataGenerator( + horizontal_flip=True, + preprocessing_function=preprocess_input +) +train_gen = train_idg.flow_from_directory( + './data/train', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size +) + +val_idg = ImageDataGenerator( + horizontal_flip=True, + preprocessing_function=preprocess_input +) + +val_gen = val_idg.flow_from_directory( + './data/val', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size +) +from keras.applications import inception_v3, mobilenet_v2, vgg16 +from keras.models import Sequential +from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard +from keras import optimizers +from keras.layers import Dense, Dropout, GlobalAveragePooling2D + +nclass = len(train_gen.class_indices) + +# base_model = vgg16.VGG16( +# weights='imagenet', +# include_top=False, +# input_shape=input_shape +# ) +# base_model = inception_v3.InceptionV3( +# weights='imagenet', +# include_top=False, +# input_shape=input_shape +# ) + +base_model = mobilenet_v2.MobileNetV2( + weights='imagenet', + include_top=False, + input_shape=input_shape +) + +add_model = Sequential() +add_model.add(base_model) +add_model.add(GlobalAveragePooling2D()) +add_model.add(Dropout(0.5)) +add_model.add( + Dense(1024, activation='relu')) # Adding some dense layers in order to learn complex functions from the base model +# Potentially throw another dropout layer here if you seem to be overfitting your +add_model.add(Dropout(0.5)) +add_model.add(Dense(512, activation='relu')) +add_model.add(Dense(nclass, activation='softmax')) # Decision layer + +model = add_model +model.compile(loss='categorical_crossentropy', + # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9), + optimizer=optimizers.Adam(lr=1e-4), + metrics=['accuracy']) +model.summary() + +# Train the model +file_path = "weights.mobilenet.non-transfer.best.hdf5" + +checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') + +early = EarlyStopping(monitor="val_acc", mode="max", patience=15) + +tensorboard = TensorBoard( + log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=32, + write_graph=True, + write_grads=True, + write_images=True, + update_freq=batch_size +) + +callbacks_list = [checkpoint, early, tensorboard] # early + +history = model.fit_generator( + train_gen, + validation_data=val_gen, + epochs=2, + shuffle=True, + verbose=True, + callbacks=callbacks_list +) + +# Create Test generator +test_idg = ImageDataGenerator( + preprocessing_function=preprocess_input, +) +test_gen = test_idg.flow_from_directory( + './data/test', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size, + shuffle=False + +) +len(test_gen.filenames) + +score = model.evaluate_generator(test_gen, workers=1) + +# predicts +predicts = model.predict_generator(test_gen, verbose=True, workers=1) + +keras_file = 'finished.h5' +keras.models.save_model(model, keras_file) + +print("Loss: ", score[0], "Accuracy: ", score[1]) +print(score) + +print(predicts) +print(type(predicts)) +print(predicts.shape) +# Process the predictions +predicts = np.argmax(predicts, + axis=1) +# test_gen.reset() +label_index = {v: k for k, v in train_gen.class_indices.items()} +predicts = [label_index[p] for p in predicts] +reals = [label_index[p] for p in test_gen.classes] + +# Save the results +print(label_index) +print(test_gen.classes) +print(test_gen.classes.shape) +print(type(test_gen.classes)) +df = pd.DataFrame(columns=['fname', 'prediction', 'true_val']) +df['fname'] = [x for x in test_gen.filenames] +df['prediction'] = predicts +df["true_val"] = reals +df.to_csv("sub1_non_transfer.csv", index=False) + +# Processed the saved results +from sklearn.metrics import accuracy_score, confusion_matrix + +acc = accuracy_score(reals, predicts) +conf_mat = confusion_matrix(reals, predicts) +print("Testing accuracy score is ", acc) +print("Confusion Matrix", conf_mat) + +df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))], + columns=[i for i in list(set(reals))]) +plt.figure(figsize=(10, 7)) +sn.heatmap(df_cm, annot=True) +plt.show() diff --git a/TransferLearningKeras.py b/TransferLearningKeras.py new file mode 100755 index 0000000..b4f97fa --- /dev/null +++ b/TransferLearningKeras.py @@ -0,0 +1,167 @@ +import pandas as pd +import numpy as np +import seaborn as sn +import matplotlib.pyplot as plt + +from time import time +from PIL import ImageFile + +ImageFile.LOAD_TRUNCATED_IMAGES = True + +input_shape = (224, 224, 3) +batch_size = 60 +model_name = "MobileNetV2FullDataset" + +from keras.preprocessing.image import ImageDataGenerator +from keras.applications.inception_v3 import preprocess_input + +train_idg = ImageDataGenerator( + # horizontal_flip=True, + preprocessing_function=preprocess_input +) +train_gen = train_idg.flow_from_directory( + './data/train', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size +) +val_idg = ImageDataGenerator( + # horizontal_flip=True, + preprocessing_function=preprocess_input +) + +val_gen = val_idg.flow_from_directory( + './data/val', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size +) + +from keras.applications import inception_v3, mobilenet_v2, vgg16 +from keras.models import Sequential +from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard +from keras import optimizers +from keras.layers import Dense, Dropout, GlobalAveragePooling2D + +nclass = len(train_gen.class_indices) + +# base_model = vgg16.VGG16( +# weights='imagenet', +# include_top=False, +# input_shape=input_shape +# ) +# base_model = inception_v3.InceptionV3( +# weights='imagenet', +# include_top=False, +# input_shape=input_shape +# ) + +base_model = mobilenet_v2.MobileNetV2( + weights='imagenet', + include_top=False, + input_shape=input_shape +) +base_model.trainable = False + +add_model = Sequential() +add_model.add(base_model) +add_model.add(GlobalAveragePooling2D()) +add_model.add(Dropout(0.5)) +add_model.add(Dense(1024, activation='relu')) +# Adding some dense layers in order to learn complex functions from the base model +add_model.add(Dropout(0.5)) +add_model.add(Dense(512, activation='relu')) +add_model.add(Dense(nclass, activation='softmax')) # Decision layer + +model = add_model +model.compile(loss='categorical_crossentropy', + # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9), + optimizer=optimizers.Adam(lr=1e-4), + metrics=['accuracy']) +model.summary() + +# Train the model +file_path = "weights.mobilenet.best.hdf5" + +checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max') + +early = EarlyStopping(monitor="val_acc", mode="max", patience=15) + +tensorboard = TensorBoard( + log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=batch_size, + write_graph=True, + write_grads=True, + write_images=True, + update_freq=batch_size + +) + +callbacks_list = [checkpoint, early, tensorboard] # early + +history = model.fit_generator( + train_gen, + steps_per_epoch=len(train_gen), + validation_data=val_gen, + validation_steps=len(val_gen), + epochs=5, + shuffle=True, + verbose=True, + callbacks=callbacks_list + +) + +# Create Test generator +test_idg = ImageDataGenerator( + preprocessing_function=preprocess_input, +) + +test_gen = test_idg.flow_from_directory( + './data/test', + target_size=(input_shape[0], input_shape[1]), + batch_size=batch_size, + shuffle=False +) + +len(test_gen.filenames) + +score = model.evaluate_generator(test_gen, workers=1) + +# predicts +predicts = model.predict_generator(test_gen, verbose=True, workers=1) + +print("Loss: ", score[0], "Accuracy: ", score[1]) +print(score) + +print(predicts) +print(type(predicts)) +print(predicts.shape) +# Process the predictions +predicts = np.argmax(predicts, + axis=1) +# test_gen.reset() +label_index = {v: k for k, v in train_gen.class_indices.items()} +predicts = [label_index[p] for p in predicts] +reals = [label_index[p] for p in test_gen.classes] + +# Save the results +print(label_index) +print(test_gen.classes) +print(test_gen.classes.shape) +print(type(test_gen.classes)) +df = pd.DataFrame(columns=['fname', 'prediction', 'true_val']) +df['fname'] = [x for x in test_gen.filenames] +df['prediction'] = predicts +df["true_val"] = reals +df.to_csv("sub1.csv", index=False) + +# Processed the saved results +from sklearn.metrics import accuracy_score, confusion_matrix + +acc = accuracy_score(reals, predicts) +conf_mat = confusion_matrix(reals, predicts) +print("Testing accuracy score is ", acc) +print("Confusion Matrix", conf_mat) + +df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))], + columns=[i for i in list(set(reals))]) +plt.figure(figsize=(10, 7)) +sn.heatmap(df_cm, annot=True) +plt.show() diff --git a/graphResults.py b/graphResults.py new file mode 100755 index 0000000..220f040 --- /dev/null +++ b/graphResults.py @@ -0,0 +1,62 @@ +import tensorflow as tf +import pandas as pd +import numpy as np +import os +import seaborn as sn +import matplotlib.pyplot as plt + +from sklearn.metrics import accuracy_score, confusion_matrix + + +def print_preds(reals, preds): + acc = accuracy_score(reals, predicts) + conf_mat = confusion_matrix(reals, predicts) + print("Testing accuracy score is ", acc) + print("Confusion Matrix", conf_mat) + + df_cm = pd.DataFrame(conf_mat, index=[i for i in ["Block", "Meter", "Sign"]], + columns=[i for i in ["Block", "Meter", "Sign"]]) + plt.figure(figsize=(10, 7)) + sn.heatmap(df_cm, annot=True) + plt.show() + + +data = pd.read_csv("sub1_non_transfer.csv") +files_list = list(data["fname"]) +reals = list(data["true_val"]) +predicts = list(data["prediction"]) + +reals2 = [] +wrong_files = [] +for root, dirs, files in os.walk(".\\photos"): + for file in files: + if file in files_list: + x = data.loc[data["fname"] == file].values[0] + if (x[1] != x[2]): + print(x) + wrong_files.append((os.path.join(root, file), x[1])) + reals2.append(root.split("\\")[-1]) + +print_preds(reals, predicts) +print_preds(reals2, predicts) + +import matplotlib.image as mpimg +from shutil import copyfile, rmtree + +for file, pred in wrong_files: + print(file) + # img = mpimg.imread(file) + # # end + # # from now on you can use img as an image, but make sure you know what you are doing! + # imgplot = plt.imshow(img) + dest = file.split("\\") + dest[1] = "failed" + dest[-1] = pred + dest[-1] + dest = "\\".join(dest) + if not os.path.exists(os.path.dirname(dest)): + try: + os.makedirs(os.path.dirname(dest)) + except Exception as e: + print(e) + copyfile(file, dest) + plt.show() diff --git a/pokemon.csv b/pokemon.csv old mode 100644 new mode 100755 diff --git a/train.py b/train.py deleted file mode 100644 index 139597f..0000000 --- a/train.py +++ /dev/null @@ -1,2 +0,0 @@ - -