From bc44d301807b10e84ad8aadeca3255cf990518b0 Mon Sep 17 00:00:00 2001
From: Lucas Oskorep <lucas.oskorep@gmail.com>
Date: Sun, 14 Apr 2019 14:32:08 -0500
Subject: [PATCH] Adding in the data first preprocessing, as well as the first
 model training.

---
 .gitignore               |   0
 FixFileTypes.py          | 108 +++++++++++++++++++++++++
 ImageGatherer.py         |   6 +-
 KerasToTensorflow.py     |  12 +++
 README.md                |   0
 TestTrainSplit.py        |  80 +++++++++++++++++++
 TrainingModelKeras.py    | 167 +++++++++++++++++++++++++++++++++++++++
 TransferLearningKeras.py | 167 +++++++++++++++++++++++++++++++++++++++
 graphResults.py          |  62 +++++++++++++++
 pokemon.csv              |   0
 train.py                 |   2 -
 11 files changed, 600 insertions(+), 4 deletions(-)
 mode change 100644 => 100755 .gitignore
 create mode 100755 FixFileTypes.py
 mode change 100644 => 100755 ImageGatherer.py
 create mode 100755 KerasToTensorflow.py
 mode change 100644 => 100755 README.md
 create mode 100755 TestTrainSplit.py
 create mode 100755 TrainingModelKeras.py
 create mode 100755 TransferLearningKeras.py
 create mode 100755 graphResults.py
 mode change 100644 => 100755 pokemon.csv
 delete mode 100644 train.py

diff --git a/.gitignore b/.gitignore
old mode 100644
new mode 100755
diff --git a/FixFileTypes.py b/FixFileTypes.py
new file mode 100755
index 0000000..845abad
--- /dev/null
+++ b/FixFileTypes.py
@@ -0,0 +1,108 @@
+import glob
+import subprocess
+import os
+import re
+import logging
+import traceback
+from random import randint
+import imghdr
+import PIL
+from PIL import Image
+import sys
+
+directory = "downloads"
+
+
+def random_with_N_digits(n):
+    range_start = 10 ** (n - 1)
+    range_end = (10 ** n) - 1
+    return randint(range_start, range_end)
+
+
+def change_file_extension(file_obj, extension):
+    old_path = os.path.splitext(file_obj)
+    if not os.path.isfile(old_path[0] + extension):
+        new_file = old_path[0] + extension
+    elif not os.path.isfile(file_obj + extension):
+        new_file = file_obj + extension
+    else:
+        print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj)
+        return
+
+    print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file)
+
+    subprocess.run(['mv', file_obj, new_file])
+
+
+def get_frames_from_gif(infile):
+    try:
+        im = Image.open(infile)
+    except IOError:
+        print
+        "Cant load", infile
+        sys.exit(1)
+
+    i = 0
+
+    try:
+        while 1:
+            im2 = im.convert('RGBA')
+            im2.load()
+            filename = os.path.join(os.path.dirname(infile), 'foo' + str(i) + '.jpg')
+            background = Image.new("RGB", im2.size, (255, 255, 255))
+            background.paste(im2, mask=im2.split()[3])
+            background.save(filename, 'JPEG', quality=80)
+            print(f"FOUND GIF, SAVING FRAME AS {filename}")
+            i += 1
+            im.seek(im.tell() + 1)
+
+    except EOFError:
+        pass  # end of sequence
+
+
+for root, dirs, files in os.walk(directory):
+
+    for file in files:
+
+        try:
+            file_obj = os.path.join(root, file)
+            exten = os.path.splitext(file)[1].lower()
+            img_type = imghdr.what(file_obj)
+            # print(file_obj)
+            if img_type is None:
+                os.remove(file_obj)
+            elif "jpeg" in img_type:
+                if "jpeg" not in exten and "jpg" not in exten:
+                    change_file_extension(file_obj, ".jpeg")
+            elif "png" in img_type:
+                if "png" not in exten:
+                    change_file_extension(file_obj, ".png")
+            elif "gif" in img_type:
+                get_frames_from_gif(file_obj)
+                os.remove(file_obj)
+            else:
+                os.remove(file_obj)
+
+        except Exception as e:
+            logging.error(traceback.format_exc())
+
+i = 1
+for root, dirs, files in os.walk(directory):
+    for file in files:
+        try:
+            file_obj = os.path.join(root, file)
+            path, file_base_name = os.path.split(file_obj)
+            old_path = os.path.splitext(file_base_name)
+            old_ext = old_path[1]
+            old_name = old_path[0]
+            new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext)
+            if file_obj != new_file and "foo" not in old_name:
+                print(f"Moving file\n"
+                      f"{new_file}\n"
+                      f"{file_obj}")
+                subprocess.run(['mv', file_obj, new_file])
+                i += 1
+        except Exception as e:
+            logging.error(traceback.format_exc())
+
+print("Cleaning JPEGs done")
diff --git a/ImageGatherer.py b/ImageGatherer.py
old mode 100644
new mode 100755
index 88bb4a8..ed247ad
--- a/ImageGatherer.py
+++ b/ImageGatherer.py
@@ -6,12 +6,14 @@ df = pd.read_csv("pokemon.csv")
 
 response = google_images_download.googleimagesdownload()
 
-for pokemon in df["identifier"][:251]:
+for pokemon in ["abra", "xatu", "yanma", "zapdos", "zubat"]:  # df["identifier"][:251]:
     absolute_image_paths = response.download(
         {
             "keywords": pokemon,
             "limit": 250,
-            "chromedriver": "/usr/lib/chromium-browser/chromedriver"
+            "chromedriver": "/usr/lib/chromium-browser/chromedriver",
+            # This needs to be changed based on the computer trying to download the images
+            "format": "jpg"
         }
     )
 
diff --git a/KerasToTensorflow.py b/KerasToTensorflow.py
new file mode 100755
index 0000000..a0e1f54
--- /dev/null
+++ b/KerasToTensorflow.py
@@ -0,0 +1,12 @@
+from tensorflow import keras
+from tensorflow.contrib import lite
+
+keras_file = "weights.mobilenet.non-transfer.best.hdf5"
+keras.models.load_model(keras_file)
+
+h5_model = keras.models.load_model(keras_file)
+converter = lite.TocoConverter.from_keras_model_file(keras_file)
+
+tflite_model = converter.convert()
+with open('mobilenet.tflite', 'wb') as f:
+    f.write(tflite_model)
diff --git a/README.md b/README.md
old mode 100644
new mode 100755
diff --git a/TestTrainSplit.py b/TestTrainSplit.py
new file mode 100755
index 0000000..f83d08a
--- /dev/null
+++ b/TestTrainSplit.py
@@ -0,0 +1,80 @@
+import os
+from random import random
+from shutil import copyfile, rmtree
+
+train_dir = "./data/train/"
+test_dir = "./data/test/"
+val_dir = "./data/val/"
+train = .75
+test = .20
+val = .05
+
+
+def add_train_data(file, filename, label):
+    dest = train_dir + label + "/" + filename
+    print(dest, label, filename)
+    if not os.path.exists(os.path.dirname(dest)):
+        try:
+            os.makedirs(os.path.dirname(dest))
+        except Exception as e:
+            print(e)
+    try:
+        copyfile(file, dest)
+    except Exception as e:
+        print(e)
+        print("INVALID FILE")
+        os.remove(file)
+        # TODO: Remove the files
+
+
+def add_val_data(file, filename, label):
+    dest = val_dir + label + "/" + filename
+    if not os.path.exists(os.path.dirname(dest)):
+        try:
+            os.makedirs(os.path.dirname(dest))
+        except Exception as e:
+            print(e)
+    copyfile(file, dest)
+
+
+def add_test_data(file, filename, label):
+    dest = test_dir + label + "/" + filename
+    if not os.path.exists(os.path.dirname(dest)):
+        try:
+            os.makedirs(os.path.dirname(dest))
+        except Exception as e:
+            print(e)
+    copyfile(file, dest)
+
+
+def remove_previous():
+    if os.path.exists(os.path.dirname(test_dir)):
+        rmtree(test_dir)
+    if os.path.exists(os.path.dirname(train_dir)):
+        rmtree(train_dir)
+    if os.path.exists(os.path.dirname(val_dir)):
+        rmtree(val_dir)
+
+
+remove_previous()
+files_processed = 0
+
+for root, dirs, files in os.walk("downloads/"):
+
+    for file in files:
+        print(file)
+
+        if file is ".DS_Store":
+            continue
+        c = random()
+
+        if c < train:
+            add_train_data(os.path.join(root, file), file, root.split("/")[-1])
+        elif c < (train + val):
+            add_val_data(os.path.join(root, file), file, root.split("/")[-1])
+        else:
+            add_test_data(os.path.join(root, file), file, root.split("/")[-1])
+        files_processed += 1
+        print(root.split("/")[-1])
+        print(files_processed)
+        print(file)
diff --git a/TrainingModelKeras.py b/TrainingModelKeras.py
new file mode 100755
index 0000000..c1d1bca
--- /dev/null
+++ b/TrainingModelKeras.py
@@ -0,0 +1,167 @@
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import os
+import seaborn as sn
+import matplotlib.pyplot as plt
+from tensorflow import keras
+from time import time
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+input_shape = (224, 224, 3)
+batch_size = 32
+model_name = "MobileNetV2FullDatasetNoTransfer"
+
+from keras.preprocessing.image import ImageDataGenerator
+from keras.applications.inception_v3 import preprocess_input
+
+train_idg = ImageDataGenerator(
+    horizontal_flip=True,
+    preprocessing_function=preprocess_input
+)
+train_gen = train_idg.flow_from_directory(
+    './data/train',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size
+)
+
+val_idg = ImageDataGenerator(
+    horizontal_flip=True,
+    preprocessing_function=preprocess_input
+)
+
+val_gen = val_idg.flow_from_directory(
+    './data/val',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size
+)
+from keras.applications import inception_v3, mobilenet_v2, vgg16
+from keras.models import Sequential
+from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
+from keras import optimizers
+from keras.layers import Dense, Dropout, GlobalAveragePooling2D
+
+nclass = len(train_gen.class_indices)
+
+# base_model = vgg16.VGG16(
+#     weights='imagenet',
+#     include_top=False,
+#     input_shape=input_shape
+# )
+# base_model = inception_v3.InceptionV3(
+#     weights='imagenet',
+#     include_top=False,
+#     input_shape=input_shape
+# )
+
+base_model = mobilenet_v2.MobileNetV2(
+    weights='imagenet',
+    include_top=False,
+    input_shape=input_shape
+)
+
+add_model = Sequential()
+add_model.add(base_model)
+add_model.add(GlobalAveragePooling2D())
+add_model.add(Dropout(0.5))
+add_model.add(
+    Dense(1024, activation='relu'))  # Adding some dense layers in order to learn complex functions from the base model
+# Potentially throw another dropout layer here if you seem to be overfitting your
+add_model.add(Dropout(0.5))
+add_model.add(Dense(512, activation='relu'))
+add_model.add(Dense(nclass, activation='softmax'))  # Decision layer
+
+model = add_model
+model.compile(loss='categorical_crossentropy',
+              # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
+              optimizer=optimizers.Adam(lr=1e-4),
+              metrics=['accuracy'])
+model.summary()
+
+# Train the model
+file_path = "weights.mobilenet.non-transfer.best.hdf5"
+
+checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
+
+early = EarlyStopping(monitor="val_acc", mode="max", patience=15)
+
+tensorboard = TensorBoard(
+    log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=32,
+    write_graph=True,
+    write_grads=True,
+    write_images=True,
+    update_freq=batch_size
+)
+
+callbacks_list = [checkpoint, early, tensorboard]  # early
+
+history = model.fit_generator(
+    train_gen,
+    validation_data=val_gen,
+    epochs=2,
+    shuffle=True,
+    verbose=True,
+    callbacks=callbacks_list
+)
+
+# Create Test generator
+test_idg = ImageDataGenerator(
+    preprocessing_function=preprocess_input,
+)
+test_gen = test_idg.flow_from_directory(
+    './data/test',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size,
+    shuffle=False
+
+)
+len(test_gen.filenames)
+
+score = model.evaluate_generator(test_gen, workers=1)
+
+# predicts
+predicts = model.predict_generator(test_gen, verbose=True, workers=1)
+
+keras_file = 'finished.h5'
+keras.models.save_model(model, keras_file)
+
+print("Loss: ", score[0], "Accuracy: ", score[1])
+print(score)
+
+print(predicts)
+print(type(predicts))
+print(predicts.shape)
+# Process the predictions
+predicts = np.argmax(predicts,
+                     axis=1)
+# test_gen.reset()
+label_index = {v: k for k, v in train_gen.class_indices.items()}
+predicts = [label_index[p] for p in predicts]
+reals = [label_index[p] for p in test_gen.classes]
+
+# Save the results
+print(label_index)
+print(test_gen.classes)
+print(test_gen.classes.shape)
+print(type(test_gen.classes))
+df = pd.DataFrame(columns=['fname', 'prediction', 'true_val'])
+df['fname'] = [x for x in test_gen.filenames]
+df['prediction'] = predicts
+df["true_val"] = reals
+df.to_csv("sub1_non_transfer.csv", index=False)
+
+# Processed the saved results
+from sklearn.metrics import accuracy_score, confusion_matrix
+
+acc = accuracy_score(reals, predicts)
+conf_mat = confusion_matrix(reals, predicts)
+print("Testing accuracy score is ", acc)
+print("Confusion Matrix", conf_mat)
+
+df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))],
+                     columns=[i for i in list(set(reals))])
+plt.figure(figsize=(10, 7))
+sn.heatmap(df_cm, annot=True)
+plt.show()
diff --git a/TransferLearningKeras.py b/TransferLearningKeras.py
new file mode 100755
index 0000000..b4f97fa
--- /dev/null
+++ b/TransferLearningKeras.py
@@ -0,0 +1,167 @@
+import pandas as pd
+import numpy as np
+import seaborn as sn
+import matplotlib.pyplot as plt
+
+from time import time
+from PIL import ImageFile
+
+ImageFile.LOAD_TRUNCATED_IMAGES = True
+
+input_shape = (224, 224, 3)
+batch_size = 60
+model_name = "MobileNetV2FullDataset"
+
+from keras.preprocessing.image import ImageDataGenerator
+from keras.applications.inception_v3 import preprocess_input
+
+train_idg = ImageDataGenerator(
+    # horizontal_flip=True,
+    preprocessing_function=preprocess_input
+)
+train_gen = train_idg.flow_from_directory(
+    './data/train',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size
+)
+val_idg = ImageDataGenerator(
+    # horizontal_flip=True,
+    preprocessing_function=preprocess_input
+)
+
+val_gen = val_idg.flow_from_directory(
+    './data/val',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size
+)
+
+from keras.applications import inception_v3, mobilenet_v2, vgg16
+from keras.models import Sequential
+from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
+from keras import optimizers
+from keras.layers import Dense, Dropout, GlobalAveragePooling2D
+
+nclass = len(train_gen.class_indices)
+
+# base_model = vgg16.VGG16(
+#     weights='imagenet',
+#     include_top=False,
+#     input_shape=input_shape
+# )
+# base_model = inception_v3.InceptionV3(
+#     weights='imagenet',
+#     include_top=False,
+#     input_shape=input_shape
+# )
+
+base_model = mobilenet_v2.MobileNetV2(
+    weights='imagenet',
+    include_top=False,
+    input_shape=input_shape
+)
+base_model.trainable = False
+
+add_model = Sequential()
+add_model.add(base_model)
+add_model.add(GlobalAveragePooling2D())
+add_model.add(Dropout(0.5))
+add_model.add(Dense(1024, activation='relu'))
+# Adding some dense layers in order to learn complex functions from the base model
+add_model.add(Dropout(0.5))
+add_model.add(Dense(512, activation='relu'))
+add_model.add(Dense(nclass, activation='softmax'))  # Decision layer
+
+model = add_model
+model.compile(loss='categorical_crossentropy',
+              # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
+              optimizer=optimizers.Adam(lr=1e-4),
+              metrics=['accuracy'])
+model.summary()
+
+# Train the model
+file_path = "weights.mobilenet.best.hdf5"
+
+checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
+
+early = EarlyStopping(monitor="val_acc", mode="max", patience=15)
+
+tensorboard = TensorBoard(
+    log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=batch_size,
+    write_graph=True,
+    write_grads=True,
+    write_images=True,
+    update_freq=batch_size
+
+)
+
+callbacks_list = [checkpoint, early, tensorboard]  # early
+
+history = model.fit_generator(
+    train_gen,
+    steps_per_epoch=len(train_gen),
+    validation_data=val_gen,
+    validation_steps=len(val_gen),
+    epochs=5,
+    shuffle=True,
+    verbose=True,
+    callbacks=callbacks_list
+
+)
+
+# Create Test generator
+test_idg = ImageDataGenerator(
+    preprocessing_function=preprocess_input,
+)
+
+test_gen = test_idg.flow_from_directory(
+    './data/test',
+    target_size=(input_shape[0], input_shape[1]),
+    batch_size=batch_size,
+    shuffle=False
+)
+
+len(test_gen.filenames)
+
+score = model.evaluate_generator(test_gen, workers=1)
+
+# predicts
+predicts = model.predict_generator(test_gen, verbose=True, workers=1)
+
+print("Loss: ", score[0], "Accuracy: ", score[1])
+print(score)
+
+print(predicts)
+print(type(predicts))
+print(predicts.shape)
+# Process the predictions
+predicts = np.argmax(predicts,
+                     axis=1)
+# test_gen.reset()
+label_index = {v: k for k, v in train_gen.class_indices.items()}
+predicts = [label_index[p] for p in predicts]
+reals = [label_index[p] for p in test_gen.classes]
+
+# Save the results
+print(label_index)
+print(test_gen.classes)
+print(test_gen.classes.shape)
+print(type(test_gen.classes))
+df = pd.DataFrame(columns=['fname', 'prediction', 'true_val'])
+df['fname'] = [x for x in test_gen.filenames]
+df['prediction'] = predicts
+df["true_val"] = reals
+df.to_csv("sub1.csv", index=False)
+
+# Processed the saved results
+from sklearn.metrics import accuracy_score, confusion_matrix
+
+acc = accuracy_score(reals, predicts)
+conf_mat = confusion_matrix(reals, predicts)
+print("Testing accuracy score is ", acc)
+print("Confusion Matrix", conf_mat)
+
+df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))],
+                     columns=[i for i in list(set(reals))])
+plt.figure(figsize=(10, 7))
+sn.heatmap(df_cm, annot=True)
+plt.show()
diff --git a/graphResults.py b/graphResults.py
new file mode 100755
index 0000000..220f040
--- /dev/null
+++ b/graphResults.py
@@ -0,0 +1,62 @@
+import tensorflow as tf
+import pandas as pd
+import numpy as np
+import os
+import seaborn as sn
+import matplotlib.pyplot as plt
+
+from sklearn.metrics import accuracy_score, confusion_matrix
+
+
+def print_preds(reals, preds):
+    acc = accuracy_score(reals, predicts)
+    conf_mat = confusion_matrix(reals, predicts)
+    print("Testing accuracy score is ", acc)
+    print("Confusion Matrix", conf_mat)
+
+    df_cm = pd.DataFrame(conf_mat, index=[i for i in ["Block", "Meter", "Sign"]],
+                         columns=[i for i in ["Block", "Meter", "Sign"]])
+    plt.figure(figsize=(10, 7))
+    sn.heatmap(df_cm, annot=True)
+    plt.show()
+
+
+data = pd.read_csv("sub1_non_transfer.csv")
+files_list = list(data["fname"])
+reals = list(data["true_val"])
+predicts = list(data["prediction"])
+
+reals2 = []
+wrong_files = []
+for root, dirs, files in os.walk(".\\photos"):
+    for file in files:
+        if file in files_list:
+            x = data.loc[data["fname"] == file].values[0]
+            if (x[1] != x[2]):
+                print(x)
+                wrong_files.append((os.path.join(root, file), x[1]))
+            reals2.append(root.split("\\")[-1])
+
+print_preds(reals, predicts)
+print_preds(reals2, predicts)
+
+import matplotlib.image as mpimg
+from shutil import copyfile, rmtree
+
+for file, pred in wrong_files:
+    print(file)
+    # img = mpimg.imread(file)
+    # # end
+    # # from now on you can use img as an image, but make sure you know what you are doing!
+    # imgplot = plt.imshow(img)
+    dest = file.split("\\")
+    dest[1] = "failed"
+    dest[-1] = pred + dest[-1]
+    dest = "\\".join(dest)
+    if not os.path.exists(os.path.dirname(dest)):
+        try:
+            os.makedirs(os.path.dirname(dest))
+        except Exception as e:
+            print(e)
+    copyfile(file, dest)
+    plt.show()
diff --git a/pokemon.csv b/pokemon.csv
old mode 100644
new mode 100755
diff --git a/train.py b/train.py
deleted file mode 100644
index 139597f..0000000
--- a/train.py
+++ /dev/null
@@ -1,2 +0,0 @@
-
-