Adding in the data first preprocessing, as well as the first model training.

2019-04-14 14:32:08 -05:00
parent 81a48a51ae
commit bc44d30180
11 changed files with 600 additions and 4 deletions
@@ -0,0 +1,108 @@
 import glob
 import subprocess
 import os
 import re
 import logging
 import traceback
 from random import randint
 import imghdr
 import PIL
 from PIL import Image
 import sys
 directory = "downloads"
 def random_with_N_digits(n):
    range_start = 10 ** (n - 1)
    range_end = (10 ** n) - 1
    return randint(range_start, range_end)
 def change_file_extension(file_obj, extension):
    old_path = os.path.splitext(file_obj)
    if not os.path.isfile(old_path[0] + extension):
        new_file = old_path[0] + extension
    elif not os.path.isfile(file_obj + extension):
        new_file = file_obj + extension
    else:
        print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj)
        return
    print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file)
    subprocess.run(['mv', file_obj, new_file])
 def get_frames_from_gif(infile):
    try:
        im = Image.open(infile)
    except IOError:
        print
        "Cant load", infile
        sys.exit(1)
    i = 0
    try:
        while 1:
            im2 = im.convert('RGBA')
            im2.load()
            filename = os.path.join(os.path.dirname(infile), 'foo' + str(i) + '.jpg')
            background = Image.new("RGB", im2.size, (255, 255, 255))
            background.paste(im2, mask=im2.split()[3])
            background.save(filename, 'JPEG', quality=80)
            print(f"FOUND GIF, SAVING FRAME AS {filename}")
            i += 1
            im.seek(im.tell() + 1)
    except EOFError:
        pass  # end of sequence
 for root, dirs, files in os.walk(directory):
    for file in files:
        try:
            file_obj = os.path.join(root, file)
            exten = os.path.splitext(file)[1].lower()
            img_type = imghdr.what(file_obj)
            # print(file_obj)
            if img_type is None:
                os.remove(file_obj)
            elif "jpeg" in img_type:
                if "jpeg" not in exten and "jpg" not in exten:
                    change_file_extension(file_obj, ".jpeg")
            elif "png" in img_type:
                if "png" not in exten:
                    change_file_extension(file_obj, ".png")
            elif "gif" in img_type:
                get_frames_from_gif(file_obj)
                os.remove(file_obj)
            else:
                os.remove(file_obj)
        except Exception as e:
            logging.error(traceback.format_exc())
 i = 1
 for root, dirs, files in os.walk(directory):
    for file in files:
        try:
            file_obj = os.path.join(root, file)
            path, file_base_name = os.path.split(file_obj)
            old_path = os.path.splitext(file_base_name)
            old_ext = old_path[1]
            old_name = old_path[0]
            new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext)
            if file_obj != new_file and "foo" not in old_name:
                print(f"Moving file\n"
                      f"{new_file}\n"
                      f"{file_obj}")
                subprocess.run(['mv', file_obj, new_file])
                i += 1
        except Exception as e:
            logging.error(traceback.format_exc())
 print("Cleaning JPEGs done")
@@ -6,12 +6,14 @@ df = pd.read_csv("pokemon.csv")
 response = google_images_download.googleimagesdownload()
-for pokemon in df["identifier"][:251]:
+for pokemon in ["abra", "xatu", "yanma", "zapdos", "zubat"]:  # df["identifier"][:251]:
    absolute_image_paths = response.download(
        {
            "keywords": pokemon,
            "limit": 250,
-            "chromedriver": "/usr/lib/chromium-browser/chromedriver"
+            "chromedriver": "/usr/lib/chromium-browser/chromedriver",
            # This needs to be changed based on the computer trying to download the images
            "format": "jpg"
        }
    )
@@ -0,0 +1,12 @@
 from tensorflow import keras
 from tensorflow.contrib import lite
 keras_file = "weights.mobilenet.non-transfer.best.hdf5"
 keras.models.load_model(keras_file)
 h5_model = keras.models.load_model(keras_file)
 converter = lite.TocoConverter.from_keras_model_file(keras_file)
 tflite_model = converter.convert()
 with open('mobilenet.tflite', 'wb') as f:
    f.write(tflite_model)
@@ -0,0 +1,80 @@
 import os
 from random import random
 from shutil import copyfile, rmtree
 train_dir = "./data/train/"
 test_dir = "./data/test/"
 val_dir = "./data/val/"
 train = .75
 test = .20
 val = .05
 def add_train_data(file, filename, label):
    dest = train_dir + label + "/" + filename
    print(dest, label, filename)
    if not os.path.exists(os.path.dirname(dest)):
        try:
            os.makedirs(os.path.dirname(dest))
        except Exception as e:
            print(e)
    try:
        copyfile(file, dest)
    except Exception as e:
        print(e)
        print("INVALID FILE")
        os.remove(file)
        # TODO: Remove the files
 def add_val_data(file, filename, label):
    dest = val_dir + label + "/" + filename
    if not os.path.exists(os.path.dirname(dest)):
        try:
            os.makedirs(os.path.dirname(dest))
        except Exception as e:
            print(e)
    copyfile(file, dest)
 def add_test_data(file, filename, label):
    dest = test_dir + label + "/" + filename
    if not os.path.exists(os.path.dirname(dest)):
        try:
            os.makedirs(os.path.dirname(dest))
        except Exception as e:
            print(e)
    copyfile(file, dest)
 def remove_previous():
    if os.path.exists(os.path.dirname(test_dir)):
        rmtree(test_dir)
    if os.path.exists(os.path.dirname(train_dir)):
        rmtree(train_dir)
    if os.path.exists(os.path.dirname(val_dir)):
        rmtree(val_dir)
 remove_previous()
 files_processed = 0
 for root, dirs, files in os.walk("downloads/"):
    for file in files:
        print(file)
        if file is ".DS_Store":
            continue
        c = random()
        if c < train:
            add_train_data(os.path.join(root, file), file, root.split("/")[-1])
        elif c < (train + val):
            add_val_data(os.path.join(root, file), file, root.split("/")[-1])
        else:
            add_test_data(os.path.join(root, file), file, root.split("/")[-1])
        files_processed += 1
        print(root.split("/")[-1])
        print(files_processed)
        print(file)
@@ -0,0 +1,167 @@
 import tensorflow as tf
 import pandas as pd
 import numpy as np
 import os
 import seaborn as sn
 import matplotlib.pyplot as plt
 from tensorflow import keras
 from time import time
 from PIL import ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 input_shape = (224, 224, 3)
 batch_size = 32
 model_name = "MobileNetV2FullDatasetNoTransfer"
 from keras.preprocessing.image import ImageDataGenerator
 from keras.applications.inception_v3 import preprocess_input
 train_idg = ImageDataGenerator(
    horizontal_flip=True,
    preprocessing_function=preprocess_input
 )
 train_gen = train_idg.flow_from_directory(
    './data/train',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size
 )
 val_idg = ImageDataGenerator(
    horizontal_flip=True,
    preprocessing_function=preprocess_input
 )
 val_gen = val_idg.flow_from_directory(
    './data/val',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size
 )
 from keras.applications import inception_v3, mobilenet_v2, vgg16
 from keras.models import Sequential
 from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
 from keras import optimizers
 from keras.layers import Dense, Dropout, GlobalAveragePooling2D
 nclass = len(train_gen.class_indices)
 # base_model = vgg16.VGG16(
 #     weights='imagenet',
 #     include_top=False,
 #     input_shape=input_shape
 # )
 # base_model = inception_v3.InceptionV3(
 #     weights='imagenet',
 #     include_top=False,
 #     input_shape=input_shape
 # )
 base_model = mobilenet_v2.MobileNetV2(
    weights='imagenet',
    include_top=False,
    input_shape=input_shape
 )
 add_model = Sequential()
 add_model.add(base_model)
 add_model.add(GlobalAveragePooling2D())
 add_model.add(Dropout(0.5))
 add_model.add(
    Dense(1024, activation='relu'))  # Adding some dense layers in order to learn complex functions from the base model
 # Potentially throw another dropout layer here if you seem to be overfitting your
 add_model.add(Dropout(0.5))
 add_model.add(Dense(512, activation='relu'))
 add_model.add(Dense(nclass, activation='softmax'))  # Decision layer
 model = add_model
 model.compile(loss='categorical_crossentropy',
              # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              optimizer=optimizers.Adam(lr=1e-4),
              metrics=['accuracy'])
 model.summary()
 # Train the model
 file_path = "weights.mobilenet.non-transfer.best.hdf5"
 checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
 early = EarlyStopping(monitor="val_acc", mode="max", patience=15)
 tensorboard = TensorBoard(
    log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=32,
    write_graph=True,
    write_grads=True,
    write_images=True,
    update_freq=batch_size
 )
 callbacks_list = [checkpoint, early, tensorboard]  # early
 history = model.fit_generator(
    train_gen,
    validation_data=val_gen,
    epochs=2,
    shuffle=True,
    verbose=True,
    callbacks=callbacks_list
 )
 # Create Test generator
 test_idg = ImageDataGenerator(
    preprocessing_function=preprocess_input,
 )
 test_gen = test_idg.flow_from_directory(
    './data/test',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size,
    shuffle=False
 )
 len(test_gen.filenames)
 score = model.evaluate_generator(test_gen, workers=1)
 # predicts
 predicts = model.predict_generator(test_gen, verbose=True, workers=1)
 keras_file = 'finished.h5'
 keras.models.save_model(model, keras_file)
 print("Loss: ", score[0], "Accuracy: ", score[1])
 print(score)
 print(predicts)
 print(type(predicts))
 print(predicts.shape)
 # Process the predictions
 predicts = np.argmax(predicts,
                     axis=1)
 # test_gen.reset()
 label_index = {v: k for k, v in train_gen.class_indices.items()}
 predicts = [label_index[p] for p in predicts]
 reals = [label_index[p] for p in test_gen.classes]
 # Save the results
 print(label_index)
 print(test_gen.classes)
 print(test_gen.classes.shape)
 print(type(test_gen.classes))
 df = pd.DataFrame(columns=['fname', 'prediction', 'true_val'])
 df['fname'] = [x for x in test_gen.filenames]
 df['prediction'] = predicts
 df["true_val"] = reals
 df.to_csv("sub1_non_transfer.csv", index=False)
 # Processed the saved results
 from sklearn.metrics import accuracy_score, confusion_matrix
 acc = accuracy_score(reals, predicts)
 conf_mat = confusion_matrix(reals, predicts)
 print("Testing accuracy score is ", acc)
 print("Confusion Matrix", conf_mat)
 df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))],
                     columns=[i for i in list(set(reals))])
 plt.figure(figsize=(10, 7))
 sn.heatmap(df_cm, annot=True)
 plt.show()
@@ -0,0 +1,167 @@
 import pandas as pd
 import numpy as np
 import seaborn as sn
 import matplotlib.pyplot as plt
 from time import time
 from PIL import ImageFile
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 input_shape = (224, 224, 3)
 batch_size = 60
 model_name = "MobileNetV2FullDataset"
 from keras.preprocessing.image import ImageDataGenerator
 from keras.applications.inception_v3 import preprocess_input
 train_idg = ImageDataGenerator(
    # horizontal_flip=True,
    preprocessing_function=preprocess_input
 )
 train_gen = train_idg.flow_from_directory(
    './data/train',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size
 )
 val_idg = ImageDataGenerator(
    # horizontal_flip=True,
    preprocessing_function=preprocess_input
 )
 val_gen = val_idg.flow_from_directory(
    './data/val',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size
 )
 from keras.applications import inception_v3, mobilenet_v2, vgg16
 from keras.models import Sequential
 from keras.callbacks import ModelCheckpoint, EarlyStopping, TensorBoard
 from keras import optimizers
 from keras.layers import Dense, Dropout, GlobalAveragePooling2D
 nclass = len(train_gen.class_indices)
 # base_model = vgg16.VGG16(
 #     weights='imagenet',
 #     include_top=False,
 #     input_shape=input_shape
 # )
 # base_model = inception_v3.InceptionV3(
 #     weights='imagenet',
 #     include_top=False,
 #     input_shape=input_shape
 # )
 base_model = mobilenet_v2.MobileNetV2(
    weights='imagenet',
    include_top=False,
    input_shape=input_shape
 )
 base_model.trainable = False
 add_model = Sequential()
 add_model.add(base_model)
 add_model.add(GlobalAveragePooling2D())
 add_model.add(Dropout(0.5))
 add_model.add(Dense(1024, activation='relu'))
 # Adding some dense layers in order to learn complex functions from the base model
 add_model.add(Dropout(0.5))
 add_model.add(Dense(512, activation='relu'))
 add_model.add(Dense(nclass, activation='softmax'))  # Decision layer
 model = add_model
 model.compile(loss='categorical_crossentropy',
              # optimizer=optimizers.SGD(lr=1e-4, momentum=0.9),
              optimizer=optimizers.Adam(lr=1e-4),
              metrics=['accuracy'])
 model.summary()
 # Train the model
 file_path = "weights.mobilenet.best.hdf5"
 checkpoint = ModelCheckpoint(file_path, monitor='val_acc', verbose=1, save_best_only=True, mode='max')
 early = EarlyStopping(monitor="val_acc", mode="max", patience=15)
 tensorboard = TensorBoard(
    log_dir="logs/" + model_name + "{}".format(time()), histogram_freq=0, batch_size=batch_size,
    write_graph=True,
    write_grads=True,
    write_images=True,
    update_freq=batch_size
 )
 callbacks_list = [checkpoint, early, tensorboard]  # early
 history = model.fit_generator(
    train_gen,
    steps_per_epoch=len(train_gen),
    validation_data=val_gen,
    validation_steps=len(val_gen),
    epochs=5,
    shuffle=True,
    verbose=True,
    callbacks=callbacks_list
 )
 # Create Test generator
 test_idg = ImageDataGenerator(
    preprocessing_function=preprocess_input,
 )
 test_gen = test_idg.flow_from_directory(
    './data/test',
    target_size=(input_shape[0], input_shape[1]),
    batch_size=batch_size,
    shuffle=False
 )
 len(test_gen.filenames)
 score = model.evaluate_generator(test_gen, workers=1)
 # predicts
 predicts = model.predict_generator(test_gen, verbose=True, workers=1)
 print("Loss: ", score[0], "Accuracy: ", score[1])
 print(score)
 print(predicts)
 print(type(predicts))
 print(predicts.shape)
 # Process the predictions
 predicts = np.argmax(predicts,
                     axis=1)
 # test_gen.reset()
 label_index = {v: k for k, v in train_gen.class_indices.items()}
 predicts = [label_index[p] for p in predicts]
 reals = [label_index[p] for p in test_gen.classes]
 # Save the results
 print(label_index)
 print(test_gen.classes)
 print(test_gen.classes.shape)
 print(type(test_gen.classes))
 df = pd.DataFrame(columns=['fname', 'prediction', 'true_val'])
 df['fname'] = [x for x in test_gen.filenames]
 df['prediction'] = predicts
 df["true_val"] = reals
 df.to_csv("sub1.csv", index=False)
 # Processed the saved results
 from sklearn.metrics import accuracy_score, confusion_matrix
 acc = accuracy_score(reals, predicts)
 conf_mat = confusion_matrix(reals, predicts)
 print("Testing accuracy score is ", acc)
 print("Confusion Matrix", conf_mat)
 df_cm = pd.DataFrame(conf_mat, index=[i for i in list(set(reals))],
                     columns=[i for i in list(set(reals))])
 plt.figure(figsize=(10, 7))
 sn.heatmap(df_cm, annot=True)
 plt.show()
@@ -0,0 +1,62 @@
 import tensorflow as tf
 import pandas as pd
 import numpy as np
 import os
 import seaborn as sn
 import matplotlib.pyplot as plt
 from sklearn.metrics import accuracy_score, confusion_matrix
 def print_preds(reals, preds):
    acc = accuracy_score(reals, predicts)
    conf_mat = confusion_matrix(reals, predicts)
    print("Testing accuracy score is ", acc)
    print("Confusion Matrix", conf_mat)
    df_cm = pd.DataFrame(conf_mat, index=[i for i in ["Block", "Meter", "Sign"]],
                         columns=[i for i in ["Block", "Meter", "Sign"]])
    plt.figure(figsize=(10, 7))
    sn.heatmap(df_cm, annot=True)
    plt.show()
 data = pd.read_csv("sub1_non_transfer.csv")
 files_list = list(data["fname"])
 reals = list(data["true_val"])
 predicts = list(data["prediction"])
 reals2 = []
 wrong_files = []
 for root, dirs, files in os.walk(".\\photos"):
    for file in files:
        if file in files_list:
            x = data.loc[data["fname"] == file].values[0]
            if (x[1] != x[2]):
                print(x)
                wrong_files.append((os.path.join(root, file), x[1]))
            reals2.append(root.split("\\")[-1])
 print_preds(reals, predicts)
 print_preds(reals2, predicts)
 import matplotlib.image as mpimg
 from shutil import copyfile, rmtree
 for file, pred in wrong_files:
    print(file)
    # img = mpimg.imread(file)
    # # end
    # # from now on you can use img as an image, but make sure you know what you are doing!
    # imgplot = plt.imshow(img)
    dest = file.split("\\")
    dest[1] = "failed"
    dest[-1] = pred + dest[-1]
    dest = "\\".join(dest)
    if not os.path.exists(os.path.dirname(dest)):
        try:
            os.makedirs(os.path.dirname(dest))
        except Exception as e:
            print(e)
    copyfile(file, dest)
    plt.show()
@@ -1,2 +0,0 @@