From 24e8d7130602fe8a1097e382bb00b5dd5b447888 Mon Sep 17 00:00:00 2001
From: Lucas Oskorep <lucas.oskorep@gmail.com>
Date: Fri, 26 Apr 2019 02:57:18 -0500
Subject: [PATCH] Made all processes threaded.  Also changed up the neural
 network to hopefully better support the higher class load

---
 1 - ImageGatherer.py      |  21 +++---
 2 - FixFileTypes.py       | 152 +++++++++++++++++++++++++-------------
 3 - TestTrainSplit.py     |  49 ++++++++----
 4 - TrainingModelKeras.py |  29 ++++----
 4 files changed, 160 insertions(+), 91 deletions(-)

diff --git a/1 - ImageGatherer.py b/1 - ImageGatherer.py
index 712a08b..3caaeed 100644
--- a/1 - ImageGatherer.py	
+++ b/1 - ImageGatherer.py	
@@ -3,23 +3,26 @@ import multiprocessing
 
 from google_images_download import google_images_download
 
-df = pd.read_csv("pokemon.csv")
-
-response = google_images_download.googleimagesdownload()
-
 
 def get_images_for_pokemon(pokemon):
+    response = google_images_download.googleimagesdownload()
     response.download(
         {
-            "keywords": pokemon,# + " pokemon",
+            "keywords": pokemon + " pokemon",
             "limit": 250,
             "chromedriver": "chromedriver",
-            "thumbnail":True
-        #     Add chromedriver to your path or just point this var directly to your chromedriver
+            "thumbnail": True
+            #     Add chromedriver to your path or just point this var directly to your chromedriverv
         }
     )
 
-pool = multiprocessing.Pool(multiprocessing.cpu_count()*4)
 
-pool.map(get_images_for_pokemon, df["identifier"][:490])
+    # freeze_support()
+    df = pd.read_csv("pokemon.csv")
 
+    pool = multiprocessing.Pool(multiprocessing.cpu_count()*3)
+    fixes = []
+    pool.map(get_images_for_pokemon, [fixes])#df["identifier"]
+
+    # for pokemon in df["identifier"][:490]:
+    #     get_images_for_pokemon(pokemon)
\ No newline at end of file
diff --git a/2 - FixFileTypes.py b/2 - FixFileTypes.py
index 845abad..ec3c0d9 100644
--- a/2 - FixFileTypes.py	
+++ b/2 - FixFileTypes.py	
@@ -9,10 +9,12 @@ import imghdr
 import PIL
 from PIL import Image
 import sys
+import multiprocessing
+from threading import Thread, Lock
+
 
 directory = "downloads"
 
-
 def random_with_N_digits(n):
     range_start = 10 ** (n - 1)
     range_end = (10 ** n) - 1
@@ -26,12 +28,13 @@ def change_file_extension(file_obj, extension):
     elif not os.path.isfile(file_obj + extension):
         new_file = file_obj + extension
     else:
-        print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj)
+        # print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj)
         return
 
     print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file)
 
-    subprocess.run(['mv', file_obj, new_file])
+    # subprocess.run(['mv', file_obj, new_file])
+    os.rename(file_obj, new_file)
 
 
 def get_frames_from_gif(infile):
@@ -42,8 +45,7 @@ def get_frames_from_gif(infile):
         "Cant load", infile
         sys.exit(1)
 
-    i = 0
-
+    iterator = 0
     try:
         while 1:
             im2 = im.convert('RGBA')
@@ -52,57 +54,101 @@ def get_frames_from_gif(infile):
             background = Image.new("RGB", im2.size, (255, 255, 255))
             background.paste(im2, mask=im2.split()[3])
             background.save(filename, 'JPEG', quality=80)
-            print(f"FOUND GIF, SAVING FRAME AS {filename}")
-            i += 1
-            im.seek(im.tell() + 1)
-
+            # print(f"FOUND GIF, SAVING FRAME AS {filename}")
+            iterator += 1
+            while (iterator % 10 != 0):
+                im.seek(im.tell() + 1)
     except EOFError:
         pass  # end of sequence
 
 
-for root, dirs, files in os.walk(directory):
-
-    for file in files:
-
-        try:
-            file_obj = os.path.join(root, file)
-            exten = os.path.splitext(file)[1].lower()
-            img_type = imghdr.what(file_obj)
-            # print(file_obj)
-            if img_type is None:
-                os.remove(file_obj)
-            elif "jpeg" in img_type:
-                if "jpeg" not in exten and "jpg" not in exten:
-                    change_file_extension(file_obj, ".jpeg")
-            elif "png" in img_type:
-                if "png" not in exten:
-                    change_file_extension(file_obj, ".png")
-            elif "gif" in img_type:
-                get_frames_from_gif(file_obj)
-                os.remove(file_obj)
-            else:
-                os.remove(file_obj)
-
-        except Exception as e:
-            logging.error(traceback.format_exc())
-
 i = 1
-for root, dirs, files in os.walk(directory):
-    for file in files:
-        try:
-            file_obj = os.path.join(root, file)
-            path, file_base_name = os.path.split(file_obj)
-            old_path = os.path.splitext(file_base_name)
-            old_ext = old_path[1]
-            old_name = old_path[0]
-            new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext)
-            if file_obj != new_file and "foo" not in old_name:
-                print(f"Moving file\n"
-                      f"{new_file}\n"
-                      f"{file_obj}")
-                subprocess.run(['mv', file_obj, new_file])
-                i += 1
-        except Exception as e:
-            logging.error(traceback.format_exc())
 
-print("Cleaning JPEGs done")
+
+def clean_image(file_root):
+    root = file_root[0]
+    file = file_root[1]
+    try:
+        file_obj = os.path.join(root, file)
+        exten = os.path.splitext(file)[1].lower()
+        img_type = imghdr.what(file_obj)
+        # print(file_obj)
+        if img_type is None:
+            os.remove(file_obj)
+        elif "jpeg" in img_type:
+            if "jpeg" not in exten and "jpg" not in exten:
+                change_file_extension(file_obj, ".jpeg")
+        elif "png" in img_type:
+            if "png" not in exten:
+                change_file_extension(file_obj, ".png")
+        elif "gif" in img_type:
+            get_frames_from_gif(file_obj)
+            os.remove(file_obj)
+        else:
+            os.remove(file_obj)
+    except Exception as e:
+        logging.error(traceback.format_exc())
+    mutex.acquire()
+    global i
+    i += 1
+    if i % 1 == 0:
+        print("changing type" + str(i))
+    mutex.release()
+
+
+ii = 1
+
+
+def rename_images(file_root):
+    root = file_root[0]
+    file = file_root[1]
+    try:
+        file_obj = os.path.join(root, file)
+        path, file_base_name = os.path.split(file_obj)
+        old_path = os.path.splitext(file_base_name)
+        old_ext = old_path[1]
+        old_name = old_path[0]
+        mutex.acquire()
+        global ii
+        ii += 1
+        new_file = os.path.join(path, str(ii) + "-" + str(random_with_N_digits(10)) + old_ext)
+        if ii % 1000 == 0:
+            print(f"Moving file"
+                  f"{new_file}"
+                  f"{file_obj} - {ii}")
+        mutex.release()
+
+        if file_obj != new_file and "foo" not in old_name:
+            # subprocess.run(['mv', file_obj, new_file])
+            os.rename(file_obj, new_file)
+
+
+    except Exception as e:
+        logging.error(traceback.format_exc())
+
+mutex = Lock()
+
+
+if __name__ == '__main__':
+
+
+    pool = multiprocessing.Pool(multiprocessing.cpu_count())
+
+    file_root_list = []
+
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            file_root_list.append((root, file))
+
+    pool.map(clean_image, file_root_list)
+
+    file_root_list = []
+
+    for root, dirs, files in os.walk(directory):
+        for file in files:
+            file_root_list.append((root, file))
+
+    pool.map(rename_images, file_root_list)
+
+    print("Cleaning JPEGs done")
+
diff --git a/3 - TestTrainSplit.py b/3 - TestTrainSplit.py
index f83d08a..618ba02 100644
--- a/3 - TestTrainSplit.py	
+++ b/3 - TestTrainSplit.py	
@@ -1,6 +1,7 @@
 import os
 from random import random
 from shutil import copyfile, rmtree
+import multiprocessing
 
 train_dir = "./data/train/"
 test_dir = "./data/test/"
@@ -12,7 +13,6 @@ val = .05
 
 def add_train_data(file, filename, label):
     dest = train_dir + label + "/" + filename
-    print(dest, label, filename)
     if not os.path.exists(os.path.dirname(dest)):
         try:
             os.makedirs(os.path.dirname(dest))
@@ -56,25 +56,42 @@ def remove_previous():
         rmtree(val_dir)
 
 
-remove_previous()
 files_processed = 0
+def test_split_file(file_root):
+    global files_processed
+    root = file_root[0]
+    file = file_root[1]
+    # print(file)
 
-for root, dirs, files in os.walk("downloads/"):
+    if file is ".DS_Store":
+        return
+    c = random()
 
-    for file in files:
-        print(file)
+    if c < train:
+        add_train_data(os.path.join(root, file), file, root.split("/")[-1])
+    elif c < (train + val):
+        add_val_data(os.path.join(root, file), file, root.split("/")[-1])
+    else:
+        add_test_data(os.path.join(root, file), file, root.split("/")[-1])
+    files_processed += 1
 
-        if file is ".DS_Store":
-            continue
-        c = random()
-
-        if c < train:
-            add_train_data(os.path.join(root, file), file, root.split("/")[-1])
-        elif c < (train + val):
-            add_val_data(os.path.join(root, file), file, root.split("/")[-1])
-        else:
-            add_test_data(os.path.join(root, file), file, root.split("/")[-1])
-        files_processed += 1
+    if files_processed % 1000==0:
         print(root.split("/")[-1])
         print(files_processed)
         print(file)
+
+
+if __name__ == '__main__':
+    remove_previous()
+
+    file_root_list = []
+
+    for root, dirs, files in os.walk("downloads/"):
+        for file in files:
+            file_root_list.append((root, file))
+
+
+    pool = multiprocessing.Pool(multiprocessing.cpu_count()*2)
+
+    pool.map(test_split_file, file_root_list)
+
diff --git a/4 - TrainingModelKeras.py b/4 - TrainingModelKeras.py
index 0e185f2..558117e 100644
--- a/4 - TrainingModelKeras.py	
+++ b/4 - TrainingModelKeras.py	
@@ -21,10 +21,10 @@ from PIL import ImageFile
 
 ImageFile.LOAD_TRUNCATED_IMAGES = True
 
-input_shape = (299, 299, 3)
-batch_size = 32
+input_shape = (224, 224, 3)
+batch_size = 60
 
-model_name = "InceptionV3Full"
+model_name = "mobilenet"
 
 # Next we set up the Image Data Generators to feed into the training cycles.
 # We need one for training, validation, and testing
@@ -41,6 +41,7 @@ train_gen = train_idg.flow_from_directory(
     target_size=(input_shape[0], input_shape[1]),
     batch_size=batch_size
 )
+
 print(len(train_gen.classes))
 
 val_idg = ImageDataGenerator(
@@ -75,26 +76,28 @@ test_gen = test_idg.flow_from_directory(
 #     include_top=False,
 #     input_shape=input_shape
 # )
-base_model = inception_v3.InceptionV3(
-    weights='imagenet',
-    include_top=False,
-    input_shape=input_shape
-)
-
-# base_model = mobilenet_v2.MobileNetV2(
+# base_model = inception_v3.InceptionV3(
 #     weights='imagenet',
 #     include_top=False,
 #     input_shape=input_shape
 # )
 
+base_model = mobilenet_v2.MobileNetV2(
+    weights='imagenet',
+    include_top=False,
+    input_shape=input_shape
+)
+
 
 # Create a new top for that model
 add_model = Sequential()
 add_model.add(base_model)
 add_model.add(GlobalAveragePooling2D())
+add_model.add(Dense(4048, activation='relu'))
 add_model.add(Dropout(0.5))
-add_model.add(
-    Dense(1024, activation='relu'))  # Adding some dense layers in order to learn complex functions from the base model
+
+add_model.add(Dense(2024, activation='relu'))
+# Adding some dense layers in order to learn complex functions from the base model
 # Potentially throw another dropout layer here if you seem to be overfitting your
 add_model.add(Dropout(0.5))
 add_model.add(Dense(512, activation='relu'))
@@ -133,7 +136,7 @@ history = model.fit_generator(
     validation_data=val_gen,
     steps_per_epoch=len(train_gen),
     validation_steps=len(val_gen),
-    epochs=60,
+    epochs=25,
     shuffle=True,
     verbose=True,
     callbacks=callbacks_list