From 24e8d7130602fe8a1097e382bb00b5dd5b447888 Mon Sep 17 00:00:00 2001 From: Lucas Oskorep Date: Fri, 26 Apr 2019 02:57:18 -0500 Subject: [PATCH] Made all processes threaded. Also changed up the neural network to hopefully better support the higher class load --- 1 - ImageGatherer.py | 21 +++--- 2 - FixFileTypes.py | 152 +++++++++++++++++++++++++------------- 3 - TestTrainSplit.py | 49 ++++++++---- 4 - TrainingModelKeras.py | 29 ++++---- 4 files changed, 160 insertions(+), 91 deletions(-) diff --git a/1 - ImageGatherer.py b/1 - ImageGatherer.py index 712a08b..3caaeed 100644 --- a/1 - ImageGatherer.py +++ b/1 - ImageGatherer.py @@ -3,23 +3,26 @@ import multiprocessing from google_images_download import google_images_download -df = pd.read_csv("pokemon.csv") - -response = google_images_download.googleimagesdownload() - def get_images_for_pokemon(pokemon): + response = google_images_download.googleimagesdownload() response.download( { - "keywords": pokemon,# + " pokemon", + "keywords": pokemon + " pokemon", "limit": 250, "chromedriver": "chromedriver", - "thumbnail":True - # Add chromedriver to your path or just point this var directly to your chromedriver + "thumbnail": True + # Add chromedriver to your path or just point this var directly to your chromedriverv } ) -pool = multiprocessing.Pool(multiprocessing.cpu_count()*4) -pool.map(get_images_for_pokemon, df["identifier"][:490]) + # freeze_support() + df = pd.read_csv("pokemon.csv") + pool = multiprocessing.Pool(multiprocessing.cpu_count()*3) + fixes = [] + pool.map(get_images_for_pokemon, [fixes])#df["identifier"] + + # for pokemon in df["identifier"][:490]: + # get_images_for_pokemon(pokemon) \ No newline at end of file diff --git a/2 - FixFileTypes.py b/2 - FixFileTypes.py index 845abad..ec3c0d9 100644 --- a/2 - FixFileTypes.py +++ b/2 - FixFileTypes.py @@ -9,10 +9,12 @@ import imghdr import PIL from PIL import Image import sys +import multiprocessing +from threading import Thread, Lock + directory = "downloads" - def random_with_N_digits(n): range_start = 10 ** (n - 1) range_end = (10 ** n) - 1 @@ -26,12 +28,13 @@ def change_file_extension(file_obj, extension): elif not os.path.isfile(file_obj + extension): new_file = file_obj + extension else: - print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj) + # print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj) return print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file) - subprocess.run(['mv', file_obj, new_file]) + # subprocess.run(['mv', file_obj, new_file]) + os.rename(file_obj, new_file) def get_frames_from_gif(infile): @@ -42,8 +45,7 @@ def get_frames_from_gif(infile): "Cant load", infile sys.exit(1) - i = 0 - + iterator = 0 try: while 1: im2 = im.convert('RGBA') @@ -52,57 +54,101 @@ def get_frames_from_gif(infile): background = Image.new("RGB", im2.size, (255, 255, 255)) background.paste(im2, mask=im2.split()[3]) background.save(filename, 'JPEG', quality=80) - print(f"FOUND GIF, SAVING FRAME AS {filename}") - i += 1 - im.seek(im.tell() + 1) - + # print(f"FOUND GIF, SAVING FRAME AS {filename}") + iterator += 1 + while (iterator % 10 != 0): + im.seek(im.tell() + 1) except EOFError: pass # end of sequence -for root, dirs, files in os.walk(directory): - - for file in files: - - try: - file_obj = os.path.join(root, file) - exten = os.path.splitext(file)[1].lower() - img_type = imghdr.what(file_obj) - # print(file_obj) - if img_type is None: - os.remove(file_obj) - elif "jpeg" in img_type: - if "jpeg" not in exten and "jpg" not in exten: - change_file_extension(file_obj, ".jpeg") - elif "png" in img_type: - if "png" not in exten: - change_file_extension(file_obj, ".png") - elif "gif" in img_type: - get_frames_from_gif(file_obj) - os.remove(file_obj) - else: - os.remove(file_obj) - - except Exception as e: - logging.error(traceback.format_exc()) - i = 1 -for root, dirs, files in os.walk(directory): - for file in files: - try: - file_obj = os.path.join(root, file) - path, file_base_name = os.path.split(file_obj) - old_path = os.path.splitext(file_base_name) - old_ext = old_path[1] - old_name = old_path[0] - new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext) - if file_obj != new_file and "foo" not in old_name: - print(f"Moving file\n" - f"{new_file}\n" - f"{file_obj}") - subprocess.run(['mv', file_obj, new_file]) - i += 1 - except Exception as e: - logging.error(traceback.format_exc()) -print("Cleaning JPEGs done") + +def clean_image(file_root): + root = file_root[0] + file = file_root[1] + try: + file_obj = os.path.join(root, file) + exten = os.path.splitext(file)[1].lower() + img_type = imghdr.what(file_obj) + # print(file_obj) + if img_type is None: + os.remove(file_obj) + elif "jpeg" in img_type: + if "jpeg" not in exten and "jpg" not in exten: + change_file_extension(file_obj, ".jpeg") + elif "png" in img_type: + if "png" not in exten: + change_file_extension(file_obj, ".png") + elif "gif" in img_type: + get_frames_from_gif(file_obj) + os.remove(file_obj) + else: + os.remove(file_obj) + except Exception as e: + logging.error(traceback.format_exc()) + mutex.acquire() + global i + i += 1 + if i % 1 == 0: + print("changing type" + str(i)) + mutex.release() + + +ii = 1 + + +def rename_images(file_root): + root = file_root[0] + file = file_root[1] + try: + file_obj = os.path.join(root, file) + path, file_base_name = os.path.split(file_obj) + old_path = os.path.splitext(file_base_name) + old_ext = old_path[1] + old_name = old_path[0] + mutex.acquire() + global ii + ii += 1 + new_file = os.path.join(path, str(ii) + "-" + str(random_with_N_digits(10)) + old_ext) + if ii % 1000 == 0: + print(f"Moving file" + f"{new_file}" + f"{file_obj} - {ii}") + mutex.release() + + if file_obj != new_file and "foo" not in old_name: + # subprocess.run(['mv', file_obj, new_file]) + os.rename(file_obj, new_file) + + + except Exception as e: + logging.error(traceback.format_exc()) + +mutex = Lock() + + +if __name__ == '__main__': + + + pool = multiprocessing.Pool(multiprocessing.cpu_count()) + + file_root_list = [] + + for root, dirs, files in os.walk(directory): + for file in files: + file_root_list.append((root, file)) + + pool.map(clean_image, file_root_list) + + file_root_list = [] + + for root, dirs, files in os.walk(directory): + for file in files: + file_root_list.append((root, file)) + + pool.map(rename_images, file_root_list) + + print("Cleaning JPEGs done") + diff --git a/3 - TestTrainSplit.py b/3 - TestTrainSplit.py index f83d08a..618ba02 100644 --- a/3 - TestTrainSplit.py +++ b/3 - TestTrainSplit.py @@ -1,6 +1,7 @@ import os from random import random from shutil import copyfile, rmtree +import multiprocessing train_dir = "./data/train/" test_dir = "./data/test/" @@ -12,7 +13,6 @@ val = .05 def add_train_data(file, filename, label): dest = train_dir + label + "/" + filename - print(dest, label, filename) if not os.path.exists(os.path.dirname(dest)): try: os.makedirs(os.path.dirname(dest)) @@ -56,25 +56,42 @@ def remove_previous(): rmtree(val_dir) -remove_previous() files_processed = 0 +def test_split_file(file_root): + global files_processed + root = file_root[0] + file = file_root[1] + # print(file) -for root, dirs, files in os.walk("downloads/"): + if file is ".DS_Store": + return + c = random() - for file in files: - print(file) + if c < train: + add_train_data(os.path.join(root, file), file, root.split("/")[-1]) + elif c < (train + val): + add_val_data(os.path.join(root, file), file, root.split("/")[-1]) + else: + add_test_data(os.path.join(root, file), file, root.split("/")[-1]) + files_processed += 1 - if file is ".DS_Store": - continue - c = random() - - if c < train: - add_train_data(os.path.join(root, file), file, root.split("/")[-1]) - elif c < (train + val): - add_val_data(os.path.join(root, file), file, root.split("/")[-1]) - else: - add_test_data(os.path.join(root, file), file, root.split("/")[-1]) - files_processed += 1 + if files_processed % 1000==0: print(root.split("/")[-1]) print(files_processed) print(file) + + +if __name__ == '__main__': + remove_previous() + + file_root_list = [] + + for root, dirs, files in os.walk("downloads/"): + for file in files: + file_root_list.append((root, file)) + + + pool = multiprocessing.Pool(multiprocessing.cpu_count()*2) + + pool.map(test_split_file, file_root_list) + diff --git a/4 - TrainingModelKeras.py b/4 - TrainingModelKeras.py index 0e185f2..558117e 100644 --- a/4 - TrainingModelKeras.py +++ b/4 - TrainingModelKeras.py @@ -21,10 +21,10 @@ from PIL import ImageFile ImageFile.LOAD_TRUNCATED_IMAGES = True -input_shape = (299, 299, 3) -batch_size = 32 +input_shape = (224, 224, 3) +batch_size = 60 -model_name = "InceptionV3Full" +model_name = "mobilenet" # Next we set up the Image Data Generators to feed into the training cycles. # We need one for training, validation, and testing @@ -41,6 +41,7 @@ train_gen = train_idg.flow_from_directory( target_size=(input_shape[0], input_shape[1]), batch_size=batch_size ) + print(len(train_gen.classes)) val_idg = ImageDataGenerator( @@ -75,26 +76,28 @@ test_gen = test_idg.flow_from_directory( # include_top=False, # input_shape=input_shape # ) -base_model = inception_v3.InceptionV3( - weights='imagenet', - include_top=False, - input_shape=input_shape -) - -# base_model = mobilenet_v2.MobileNetV2( +# base_model = inception_v3.InceptionV3( # weights='imagenet', # include_top=False, # input_shape=input_shape # ) +base_model = mobilenet_v2.MobileNetV2( + weights='imagenet', + include_top=False, + input_shape=input_shape +) + # Create a new top for that model add_model = Sequential() add_model.add(base_model) add_model.add(GlobalAveragePooling2D()) +add_model.add(Dense(4048, activation='relu')) add_model.add(Dropout(0.5)) -add_model.add( - Dense(1024, activation='relu')) # Adding some dense layers in order to learn complex functions from the base model + +add_model.add(Dense(2024, activation='relu')) +# Adding some dense layers in order to learn complex functions from the base model # Potentially throw another dropout layer here if you seem to be overfitting your add_model.add(Dropout(0.5)) add_model.add(Dense(512, activation='relu')) @@ -133,7 +136,7 @@ history = model.fit_generator( validation_data=val_gen, steps_per_epoch=len(train_gen), validation_steps=len(val_gen), - epochs=60, + epochs=25, shuffle=True, verbose=True, callbacks=callbacks_list