Made all processes threaded. Also changed up the neural network to hopefully better support the higher class load

This commit is contained in:
Lucas Oskorep
2019-04-26 02:57:18 -05:00
parent 493f5cfb25
commit 24e8d71306
4 changed files with 160 additions and 91 deletions
+12 -9
View File
@@ -3,23 +3,26 @@ import multiprocessing
from google_images_download import google_images_download from google_images_download import google_images_download
df = pd.read_csv("pokemon.csv")
response = google_images_download.googleimagesdownload()
def get_images_for_pokemon(pokemon): def get_images_for_pokemon(pokemon):
response = google_images_download.googleimagesdownload()
response.download( response.download(
{ {
"keywords": pokemon,# + " pokemon", "keywords": pokemon + " pokemon",
"limit": 250, "limit": 250,
"chromedriver": "chromedriver", "chromedriver": "chromedriver",
"thumbnail":True "thumbnail": True
# Add chromedriver to your path or just point this var directly to your chromedriver # Add chromedriver to your path or just point this var directly to your chromedriverv
} }
) )
pool = multiprocessing.Pool(multiprocessing.cpu_count()*4)
pool.map(get_images_for_pokemon, df["identifier"][:490]) # freeze_support()
df = pd.read_csv("pokemon.csv")
pool = multiprocessing.Pool(multiprocessing.cpu_count()*3)
fixes = []
pool.map(get_images_for_pokemon, [fixes])#df["identifier"]
# for pokemon in df["identifier"][:490]:
# get_images_for_pokemon(pokemon)
+67 -21
View File
@@ -9,10 +9,12 @@ import imghdr
import PIL import PIL
from PIL import Image from PIL import Image
import sys import sys
import multiprocessing
from threading import Thread, Lock
directory = "downloads" directory = "downloads"
def random_with_N_digits(n): def random_with_N_digits(n):
range_start = 10 ** (n - 1) range_start = 10 ** (n - 1)
range_end = (10 ** n) - 1 range_end = (10 ** n) - 1
@@ -26,12 +28,13 @@ def change_file_extension(file_obj, extension):
elif not os.path.isfile(file_obj + extension): elif not os.path.isfile(file_obj + extension):
new_file = file_obj + extension new_file = file_obj + extension
else: else:
print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj) # print(f"Found {extension} hiding as JPEG but couldn't rename:", file_obj)
return return
print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file) print(f"Found {extension} hiding as JPEG, renaming:", file_obj, '->', new_file)
subprocess.run(['mv', file_obj, new_file]) # subprocess.run(['mv', file_obj, new_file])
os.rename(file_obj, new_file)
def get_frames_from_gif(infile): def get_frames_from_gif(infile):
@@ -42,8 +45,7 @@ def get_frames_from_gif(infile):
"Cant load", infile "Cant load", infile
sys.exit(1) sys.exit(1)
i = 0 iterator = 0
try: try:
while 1: while 1:
im2 = im.convert('RGBA') im2 = im.convert('RGBA')
@@ -52,18 +54,20 @@ def get_frames_from_gif(infile):
background = Image.new("RGB", im2.size, (255, 255, 255)) background = Image.new("RGB", im2.size, (255, 255, 255))
background.paste(im2, mask=im2.split()[3]) background.paste(im2, mask=im2.split()[3])
background.save(filename, 'JPEG', quality=80) background.save(filename, 'JPEG', quality=80)
print(f"FOUND GIF, SAVING FRAME AS {filename}") # print(f"FOUND GIF, SAVING FRAME AS {filename}")
i += 1 iterator += 1
while (iterator % 10 != 0):
im.seek(im.tell() + 1) im.seek(im.tell() + 1)
except EOFError: except EOFError:
pass # end of sequence pass # end of sequence
for root, dirs, files in os.walk(directory): i = 1
for file in files:
def clean_image(file_root):
root = file_root[0]
file = file_root[1]
try: try:
file_obj = os.path.join(root, file) file_obj = os.path.join(root, file)
exten = os.path.splitext(file)[1].lower() exten = os.path.splitext(file)[1].lower()
@@ -82,27 +86,69 @@ for root, dirs, files in os.walk(directory):
os.remove(file_obj) os.remove(file_obj)
else: else:
os.remove(file_obj) os.remove(file_obj)
except Exception as e: except Exception as e:
logging.error(traceback.format_exc()) logging.error(traceback.format_exc())
mutex.acquire()
global i
i += 1
if i % 1 == 0:
print("changing type" + str(i))
mutex.release()
i = 1
for root, dirs, files in os.walk(directory): ii = 1
for file in files:
def rename_images(file_root):
root = file_root[0]
file = file_root[1]
try: try:
file_obj = os.path.join(root, file) file_obj = os.path.join(root, file)
path, file_base_name = os.path.split(file_obj) path, file_base_name = os.path.split(file_obj)
old_path = os.path.splitext(file_base_name) old_path = os.path.splitext(file_base_name)
old_ext = old_path[1] old_ext = old_path[1]
old_name = old_path[0] old_name = old_path[0]
new_file = os.path.join(path, str(i) + "-" + str(random_with_N_digits(10)) + old_ext) mutex.acquire()
global ii
ii += 1
new_file = os.path.join(path, str(ii) + "-" + str(random_with_N_digits(10)) + old_ext)
if ii % 1000 == 0:
print(f"Moving file"
f"{new_file}"
f"{file_obj} - {ii}")
mutex.release()
if file_obj != new_file and "foo" not in old_name: if file_obj != new_file and "foo" not in old_name:
print(f"Moving file\n" # subprocess.run(['mv', file_obj, new_file])
f"{new_file}\n" os.rename(file_obj, new_file)
f"{file_obj}")
subprocess.run(['mv', file_obj, new_file])
i += 1
except Exception as e: except Exception as e:
logging.error(traceback.format_exc()) logging.error(traceback.format_exc())
print("Cleaning JPEGs done") mutex = Lock()
if __name__ == '__main__':
pool = multiprocessing.Pool(multiprocessing.cpu_count())
file_root_list = []
for root, dirs, files in os.walk(directory):
for file in files:
file_root_list.append((root, file))
pool.map(clean_image, file_root_list)
file_root_list = []
for root, dirs, files in os.walk(directory):
for file in files:
file_root_list.append((root, file))
pool.map(rename_images, file_root_list)
print("Cleaning JPEGs done")
+25 -8
View File
@@ -1,6 +1,7 @@
import os import os
from random import random from random import random
from shutil import copyfile, rmtree from shutil import copyfile, rmtree
import multiprocessing
train_dir = "./data/train/" train_dir = "./data/train/"
test_dir = "./data/test/" test_dir = "./data/test/"
@@ -12,7 +13,6 @@ val = .05
def add_train_data(file, filename, label): def add_train_data(file, filename, label):
dest = train_dir + label + "/" + filename dest = train_dir + label + "/" + filename
print(dest, label, filename)
if not os.path.exists(os.path.dirname(dest)): if not os.path.exists(os.path.dirname(dest)):
try: try:
os.makedirs(os.path.dirname(dest)) os.makedirs(os.path.dirname(dest))
@@ -56,16 +56,15 @@ def remove_previous():
rmtree(val_dir) rmtree(val_dir)
remove_previous()
files_processed = 0 files_processed = 0
def test_split_file(file_root):
for root, dirs, files in os.walk("downloads/"): global files_processed
root = file_root[0]
for file in files: file = file_root[1]
print(file) # print(file)
if file is ".DS_Store": if file is ".DS_Store":
continue return
c = random() c = random()
if c < train: if c < train:
@@ -75,6 +74,24 @@ for root, dirs, files in os.walk("downloads/"):
else: else:
add_test_data(os.path.join(root, file), file, root.split("/")[-1]) add_test_data(os.path.join(root, file), file, root.split("/")[-1])
files_processed += 1 files_processed += 1
if files_processed % 1000==0:
print(root.split("/")[-1]) print(root.split("/")[-1])
print(files_processed) print(files_processed)
print(file) print(file)
if __name__ == '__main__':
remove_previous()
file_root_list = []
for root, dirs, files in os.walk("downloads/"):
for file in files:
file_root_list.append((root, file))
pool = multiprocessing.Pool(multiprocessing.cpu_count()*2)
pool.map(test_split_file, file_root_list)
+16 -13
View File
@@ -21,10 +21,10 @@ from PIL import ImageFile
ImageFile.LOAD_TRUNCATED_IMAGES = True ImageFile.LOAD_TRUNCATED_IMAGES = True
input_shape = (299, 299, 3) input_shape = (224, 224, 3)
batch_size = 32 batch_size = 60
model_name = "InceptionV3Full" model_name = "mobilenet"
# Next we set up the Image Data Generators to feed into the training cycles. # Next we set up the Image Data Generators to feed into the training cycles.
# We need one for training, validation, and testing # We need one for training, validation, and testing
@@ -41,6 +41,7 @@ train_gen = train_idg.flow_from_directory(
target_size=(input_shape[0], input_shape[1]), target_size=(input_shape[0], input_shape[1]),
batch_size=batch_size batch_size=batch_size
) )
print(len(train_gen.classes)) print(len(train_gen.classes))
val_idg = ImageDataGenerator( val_idg = ImageDataGenerator(
@@ -75,26 +76,28 @@ test_gen = test_idg.flow_from_directory(
# include_top=False, # include_top=False,
# input_shape=input_shape # input_shape=input_shape
# ) # )
base_model = inception_v3.InceptionV3( # base_model = inception_v3.InceptionV3(
weights='imagenet',
include_top=False,
input_shape=input_shape
)
# base_model = mobilenet_v2.MobileNetV2(
# weights='imagenet', # weights='imagenet',
# include_top=False, # include_top=False,
# input_shape=input_shape # input_shape=input_shape
# ) # )
base_model = mobilenet_v2.MobileNetV2(
weights='imagenet',
include_top=False,
input_shape=input_shape
)
# Create a new top for that model # Create a new top for that model
add_model = Sequential() add_model = Sequential()
add_model.add(base_model) add_model.add(base_model)
add_model.add(GlobalAveragePooling2D()) add_model.add(GlobalAveragePooling2D())
add_model.add(Dense(4048, activation='relu'))
add_model.add(Dropout(0.5)) add_model.add(Dropout(0.5))
add_model.add(
Dense(1024, activation='relu')) # Adding some dense layers in order to learn complex functions from the base model add_model.add(Dense(2024, activation='relu'))
# Adding some dense layers in order to learn complex functions from the base model
# Potentially throw another dropout layer here if you seem to be overfitting your # Potentially throw another dropout layer here if you seem to be overfitting your
add_model.add(Dropout(0.5)) add_model.add(Dropout(0.5))
add_model.add(Dense(512, activation='relu')) add_model.add(Dense(512, activation='relu'))
@@ -133,7 +136,7 @@ history = model.fit_generator(
validation_data=val_gen, validation_data=val_gen,
steps_per_epoch=len(train_gen), steps_per_epoch=len(train_gen),
validation_steps=len(val_gen), validation_steps=len(val_gen),
epochs=60, epochs=25,
shuffle=True, shuffle=True,
verbose=True, verbose=True,
callbacks=callbacks_list callbacks=callbacks_list