From 285955fa5c8b076731694c158abc88b1fbad6f13 Mon Sep 17 00:00:00 2001 From: lucasoskorep Date: Thu, 25 Apr 2019 16:13:05 -0500 Subject: [PATCH] added multithreading to the downloader --- ModelTraining/1 - ImageGatherer.py | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/ModelTraining/1 - ImageGatherer.py b/ModelTraining/1 - ImageGatherer.py index ff17d83..712a08b 100755 --- a/ModelTraining/1 - ImageGatherer.py +++ b/ModelTraining/1 - ImageGatherer.py @@ -1,4 +1,5 @@ import pandas as pd +import multiprocessing from google_images_download import google_images_download @@ -6,15 +7,19 @@ df = pd.read_csv("pokemon.csv") response = google_images_download.googleimagesdownload() -for pokemon in df["identifier"][:251]: - absolute_image_paths = response.download( + +def get_images_for_pokemon(pokemon): + response.download( { - "keywords": pokemon, + "keywords": pokemon,# + " pokemon", "limit": 250, - "chromedriver": "/usr/lib/chromium-browser/chromedriver", - # This needs to be changed based on the computer trying to download the images - "format": "jpg" + "chromedriver": "chromedriver", + "thumbnail":True + # Add chromedriver to your path or just point this var directly to your chromedriver } ) -# TODO: Need to clean data up here.... really should be added to another class as well you lazy asshole +pool = multiprocessing.Pool(multiprocessing.cpu_count()*4) + +pool.map(get_images_for_pokemon, df["identifier"][:490]) +