fbpixel
Tags: , ,

, ,

To train a neural network in object detection and recognition, you need an image bank to work with. We’ll see how to download a large number of images from Google using Python. To train a neural network, you need a large amount of data. The more data, the better the training. In our case, we want to train a neural network to recognize a particular object. To do this, we create a Python script that downloads files from the Internet and places them in a folder.

Configuring Python3

Download Selenium and OpenCV libraries (optional)

python3 -m pip install selenium
python3 -m pip install opencv-python

download file geckodriver

N.B.: We use the OpenCV library only to check that OpenCV can open and use the images, so as not to clutter up the folder unnecessarily.

Image download Python script

This script launches a search on Google Image and saves the images found in the folder specified for the image bank.

N.B.: Don’t forget to specify the path to the geckodriver GECKOPATH file, the path to the destination folder and the keywords for Google search.

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
import sys
import os
import time
#Imports Packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException,WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
import cv2
########################################################################
GECKOPATH = "PATH-TO-GECKODRIVER"
parent_dir = "PATH-TO-FOLDER"
search='coffee mug'
########################################################################
# path
folderName=search.replace(" ","_")
directory = os.path.join(parent_dir, folderName,'img')
# Create the directory
try:
if not os.path.exists(directory):
os.makedirs(directory) #os.mkdir(directory)
except OSError as error:
print("ERROR : {}".format(error))
sys.path.append(GECKOPATH)
#Opens up web driver and goes to Google Images
browser = webdriver.Firefox()#Firefox(firefox_binary=binary)
#load google image
browser.get('https://www.google.ca/imghp?hl=en')
delay = 10 # seconds
try:
btnId="L2AGLb"
myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper
elm=browser.find_element_by_id(btnId)
elm.click()
print("Popup is passed!")
except TimeoutException as e:
print("Loading took too much time!")
# get and fill search bar
box = browser.find_element_by_xpath('//*[@id="sbtc"]/div/div[2]/input')
box.send_keys(search)
box.send_keys(Keys.ENTER)
#Will keep scrolling down the webpage until it cannot scroll no more
last_height = browser.execute_script('return document.body.scrollHeight')
while True:
browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
time.sleep(5)
new_height = browser.execute_script('return document.body.scrollHeight')
try:
browser.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input').click()
time.sleep(5)
except:
print("button not found")
pass
if new_height == last_height:
break
last_height = new_height
imgList=[]
for i in range(1, 1000):
try:
browser.find_element_by_xpath('//*[@id="islrg"]/div[1]/div['+str(i)+']/a[1]/div[1]/img').screenshot(directory+'\{}.png'.format(i))
imgList.add(directory+'\{}.png'.format(i))
except:
pass
browser.quit()
#Test images with OpenCV
for img in imgList:
try:
cv2.imread(img)
except Exception as e:
os.remove(img)
print("remove {}".format(img))
import sys import os import time #Imports Packages from selenium import webdriver from selenium.webdriver.common.keys import Keys from selenium.common.exceptions import TimeoutException,WebDriverException from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.common.by import By import cv2 ######################################################################## GECKOPATH = "PATH-TO-GECKODRIVER" parent_dir = "PATH-TO-FOLDER" search='coffee mug' ######################################################################## # path folderName=search.replace(" ","_") directory = os.path.join(parent_dir, folderName,'img') # Create the directory try: if not os.path.exists(directory): os.makedirs(directory) #os.mkdir(directory) except OSError as error: print("ERROR : {}".format(error)) sys.path.append(GECKOPATH) #Opens up web driver and goes to Google Images browser = webdriver.Firefox()#Firefox(firefox_binary=binary) #load google image browser.get('https://www.google.ca/imghp?hl=en') delay = 10 # seconds try: btnId="L2AGLb" myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper elm=browser.find_element_by_id(btnId) elm.click() print("Popup is passed!") except TimeoutException as e: print("Loading took too much time!") # get and fill search bar box = browser.find_element_by_xpath('//*[@id="sbtc"]/div/div[2]/input') box.send_keys(search) box.send_keys(Keys.ENTER) #Will keep scrolling down the webpage until it cannot scroll no more last_height = browser.execute_script('return document.body.scrollHeight') while True: browser.execute_script('window.scrollTo(0,document.body.scrollHeight)') time.sleep(5) new_height = browser.execute_script('return document.body.scrollHeight') try: browser.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input').click() time.sleep(5) except: print("button not found") pass if new_height == last_height: break last_height = new_height imgList=[] for i in range(1, 1000): try: browser.find_element_by_xpath('//*[@id="islrg"]/div[1]/div['+str(i)+']/a[1]/div[1]/img').screenshot(directory+'\{}.png'.format(i)) imgList.add(directory+'\{}.png'.format(i)) except: pass browser.quit() #Test images with OpenCV for img in imgList: try: cv2.imread(img) except Exception as e: os.remove(img) print("remove {}".format(img))
import sys
import os
import time 

#Imports Packages
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.common.exceptions import TimeoutException,WebDriverException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By

import cv2

########################################################################
GECKOPATH = "PATH-TO-GECKODRIVER"
parent_dir = "PATH-TO-FOLDER"
search='coffee mug'
########################################################################

# path 
folderName=search.replace(" ","_")
directory = os.path.join(parent_dir, folderName,'img') 
   
# Create the directory 
try: 
	if not os.path.exists(directory):
		os.makedirs(directory) #os.mkdir(directory)
except OSError as error: 
	print("ERROR : {}".format(error)) 


sys.path.append(GECKOPATH)  
#Opens up web driver and goes to Google Images
browser = webdriver.Firefox()#Firefox(firefox_binary=binary)

#load google image
browser.get('https://www.google.ca/imghp?hl=en')

delay = 10 # seconds
try:
	btnId="L2AGLb"
	myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper 
	elm=browser.find_element_by_id(btnId)
	elm.click()
	print("Popup is passed!")
except TimeoutException as e:
	print("Loading took too much time!")



# get and fill search bar
box = browser.find_element_by_xpath('//*[@id="sbtc"]/div/div[2]/input')
box.send_keys(search)
box.send_keys(Keys.ENTER)


#Will keep scrolling down the webpage until it cannot scroll no more
last_height = browser.execute_script('return document.body.scrollHeight')
while True:
	browser.execute_script('window.scrollTo(0,document.body.scrollHeight)')
	time.sleep(5)
	new_height = browser.execute_script('return document.body.scrollHeight')
	try:
		browser.find_element_by_xpath('//*[@id="islmp"]/div/div/div/div/div[5]/input').click()
		time.sleep(5)
	except:
		print("button not found")
		pass
		
	if new_height == last_height:
		break
	last_height = new_height
	
imgList=[]
for i in range(1, 1000):
	try:
		browser.find_element_by_xpath('//*[@id="islrg"]/div[1]/div['+str(i)+']/a[1]/div[1]/img').screenshot(directory+'\{}.png'.format(i))
		imgList.add(directory+'\{}.png'.format(i))
		
	except:
		pass
browser.quit()
 
#Test images with OpenCV
for img in imgList:
	try:   
		cv2.imread(img)
	except Exception as e:
		os.remove(img)
		print("remove {}".format(img))

BONUS: Managing a popup

In the code I’ve added a command to manage the popup that appears when the web page is opened. It will wait for the button with the correct identifier to be loaded before pressing it.

Plain text
Copy to clipboard
Open code in new window
EnlighterJS 3 Syntax Highlighter
delay = 10 # seconds
try:
btnId="L2AGLb"
myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper
elm=browser.find_element_by_id(btnId)
elm.click()
print("Popup is passed!")
except TimeoutException as e:
print("Loading took too much time!")
delay = 10 # seconds try: btnId="L2AGLb" myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper elm=browser.find_element_by_id(btnId) elm.click() print("Popup is passed!") except TimeoutException as e: print("Loading took too much time!")
delay = 10 # seconds
try:
	btnId="L2AGLb"
	myElem = WebDriverWait(browser, delay).until(EC.presence_of_element_located((By.ID , btnId))) #id info-address-place-wrapper 
	elm=browser.find_element_by_id(btnId)
	elm.click()
	print("Popup is passed!")
except TimeoutException as e:
	print("Loading took too much time!")

 

downloads-image-from-internet-python-result Create your image bank with Python

You now have an image bank that you can use for visual recognition, for example, or for image processing.

Applications

  • Develop image processing algorithms
  • Training neural networks for object detection and recognition

Sources