in

BeautifulSoup Web Scraper Stopping After 100 Images


I am trying scrape images from Google Images and have a somewhat functioning program, but it always stops after 100 images, no matter what I have my loop set to. Since it gives no actual error, I am having trouble pinning down what I need to do. This is the full code:

import requests, re, json, urllib, os
from bs4 import BeautifulSoup
from func_timeout import func_set_timeout
import ssl

ssl._create_default_https_context = ssl._create_unverified_context

searchFor = "sports car"
getImages = 1000

headers = {
    "User-Agent":
    "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582"
}

params = {
    "q": searchFor,
    "tbm": "isch",
    "ijn": "0",
}

html = requests.get("https://www.google.com/search", params=params, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')

@func_set_timeout(2)
def downloadImage(a,b):
    urllib.request.urlretrieve(a, b)

def get_images_data():

    print('nGoogle Images Metadata:')
    for google_image in soup.select('.isv-r.PNCib.MSM1fd.BUooTd'):
        title = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['title']
        source = google_image.select_one('.fxgdke').text
        link = google_image.select_one('.VFACy.kGQAp.sMi44c.lNHeqe.WGvvNb')['href']
        print(f'{title}n{source}n{link}n')

    all_script_tags = soup.select('script')

    matched_images_data="".join(re.findall(r"AF_initDataCallback(([^<]+));", str(all_script_tags)))

    # if you try to json.loads() without json.dumps it will throw an error:
    # "Expecting property name enclosed in double quotes"
    matched_images_data_fix = json.dumps(matched_images_data)
    matched_images_data_json = json.loads(matched_images_data_fix)

    # Has a length of 1
    matched_google_image_data = re.findall(r'["GRID_STATE0",null,[[1,[0,".*?",(.*),"All",', matched_images_data_json)

    # Has a length of 100
    matched_google_images_thumbnails=", ".join(
        re.findall(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]',
                   str(matched_google_image_data))).split(', ') 

    print('Google Image Thumbnails:')  # in order
    for fixed_google_image_thumbnail in matched_google_images_thumbnails:
        google_image_thumbnail_not_fixed = bytes(fixed_google_image_thumbnail, 'ascii').decode('unicode-escape')

        # after first decoding, Unicode characters are still present. After the second iteration, they were decoded.
        google_image_thumbnail = bytes(google_image_thumbnail_not_fixed, 'ascii').decode('unicode-escape')
        print(google_image_thumbnail)

    # removing previously matched thumbnails for easier full resolution image matches.
    removed_matched_google_images_thumbnails = re.sub(r'["(https://encrypted-tbn0.gstatic.com/images?.*?)",d+,d+]', '', str(matched_google_image_data))

    matched_google_full_resolution_images = re.findall(r"(?:'|,),["(https:|http.*?)",d+,d+]",
                                                       removed_matched_google_images_thumbnails)


    print('nFull Resolution Images:')  # in order
    for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):
        original_size_img_not_fixed = bytes(fixed_full_res_image, 'ascii').decode('unicode-escape')
        original_size_img = bytes(original_size_img_not_fixed, 'ascii').decode('unicode-escape')
        print(original_size_img)

        # Download original images
        opener=urllib.request.build_opener()
        opener.addheaders=[('User-Agent','Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.19582')]
        urllib.request.install_opener(opener)
        
        fileName="Images/" + str(len(os.listdir("Images"))+1) +'.jpeg'

        try:
            downloadImage(original_size_img, fileName)
        except:
            pass
        
        if len(os.listdir("Images")) == getImages:
            print('Quitting...')
            quit()

get_images_data()

I know that is a lot, and I know the root of my issue is wanting to be able to have

matched_google_full_resolution_images

be whatever I want it to be in this loop:

for index, fixed_full_res_image in enumerate(matched_google_full_resolution_images):

In what I have been able to find, it seems like I need to somehow load more images initially because in the base of the page there are only so many images, but I have found to actual solutions. (Also this is my first question, so if I asked it poorly, please let me know what I can do.)



Source: https://stackoverflow.com/questions/70538254/beautifulsoup-web-scraper-stopping-after-100-images

Jio vs Vi vs Bharti Airtel Affordable 1GB/Day Prepaid Plans Compared

Will this Minecraft minelayer / prismarines bot work?