본문 바로가기

코딩/AI

Web Crawling, 3일차

❤ 픽사베이

2. 여러개 이미지 수집하기

import time
from selenium.webdriver.common.by import By
driver = webdriver.Chrome()
url = 'https://pixabay.com/ko/images/search/강아지/'
driver.get(url)
time.sleep(3)

for _ in range(20):
    driver.execute_script("window.scrollBy({ top: window.innerHeight, behavior: 'smooth' })")
    time.sleep(0.3)

image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
image_area = driver.find_element(By.XPATH, image_area_xpath)
image_elements = image_area.find_elements(By.TAG_NAME, 'img')

image_urls = []

for image_element in image_elements:
    image_url = image_element.get_attribute('data-lazy-src')
    if image_url is None:
        image_url = image_element.get_attribute('src')
        print(image_url)
        image_urls.append(image_url)
import os
from urllib import parse
for i in range(len(image_urls)):
    image_url = image_urls[i]
    # https://cdn.pixabay.com/photo/2023/12/05/16/57/dog-8432098_640.jpg
    url = parse.urlparse(image_url)
    name, ext = os.path.splitext(url.path)
    image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'})
    f = open(f'dog{i}.{ext}', 'wb')
    f.write(urlopen(image_byte).read())
    f.close()

 

3. 함수로 리팩토링

 

def crawl_and_save_image(keyword, pages):
    image_urls = []
    for i in range(1, pages+1):
        url = f'https://pixabay.com/ko/images/search/{keyword}/?pagi={i}'
        driver.get(url)
        time.sleep(2)

        for _ in range(20):
            driver.execute_script("window.scrollBy({ top: window.innerHeight, behavior: 'smooth' })")
            time.sleep(0.3)
        
        image_area_xpath = '/html/body/div[1]/div[1]/div/div[2]/div[3]'
        image_area = driver.find_element(By.XPATH, image_area_xpath)
        image_elements = image_area.find_elements(By.TAG_NAME, 'img')
        
        for image_element in image_elements:
            image_url = image_element.get_attribute('src')
            print(image_url)
            image_urls.append(image_url)

    if not os.path.exists(keyword):
        os.mkdir(keyword)

    for i in range(len(image_urls)):
        image_url = image_urls[i]
        url = parse.urlparse(image_url)
        filename = image_url.split('/')[-1]
        image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'})
        f = open(f'{keyword}/{filename}', 'wb')
        f.write(urlopen(image_byte).read())
        f.close()
driver = webdriver.Chrome()
crawl_and_save_image('호랑이', 2)

'코딩 > AI' 카테고리의 다른 글

데이터 분석, 판다스  (0) 2024.05.23
데이터 분석, 넘파이  (0) 2024.05.23
Web Crawling, 과제1  (0) 2024.05.21
Web Crawling, 2일차  (0) 2024.05.21
Web Crawling , 1일차  (0) 2024.05.21