Web Crawling, 2일차

셀레니움

- 브라우저를 컨트롤 할 수 있도록 지원하는 라이브러리

!pip install selenium

!pip install chromedriver_autoinstaller

from selenium import webdriver
from selenium.webdriver.common.keys import Keys

driver = webdriver.Chrome()

driver.get('https://www.google.com')

search = driver.find_element('name','q')
search.send_keys('날씨')

search.send_keys(Keys.RETURN)

# 정리
driver = webdriver.Chrome()
driver.get('https://www.google.com')
search = driver.find_element('name','q')
search.send_keys('미세먼지')
search.send_keys(Keys.RETURN)

❤ 네이버 웹툰

https://comic.naver.com/webtoon/detail?titleId=783053&no=134&week=tue

driver = webdriver.Chrome()
driver.get('https://comic.naver.com/webtoon/detail?titleId=783053&no=134&week=tue')

!pip install bs4

from bs4 import BeautifulSoup

# driver.page_source
soup = BeautifulSoup(driver.page_source)

comment_area=soup.findAll('span',{'class':'u_cbox_contents'})
print(comment_area)

print('*********** 베스트 댓글 ************')
for i in range(len(comment_area)):
    comment = comment_area[i].text.strip()
    print(comment)
    print('-' * 30)

Xpath

: 기존의 컴퓨터 파일 시스템에서 사용하는 경로 표현식과 유사한 경로 언어

예) /html/body/div[1]/div[5]/div/div/div[5]/div[1]/div[3]/div/div/div[6]/ul/li[1]/div[1]/div/div[2]/span[1]

driver.find_element('xpath','/html/body/div[1]/div[5]/div/div/div[5]/div[1]/div[3]/div/div/div[6]/ul/li[1]/div[1]/div/div[2]/span[1]').click()

soup = BeautifulSoup(driver.page_source)
comment_area=soup.findAll('span',{'class':'u_cbox_contents'})
print(comment_area)

print('*********** 전체 댓글 ************')
for i in range(len(comment_area)):
    comment = comment_area[i].text.strip()
    print(comment)
    print('-' * 30)

❤ 인스타그램

import chromedriver_autoinstaller
from selenium import webdriver

driver = webdriver.Chrome()

url = 'https://www.instagram.com/'
driver.get(url)

id = '아이디'
pw = '비밀번호'

input_id = driver.find_element('xpath','/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[1]/div/label/input')
input_pw = driver.find_element('xpath','/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[2]/div/label/input')

# 아이디 입력
input_id.send_keys(id)
# 비밀번호 입력
input_pw.send_keys(pw)

driver.find_element('xpath', '/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[3]').click()

✔ 해시태그 검색

hashtag = '맛점'
url = f'https://www.instagram.com/explore/tags/{hashtag}/'
driver.get(url)

✔ 스크롤 내리기

import time

for _ in range(1):
    driver.execute_script('window.scrollTo(0, document.body.scrollHeight)')
    # time.sleep(3)

✔ 원하는 사진 클릭하기

xpath = '/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/article/div/div[2]/div/div[2]/div[2]/a'
driver.find_element('xpath', xpath).click()

✔ 좋아요 및 댓글 달기

like_xpath = '/html/body/div[8]/div[1]/div/div[3]/div/div/div/div/div[2]/div/article/div/div[3]/div/div/section[1]/span[1]/div/div'
driver.find_element('xpath', like_xpath).click()
driver.find_element('xpath', like_xpath).click()

reply_xpath = '/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/div[1]/div/div[2]/div/div[4]/section/div/form/div/textarea'
driver.find_element('xpath', reply_xpath).click()
driver.find_element('xpath', reply_xpath).send_keys('👍')

✔ 함수로 리팩토링

#로그인
def login(id, pw):
    input_id = driver.find_element("xpath", "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[1]/div/label/input")
    input_pw = driver.find_element("xpath", "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[2]/div/label/input")
    input_id.send_keys(pw)
    input_pw.send_keys(id)
    driver.find_element("xpath", "/html/body/div[2]/div/div/div[2]/div/div/div[1]/section/main/article/div[2]/div[1]/div[2]/form/div/div[3]/button").click()
#해시태그 검색
def search(hashtag):
    url = f"https://instagram.com/explore/tags/{hashtag}/"
    driver.get(url)
#좋아요 및 댓글달기
def like_and_comment(comment):
    xpath = '/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/article/div/div[2]/div/div[1]/div[2]/a'
    driver.find_element("xpath", xpath).click()
    like_xpath = "/html/body/div[2]/div/div/div[2]/div/div/div[1]/div[1]/div[2]/section/main/div/article/div[3]/div/div[1]/section[1]/div[1]/span[1]/div/div"
    driver.find_element("xpath", like_xpath).click()
    driver.find_element("xpath", like_xpath).click()

# 실행
driver = webdriver.Chrome()
url = 'https://www.instagram.com/'
driver.get(url)
driver.implicitly_wait(3)

id = '아이디'
pw = '비번'

login(id, pw)
time.sleep(4)

hashtag = '사과'
search(hashtag)
time.sleep(4)

like_and_comment('좋아요!!')

❤ 픽사베이

✔ 이미지 수집하기

import chromedriver_autoinstaller
import time
from selenium import webdriver
from urllib.request import Request, urlopen

driver = webdriver.Chrome()
url = 'https://pixabay.com/ko/images/search/강아지/'
driver.get(url)

image_xpath = '/html/body/div[1]/div[1]/div/div/div/div[1]/div[2]/div/img'
image_url = driver.find_element('xpath', image_xpath).get_attribute('src')
print('image_url: ', image_url)

image_byte = Request(image_url, headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36'})
f = open('dog.jpg', 'wb')
f.write(urlopen(image_byte).read())
f.close()

'코딩 > AI' 카테고리의 다른 글

데이터 분석, 판다스 (0)	2024.05.23
데이터 분석, 넘파이 (0)	2024.05.23
Web Crawling, 3일차 (0)	2024.05.23
Web Crawling, 과제1 (0)	2024.05.21
Web Crawling , 1일차 (0)	2024.05.21

dev log

Web Crawling, 2일차

'코딩 > AI' 카테고리의 다른 글

티스토리툴바

Web Crawling, 2일차

'코딩 > AI' 카테고리의 다른 글

'코딩/AI' Related Articles

티스토리툴바