from selenium.webdriver.common.by import By from urllib3.util.retry import Retry from requests.adapters import HTTPAdapter from openpyxl import Workbook from bs4 import BeautifulSoup from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.common.keys import Keys import time import datetime import requests # 크롬 옵션 설정 from selenium.webdriver.chrome.service import Service as ChromeService from selenium.webdriver.chrome.options import Options as ChromeOptions options = ChromeOptions() user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36" options.add_argument('user-agent=' + user_agent) options.add_argument("lang=ko_KR") options.add_argument('headless') # 헤드리스 모드로 실행 (브라우저 창을 띄우지 않음) options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") options.add_argument("--no-sandbox") # 크롬 드라이버 최신 버전 설정 service = ChromeService(executable_path=ChromeDriverManager().install()) # 크롬 드라이버 실행 driver = webdriver.Chrome(service=service, options=options) # url url = 'https://m.place.naver.com/restaurant/1085956231/review/visitorentry=ple&reviewSort=recent' # BS4 setting for secondary access session = requests.Session() headers = { "User-Agent": "user value"} retries = Retry(total=5, backoff_factor=0.1, status_forcelist=[500, 502, 503, 504]) session.mount('http://', HTTPAdapter(max_retries=retries)) # New xlsx file now = datetime.datetime.now() xlsx = Workbook() list_sheet = xlsx.create_sheet('output') list_sheet.append(['nickname', 'content', 'date', 'revisit']) # Start crawling/scraping! try: print("Starting webdriver and accessing URL...") driver.get(url) driver.implicitly_wait(30) print("Page loaded. Scrolling down...") driver.find_element(By.TAG_NAME, 'body').send_keys(Keys.PAGE_DOWN) count = 0 try: while True: print("Clicking on '더보기' button...") driver.find_element(By.XPATH, '//*[@id="app-root"]/div/div/div/div[6]/div[2]/div[3]/div[2]/div/a').click() count += 1 print(f"'더보기' button clicked {count} times.") time.sleep(0.4) except Exception as e: print(f'No more "더보기" button found, finished scrolling after {count} clicks.') time.sleep(25) html = driver.page_source bs = BeautifulSoup(html, 'lxml') reviews = bs.select('li.YlrAu') print(f"Found {len(reviews)} reviews.") for r in reviews: nickname = r.select_one('div.VYGLG') content = r.select_one('div.vg7Fp.CyA_N') date = r.select('div.D40bm>span.CKUdu>time')[0] revisit = r.select('div.D40bm>span.CKUdu')[1] # exception handling nickname = nickname.text if nickname else '' content = content.text if content else '' date = date.text if date else '' revisit = revisit.text if revisit else '' time.sleep(0.06) print(f"Review: {nickname} / {content} / {date} / {revisit}") list_sheet.append([nickname, content, date, revisit]) time.sleep(0.06) # Save the file file_name = 'naver_review_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx' xlsx.save(file_name) print(f"File saved as {file_name}") except Exception as e: print(f"Exception occurred: {e}") # Save the file(temp) file_name = 'naver_review_' + now.strftime('%Y-%m-%d_%H-%M-%S') + '.xlsx' xlsx.save(file_name) print(f"File saved as {file_name} after exception") finally: driver.quit() print("Webdriver closed.")
네이버 지도 리뷰 크롤링 코드 이며, 위 와같이 작성시 엑셀파일은 생성되지만, 크롤링이 전혀안되고있는 상황입니다. 아래는 결과값입니다.
DevTools listening on ws://127.0.0.1:51574/devtools/browser/16977761-9899-4120-9ba5-2ef94f71fbc6 Starting webdriver and accessing URL... Page loaded. Scrolling down... Clicking on '더보기' button... No more "더보기" button found, finished scrolling after 0 clicks. Found 0 reviews. File saved as naver_review_2024-06-07_10-45-31.xlsx Webdriver closed. PS C:\Users\ooooo\
위는 결과값 입니다. 엑셀파일은 생성이 되지만 전혀 리뷰를 읽지 못하는 상황입니다. 도움을 주시면 감사드리겠습니다. |