import re import regex import time import pandas as pd import os import bs4 import multiprocessing from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service as ChromeService from joblib import Parallel, delayed from utils.cache_clear import delete_cache def get_url(keyword, start_date, end_date, page_num): url = (f"https://search.daum.net/search?" f"w=fusion&nil_search=btn&DA=PGD" f"&q={keyword}" f"&col=cafe&sort=recency" f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959" f"&period=u&p={page_num}") return url def remove_tags(html): # parse html content soup = bs4.BeautifulSoup(html, "html.parser") for data in soup(['style', 'script']): # Remove tags data.decompose() # return data by retrieving the tag content return ' '.join(soup.stripped_strings) def scrap_agent(url, keyword, start_date, end_date, page_num): # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.get(url) time.sleep(2) # infinity_scroll_to_the_bottom(driver) try : single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated") except : final_page = regex.search(r'