import re
import regex
import time
import pandas as pd
import os
import bs4
import urllib.parse
import multiprocessing
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from joblib import Parallel, delayed
from utils.cache_clear import delete_cache
def get_url(keyword, start_date, end_date, page_num):
url = (f"https://search.daum.net/search?"
f"w=fusion&nil_search=btn&DA=PGD"
f"&q={urllib.parse.quote(keyword)}"
f"&col=cafe&sort=recency"
f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959"
f"&period=u&p={page_num}")
return url
def remove_tags(html):
# parse html content
soup = bs4.BeautifulSoup(html, "html.parser")
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def scrap_agent(url, keyword, start_date, end_date, page_num):
print(f"working on {start_date} to {end_date}")
# Chrome WebDriver 설치 경로 가져오기
chrome_path = ChromeDriverManager().install()
# WebDriver 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
# WebDriver 초기화
driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
driver.get(url)
time.sleep(2)
# infinity_scroll_to_the_bottom(driver)
single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated")
search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
formatter = bs4.formatter.HTMLFormatter(indent=1)
search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter)
search_result_contents = regex.findall(r'