Code - yjyoon/구미시-네이버-블로그-스크레퍼

import re import regex import time import pandas as pd import os import bs4 import urllib.parse import multiprocessing from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service as ChromeService from joblib import Parallel, delayed from utils.cache_clear import delete_cache def get_url(keyword, start_date, end_date, page_num): url = (f"https://search.daum.net/search?" f"w=fusion&nil_search=btn&DA=PGD" f"&q={urllib.parse.quote(keyword)}" f"&col=cafe&sort=recency" f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959" f"&period=u&p={page_num}") return url def remove_tags(html): # parse html content soup = bs4.BeautifulSoup(html, "html.parser") for data in soup(['style', 'script']): # Remove tags data.decompose() # return data by retrieving the tag content return ' '.join(soup.stripped_strings) def scrap_agent(url, keyword, start_date, end_date, page_num): print(f"working on {start_date} to {end_date}") # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.get(url) time.sleep(2) # infinity_scroll_to_the_bottom(driver) single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated") search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML') formatter = bs4.formatter.HTMLFormatter(indent=1) search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter) search_result_contents = regex.findall(r'<c-card class="_cubic hydrated">(.*?)\n</c-card>', search_result_plain_html_beautified, re.DOTALL) ret = { "url": [], "title": [], "cafe_name": [], "post_date": [], "text": [], "topic": [] } for content in search_result_contents: content = content.replace("\n", "") title_and_url_extract = regex.search(r'<div class=\"fmenu-layer layer_item\" (.*?)>', content) href = regex.search(r'data-link=\"(.*?)\"', title_and_url_extract.group(1)).group(1) title = regex.search(r'data-title=\"(.*?)\"', title_and_url_extract.group(1)).group(1) text = regex.search(r'<p class="conts-desc clamp-g">(.*?)</p>',content) cafe_name = regex.search(r'<c-frag class=\"_cubic hydrated\" slot=\"_slt1\">(.*?)</c-frag>', content) post_date = regex.search(r'<c-footer-desc class=\"_cubic hydrated\" slot=\"info\">(.*?)</c-footer-desc>',content) text = remove_tags(text.group(1)) cafe_name = remove_tags(cafe_name.group(1)) post_date = remove_tags(post_date.group(1)) topic_extract = regex.search(r'<c-header-item class=\"_cubic hydrated\"(.*?)>', content).group(1) topic = regex.search(r'\"topic\":\{(.*?)\}', topic_extract).group(1) topic = regex.findall(r'\"(.*?)\"', topic) topic = '&'.join(topic) ret["url"].append(href) ret["title"].append(title) ret["cafe_name"].append(cafe_name) ret["post_date"].append(post_date) ret["text"].append(text) ret["topic"].append(topic) delete_cache(driver) driver.close() if ret["url"] == []: return None out_df = pd.DataFrame.from_dict(ret) base_dir = "daum_cafe" if not os.path.exists(f"{base_dir}/{keyword}"): os.mkdir(f"{base_dir}/{keyword}") save_file_name = f"{start_date}-{end_date}-{page_num}.csv" out_df.to_csv(f"{base_dir}/{keyword}/{save_file_name}", index=False) print(f"saved {base_dir}/{keyword}/{save_file_name}") return ret def merge_dicts(dict_list): # Initialize the result dictionary with empty lists result = { "url": [], "title": [], "blog_name": [], "post_date": [], "text": [], "topic": [] } # Iterate through each dictionary and merge the lists for d in dict_list: for key in result.keys(): result[key].extend(d.get(key, [])) return result def generate_date_range(start_date, end_date, interval): """ :param start_date: start date in datetimestr formatted in %Y-%m-%d :param end_date: end date in datetimestr formatted in %Y-%m-%d :param interval: interval of time in DAYS :return: returns list of time interval that will be feeded into naver_blog_scrapper. for example, [[2023-10-01, 2023-10-07],[2023-10-08, 2023-10-14]] for generate_date_range("2023-10-01", "2023-10-14", 7) Also, this function can handle when the time range of start and end date is not perfectly divisible by interval generate_date_range("2023-10-01", "2023-10-14", 10) will produce output [['2023-10-01', '2023-10-10'], ['2023-10-11', '2023-10-14']] """ # Convert the start and end date strings to datetime objects start = datetime.strptime(start_date, "%Y-%m-%d") end = datetime.strptime(end_date, "%Y-%m-%d") # Generate a range of dates from start to end date_ranges = [] current_date = start while current_date < end: current_end = min(current_date + timedelta(days=interval - 1), end) date_ranges.append([current_date.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")]) current_date = current_end + timedelta(days=1) return date_ranges def daum_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1): # daum starts the page_num from 2 last_page = 502 date_ranges = generate_date_range(start_date, end_date, interval) print("!!") for date_range in date_ranges: ret = 0 current_page = 2 while ret is not None or current_page > last_page: url = get_url(keyword, date_range[0], date_range[1], current_page) ret = scrap_agent(url, keyword, date_range[0], date_range[1], current_page) print(ret) current_page += 1 if __name__ == "__main__": # scrap_agent("https://search.daum.net/search?&p=3&q=%EA%B5%AC%EB%AF%B8+%EC%A7%80%EC%82%B0%EB%8F%99&col=cafe", "dummy", "dummy", "dummy1", "dummy") # scrap_agent("https://search.daum.net/search?w=fusion&nil_search=btn&DA=PGD&q=%EA%B5%AC%EB%AF%B8&col=cafe&sort=recency&sd=20221111000000&ed=20221111235959&period=u&p=5", # "구미", # "2020", # "2020") daum_cafe_scrapper("선산읍", start_date="2023-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("장천면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("무을면", start_date="2023-10-23", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("해평면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("옥성면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("구미 송정동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) # daum_cafe_scrapper("원평동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) # daum_cafe_scrapper("구미 지산동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) # daum_cafe_scrapper("도량동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) # daum_cafe_scrapper("선주원남동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) # daum_cafe_scrapper("신평1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("신평2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("형곡1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("형곡2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("구미 비산동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("공단동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1) # daum_cafe_scrapper("광평동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("상모사곡동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("임오동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("인동동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("진미동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # daum_cafe_scrapper("양표동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) # # daum_cafe_scrapper("구미", start_date="2022-01-01", end_date="2023-10-31", interval=1, browser_thread_count=1) # daum_cafe_scrapper("구미시장", start_date="2022-01-01", end_date="2023-10-31", interval=7, browser_thread_count=1)