Code - yjyoon/구미시-네이버-블로그-스크레퍼

import regex import time import math import pandas as pd import os from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service as ChromeService from tqdm import tqdm HEADER = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"} def generate_date_range(start_date, end_date): # Convert the start and end date strings to datetime objects start = datetime.strptime(start_date, "%Y-%m-%d") end = datetime.strptime(end_date, "%Y-%m-%d") # Generate a range of dates from start to end date_range = [] current_date = start while current_date <= end: date_range.append(current_date.strftime("%Y-%m-%d")) current_date += timedelta(days=1) return date_range def get_url(keyword, start_date, end_date, page_num): url = f"https://section.blog.naver.com/Search/Post.naver?pageNo={page_num}&rangeType=PERIOD&orderBy=sim&startDate={start_date}&endDate={end_date}&keyword={keyword}" return url def get_article_count(plain_html): regex_match_count = regex.search(r'\d*(,\d{3}|d{3})*\b*건\b*<', plain_html) regex_match_count = regex.sub(r'건\b*<', '', regex_match_count.group()) match_count = regex_match_count.replace(',', '') return int(match_count) def get_total_page(url): # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.get(url) time.sleep(3) plain_html = driver.page_source article_count = get_article_count(plain_html) total_page = math.ceil(int(article_count) / 7) return total_page def scrap_agent(url): # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.get(url) time.sleep(2) single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "html body ui-view div#wrap.wrap main#container.container div.layout_content div#content.content section.wrap_search_list div.area_list_search") search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML') search_result_urls = regex.findall(r"https:\/\/blog\.naver\.com\/[\uAC00-\uD7AFa-zA-Z0-9_]+\/[\uAC00-\uD7AFa-zA-Z0-9_]+", search_result_plain_html) # convenient single liner that removes duplicates search_result_urls = list(set(search_result_urls)) return search_result_urls def blog_url_scrapper(keyword, start_date, end_date): dates_to_search = generate_date_range(start_date=start_date,end_date=end_date) iter_over_dates = tqdm(dates_to_search, position=0, leave=True) for dates in iter_over_dates: iter_over_dates.set_description(f"Collecting over an time interval, Current Day - {dates}") search_results = [] url = get_url(keyword, dates, dates, 1) total_page = get_total_page(url) iter_over_pages = tqdm(range(1, total_page+1), position=0, leave=False) for i in iter_over_pages: iter_over_pages.set_description(f"current page-{i}/{total_page}") url = get_url(keyword, start_date=dates, end_date=dates, page_num=i) search_result = scrap_agent(url) search_results += search_result df_data = { "url" : search_results, "postdate" : [dates] * len(search_results), "creator" : [""] * len(search_results), "title" : [""] * len(search_results), "contents" : [""] * len(search_results) } df_out = pd.DataFrame(df_data) if not os.path.exists(f"blog/{keyword}"): os.mkdir(f"blog/{keyword}") if len(df_out) > 0 : df_out.to_csv(f"blog/{keyword}/{dates}", index=False)