
File name
Commit message
Commit date
import regex
import time
import math
import pandas as pd
import os
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from tqdm import tqdm
HEADER = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}
def generate_date_range(start_date, end_date):
# Convert the start and end date strings to datetime objects
start = datetime.strptime(start_date, "%Y-%m-%d")
end = datetime.strptime(end_date, "%Y-%m-%d")
# Generate a range of dates from start to end
date_range = []
current_date = start
while current_date <= end:
date_range.append(current_date.strftime("%Y-%m-%d"))
current_date += timedelta(days=1)
return date_range
def get_url(keyword, start_date, end_date, page_num):
url = f"https://section.blog.naver.com/Search/Post.naver?pageNo={page_num}&rangeType=PERIOD&orderBy=sim&startDate={start_date}&endDate={end_date}&keyword={keyword}"
return url
def get_article_count(plain_html):
regex_match_count = regex.search(r'\d*(,\d{3}|d{3})*\b*건\b*<', plain_html)
regex_match_count = regex.sub(r'건\b*<', '', regex_match_count.group())
match_count = regex_match_count.replace(',', '')
return int(match_count)
def get_total_page(url):
# Chrome WebDriver 설치 경로 가져오기
chrome_path = ChromeDriverManager().install()
# WebDriver 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
# WebDriver 초기화
driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
driver.get(url)
time.sleep(3)
plain_html = driver.page_source
article_count = get_article_count(plain_html)
total_page = math.ceil(int(article_count) / 7)
return total_page
def scrap_agent(url):
# Chrome WebDriver 설치 경로 가져오기
chrome_path = ChromeDriverManager().install()
# WebDriver 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
options.add_argument('headless')
options.add_argument('window-size=1920x1080')
options.add_argument("disable-gpu")
# options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
# WebDriver 초기화
driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
driver.get(url)
time.sleep(2)
single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "html body ui-view div#wrap.wrap main#container.container div.layout_content div#content.content section.wrap_search_list div.area_list_search")
search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
search_result_urls = regex.findall(r"https:\/\/blog\.naver\.com\/[\uAC00-\uD7AFa-zA-Z0-9_]+\/[\uAC00-\uD7AFa-zA-Z0-9_]+",
search_result_plain_html)
# convenient single liner that removes duplicates
search_result_urls = list(set(search_result_urls))
return search_result_urls
def blog_url_scrapper(keyword, start_date, end_date):
dates_to_search = generate_date_range(start_date=start_date,end_date=end_date)
iter_over_dates = tqdm(dates_to_search, position=0, leave=True)
for dates in iter_over_dates:
iter_over_dates.set_description(f"Collecting over an time interval, Current Day - {dates}")
search_results = []
url = get_url(keyword, dates, dates, 1)
total_page = get_total_page(url)
iter_over_pages = tqdm(range(1, total_page+1), position=0, leave=False)
for i in iter_over_pages:
iter_over_pages.set_description(f"current page-{i}/{total_page}")
url = get_url(keyword, start_date=dates, end_date=dates, page_num=i)
search_result = scrap_agent(url)
search_results += search_result
df_data = {
"url" : search_results,
"postdate" : [dates] * len(search_results),
"creator" : [""] * len(search_results),
"title" : [""] * len(search_results),
"contents" : [""] * len(search_results)
}
df_out = pd.DataFrame(df_data)
if not os.path.exists(f"blog/{keyword}"):
os.mkdir(f"blog/{keyword}")
if len(df_out) > 0 :
df_out.to_csv(f"blog/{keyword}/{dates}", index=False)