Code - yjyoon/구미시-네이버-블로그-스크레퍼

import random import re import regex import time import math import pandas as pd import os import bs4 from bs4 import MarkupResemblesLocatorWarning from datetime import datetime, timedelta from selenium import webdriver from selenium.webdriver.common.by import By from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service as ChromeService from tqdm import tqdm from joblib import Parallel, delayed from utils.cache_clear import delete_cache from utils.infinity_scroll import infinity_scroll_to_the_bottom import warnings def remove_tags(html): # parse html content soup = bs4.BeautifulSoup(html, "html.parser") for data in soup(['style', 'script']): # Remove tags data.decompose() # return data by retrieving the tag content return ' '.join(soup.stripped_strings) def scrap_agent(url, keyword, start_date, end_date): # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) options.add_argument('headless') options.add_argument('window-size=1920x1080') options.add_argument("disable-gpu") # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.get(url) time.sleep(2) infinity_scroll_to_the_bottom(driver) single_page_search_result = driver.find_elements(By.CSS_SELECTOR, ".lst_view") if single_page_search_result==[] : print("There is no result! really, None, or you can check it by yourself at :\n" f"{url}\n" f"this is with keyword {keyword} with time range {start_date} ~ {end_date}") return None search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML') formatter = bs4.formatter.HTMLFormatter(indent=1) search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter) search_result_contents = regex.findall(r'<li class="bx">(.*?)\n<\/li>', search_result_plain_html_beautified, re.DOTALL) ret = { "url": [], "title": [], "cafe_name": [], "post_date": [], "text": [] } for content in search_result_contents: content = content.replace("\n", "") href = "/".join(str(i) for i in regex.findall(r'href="([^"]+)/([^"]+)/([^"]+)/([^"]+)"', content, re.DOTALL)[0]) title = regex.search(r'<div class="title_area">(.*?) </div>',content) text = regex.search(r' <div class="dsc_area">(.*?) <\/div>',content) cafe_name = regex.search(r' <div class="user_info">(.*?) </div>',content) post_date = regex.search(r'<span class="sub">(.*?)<\/span>',content) title = remove_tags(title.group(1)) # title = title[:-11] # to remove date text = remove_tags(text.group(1)) cafe_name = remove_tags(cafe_name.group(1)) cafe_name = cafe_name[:-11] post_date = remove_tags((post_date.group(1))) ret["url"].append(href) ret["title"].append(title) ret["cafe_name"].append(cafe_name) ret["post_date"].append(post_date) ret["text"].append(text) delete_cache(driver) driver.close() # merged_result = merge_dicts(ret) out_df = pd.DataFrame.from_dict(ret) if not os.path.exists(f"cafe/{keyword}"): os.mkdir(f"cafe/{keyword}") out_df.to_csv(f"cafe/{keyword}/{start_date}-{end_date}.csv", index=False) print(f"saved cafe/{keyword}/{start_date}-{end_date}.csv") return ret