import pandas as pd import regex import time from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium import webdriver from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.service import Service as ChromeService # Chrome WebDriver 설치 경로 가져오기 chrome_path = ChromeDriverManager().install() # WebDriver 설정 options = webdriver.ChromeOptions() options.add_experimental_option("excludeSwitches", ["enable-automation"]) options.add_experimental_option("useAutomationExtension", False) # WebDriver 초기화 driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) driver.implicitly_wait(3) # ConnectionError방지 headers = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"} contents = [] comments_texts = [] pattern1 = '<[^>]*>' # check how many unique urls and return unique list of urls naver_urls = list(set(naver_urls)) try: for naver_url in naver_urls: print(naver_url) driver.get(naver_url) time.sleep(5) # 대기시간 변경 가능 iframe = driver.find_element(By.ID, "mainFrame") # id가 mainFrame이라는 요소를 찾아내고 -> iframe임 driver.switch_to.frame(iframe) # 이 iframe이 내가 찾고자하는 html을 포함하고 있는 내용 source = driver.page_source html = BeautifulSoup(source, "html.parser") # 검색결과 확인용 # with open("Output.txt", "w") as text_file: # text_file.write(str(html)) # 본문 텍스트만 가져오기 content = html.select("div.se-main-container") # list합치기 content = ''.join(str(content)) # html태그제거 및 텍스트 다듬기 content = regex.sub(pattern=pattern1, repl='', string=content) pattern2 = """[\n\n\n\n\n// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}""" content = content.replace(pattern2, '') content = content.replace('\n', '') content = content.replace('\u200b', '') contents.append(content) news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate}) news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig') except: contents.append('error') news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate}) news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig')