
File name
Commit message
Commit date
import pandas as pd
import regex
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
# Chrome WebDriver 설치 경로 가져오기
chrome_path = ChromeDriverManager().install()
# WebDriver 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)
# WebDriver 초기화
driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
driver.implicitly_wait(3)
# ConnectionError방지
headers = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}
contents = []
comments_texts = []
pattern1 = '<[^>]*>'
# check how many unique urls and return unique list of urls
naver_urls = list(set(naver_urls))
try:
for naver_url in naver_urls:
print(naver_url)
driver.get(naver_url)
time.sleep(5) # 대기시간 변경 가능
iframe = driver.find_element(By.ID, "mainFrame") # id가 mainFrame이라는 요소를 찾아내고 -> iframe임
driver.switch_to.frame(iframe) # 이 iframe이 내가 찾고자하는 html을 포함하고 있는 내용
source = driver.page_source
html = BeautifulSoup(source, "html.parser")
# 검색결과 확인용
# with open("Output.txt", "w") as text_file:
# text_file.write(str(html))
# 본문 텍스트만 가져오기
content = html.select("div.se-main-container")
# list합치기
content = ''.join(str(content))
# html태그제거 및 텍스트 다듬기
content = regex.sub(pattern=pattern1, repl='', string=content)
pattern2 = """[\n\n\n\n\n// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}"""
content = content.replace(pattern2, '')
content = content.replace('\n', '')
content = content.replace('\u200b', '')
contents.append(content)
news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate})
news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig')
except:
contents.append('error')
news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate})
news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig')