import pandas as pd
import regex
import time
from bs4 import BeautifulSoup
from selenium.webdriver.common.by import By
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService

# Chrome WebDriver 설치 경로 가져오기
chrome_path = ChromeDriverManager().install()

# WebDriver 설정
options = webdriver.ChromeOptions()
options.add_experimental_option("excludeSwitches", ["enable-automation"])
options.add_experimental_option("useAutomationExtension", False)

# WebDriver 초기화
driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
driver.implicitly_wait(3)

# ConnectionError방지
headers = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}

contents = []
comments_texts = []

pattern1 = '<[^>]*>'

# check how many unique urls and return unique list of urls
naver_urls = list(set(naver_urls))

try:
    for naver_url in naver_urls:
        print(naver_url)
        driver.get(naver_url)
        time.sleep(5)  # 대기시간 변경 가능

        iframe = driver.find_element(By.ID, "mainFrame")  # id가 mainFrame이라는 요소를 찾아내고 -> iframe임
        driver.switch_to.frame(iframe)  # 이 iframe이 내가 찾고자하는 html을 포함하고 있는 내용

        source = driver.page_source
        html = BeautifulSoup(source, "html.parser")
        # 검색결과 확인용
        # with open("Output.txt", "w") as text_file:
        #     text_file.write(str(html))

        # 본문 텍스트만 가져오기
        content = html.select("div.se-main-container")
        #  list합치기
        content = ''.join(str(content))

        # html태그제거 및 텍스트 다듬기
        content = regex.sub(pattern=pattern1, repl='', string=content)
        pattern2 = """[\n\n\n\n\n// flash 오류를 우회하기 위한 함수 추가\nfunction _flash_removeCallback() {}"""
        content = content.replace(pattern2, '')
        content = content.replace('\n', '')
        content = content.replace('\u200b', '')
        contents.append(content)

    news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate})
    news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig')
except:
    contents.append('error')
    news_df = pd.DataFrame({'title': titles, 'content': contents, 'date': postdate})
    news_df.to_csv('blog.csv', index=False, encoding='utf-8-sig')