import regex
import time
import math
import pandas as pd
import os
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from tqdm import tqdm

HEADER = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}


def generate_date_range(start_date, end_date):
    # Convert the start and end date strings to datetime objects
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    # Generate a range of dates from start to end
    date_range = []
    current_date = start
    while current_date <= end:
        date_range.append(current_date.strftime("%Y-%m-%d"))
        current_date += timedelta(days=1)

    return date_range


def get_url(keyword, start_date, end_date, page_num):
    url = f"https://section.blog.naver.com/Search/Post.naver?pageNo={page_num}&rangeType=PERIOD&orderBy=sim&startDate={start_date}&endDate={end_date}&keyword={keyword}"
    return url


def get_article_count(plain_html):
    regex_match_count = regex.search(r'\d*(,\d{3}|d{3})*\b*건\b*<', plain_html)
    regex_match_count = regex.sub(r'건\b*<', '', regex_match_count.group())
    match_count = regex_match_count.replace(',', '')
    return int(match_count)

def get_total_page(url):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(3)
    plain_html = driver.page_source
    article_count = get_article_count(plain_html)
    total_page = math.ceil(int(article_count) / 7)
    return total_page

def scrap_agent(url):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(2)
    single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "html body ui-view div#wrap.wrap main#container.container div.layout_content div#content.content section.wrap_search_list div.area_list_search")

    search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
    search_result_urls = regex.findall(r"https:\/\/blog\.naver\.com\/[\uAC00-\uD7AFa-zA-Z0-9_]+\/[\uAC00-\uD7AFa-zA-Z0-9_]+",
                  search_result_plain_html)
    # convenient single liner that removes duplicates
    search_result_urls = list(set(search_result_urls))

    return search_result_urls

def blog_url_scrapper(keyword, start_date, end_date):
    dates_to_search = generate_date_range(start_date=start_date,end_date=end_date)
    iter_over_dates = tqdm(dates_to_search, position=0, leave=True)
    for dates in iter_over_dates:
        iter_over_dates.set_description(f"Collecting over an time interval, Current Day - {dates}")

        search_results = []
        url = get_url(keyword, dates, dates, 1)
        total_page = get_total_page(url)

        iter_over_pages = tqdm(range(1, total_page+1), position=0, leave=False)
        for i in iter_over_pages:
            iter_over_pages.set_description(f"current page-{i}/{total_page}")

            url = get_url(keyword, start_date=dates, end_date=dates, page_num=i)
            search_result = scrap_agent(url)
            search_results += search_result

        df_data = {
            "url" : search_results,
            "postdate" : [dates] * len(search_results),
            "creator" : [""] * len(search_results),
            "title" : [""] * len(search_results),
            "contents" : [""] * len(search_results)
        }
        df_out = pd.DataFrame(df_data)
        if not os.path.exists(f"blog/{keyword}"):
            os.mkdir(f"blog/{keyword}")

        if len(df_out) > 0 :
            df_out.to_csv(f"blog/{keyword}/{dates}", index=False)