import random
import re
import regex
import time
import math
import pandas as pd
import os
import bs4
from bs4 import MarkupResemblesLocatorWarning
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from tqdm import tqdm
from joblib import Parallel, delayed
from utils.cache_clear import delete_cache
import warnings


def get_url(keyword, start_date, end_date):
    # 오케이독본점 SEO 고용함. 스팸 개쩜
    url = (
        f"https://search.naver.com/search.naver?"
        f'nso=so%3Add%2Cp%3Afrom{start_date.replace("-", "")}to{end_date.replace("-", "")}'
        f'&nso_open=1&prdtype=0&query={keyword}+-오케이독본점'
        f'&sm=mtb_opt'
        f'&st=date'
        f'&stnm=date&where=articleg'
        f'&opt_tab=0'
        f'&date_from={start_date.replace("-", "")}&date_to={end_date.replace("-", "")}'
    )

        # f"https://search.naver.com/search.naver?where=articleg&query={keyword}+-오케이독본점&nso_open=1&prdtype=0"
        #    f"&oquery={keyword}"
        #    f"&ie=utf8&st=rel&date_option=8"
        #    f"&date_from={start_date.replace('-','.')}"
        #    f"&date_to={end_date.replace('-','.')}"
        #    f"&srchby=text&dup_remove=1"
        #    f"&cafe_url=&without_cafe_url="
        #    f"&sm=tab_opt&nso=so%3Add%2Cp%3Afrom{start_date.replace('-','')}to{end_date.replace('-','')}&nso_open=1&prdtype=0")
    return url

def remove_tags(html):
    # parse html content
    soup = bs4.BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def infinity_scroll_to_the_bottom(driver):
    # Automatically scroll the page
    MAXITER = 2000
    scroll_pause_time = 0.1 + random.random() * 2  # Pause between each scroll
    screen_height = driver.execute_script("return window.screen.height;")  # Browser window height
    i = 1
    while i < MAXITER:
        # Scroll down
        driver.execute_script(f"window.scrollTo(0, {screen_height * i + random.randint(1, 500)});")
        i += 1
        time.sleep(scroll_pause_time)

        # Check if reaching the end of the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")
        # print(scroll_height)
        if screen_height * i > scroll_height:
            break
    return driver

def scrap_agent(url, keyword, start_date, end_date):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(2)

    infinity_scroll_to_the_bottom(driver)

    single_page_search_result = driver.find_elements(By.CSS_SELECTOR, ".lst_view")

    search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
    formatter = bs4.formatter.HTMLFormatter(indent=1)
    search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter)

    search_result_contents = regex.findall(r'<li class="bx">(.*?)\n<\/li>', search_result_plain_html_beautified, re.DOTALL)

    ret = {
        "url": [],
        "title": [],
        "cafe_name": [],
        "post_date": [],
        "text": []
    }
    for content in search_result_contents:
        content = content.replace("\n", "")
        href = "/".join(str(i) for i in regex.findall(r'href="([^"]+)/([^"]+)/([^"]+)/([^"]+)"', content, re.DOTALL)[0])
        title = regex.search(r'<div class="title_area">(.*?)   </div>',content)
        text = regex.search(r'   <div class="dsc_area">(.*?)   <\/div>',content)
        cafe_name = regex.search(r'    <div class="user_info">(.*?)    </div>',content)
        post_date = regex.search(r'<span class="sub">(.*?)<\/span>',content)
        title = remove_tags(title.group(1))
        # title = title[:-11] # to remove date
        text = remove_tags(text.group(1))
        cafe_name = remove_tags(cafe_name.group(1))
        cafe_name = cafe_name[:-11]
        post_date = remove_tags((post_date.group(1)))

        ret["url"].append(href)
        ret["title"].append(title)
        ret["cafe_name"].append(cafe_name)
        ret["post_date"].append(post_date)
        ret["text"].append(text)
    delete_cache(driver)
    driver.close()

    # merged_result = merge_dicts(ret)
    out_df = pd.DataFrame.from_dict(ret)
    if not os.path.exists(f"cafe/{keyword}"):
        os.mkdir(f"cafe/{keyword}")
    out_df.to_csv(f"cafe/{keyword}/{start_date}-{end_date}.csv", index=False)
    print(f"saved cafe/{keyword}/{start_date}-{end_date}.csv")
    return ret

def merge_dicts(dict_list):
    # Initialize the result dictionary with empty lists
    result = {
        "url": [],
        "title": [],
        "blog_name": [],
        "post_date": [],
        "text": []
    }

    # Iterate through each dictionary and merge the lists
    for d in dict_list:
        for key in result.keys():
            result[key].extend(d.get(key, []))

    return result


def generate_date_range(start_date, end_date, interval):
    """
    :param start_date: start date in datetimestr formatted in %Y-%m-%d
    :param end_date: end date in datetimestr formatted in %Y-%m-%d
    :param interval: interval of time in DAYS
    :return: returns list of time interval that will be feeded into naver_blog_scrapper.
    for example, [[2023-10-01, 2023-10-07],[2023-10-08, 2023-10-14]] for generate_date_range("2023-10-01", "2023-10-14", 7)
    Also, this function can handle when the time range of start and end date is not perfectly divisible by interval
    generate_date_range("2023-10-01", "2023-10-14", 10) will produce output [['2023-10-01', '2023-10-10'], ['2023-10-11', '2023-10-14']]
    """

    # Convert the start and end date strings to datetime objects
    start = datetime.strptime(start_date, "%Y-%m-%d")
    end = datetime.strptime(end_date, "%Y-%m-%d")

    # Generate a range of dates from start to end
    date_ranges = []
    current_date = start
    while current_date < end:
        current_end = min(current_date + timedelta(days=interval - 1), end)
        date_ranges.append([current_date.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")])
        current_date = current_end + timedelta(days=1)

    return date_ranges


def naver_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1):
    date_ranges = generate_date_range(start_date, end_date, interval)
    urls = []
    for date_range in date_ranges:
        urls.append(get_url(keyword, date_range[0], date_range[1]))
    pass
    def parallel_scraping(keyword):
        results = Parallel(n_jobs=browser_thread_count)(delayed(scrap_agent)(url, keyword, date_ranges[i][0], date_ranges[i][1]) for i, url in enumerate(urls))
        return results

    ret = parallel_scraping(keyword)


if __name__ == "__main__":
    naver_cafe_scrapper("선산읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("장천면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("무을면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("해평면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("옥성면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("구미 송정동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("원평동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("구미 지산동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("도량동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("선주원남동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("신평1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("신평2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("형곡1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("형곡2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("구미 비산동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("공단동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
    naver_cafe_scrapper("광평동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("상모사곡동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("임오동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("인동동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("진미동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
    naver_cafe_scrapper("양표동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)

    # naver_cafe_scrapper("구미", start_date="2022-01-01", end_date="2023-10-31", interval=1, browser_thread_count=1)
    # naver_cafe_scrapper("구미시장", start_date="2022-01-01", end_date="2023-10-31", interval=7, browser_thread_count=1)

