import random
import re
import regex
import time
import math
import pandas as pd
import os
import bs4
from bs4 import MarkupResemblesLocatorWarning
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from tqdm import tqdm
from joblib import Parallel, delayed
from utils.cache_clear import delete_cache
import warnings


# To slience the MarkupResemblesLocatorWarning of bs4, which is annoying since I KNOW EVERYTHING IS HTML
warnings.filterwarnings("ignore", category=MarkupResemblesLocatorWarning, module='bs4')

HEADER = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}


# Function to remove tags
def remove_tags(html):
    # parse html content
    soup = bs4.BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)

def get_url(keyword, page_num):
    t_rand = 1702255000000 + random.randint(0,999999)
    url = f"https://section.cafe.naver.com/ca-fe/home/search/articles?q={keyword}&t={t_rand}&em=1&p={page_num}"
    return url

def get_total_page(url):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('--blink-settings=imagesEnabled=false')
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(3)
    total_count = driver.find_element(By.CSS_SELECTOR, ".sub_text")
    total_count = total_count.text
    total_page = math.ceil(int(total_count.replace(',',''))/12)
    return total_page

def scrap_agent(url):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(2)
    single_page_search_result = driver.find_elements(By.CSS_SELECTOR, ".item_list")

    search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
    formatter = bs4.formatter.HTMLFormatter(indent=1)
    search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter)

    search_result_contents = regex.findall(r'  <div class="article_item_wrap">(.*?)\n  <\/div>', search_result_plain_html_beautified, re.DOTALL)

    ret = {
        "url": [],
        "title": [],
        "cafe_name": [],
        "post_date": [],
        "text": []
    }
    for content in search_result_contents:
        content = content.replace("\n", "")
        href = regex.search(r"https:\/\/cafe\.naver\.com\/[\uAC00-\uD7AFa-zA-Z0-9__-]+\?iframe_url\=[\uAC00-\uD7AFa-zA-Z0-9\.\%__-]+", content, re.DOTALL)
        title = regex.search(r'<strong class="title">(.*?)<\/strong>',content)
        text = regex.search(r'<p class="text">(.*?)<\/p>',content)
        cafe_name = regex.search(r'<span class="cafe_name">(.*?)<\/span>',content)
        post_date = regex.search(r'<span class="date">(.*?)<\/span>',content)
        href = remove_tags(href.group(0))
        title = remove_tags(title.group(1))
        text = remove_tags(text.group(1))
        cafe_name = remove_tags(cafe_name.group(1))
        post_date = remove_tags((post_date.group(1)))

        ret["url"].append(href)
        ret["title"].append(title)
        ret["cafe_name"].append(cafe_name)
        ret["post_date"].append(post_date)
        ret["text"].append(text)
    delete_cache(driver)
    driver.close()
    return ret

def merge_dicts(dict_list):
    # Initialize the result dictionary with empty lists
    result = {
        "url": [],
        "title": [],
        "cafe_name": [],
        "post_date": [],
        "text": []
    }

    # Iterate through each dictionary and merge the lists
    for d in dict_list:
        for key in result.keys():
            result[key].extend(d.get(key, []))

    return result


def range_of_ranges(start, end, step):
    ranges = []
    current_start = start

    while current_start <= end:
        # Calculate the end of the current range
        current_end = min(current_start + step - 1, end)

        # Append the current range as a list
        ranges.append([current_start, current_end])

        # Update the start for the next range
        current_start += step

    return ranges


def naver_cafe_scrapper(keyword, step = 100, start_page_num=1, browser_thread_count=12):
    url = get_url(keyword, 1)
    total_page_num = get_total_page(url)
    print(f"total_page_num : {total_page_num}")

    for scope in range_of_ranges(start_page_num, total_page_num+1, step):
        def parallel_scraping(keyword):
            urls = Parallel(n_jobs=-1)(delayed(get_url)(keyword, i) for i in range(scope[0], scope[1]))
            results = Parallel(n_jobs=browser_thread_count)(delayed(scrap_agent)(url) for url in urls)
            return results

        ret = parallel_scraping(keyword)
        merged_result = merge_dicts(ret)
        out_df = pd.DataFrame.from_dict(merged_result)
        if not os.path.exists(f"cafe/{keyword}"):
            os.mkdir(f"cafe/{keyword}")
        out_df.to_csv(f"cafe/{keyword}/{scope[0]}-{scope[1]}.csv", index=False)


if __name__ == "__main__":
    pass