import random
import re
import regex
import time
import math
import pandas as pd
import os
import bs4
from bs4 import MarkupResemblesLocatorWarning
from datetime import datetime, timedelta
from selenium import webdriver
from selenium.webdriver.common.by import By
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.chrome.service import Service as ChromeService
from tqdm import tqdm
from joblib import Parallel, delayed
from utils.cache_clear import delete_cache
from utils.infinity_scroll import infinity_scroll_to_the_bottom
import warnings

def remove_tags(html):
    # parse html content
    soup = bs4.BeautifulSoup(html, "html.parser")

    for data in soup(['style', 'script']):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return ' '.join(soup.stripped_strings)


def scrap_agent(url, keyword, start_date, end_date):
    # Chrome WebDriver 설치 경로 가져오기
    chrome_path = ChromeDriverManager().install()
    # WebDriver 설정
    options = webdriver.ChromeOptions()
    options.add_experimental_option("excludeSwitches", ["enable-automation"])
    options.add_experimental_option("useAutomationExtension", False)
    options.add_argument('headless')
    options.add_argument('window-size=1920x1080')
    options.add_argument("disable-gpu")
    # options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")

    # WebDriver 초기화
    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
    driver.get(url)
    time.sleep(2)

    infinity_scroll_to_the_bottom(driver)


    single_page_search_result = driver.find_elements(By.CSS_SELECTOR, ".lst_view")
    if single_page_search_result==[] :
        print("There is no result! really, None, or you can check it by yourself at :\n"
              f"{url}\n"
              f"this is with keyword {keyword} with time range {start_date} ~ {end_date}")
        return None

    search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
    formatter = bs4.formatter.HTMLFormatter(indent=1)
    search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter)

    search_result_contents = regex.findall(r'<li class="bx">(.*?)\n<\/li>', search_result_plain_html_beautified, re.DOTALL)

    ret = {
        "url": [],
        "title": [],
        "cafe_name": [],
        "post_date": [],
        "text": []
    }
    for content in search_result_contents:
        content = content.replace("\n", "")
        href = "/".join(str(i) for i in regex.findall(r'href="([^"]+)/([^"]+)/([^"]+)/([^"]+)"', content, re.DOTALL)[0])
        title = regex.search(r'<div class="title_area">(.*?)   </div>',content)
        text = regex.search(r'   <div class="dsc_area">(.*?)   <\/div>',content)
        cafe_name = regex.search(r'    <div class="user_info">(.*?)    </div>',content)
        post_date = regex.search(r'<span class="sub">(.*?)<\/span>',content)
        title = remove_tags(title.group(1))
        # title = title[:-11] # to remove date
        text = remove_tags(text.group(1))
        cafe_name = remove_tags(cafe_name.group(1))
        cafe_name = cafe_name[:-11]
        post_date = remove_tags((post_date.group(1)))

        ret["url"].append(href)
        ret["title"].append(title)
        ret["cafe_name"].append(cafe_name)
        ret["post_date"].append(post_date)
        ret["text"].append(text)
    delete_cache(driver)
    driver.close()

    # merged_result = merge_dicts(ret)
    out_df = pd.DataFrame.from_dict(ret)
    if not os.path.exists(f"cafe/{keyword}"):
        os.mkdir(f"cafe/{keyword}")
    out_df.to_csv(f"cafe/{keyword}/{start_date}-{end_date}.csv", index=False)
    print(f"saved cafe/{keyword}/{start_date}-{end_date}.csv")
    return ret