윤영준 윤영준 2023-12-14
added daum scrapper
@2b523c87dcd6073b0fd59d4b9fbbbadfd68c1da4
 
daum_cafe_scrapper.py (added)
+++ daum_cafe_scrapper.py
@@ -0,0 +1,210 @@
+import re
+import regex
+import time
+import pandas as pd
+import os
+import bs4
+import multiprocessing
+from datetime import datetime, timedelta
+from selenium import webdriver
+from selenium.webdriver.common.by import By
+from webdriver_manager.chrome import ChromeDriverManager
+from selenium.webdriver.chrome.service import Service as ChromeService
+from joblib import Parallel, delayed
+from utils.cache_clear import delete_cache
+
+
+def get_url(keyword, start_date, end_date, page_num):
+    url = (f"https://search.daum.net/search?"
+           f"w=fusion&nil_search=btn&DA=PGD"
+           f"&q={keyword}"
+           f"&col=cafe&sort=recency"
+           f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959"
+           f"&period=u&p={page_num}")
+    return url
+
+def remove_tags(html):
+    # parse html content
+    soup = bs4.BeautifulSoup(html, "html.parser")
+
+    for data in soup(['style', 'script']):
+        # Remove tags
+        data.decompose()
+
+    # return data by retrieving the tag content
+    return ' '.join(soup.stripped_strings)
+
+
+
+def scrap_agent(url, keyword, start_date, end_date, page_num):
+    # Chrome WebDriver 설치 경로 가져오기
+    chrome_path = ChromeDriverManager().install()
+    # WebDriver 설정
+    options = webdriver.ChromeOptions()
+    options.add_experimental_option("excludeSwitches", ["enable-automation"])
+    options.add_experimental_option("useAutomationExtension", False)
+    # options.add_argument('headless')
+    options.add_argument('window-size=1920x1080')
+    options.add_argument("disable-gpu")
+    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko")
+
+    # WebDriver 초기화
+    driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options)
+    driver.get(url)
+    time.sleep(2)
+
+    # infinity_scroll_to_the_bottom(driver)
+
+    try :
+        single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated")
+    except :
+        final_page = regex.search(r'<div id=\"noResult\"', driver.page_source)
+        try :
+            print("No more search_result!")
+            return None
+        except :
+            raise "Error! the page has not loaded properly, you must look for connection!"
+
+    search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
+    formatter = bs4.formatter.HTMLFormatter(indent=1)
+    search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter)
+
+    search_result_contents = regex.findall(r'<c-card class="_cubic hydrated">(.*?)\n</c-card>', search_result_plain_html_beautified, re.DOTALL)
+
+    ret = {
+        "url": [],
+        "title": [],
+        "cafe_name": [],
+        "post_date": [],
+        "text": [],
+        "topic": []
+    }
+    for content in search_result_contents:
+        content = content.replace("\n", "")
+        title_and_url_extract = regex.search(r'<div class=\"fmenu-layer layer_item\" (.*?)>', content)
+        href = regex.search(r'data-link=\"(.*?)\"', title_and_url_extract.group(1)).group(1)
+        title = regex.search(r'data-title=\"(.*?)\"', title_and_url_extract.group(1)).group(1)
+        text = regex.search(r'<p class="conts-desc clamp-g">(.*?)</p>',content)
+        cafe_name = regex.search(r'<c-frag class=\"_cubic hydrated\" slot=\"_slt1\">(.*?)</c-frag>', content)
+        post_date = regex.search(r'<c-footer-desc class=\"_cubic hydrated\" slot=\"info\">(.*?)</c-footer-desc>',content)
+        text = remove_tags(text.group(1))
+        cafe_name = remove_tags(cafe_name.group(1))
+        post_date = remove_tags(post_date.group(1))
+
+        topic_extract = regex.search(r'<c-header-item class=\"_cubic hydrated\"(.*?)>', content).group(1)
+        topic = regex.search(r'\"topic\":\{(.*?)\}', topic_extract).group(1)
+        topic = regex.findall(r'\"(.*?)\"', topic)
+        topic = '&'.join(topic)
+
+        ret["url"].append(href)
+        ret["title"].append(title)
+        ret["cafe_name"].append(cafe_name)
+        ret["post_date"].append(post_date)
+        ret["text"].append(text)
+        ret["topic"].append(topic)
+    delete_cache(driver)
+    driver.close()
+
+
+    out_df = pd.DataFrame.from_dict(ret)
+    base_dir = "daum_cafe"
+    if not os.path.exists(f"{base_dir}/{keyword}"):
+        os.mkdir(f"{base_dir}/{keyword}")
+    save_file_name = f"{start_date}-{end_date}-{page_num}.csv"
+    out_df.to_csv(f"{base_dir}/{keyword}/{save_file_name}", index=False)
+    print(f"saved {base_dir}/{keyword}/{save_file_name}")
+    return ret
+
+def merge_dicts(dict_list):
+    # Initialize the result dictionary with empty lists
+    result = {
+        "url": [],
+        "title": [],
+        "blog_name": [],
+        "post_date": [],
+        "text": [],
+        "topic": []
+    }
+
+    # Iterate through each dictionary and merge the lists
+    for d in dict_list:
+        for key in result.keys():
+            result[key].extend(d.get(key, []))
+
+    return result
+
+
+def generate_date_range(start_date, end_date, interval):
+    """
+    :param start_date: start date in datetimestr formatted in %Y-%m-%d
+    :param end_date: end date in datetimestr formatted in %Y-%m-%d
+    :param interval: interval of time in DAYS
+    :return: returns list of time interval that will be feeded into naver_blog_scrapper.
+    for example, [[2023-10-01, 2023-10-07],[2023-10-08, 2023-10-14]] for generate_date_range("2023-10-01", "2023-10-14", 7)
+    Also, this function can handle when the time range of start and end date is not perfectly divisible by interval
+    generate_date_range("2023-10-01", "2023-10-14", 10) will produce output [['2023-10-01', '2023-10-10'], ['2023-10-11', '2023-10-14']]
+    """
+
+    # Convert the start and end date strings to datetime objects
+    start = datetime.strptime(start_date, "%Y-%m-%d")
+    end = datetime.strptime(end_date, "%Y-%m-%d")
+
+    # Generate a range of dates from start to end
+    date_ranges = []
+    current_date = start
+    while current_date < end:
+        current_end = min(current_date + timedelta(days=interval - 1), end)
+        date_ranges.append([current_date.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")])
+        current_date = current_end + timedelta(days=1)
+
+    return date_ranges
+
+
+def daum_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1):
+
+    last_page = 502
+    current_page = 2 # daum starts the page_num from 2
+    date_ranges = generate_date_range(start_date, end_date, interval)
+    print("!!")
+    for date_range in date_ranges:
+        ret = 0
+        while ret is not None:
+            url = get_url(keyword, date_range[0], date_range[1], current_page)
+            ret = scrap_agent(url, date_range[0], date_range[1], interval, current_page)
+            current_page += 1
+
+if __name__ == "__main__":
+    scrap_agent("https://search.daum.net/search?nil_suggest=btn&w=fusion&DA=SBC&q=%EA%B5%AC%EB%AF%B8&sd=20220101000000&ed=20220101235959&period=u&col=cafe&p=2", "dummy", "dummy", "dummy", "dummy")
+    # scrap_agent("https://search.daum.net/search?w=fusion&nil_search=btn&DA=PGD&q=%EA%B5%AC%EB%AF%B8&col=cafe&sort=recency&sd=20221111000000&ed=20221111235959&period=u&p=5",
+    #             "구미",
+    #             "2020",
+    #             "2020")
+    # daum_cafe_scrapper("선산읍", start_date="2022-01-01", end_date="2022-01-02", interval=1, browser_thread_count=1)
+    # daum_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("장천면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("무을면", start_date="2023-10-23", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("해평면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("옥성면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("구미 송정동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2)
+    # daum_cafe_scrapper("원평동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2)
+    # daum_cafe_scrapper("구미 지산동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2)
+    # daum_cafe_scrapper("도량동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2)
+    # daum_cafe_scrapper("선주원남동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2)
+    # daum_cafe_scrapper("신평1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("신평2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("형곡1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("형곡2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("구미 비산동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("공단동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1)
+    # daum_cafe_scrapper("광평동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("상모사곡동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("임오동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("인동동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("진미동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    # daum_cafe_scrapper("양표동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
+    #
+    # daum_cafe_scrapper("구미", start_date="2022-01-01", end_date="2023-10-31", interval=1, browser_thread_count=1)
+    # daum_cafe_scrapper("구미시장", start_date="2022-01-01", end_date="2023-10-31", interval=7, browser_thread_count=1)
+
 
utils/infinity_scroll.py (added)
+++ utils/infinity_scroll.py
@@ -0,0 +1,20 @@
+import random
+from time import sleep
+
+def infinity_scroll_to_the_bottom(driver, scroll_multiplier=1, MAXITER=2000):
+    # Automatically scroll the page
+    scroll_pause_time = 0.1 + random.random() * 2  # Pause between each scroll
+    screen_height = driver.execute_script("return window.screen.height;")  # Browser window height
+    i = 1
+    while i < MAXITER:
+        # Scroll down
+        driver.execute_script(f"window.scrollTo(0, {screen_height * i * scroll_multiplier + random.randint(1, 500)});")
+        i += 1
+        sleep(scroll_pause_time)
+
+        # Check if reaching the end of the page
+        scroll_height = driver.execute_script("return document.body.scrollHeight;")
+        # print(scroll_height)
+        if screen_height * i > scroll_height:
+            break
+    return driver(파일 끝에 줄바꿈 문자 없음)
Add a comment
List