윤영준 윤영준 2023-12-14
found the workaround for daum
@f1427a6e80524cb6df4202d8173f6084f91b96ea
daum_cafe_scrapper.py
--- daum_cafe_scrapper.py
+++ daum_cafe_scrapper.py
@@ -4,6 +4,7 @@
 import pandas as pd
 import os
 import bs4
+import urllib.parse
 import multiprocessing
 from datetime import datetime, timedelta
 from selenium import webdriver
@@ -17,7 +18,7 @@
 def get_url(keyword, start_date, end_date, page_num):
     url = (f"https://search.daum.net/search?"
            f"w=fusion&nil_search=btn&DA=PGD"
-           f"&q={keyword}"
+           f"&q={urllib.parse.quote(keyword)}"
            f"&col=cafe&sort=recency"
            f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959"
            f"&period=u&p={page_num}")
@@ -37,6 +38,7 @@
 
 
 def scrap_agent(url, keyword, start_date, end_date, page_num):
+    print(f"working on {start_date} to {end_date}")
     # Chrome WebDriver 설치 경로 가져오기
     chrome_path = ChromeDriverManager().install()
     # WebDriver 설정
@@ -55,15 +57,7 @@
 
     # infinity_scroll_to_the_bottom(driver)
 
-    try :
-        single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated")
-    except :
-        final_page = regex.search(r'<div id=\"noResult\"', driver.page_source)
-        try :
-            print("No more search_result!")
-            return None
-        except :
-            raise "Error! the page has not loaded properly, you must look for connection!"
+    single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated")
 
     search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML')
     formatter = bs4.formatter.HTMLFormatter(indent=1)
@@ -105,6 +99,8 @@
     delete_cache(driver)
     driver.close()
 
+    if ret["url"] == []:
+        return None
 
     out_df = pd.DataFrame.from_dict(ret)
     base_dir = "daum_cafe"
@@ -161,25 +157,26 @@
 
 
 def daum_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1):
-
+     # daum starts the page_num from 2
     last_page = 502
-    current_page = 2 # daum starts the page_num from 2
     date_ranges = generate_date_range(start_date, end_date, interval)
     print("!!")
     for date_range in date_ranges:
         ret = 0
-        while ret is not None:
+        current_page = 2
+        while ret is not None or current_page > last_page:
             url = get_url(keyword, date_range[0], date_range[1], current_page)
-            ret = scrap_agent(url, date_range[0], date_range[1], interval, current_page)
+            ret = scrap_agent(url, keyword, date_range[0], date_range[1], current_page)
+            print(ret)
             current_page += 1
 
 if __name__ == "__main__":
-    scrap_agent("https://search.daum.net/search?nil_suggest=btn&w=fusion&DA=SBC&q=%EA%B5%AC%EB%AF%B8&sd=20220101000000&ed=20220101235959&period=u&col=cafe&p=2", "dummy", "dummy", "dummy", "dummy")
+    # scrap_agent("https://search.daum.net/search?&p=3&q=%EA%B5%AC%EB%AF%B8+%EC%A7%80%EC%82%B0%EB%8F%99&col=cafe", "dummy", "dummy", "dummy1", "dummy")
     # scrap_agent("https://search.daum.net/search?w=fusion&nil_search=btn&DA=PGD&q=%EA%B5%AC%EB%AF%B8&col=cafe&sort=recency&sd=20221111000000&ed=20221111235959&period=u&p=5",
     #             "구미",
     #             "2020",
     #             "2020")
-    # daum_cafe_scrapper("선산읍", start_date="2022-01-01", end_date="2022-01-02", interval=1, browser_thread_count=1)
+    daum_cafe_scrapper("선산읍", start_date="2023-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
     # daum_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
     # daum_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
     # daum_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1)
Add a comment
List