
--- daum_cafe_scrapper.py
+++ daum_cafe_scrapper.py
... | ... | @@ -4,6 +4,7 @@ |
4 | 4 |
import pandas as pd |
5 | 5 |
import os |
6 | 6 |
import bs4 |
7 |
+import urllib.parse |
|
7 | 8 |
import multiprocessing |
8 | 9 |
from datetime import datetime, timedelta |
9 | 10 |
from selenium import webdriver |
... | ... | @@ -17,7 +18,7 @@ |
17 | 18 |
def get_url(keyword, start_date, end_date, page_num): |
18 | 19 |
url = (f"https://search.daum.net/search?" |
19 | 20 |
f"w=fusion&nil_search=btn&DA=PGD" |
20 |
- f"&q={keyword}" |
|
21 |
+ f"&q={urllib.parse.quote(keyword)}" |
|
21 | 22 |
f"&col=cafe&sort=recency" |
22 | 23 |
f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959" |
23 | 24 |
f"&period=u&p={page_num}") |
... | ... | @@ -37,6 +38,7 @@ |
37 | 38 |
|
38 | 39 |
|
39 | 40 |
def scrap_agent(url, keyword, start_date, end_date, page_num): |
41 |
+ print(f"working on {start_date} to {end_date}") |
|
40 | 42 |
# Chrome WebDriver 설치 경로 가져오기 |
41 | 43 |
chrome_path = ChromeDriverManager().install() |
42 | 44 |
# WebDriver 설정 |
... | ... | @@ -55,15 +57,7 @@ |
55 | 57 |
|
56 | 58 |
# infinity_scroll_to_the_bottom(driver) |
57 | 59 |
|
58 |
- try : |
|
59 |
- single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated") |
|
60 |
- except : |
|
61 |
- final_page = regex.search(r'<div id=\"noResult\"', driver.page_source) |
|
62 |
- try : |
|
63 |
- print("No more search_result!") |
|
64 |
- return None |
|
65 |
- except : |
|
66 |
- raise "Error! the page has not loaded properly, you must look for connection!" |
|
60 |
+ single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated") |
|
67 | 61 |
|
68 | 62 |
search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML') |
69 | 63 |
formatter = bs4.formatter.HTMLFormatter(indent=1) |
... | ... | @@ -105,6 +99,8 @@ |
105 | 99 |
delete_cache(driver) |
106 | 100 |
driver.close() |
107 | 101 |
|
102 |
+ if ret["url"] == []: |
|
103 |
+ return None |
|
108 | 104 |
|
109 | 105 |
out_df = pd.DataFrame.from_dict(ret) |
110 | 106 |
base_dir = "daum_cafe" |
... | ... | @@ -161,25 +157,26 @@ |
161 | 157 |
|
162 | 158 |
|
163 | 159 |
def daum_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1): |
164 |
- |
|
160 |
+ # daum starts the page_num from 2 |
|
165 | 161 |
last_page = 502 |
166 |
- current_page = 2 # daum starts the page_num from 2 |
|
167 | 162 |
date_ranges = generate_date_range(start_date, end_date, interval) |
168 | 163 |
print("!!") |
169 | 164 |
for date_range in date_ranges: |
170 | 165 |
ret = 0 |
171 |
- while ret is not None: |
|
166 |
+ current_page = 2 |
|
167 |
+ while ret is not None or current_page > last_page: |
|
172 | 168 |
url = get_url(keyword, date_range[0], date_range[1], current_page) |
173 |
- ret = scrap_agent(url, date_range[0], date_range[1], interval, current_page) |
|
169 |
+ ret = scrap_agent(url, keyword, date_range[0], date_range[1], current_page) |
|
170 |
+ print(ret) |
|
174 | 171 |
current_page += 1 |
175 | 172 |
|
176 | 173 |
if __name__ == "__main__": |
177 |
- scrap_agent("https://search.daum.net/search?nil_suggest=btn&w=fusion&DA=SBC&q=%EA%B5%AC%EB%AF%B8&sd=20220101000000&ed=20220101235959&period=u&col=cafe&p=2", "dummy", "dummy", "dummy", "dummy") |
|
174 |
+ # scrap_agent("https://search.daum.net/search?&p=3&q=%EA%B5%AC%EB%AF%B8+%EC%A7%80%EC%82%B0%EB%8F%99&col=cafe", "dummy", "dummy", "dummy1", "dummy") |
|
178 | 175 |
# scrap_agent("https://search.daum.net/search?w=fusion&nil_search=btn&DA=PGD&q=%EA%B5%AC%EB%AF%B8&col=cafe&sort=recency&sd=20221111000000&ed=20221111235959&period=u&p=5", |
179 | 176 |
# "구미", |
180 | 177 |
# "2020", |
181 | 178 |
# "2020") |
182 |
- # daum_cafe_scrapper("선산읍", start_date="2022-01-01", end_date="2022-01-02", interval=1, browser_thread_count=1) |
|
179 |
+ daum_cafe_scrapper("선산읍", start_date="2023-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) |
|
183 | 180 |
# daum_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) |
184 | 181 |
# daum_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) |
185 | 182 |
# daum_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?