
+++ daum_cafe_scrapper.py
... | ... | @@ -0,0 +1,210 @@ |
1 | +import re | |
2 | +import regex | |
3 | +import time | |
4 | +import pandas as pd | |
5 | +import os | |
6 | +import bs4 | |
7 | +import multiprocessing | |
8 | +from datetime import datetime, timedelta | |
9 | +from selenium import webdriver | |
10 | +from selenium.webdriver.common.by import By | |
11 | +from webdriver_manager.chrome import ChromeDriverManager | |
12 | +from selenium.webdriver.chrome.service import Service as ChromeService | |
13 | +from joblib import Parallel, delayed | |
14 | +from utils.cache_clear import delete_cache | |
15 | + | |
16 | + | |
17 | +def get_url(keyword, start_date, end_date, page_num): | |
18 | + url = (f"https://search.daum.net/search?" | |
19 | + f"w=fusion&nil_search=btn&DA=PGD" | |
20 | + f"&q={keyword}" | |
21 | + f"&col=cafe&sort=recency" | |
22 | + f"&sd={start_date.replace('-','')}000000&ed={end_date.replace('-','')}235959" | |
23 | + f"&period=u&p={page_num}") | |
24 | + return url | |
25 | + | |
26 | +def remove_tags(html): | |
27 | + # parse html content | |
28 | + soup = bs4.BeautifulSoup(html, "html.parser") | |
29 | + | |
30 | + for data in soup(['style', 'script']): | |
31 | + # Remove tags | |
32 | + data.decompose() | |
33 | + | |
34 | + # return data by retrieving the tag content | |
35 | + return ' '.join(soup.stripped_strings) | |
36 | + | |
37 | + | |
38 | + | |
39 | +def scrap_agent(url, keyword, start_date, end_date, page_num): | |
40 | + # Chrome WebDriver 설치 경로 가져오기 | |
41 | + chrome_path = ChromeDriverManager().install() | |
42 | + # WebDriver 설정 | |
43 | + options = webdriver.ChromeOptions() | |
44 | + options.add_experimental_option("excludeSwitches", ["enable-automation"]) | |
45 | + options.add_experimental_option("useAutomationExtension", False) | |
46 | + # options.add_argument('headless') | |
47 | + options.add_argument('window-size=1920x1080') | |
48 | + options.add_argument("disable-gpu") | |
49 | + options.add_argument("user-agent=Mozilla/5.0 (Windows NT 6.1; WOW64; Trident/7.0; rv:11.0) like Gecko") | |
50 | + | |
51 | + # WebDriver 초기화 | |
52 | + driver = webdriver.Chrome(service=ChromeService(chrome_path), options=options) | |
53 | + driver.get(url) | |
54 | + time.sleep(2) | |
55 | + | |
56 | + # infinity_scroll_to_the_bottom(driver) | |
57 | + | |
58 | + try : | |
59 | + single_page_search_result = driver.find_elements(By.CSS_SELECTOR, "c-container.hydrated") | |
60 | + except : | |
61 | + final_page = regex.search(r'<div id=\"noResult\"', driver.page_source) | |
62 | + try : | |
63 | + print("No more search_result!") | |
64 | + return None | |
65 | + except : | |
66 | + raise "Error! the page has not loaded properly, you must look for connection!" | |
67 | + | |
68 | + search_result_plain_html = single_page_search_result[0].get_attribute('innerHTML') | |
69 | + formatter = bs4.formatter.HTMLFormatter(indent=1) | |
70 | + search_result_plain_html_beautified = bs4.BeautifulSoup(search_result_plain_html, 'html.parser').prettify(formatter=formatter) | |
71 | + | |
72 | + search_result_contents = regex.findall(r'<c-card class="_cubic hydrated">(.*?)\n</c-card>', search_result_plain_html_beautified, re.DOTALL) | |
73 | + | |
74 | + ret = { | |
75 | + "url": [], | |
76 | + "title": [], | |
77 | + "cafe_name": [], | |
78 | + "post_date": [], | |
79 | + "text": [], | |
80 | + "topic": [] | |
81 | + } | |
82 | + for content in search_result_contents: | |
83 | + content = content.replace("\n", "") | |
84 | + title_and_url_extract = regex.search(r'<div class=\"fmenu-layer layer_item\" (.*?)>', content) | |
85 | + href = regex.search(r'data-link=\"(.*?)\"', title_and_url_extract.group(1)).group(1) | |
86 | + title = regex.search(r'data-title=\"(.*?)\"', title_and_url_extract.group(1)).group(1) | |
87 | + text = regex.search(r'<p class="conts-desc clamp-g">(.*?)</p>',content) | |
88 | + cafe_name = regex.search(r'<c-frag class=\"_cubic hydrated\" slot=\"_slt1\">(.*?)</c-frag>', content) | |
89 | + post_date = regex.search(r'<c-footer-desc class=\"_cubic hydrated\" slot=\"info\">(.*?)</c-footer-desc>',content) | |
90 | + text = remove_tags(text.group(1)) | |
91 | + cafe_name = remove_tags(cafe_name.group(1)) | |
92 | + post_date = remove_tags(post_date.group(1)) | |
93 | + | |
94 | + topic_extract = regex.search(r'<c-header-item class=\"_cubic hydrated\"(.*?)>', content).group(1) | |
95 | + topic = regex.search(r'\"topic\":\{(.*?)\}', topic_extract).group(1) | |
96 | + topic = regex.findall(r'\"(.*?)\"', topic) | |
97 | + topic = '&'.join(topic) | |
98 | + | |
99 | + ret["url"].append(href) | |
100 | + ret["title"].append(title) | |
101 | + ret["cafe_name"].append(cafe_name) | |
102 | + ret["post_date"].append(post_date) | |
103 | + ret["text"].append(text) | |
104 | + ret["topic"].append(topic) | |
105 | + delete_cache(driver) | |
106 | + driver.close() | |
107 | + | |
108 | + | |
109 | + out_df = pd.DataFrame.from_dict(ret) | |
110 | + base_dir = "daum_cafe" | |
111 | + if not os.path.exists(f"{base_dir}/{keyword}"): | |
112 | + os.mkdir(f"{base_dir}/{keyword}") | |
113 | + save_file_name = f"{start_date}-{end_date}-{page_num}.csv" | |
114 | + out_df.to_csv(f"{base_dir}/{keyword}/{save_file_name}", index=False) | |
115 | + print(f"saved {base_dir}/{keyword}/{save_file_name}") | |
116 | + return ret | |
117 | + | |
118 | +def merge_dicts(dict_list): | |
119 | + # Initialize the result dictionary with empty lists | |
120 | + result = { | |
121 | + "url": [], | |
122 | + "title": [], | |
123 | + "blog_name": [], | |
124 | + "post_date": [], | |
125 | + "text": [], | |
126 | + "topic": [] | |
127 | + } | |
128 | + | |
129 | + # Iterate through each dictionary and merge the lists | |
130 | + for d in dict_list: | |
131 | + for key in result.keys(): | |
132 | + result[key].extend(d.get(key, [])) | |
133 | + | |
134 | + return result | |
135 | + | |
136 | + | |
137 | +def generate_date_range(start_date, end_date, interval): | |
138 | + """ | |
139 | + :param start_date: start date in datetimestr formatted in %Y-%m-%d | |
140 | + :param end_date: end date in datetimestr formatted in %Y-%m-%d | |
141 | + :param interval: interval of time in DAYS | |
142 | + :return: returns list of time interval that will be feeded into naver_blog_scrapper. | |
143 | + for example, [[2023-10-01, 2023-10-07],[2023-10-08, 2023-10-14]] for generate_date_range("2023-10-01", "2023-10-14", 7) | |
144 | + Also, this function can handle when the time range of start and end date is not perfectly divisible by interval | |
145 | + generate_date_range("2023-10-01", "2023-10-14", 10) will produce output [['2023-10-01', '2023-10-10'], ['2023-10-11', '2023-10-14']] | |
146 | + """ | |
147 | + | |
148 | + # Convert the start and end date strings to datetime objects | |
149 | + start = datetime.strptime(start_date, "%Y-%m-%d") | |
150 | + end = datetime.strptime(end_date, "%Y-%m-%d") | |
151 | + | |
152 | + # Generate a range of dates from start to end | |
153 | + date_ranges = [] | |
154 | + current_date = start | |
155 | + while current_date < end: | |
156 | + current_end = min(current_date + timedelta(days=interval - 1), end) | |
157 | + date_ranges.append([current_date.strftime("%Y-%m-%d"), current_end.strftime("%Y-%m-%d")]) | |
158 | + current_date = current_end + timedelta(days=1) | |
159 | + | |
160 | + return date_ranges | |
161 | + | |
162 | + | |
163 | +def daum_cafe_scrapper(keyword, start_date, end_date, interval, browser_thread_count=1): | |
164 | + | |
165 | + last_page = 502 | |
166 | + current_page = 2 # daum starts the page_num from 2 | |
167 | + date_ranges = generate_date_range(start_date, end_date, interval) | |
168 | + print("!!") | |
169 | + for date_range in date_ranges: | |
170 | + ret = 0 | |
171 | + while ret is not None: | |
172 | + url = get_url(keyword, date_range[0], date_range[1], current_page) | |
173 | + ret = scrap_agent(url, date_range[0], date_range[1], interval, current_page) | |
174 | + current_page += 1 | |
175 | + | |
176 | +if __name__ == "__main__": | |
177 | + scrap_agent("https://search.daum.net/search?nil_suggest=btn&w=fusion&DA=SBC&q=%EA%B5%AC%EB%AF%B8&sd=20220101000000&ed=20220101235959&period=u&col=cafe&p=2", "dummy", "dummy", "dummy", "dummy") | |
178 | + # scrap_agent("https://search.daum.net/search?w=fusion&nil_search=btn&DA=PGD&q=%EA%B5%AC%EB%AF%B8&col=cafe&sort=recency&sd=20221111000000&ed=20221111235959&period=u&p=5", | |
179 | + # "구미", | |
180 | + # "2020", | |
181 | + # "2020") | |
182 | + # daum_cafe_scrapper("선산읍", start_date="2022-01-01", end_date="2022-01-02", interval=1, browser_thread_count=1) | |
183 | + # daum_cafe_scrapper("고아읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
184 | + # daum_cafe_scrapper("산동읍", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
185 | + # daum_cafe_scrapper("도개면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
186 | + # daum_cafe_scrapper("장천면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
187 | + # daum_cafe_scrapper("무을면", start_date="2023-10-23", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
188 | + # daum_cafe_scrapper("해평면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
189 | + # daum_cafe_scrapper("옥성면", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
190 | + # daum_cafe_scrapper("구미 송정동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) | |
191 | + # daum_cafe_scrapper("원평동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) | |
192 | + # daum_cafe_scrapper("구미 지산동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) | |
193 | + # daum_cafe_scrapper("도량동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) | |
194 | + # daum_cafe_scrapper("선주원남동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=2) | |
195 | + # daum_cafe_scrapper("신평1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
196 | + # daum_cafe_scrapper("신평2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
197 | + # daum_cafe_scrapper("형곡1동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
198 | + # daum_cafe_scrapper("형곡2동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
199 | + # daum_cafe_scrapper("구미 비산동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
200 | + # daum_cafe_scrapper("공단동", start_date="2022-01-01", end_date="2023-10-31", interval=10, browser_thread_count=1) | |
201 | + # daum_cafe_scrapper("광평동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
202 | + # daum_cafe_scrapper("상모사곡동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
203 | + # daum_cafe_scrapper("임오동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
204 | + # daum_cafe_scrapper("인동동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
205 | + # daum_cafe_scrapper("진미동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
206 | + # daum_cafe_scrapper("양표동", start_date="2022-01-01", end_date="2023-10-31", interval=30, browser_thread_count=1) | |
207 | + # | |
208 | + # daum_cafe_scrapper("구미", start_date="2022-01-01", end_date="2023-10-31", interval=1, browser_thread_count=1) | |
209 | + # daum_cafe_scrapper("구미시장", start_date="2022-01-01", end_date="2023-10-31", interval=7, browser_thread_count=1) | |
210 | + |
+++ utils/infinity_scroll.py
... | ... | @@ -0,0 +1,20 @@ |
1 | +import random | |
2 | +from time import sleep | |
3 | + | |
4 | +def infinity_scroll_to_the_bottom(driver, scroll_multiplier=1, MAXITER=2000): | |
5 | + # Automatically scroll the page | |
6 | + scroll_pause_time = 0.1 + random.random() * 2 # Pause between each scroll | |
7 | + screen_height = driver.execute_script("return window.screen.height;") # Browser window height | |
8 | + i = 1 | |
9 | + while i < MAXITER: | |
10 | + # Scroll down | |
11 | + driver.execute_script(f"window.scrollTo(0, {screen_height * i * scroll_multiplier + random.randint(1, 500)});") | |
12 | + i += 1 | |
13 | + sleep(scroll_pause_time) | |
14 | + | |
15 | + # Check if reaching the end of the page | |
16 | + scroll_height = driver.execute_script("return document.body.scrollHeight;") | |
17 | + # print(scroll_height) | |
18 | + if screen_height * i > scroll_height: | |
19 | + break | |
20 | + return driver(파일 끝에 줄바꿈 문자 없음) |
Add a comment
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?