HEADER = {"User-Agent": "Mozilla/119.0 (Windows NT 10.0; Win64; x64) Chrome/98.0.4758.102"}
+def remove_html(strr):
+ # print(strr)
+ cleaning = regex.sub(r'<.*?>', '', strr.strip().replace("\n",' '))
+ cleaning = regex.sub(r' +', ' ', cleaning)
+ return cleaning
+
# Function to remove tags
def remove_tags(html):
# parse html content
href = regex.search(
r"https:\/\/blog\.naver\.com\/[\uAC00-\uD7AFa-zA-Z0-9_-]+\/[\uAC00-\uD7AFa-zA-Z0-9_-]+",
content, re.DOTALL)
- title = regex.search(r'<strong class="title_post">(.*?)<\/strong>', content)
+ # title is using bit different approach since the search engine highlights for search keywords
+ title = regex.search(r'<strong class="title_post">(.*?)<\/span>', content)
text = regex.search(r'<!-- ngIf: post.contents -->(.*?)<\/a>', content)
author_name = regex.search(r'<em class="name_author">(.*?)<\/em>', content)
post_date = regex.search(r'<span class="date">(.*?)<\/span>', content)
href = href.group(0)
- title = remove_tags(title.group(1))
+ title = remove_html(title.group(1))
text = remove_tags(text.group(1))
author_name = remove_tags(author_name.group(1))
post_date = remove_tags((post_date.group(1)))
if __name__ == "__main__":
- naver_blog_scrapper("구미 송정동", "2022-01-01", "2023-10-31", 7, 50, 1, 12)
+ #TODO start_page_num must be not working as intended
+ naver_blog_scrapper("도개면", "2022-01-01", "2023-10-31", 100, 50, 1, 12)
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?
Delete comment
Once you delete this comment, you won't be able to recover it. Are you sure you want to delete this comment?