diff --git a/media_platform/weibo/client.py b/media_platform/weibo/client.py index fd2a783..e3c975c 100644 --- a/media_platform/weibo/client.py +++ b/media_platform/weibo/client.py @@ -7,7 +7,7 @@ import asyncio import copy import json import re -from typing import Any, Callable, Dict, List, Optional, Tuple, Union +from typing import Any, Callable, Dict, List, Optional from urllib.parse import urlencode import httpx @@ -73,6 +73,8 @@ class WeiboClient: resp_data: Dict = await self.request(method="GET", url=f"{self._host}{uri}", headers=self.headers) if resp_data.get("login"): ping_flag = True + else: + utils.logger.error(f"[WeiboClient.pong] cookie may be invalid and again login...") except Exception as e: utils.logger.error(f"[WeiboClient.pong] Pong weibo failed: {e}, and try to login again...") ping_flag = False diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index ff30e93..0ebd89a 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -37,7 +37,9 @@ class WeiboCrawler(AbstractCrawler): def __init__(self): self.index_url = "https://www.weibo.com" + self.mobile_index_url = "https://m.weibo.cn" self.user_agent = utils.get_user_agent() + self.mobile_user_agent = utils.get_mobile_user_agent() def init_config(self, platform: str, login_type: str, crawler_type: str): self.platform = platform @@ -57,13 +59,13 @@ class WeiboCrawler(AbstractCrawler): self.browser_context = await self.launch_browser( chromium, None, - self.user_agent, + self.mobile_user_agent, headless=config.HEADLESS ) # stealth.min.js is a js script to prevent the website from detecting the crawler. await self.browser_context.add_init_script(path="libs/stealth.min.js") self.context_page = await self.browser_context.new_page() - await self.context_page.goto(self.index_url) + await self.context_page.goto(self.mobile_index_url) # Create a client to interact with the xiaohongshu website. self.wb_client = await self.create_weibo_client(httpx_proxy_format) @@ -75,7 +77,14 @@ class WeiboCrawler(AbstractCrawler): context_page=self.context_page, cookie_str=config.COOKIES ) + await self.context_page.goto(self.index_url) + await asyncio.sleep(1) await login_obj.begin() + + # 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie + utils.logger.info("[WeiboCrawler.start] redirect weibo mobile homepage and update cookies on mobile platform") + await self.context_page.goto(self.mobile_index_url) + await asyncio.sleep(2) await self.wb_client.update_cookies(browser_context=self.browser_context) crawler_type_var.set(self.crawler_type) @@ -183,7 +192,7 @@ class WeiboCrawler(AbstractCrawler): # Download comments all_comments = await self.wb_client.get_note_all_comments( note_id=note_id, - crawl_interval=random.random(), + crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些 ) # Filter comments by keyword diff --git a/media_platform/weibo/login.py b/media_platform/weibo/login.py index 82ccf8f..dd0d737 100644 --- a/media_platform/weibo/login.py +++ b/media_platform/weibo/login.py @@ -43,6 +43,7 @@ class WeiboLogin(AbstractLogin): raise ValueError( "[WeiboLogin.begin] Invalid Login Type Currently only supported qrcode or phone or cookie ...") + @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) async def check_login_state(self, no_logged_in_session: str) -> bool: """ @@ -61,8 +62,8 @@ class WeiboLogin(AbstractLogin): """If the login dialog box does not pop up automatically, we will manually click the login button""" dialog_selector = "xpath=//div[@class='woo-modal-main']" try: - # check dialog box is auto popup and wait for 10 seconds - await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) + # check dialog box is auto popup and wait for 4 seconds + await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4) except Exception as e: utils.logger.error( f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}") @@ -71,12 +72,12 @@ class WeiboLogin(AbstractLogin): # 向下滚动1000像素 await self.context_page.mouse.wheel(0,500) - await asyncio.sleep(2) + await asyncio.sleep(0.5) try: # click login button login_button_ele = self.context_page.locator( - "xpath=//a[text()='登录']" + "xpath=//a[text()='登录']", ) await login_button_ele.click() await asyncio.sleep(0.5)