fix: 微博登录问题修复
feat: 微博二级评论
This commit is contained in:
parent
7229d29123
commit
1c2237a66f
@ -21,7 +21,7 @@
|
|||||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||||
|
|
||||||
|
|
||||||
## 使用方法
|
## 使用方法
|
||||||
@ -109,10 +109,12 @@
|
|||||||
|
|
||||||
|
|
||||||
## MediaCrawler爬虫项目交流群:
|
## MediaCrawler爬虫项目交流群:
|
||||||
> 如果二维码过期,可以直接添加我的微信号:yzglan(备注来自:github),拉你进项目交流群
|
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
||||||
|
>
|
||||||
|
> 如果图片展示不出来,可以直接添加我的微信号:yzglan
|
||||||
|
|
||||||
<div style="max-width: 200px">
|
<div style="max-width: 200px">
|
||||||
<p><img alt="图片展示不出来可以加作者微信: yzglan" src="static/images/12群二维码.JPG" style="width: 200px;height: 100%" ></p>
|
<p><img alt="relakkes_wechat" src="static/images/relakkes_weichat.JPG" style="width: 200px;height: 100%" ></p>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
|
|
||||||
|
@ -13,6 +13,7 @@ from urllib.parse import urlencode
|
|||||||
import httpx
|
import httpx
|
||||||
from playwright.async_api import BrowserContext, Page
|
from playwright.async_api import BrowserContext, Page
|
||||||
|
|
||||||
|
import config
|
||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
@ -129,13 +130,12 @@ class WeiboClient:
|
|||||||
|
|
||||||
return await self.get(uri, params, headers=headers)
|
return await self.get(uri, params, headers=headers)
|
||||||
|
|
||||||
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None, ):
|
callback: Optional[Callable] = None, ):
|
||||||
"""
|
"""
|
||||||
get note all comments include sub comments
|
get note all comments include sub comments
|
||||||
:param note_id:
|
:param note_id:
|
||||||
:param crawl_interval:
|
:param crawl_interval:
|
||||||
:param is_fetch_sub_comments:
|
|
||||||
:param callback:
|
:param callback:
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
@ -151,12 +151,37 @@ class WeiboClient:
|
|||||||
if callback: # 如果有回调函数,就执行回调函数
|
if callback: # 如果有回调函数,就执行回调函数
|
||||||
await callback(note_id, comment_list)
|
await callback(note_id, comment_list)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
if not is_fetch_sub_comments:
|
|
||||||
result.extend(comment_list)
|
result.extend(comment_list)
|
||||||
continue
|
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
||||||
# todo handle get sub comments
|
result.extend(sub_comment_result)
|
||||||
return result
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
|
||||||
|
callback: Optional[Callable] = None) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
获取评论的所有子评论
|
||||||
|
Args:
|
||||||
|
note_id:
|
||||||
|
comment_list:
|
||||||
|
callback:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||||
|
utils.logger.info(
|
||||||
|
f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||||
|
return []
|
||||||
|
|
||||||
|
res_sub_comments = []
|
||||||
|
for comment in comment_list:
|
||||||
|
sub_comments = comment.get("comments")
|
||||||
|
if sub_comments and isinstance(sub_comments, list):
|
||||||
|
await callback(note_id, sub_comments)
|
||||||
|
res_sub_comments.extend(sub_comments)
|
||||||
|
return res_sub_comments
|
||||||
|
|
||||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||||
"""
|
"""
|
||||||
根据帖子ID获取详情
|
根据帖子ID获取详情
|
||||||
|
@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
context_page=self.context_page,
|
context_page=self.context_page,
|
||||||
cookie_str=config.COOKIES
|
cookie_str=config.COOKIES
|
||||||
)
|
)
|
||||||
await self.context_page.goto(self.index_url)
|
|
||||||
await asyncio.sleep(1)
|
|
||||||
await login_obj.begin()
|
await login_obj.begin()
|
||||||
|
|
||||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||||
@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler):
|
|||||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||||
await self.wb_client.get_note_all_comments(
|
await self.wb_client.get_note_all_comments(
|
||||||
note_id=note_id,
|
note_id=note_id,
|
||||||
crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些
|
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些
|
||||||
callback=weibo_store.batch_update_weibo_note_comments
|
callback=weibo_store.batch_update_weibo_note_comments
|
||||||
)
|
)
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
|
@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin):
|
|||||||
self.context_page = context_page
|
self.context_page = context_page
|
||||||
self.login_phone = login_phone
|
self.login_phone = login_phone
|
||||||
self.cookie_str = cookie_str
|
self.cookie_str = cookie_str
|
||||||
|
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
||||||
|
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login weibo"""
|
"""Start login weibo"""
|
||||||
@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin):
|
|||||||
"""
|
"""
|
||||||
current_cookie = await self.browser_context.cookies()
|
current_cookie = await self.browser_context.cookies()
|
||||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||||
|
if cookie_dict.get("SSOLoginState"):
|
||||||
|
return True
|
||||||
current_web_session = cookie_dict.get("WBPSESS")
|
current_web_session = cookie_dict.get("WBPSESS")
|
||||||
if current_web_session != no_logged_in_session:
|
if current_web_session != no_logged_in_session:
|
||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
async def popup_login_dialog(self):
|
|
||||||
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
|
||||||
dialog_selector = "xpath=//div[@class='woo-modal-main']"
|
|
||||||
try:
|
|
||||||
# check dialog box is auto popup and wait for 4 seconds
|
|
||||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4)
|
|
||||||
except Exception as e:
|
|
||||||
utils.logger.error(
|
|
||||||
f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
|
||||||
utils.logger.info(
|
|
||||||
"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
|
||||||
|
|
||||||
# 向下滚动1000像素
|
|
||||||
await self.context_page.mouse.wheel(0,500)
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# click login button
|
|
||||||
login_button_ele = self.context_page.locator(
|
|
||||||
"xpath=//a[text()='登录']",
|
|
||||||
)
|
|
||||||
await login_button_ele.click()
|
|
||||||
await asyncio.sleep(0.5)
|
|
||||||
except Exception as e:
|
|
||||||
utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}")
|
|
||||||
|
|
||||||
async def login_by_qrcode(self):
|
async def login_by_qrcode(self):
|
||||||
"""login weibo website and keep webdriver login state"""
|
"""login weibo website and keep webdriver login state"""
|
||||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||||
|
await self.context_page.goto(self.weibo_sso_login_url)
|
||||||
await self.popup_login_dialog()
|
|
||||||
|
|
||||||
# find login qrcode
|
# find login qrcode
|
||||||
qrcode_img_selector = "//div[@class='woo-modal-main']//img"
|
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
|
||||||
base64_qrcode_img = await utils.find_login_qrcode(
|
base64_qrcode_img = await utils.find_login_qrcode(
|
||||||
self.context_page,
|
self.context_page,
|
||||||
selector=qrcode_img_selector
|
selector=qrcode_img_selector
|
||||||
|
@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
|||||||
ALTER TABLE `bilibili_video_comment`
|
ALTER TABLE `bilibili_video_comment`
|
||||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
|
ALTER TABLE `weibo_note_comment`
|
||||||
|
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||||
|
|
||||||
SET FOREIGN_KEY_CHECKS = 1;
|
SET FOREIGN_KEY_CHECKS = 1;
|
||||||
|
@ -6,8 +6,6 @@
|
|||||||
import re
|
import re
|
||||||
from typing import List
|
from typing import List
|
||||||
|
|
||||||
import config
|
|
||||||
|
|
||||||
from .weibo_store_image import *
|
from .weibo_store_image import *
|
||||||
from .weibo_store_impl import *
|
from .weibo_store_impl import *
|
||||||
|
|
||||||
@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
|
|||||||
"comment_like_count": str(comment_item.get("like_count", 0)),
|
"comment_like_count": str(comment_item.get("like_count", 0)),
|
||||||
"last_modify_ts": utils.get_current_timestamp(),
|
"last_modify_ts": utils.get_current_timestamp(),
|
||||||
"ip_location": comment_item.get("source", "").replace("来自", ""),
|
"ip_location": comment_item.get("source", "").replace("来自", ""),
|
||||||
|
"parent_comment_id": comment_item.get("rootid", ""),
|
||||||
|
|
||||||
# 用户信息
|
# 用户信息
|
||||||
"user_id": str(user_info.get("id")),
|
"user_id": str(user_info.get("id")),
|
||||||
|
@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore):
|
|||||||
|
|
||||||
|
|
||||||
class WeiboDbStoreImplement(AbstractStore):
|
class WeiboDbStoreImplement(AbstractStore):
|
||||||
|
|
||||||
async def store_content(self, content_item: Dict):
|
async def store_content(self, content_item: Dict):
|
||||||
"""
|
"""
|
||||||
Weibo content DB storage implementation
|
Weibo content DB storage implementation
|
||||||
@ -134,6 +135,9 @@ class WeiboDbStoreImplement(AbstractStore):
|
|||||||
else:
|
else:
|
||||||
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
|
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
|
||||||
|
|
||||||
|
async def store_creator(self, creator: Dict):
|
||||||
|
pass
|
||||||
|
|
||||||
|
|
||||||
class WeiboJsonStoreImplement(AbstractStore):
|
class WeiboJsonStoreImplement(AbstractStore):
|
||||||
json_store_path: str = "data/weibo/json"
|
json_store_path: str = "data/weibo/json"
|
||||||
@ -142,7 +146,6 @@ class WeiboJsonStoreImplement(AbstractStore):
|
|||||||
file_count: int = calculate_number_of_files(json_store_path)
|
file_count: int = calculate_number_of_files(json_store_path)
|
||||||
WordCloud = words.AsyncWordCloudGenerator()
|
WordCloud = words.AsyncWordCloudGenerator()
|
||||||
|
|
||||||
|
|
||||||
def make_save_file_name(self, store_type: str) -> (str, str):
|
def make_save_file_name(self, store_type: str) -> (str, str):
|
||||||
"""
|
"""
|
||||||
make save file name by store type
|
make save file name by store type
|
||||||
@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
await self.save_data_to_json(comment_item, "comments")
|
await self.save_data_to_json(comment_item, "comments")
|
||||||
|
|
||||||
|
async def store_creator(self, creator: Dict):
|
||||||
|
pass
|
||||||
|
Loading…
Reference in New Issue
Block a user