fix: 微博登录问题修复
feat: 微博二级评论
This commit is contained in:
parent
7229d29123
commit
1c2237a66f
@ -21,7 +21,7 @@
|
||||
| 抖音 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 快手 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| B 站 | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ❌ | ❌ | ✅ | ✅ | ✅ |
|
||||
| 微博 | ✅ | ✅ | ✅ | ❌ | ✅ | ✅ | ✅ |
|
||||
|
||||
|
||||
## 使用方法
|
||||
@ -109,10 +109,12 @@
|
||||
|
||||
|
||||
## MediaCrawler爬虫项目交流群:
|
||||
> 如果二维码过期,可以直接添加我的微信号:yzglan(备注来自:github),拉你进项目交流群
|
||||
> 扫描下方我的个人微信,备注:github,拉你进MediaCrawler项目交流群(请一定备注:github,会有wx小助手自动拉群)
|
||||
>
|
||||
> 如果图片展示不出来,可以直接添加我的微信号:yzglan
|
||||
|
||||
<div style="max-width: 200px">
|
||||
<p><img alt="图片展示不出来可以加作者微信: yzglan" src="static/images/12群二维码.JPG" style="width: 200px;height: 100%" ></p>
|
||||
<p><img alt="relakkes_wechat" src="static/images/relakkes_weichat.JPG" style="width: 200px;height: 100%" ></p>
|
||||
</div>
|
||||
|
||||
|
||||
|
@ -13,6 +13,7 @@ from urllib.parse import urlencode
|
||||
import httpx
|
||||
from playwright.async_api import BrowserContext, Page
|
||||
|
||||
import config
|
||||
from tools import utils
|
||||
|
||||
from .exception import DataFetchError
|
||||
@ -129,13 +130,12 @@ class WeiboClient:
|
||||
|
||||
return await self.get(uri, params, headers=headers)
|
||||
|
||||
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False,
|
||||
async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0,
|
||||
callback: Optional[Callable] = None, ):
|
||||
"""
|
||||
get note all comments include sub comments
|
||||
:param note_id:
|
||||
:param crawl_interval:
|
||||
:param is_fetch_sub_comments:
|
||||
:param callback:
|
||||
:return:
|
||||
"""
|
||||
@ -151,12 +151,37 @@ class WeiboClient:
|
||||
if callback: # 如果有回调函数,就执行回调函数
|
||||
await callback(note_id, comment_list)
|
||||
await asyncio.sleep(crawl_interval)
|
||||
if not is_fetch_sub_comments:
|
||||
result.extend(comment_list)
|
||||
continue
|
||||
# todo handle get sub comments
|
||||
sub_comment_result = await self.get_comments_all_sub_comments(note_id, comment_list, callback)
|
||||
result.extend(sub_comment_result)
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
async def get_comments_all_sub_comments(note_id: str, comment_list: List[Dict],
|
||||
callback: Optional[Callable] = None) -> List[Dict]:
|
||||
"""
|
||||
获取评论的所有子评论
|
||||
Args:
|
||||
note_id:
|
||||
comment_list:
|
||||
callback:
|
||||
|
||||
Returns:
|
||||
|
||||
"""
|
||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||
utils.logger.info(
|
||||
f"[WeiboClient.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||
return []
|
||||
|
||||
res_sub_comments = []
|
||||
for comment in comment_list:
|
||||
sub_comments = comment.get("comments")
|
||||
if sub_comments and isinstance(sub_comments, list):
|
||||
await callback(note_id, sub_comments)
|
||||
res_sub_comments.extend(sub_comments)
|
||||
return res_sub_comments
|
||||
|
||||
async def get_note_info_by_id(self, note_id: str) -> Dict:
|
||||
"""
|
||||
根据帖子ID获取详情
|
||||
@ -189,7 +214,7 @@ class WeiboClient:
|
||||
image_url = ""
|
||||
for i in range(len(sub_url)):
|
||||
if i == 1:
|
||||
image_url += "large/" #都获取高清大图
|
||||
image_url += "large/" # 都获取高清大图
|
||||
elif i == len(sub_url) - 1:
|
||||
image_url += sub_url[i]
|
||||
else:
|
||||
|
@ -69,8 +69,6 @@ class WeiboCrawler(AbstractCrawler):
|
||||
context_page=self.context_page,
|
||||
cookie_str=config.COOKIES
|
||||
)
|
||||
await self.context_page.goto(self.index_url)
|
||||
await asyncio.sleep(1)
|
||||
await login_obj.begin()
|
||||
|
||||
# 登录成功后重定向到手机端的网站,再更新手机端登录成功的cookie
|
||||
@ -192,7 +190,7 @@ class WeiboCrawler(AbstractCrawler):
|
||||
utils.logger.info(f"[WeiboCrawler.get_note_comments] begin get note_id: {note_id} comments ...")
|
||||
await self.wb_client.get_note_all_comments(
|
||||
note_id=note_id,
|
||||
crawl_interval=random.randint(1,10), # 微博对API的限流比较严重,所以延时提高一些
|
||||
crawl_interval=random.randint(1,3), # 微博对API的限流比较严重,所以延时提高一些
|
||||
callback=weibo_store.batch_update_weibo_note_comments
|
||||
)
|
||||
except DataFetchError as ex:
|
||||
|
@ -30,6 +30,7 @@ class WeiboLogin(AbstractLogin):
|
||||
self.context_page = context_page
|
||||
self.login_phone = login_phone
|
||||
self.cookie_str = cookie_str
|
||||
self.weibo_sso_login_url = "https://passport.weibo.com/sso/signin?entry=miniblog&source=miniblog"
|
||||
|
||||
async def begin(self):
|
||||
"""Start login weibo"""
|
||||
@ -54,45 +55,19 @@ class WeiboLogin(AbstractLogin):
|
||||
"""
|
||||
current_cookie = await self.browser_context.cookies()
|
||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||
if cookie_dict.get("SSOLoginState"):
|
||||
return True
|
||||
current_web_session = cookie_dict.get("WBPSESS")
|
||||
if current_web_session != no_logged_in_session:
|
||||
return True
|
||||
return False
|
||||
|
||||
async def popup_login_dialog(self):
|
||||
"""If the login dialog box does not pop up automatically, we will manually click the login button"""
|
||||
dialog_selector = "xpath=//div[@class='woo-modal-main']"
|
||||
try:
|
||||
# check dialog box is auto popup and wait for 4 seconds
|
||||
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 4)
|
||||
except Exception as e:
|
||||
utils.logger.error(
|
||||
f"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, error: {e}")
|
||||
utils.logger.info(
|
||||
"[WeiboLogin.popup_login_dialog] login dialog box does not pop up automatically, we will manually click the login button")
|
||||
|
||||
# 向下滚动1000像素
|
||||
await self.context_page.mouse.wheel(0,500)
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
try:
|
||||
# click login button
|
||||
login_button_ele = self.context_page.locator(
|
||||
"xpath=//a[text()='登录']",
|
||||
)
|
||||
await login_button_ele.click()
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
utils.logger.info(f"[WeiboLogin.popup_login_dialog] manually click the login button faield maybe login dialog Appear:{e}")
|
||||
|
||||
async def login_by_qrcode(self):
|
||||
"""login weibo website and keep webdriver login state"""
|
||||
utils.logger.info("[WeiboLogin.login_by_qrcode] Begin login weibo by qrcode ...")
|
||||
|
||||
await self.popup_login_dialog()
|
||||
|
||||
await self.context_page.goto(self.weibo_sso_login_url)
|
||||
# find login qrcode
|
||||
qrcode_img_selector = "//div[@class='woo-modal-main']//img"
|
||||
qrcode_img_selector = "xpath=//img[@class='w-full h-full']"
|
||||
base64_qrcode_img = await utils.find_login_qrcode(
|
||||
self.context_page,
|
||||
selector=qrcode_img_selector
|
||||
|
@ -333,4 +333,7 @@ ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
ALTER TABLE `bilibili_video_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
ALTER TABLE `weibo_note_comment`
|
||||
ADD COLUMN `parent_comment_id` VARCHAR(64) DEFAULT NULL COMMENT '父评论ID';
|
||||
|
||||
SET FOREIGN_KEY_CHECKS = 1;
|
||||
|
@ -6,8 +6,6 @@
|
||||
import re
|
||||
from typing import List
|
||||
|
||||
import config
|
||||
|
||||
from .weibo_store_image import *
|
||||
from .weibo_store_impl import *
|
||||
|
||||
@ -81,6 +79,7 @@ async def update_weibo_note_comment(note_id: str, comment_item: Dict):
|
||||
"comment_like_count": str(comment_item.get("like_count", 0)),
|
||||
"last_modify_ts": utils.get_current_timestamp(),
|
||||
"ip_location": comment_item.get("source", "").replace("来自", ""),
|
||||
"parent_comment_id": comment_item.get("rootid", ""),
|
||||
|
||||
# 用户信息
|
||||
"user_id": str(user_info.get("id")),
|
||||
|
@ -27,7 +27,7 @@ def calculate_number_of_files(file_store_path: str) -> int:
|
||||
if not os.path.exists(file_store_path):
|
||||
return 1
|
||||
try:
|
||||
return max([int(file_name.split("_")[0])for file_name in os.listdir(file_store_path)])+1
|
||||
return max([int(file_name.split("_")[0]) for file_name in os.listdir(file_store_path)]) + 1
|
||||
except ValueError:
|
||||
return 1
|
||||
|
||||
@ -37,7 +37,7 @@ class WeiboCsvStoreImplement(AbstractStore):
|
||||
pass
|
||||
|
||||
csv_store_path: str = "data/weibo"
|
||||
file_count:int=calculate_number_of_files(csv_store_path)
|
||||
file_count: int = calculate_number_of_files(csv_store_path)
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> str:
|
||||
"""
|
||||
@ -93,6 +93,7 @@ class WeiboCsvStoreImplement(AbstractStore):
|
||||
|
||||
|
||||
class WeiboDbStoreImplement(AbstractStore):
|
||||
|
||||
async def store_content(self, content_item: Dict):
|
||||
"""
|
||||
Weibo content DB storage implementation
|
||||
@ -134,16 +135,18 @@ class WeiboDbStoreImplement(AbstractStore):
|
||||
else:
|
||||
await update_comment_by_comment_id(comment_id, comment_item=comment_item)
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
||||
|
||||
class WeiboJsonStoreImplement(AbstractStore):
|
||||
json_store_path: str = "data/weibo/json"
|
||||
words_store_path: str = "data/weibo/words"
|
||||
lock = asyncio.Lock()
|
||||
file_count:int=calculate_number_of_files(json_store_path)
|
||||
file_count: int = calculate_number_of_files(json_store_path)
|
||||
WordCloud = words.AsyncWordCloudGenerator()
|
||||
|
||||
|
||||
def make_save_file_name(self, store_type: str) -> (str,str):
|
||||
def make_save_file_name(self, store_type: str) -> (str, str):
|
||||
"""
|
||||
make save file name by store type
|
||||
Args:
|
||||
@ -170,7 +173,7 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
"""
|
||||
pathlib.Path(self.json_store_path).mkdir(parents=True, exist_ok=True)
|
||||
pathlib.Path(self.words_store_path).mkdir(parents=True, exist_ok=True)
|
||||
save_file_name,words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_file_name, words_file_name_prefix = self.make_save_file_name(store_type=store_type)
|
||||
save_data = []
|
||||
|
||||
async with self.lock:
|
||||
@ -209,3 +212,6 @@ class WeiboJsonStoreImplement(AbstractStore):
|
||||
|
||||
"""
|
||||
await self.save_data_to_json(comment_item, "comments")
|
||||
|
||||
async def store_creator(self, creator: Dict):
|
||||
pass
|
||||
|
Loading…
Reference in New Issue
Block a user