diff --git a/config/base_config.py b/config/base_config.py index cdac54e..076003a 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -40,7 +40,7 @@ START_PAGE = 1 CRAWLER_MAX_NOTES_COUNT = 20 # 并发爬虫数量控制 -MAX_CONCURRENCY_NUM = 4 +MAX_CONCURRENCY_NUM = 1 # 是否开启爬图片模式, 默认不开启爬图片 ENABLE_GET_IMAGES = False diff --git a/media_platform/xhs/client.py b/media_platform/xhs/client.py index 78aaec1..62afdcf 100644 --- a/media_platform/xhs/client.py +++ b/media_platform/xhs/client.py @@ -198,27 +198,34 @@ class XiaoHongShuClient(AbstractApiClient): } return await self.post(uri, data) - async def get_note_by_id(self, note_id: str) -> Dict: + async def get_note_by_id(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict: """ 获取笔记详情API Args: note_id:笔记ID + xsec_source: 渠道来源 + xsec_token: 搜索关键字之后返回的比较列表中返回的token Returns: """ + if xsec_source == "": + xsec_source = "pc_search" + data = { "source_note_id": note_id, "image_formats": ["jpg", "webp", "avif"], "extra": {"need_body_topic": 1}, - "xsec_source": "pc_feed", + "xsec_source": xsec_source, + "xsec_token": xsec_token } uri = "/api/sns/web/v1/feed" res = await self.post(uri, data) if res and res.get("items"): res_dict: Dict = res["items"][0]["note_card"] return res_dict - utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note empty and res:{res}") + # 爬取频繁了可能会出现有的笔记能有结果有的没有 + utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}") return dict() async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: @@ -292,9 +299,9 @@ class XiaoHongShuClient(AbstractApiClient): sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) result.extend(sub_comments) return result - + async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, - callback: Optional[Callable] = None) -> List[Dict]: + callback: Optional[Callable] = None) -> List[Dict]: """ 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 Args: @@ -306,23 +313,24 @@ class XiaoHongShuClient(AbstractApiClient): """ if not config.ENABLE_GET_SUB_COMMENTS: - utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") + utils.logger.info( + f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") return [] - + result = [] for comment in comments: note_id = comment.get("note_id") sub_comments = comment.get("sub_comments") if sub_comments and callback: await callback(note_id, sub_comments) - + sub_comment_has_more = comment.get("sub_comment_has_more") if not sub_comment_has_more: continue root_comment_id = comment.get("id") sub_comment_cursor = comment.get("sub_comment_cursor") - + while sub_comment_has_more: comments_res = await self.get_note_sub_comments(note_id, root_comment_id, 10, sub_comment_cursor) sub_comment_has_more = comments_res.get("has_more", False) @@ -398,17 +406,20 @@ class XiaoHongShuClient(AbstractApiClient): while notes_has_more: notes_res = await self.get_notes_by_creator(user_id, notes_cursor) if not notes_res: - utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.") + utils.logger.error( + f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.") break notes_has_more = notes_res.get("has_more", False) notes_cursor = notes_res.get("cursor", "") if "notes" not in notes_res: - utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}") + utils.logger.info( + f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}") break notes = notes_res["notes"] - utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}") + utils.logger.info( + f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}") if callback: await callback(notes) await asyncio.sleep(crawl_interval) diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 5a9293f..a8d3172 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -27,7 +27,8 @@ class XiaoHongShuCrawler(AbstractCrawler): def __init__(self) -> None: self.index_url = "https://www.xiaohongshu.com" - self.user_agent = utils.get_user_agent() + # self.user_agent = utils.get_user_agent() + self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36" async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None @@ -110,18 +111,23 @@ class XiaoHongShuCrawler(AbstractCrawler): sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL, ) utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") - if(not notes_res or not notes_res.get('has_more', False)): + if not notes_res or not notes_res.get('has_more', False): utils.logger.info("No more content!") break semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_note_detail(post_item.get("id"), semaphore) + self.get_note_detail( + note_id=post_item.get("id"), + xsec_source=post_item.get("xsec_source"), + xsec_token=post_item.get("xsec_token"), + semaphore=semaphore + ) for post_item in notes_res.get("items", {}) if post_item.get('model_type') not in ('rec_query', 'hot_query') ] note_details = await asyncio.gather(*task_list) for note_detail in note_details: - if note_detail is not None: + if note_detail: await xhs_store.update_xhs_note(note_detail) await self.get_notice_media(note_detail) note_id_list.append(note_detail.get("note_id")) @@ -157,32 +163,42 @@ class XiaoHongShuCrawler(AbstractCrawler): """ semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) task_list = [ - self.get_note_detail(post_item.get("note_id"), semaphore) for post_item in note_list + self.get_note_detail( + note_id=post_item.get("id"), + xsec_source=post_item.get("xsec_source"), + xsec_token=post_item.get("xsec_token"), + semaphore=semaphore + ) + for post_item in note_list ] note_details = await asyncio.gather(*task_list) for note_detail in note_details: - if note_detail is not None: + if note_detail: await xhs_store.update_xhs_note(note_detail) async def get_specified_notes(self): """Get the information and comments of the specified post""" - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST - ] - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: - if note_detail is not None: - await xhs_store.update_xhs_note(note_detail) - await self.get_notice_media(note_detail) - await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST) + # todo 指定帖子爬取暂时失效,xhs更新了帖子详情的请求参数,需要携带xsec_token,目前发现该参数只能在搜索场景下获取到 + raise Exception( + "指定帖子爬取暂时失效,xhs更新了帖子详情的请求参数,需要携带xsec_token,目前发现只能在搜索场景下获取到") + # semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + # task_list = [ + # self.get_note_detail(note_id=note_id, xsec_token="", semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST + # ] + # note_details = await asyncio.gather(*task_list) + # for note_detail in note_details: + # if note_detail is not None: + # await xhs_store.update_xhs_note(note_detail) + # await self.get_notice_media(note_detail) + # await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST) - async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: + async def get_note_detail(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \ + Optional[Dict]: """Get note detail""" async with semaphore: try: - return await self.xhs_client.get_note_by_id(note_id) + return await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token) except DataFetchError as ex: utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}") return None diff --git a/media_platform/xhs/help.py b/media_platform/xhs/help.py index 72b269f..5d887a5 100644 --- a/media_platform/xhs/help.py +++ b/media_platform/xhs/help.py @@ -10,19 +10,19 @@ def sign(a1="", b1="", x_s="", x_t=""): takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t". """ common = { - "s0": 5, # getPlatformCode + "s0": 3, # getPlatformCode "s1": "", "x0": "1", # localStorage.getItem("b1b1") - "x1": "3.3.0", # version - "x2": "Windows", + "x1": "3.7.8-2", # version + "x2": "Mac OS", "x3": "xhs-pc-web", - "x4": "1.4.4", + "x4": "4.27.2", "x5": a1, # cookie of a1 "x6": x_t, "x7": x_s, "x8": b1, # localStorage.getItem("b1") "x9": mrc(x_t + x_s + b1), - "x10": 1, # getSigCount + "x10": 154, # getSigCount } encode_str = encodeUtf8(json.dumps(common, separators=(',', ':'))) x_s_common = b64Encode(encode_str) diff --git a/store/bilibili/__init__.py b/store/bilibili/__init__.py index 9ba4a21..1bc52cd 100644 --- a/store/bilibili/__init__.py +++ b/store/bilibili/__init__.py @@ -10,6 +10,7 @@ import config from .bilibili_store_impl import * from .bilibilli_store_video import * + class BiliStoreFactory: STORES = { "csv": BiliCsvStoreImplement, diff --git a/store/weibo/weibo_store_impl.py b/store/weibo/weibo_store_impl.py index aa5a019..f3476ff 100644 --- a/store/weibo/weibo_store_impl.py +++ b/store/weibo/weibo_store_impl.py @@ -33,6 +33,9 @@ def calculate_number_of_files(file_store_path: str) -> int: class WeiboCsvStoreImplement(AbstractStore): + async def store_creator(self, creator: Dict): + pass + csv_store_path: str = "data/weibo" file_count:int=calculate_number_of_files(csv_store_path)