# 声明:本代码仅供学习和研究目的使用。使用者应遵守以下原则: # 1. 不得用于任何商业用途。 # 2. 使用时应遵守目标平台的使用条款和robots.txt规则。 # 3. 不得进行大规模爬取或对平台造成运营干扰。 # 4. 应合理控制请求频率,避免给目标平台带来不必要的负担。 # 5. 不得用于任何非法或不当的用途。 # # 详细许可条款请参阅项目根目录下的LICENSE文件。 # 使用本代码即表示您同意遵守上述原则和LICENSE中的所有条款。 import asyncio import json import re from typing import Any, Callable, Dict, List, Optional, Union from urllib.parse import urlencode import httpx from playwright.async_api import BrowserContext, Page from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_result import config from base.base_crawler import AbstractApiClient from tools import utils from .exception import DataFetchError, IPBlockError from .field import SearchNoteType, SearchSortType from .help import get_search_id, sign class XiaoHongShuClient(AbstractApiClient): def __init__( self, timeout=10, proxies=None, *, headers: Dict[str, str], playwright_page: Page, cookie_dict: Dict[str, str], ): self.proxies = proxies self.timeout = timeout self.headers = headers self._host = "https://edith.xiaohongshu.com" self._domain = "https://www.xiaohongshu.com" self.IP_ERROR_STR = "网络连接异常,请检查网络设置或重启试试" self.IP_ERROR_CODE = 300012 self.NOTE_ABNORMAL_STR = "笔记状态异常,请稍后查看" self.NOTE_ABNORMAL_CODE = -510001 self.playwright_page = playwright_page self.cookie_dict = cookie_dict async def _pre_headers(self, url: str, data=None) -> Dict: """ 请求头参数签名 Args: url: data: Returns: """ encrypt_params = await self.playwright_page.evaluate("([url, data]) => window._webmsxyw(url,data)", [url, data]) local_storage = await self.playwright_page.evaluate("() => window.localStorage") signs = sign( a1=self.cookie_dict.get("a1", ""), b1=local_storage.get("b1", ""), x_s=encrypt_params.get("X-s", ""), x_t=str(encrypt_params.get("X-t", "")) ) headers = { "X-S": signs["x-s"], "X-T": signs["x-t"], "x-S-Common": signs["x-s-common"], "X-B3-Traceid": signs["x-b3-traceid"] } self.headers.update(headers) return self.headers @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def request(self, method, url, **kwargs) -> Union[str, Any]: """ 封装httpx的公共请求方法,对请求响应做一些处理 Args: method: 请求方法 url: 请求的URL **kwargs: 其他请求参数,例如请求头、请求体等 Returns: """ # return response.text return_response = kwargs.pop('return_response', False) async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) if response.status_code == 471 or response.status_code == 461: # someday someone maybe will bypass captcha verify_type = response.headers['Verifytype'] verify_uuid = response.headers['Verifyuuid'] raise Exception( f"出现验证码,请求失败,Verifytype: {verify_type},Verifyuuid: {verify_uuid}, Response: {response}") if return_response: return response.text data: Dict = response.json() if data["success"]: return data.get("data", data.get("success", {})) elif data["code"] == self.IP_ERROR_CODE: raise IPBlockError(self.IP_ERROR_STR) else: raise DataFetchError(data.get("msg", None)) async def get(self, uri: str, params=None) -> Dict: """ GET请求,对请求头签名 Args: uri: 请求路由 params: 请求参数 Returns: """ final_uri = uri if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") headers = await self._pre_headers(final_uri) return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=headers) async def post(self, uri: str, data: dict, **kwargs) -> Dict: """ POST请求,对请求头签名 Args: uri: 请求路由 data: 请求体参数 Returns: """ headers = await self._pre_headers(uri, data) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=headers, **kwargs) async def get_note_media(self, url: str) -> Union[bytes, None]: async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request("GET", url, timeout=self.timeout) if not response.reason_phrase == "OK": utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}") return None else: return response.content async def pong(self) -> bool: """ 用于检查登录态是否失效了 Returns: """ """get a note to check if login state is ok""" utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...") ping_flag = False try: note_card: Dict = await self.get_note_by_keyword(keyword="小红书") if note_card.get("items"): ping_flag = True except Exception as e: utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...") ping_flag = False return ping_flag async def update_cookies(self, browser_context: BrowserContext): """ API客户端提供的更新cookies方法,一般情况下登录成功后会调用此方法 Args: browser_context: 浏览器上下文对象 Returns: """ cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict async def get_note_by_keyword( self, keyword: str, search_id: str = get_search_id(), page: int = 1, page_size: int = 20, sort: SearchSortType = SearchSortType.GENERAL, note_type: SearchNoteType = SearchNoteType.ALL ) -> Dict: """ 根据关键词搜索笔记 Args: keyword: 关键词参数 page: 分页第几页 page_size: 分页数据长度 sort: 搜索结果排序指定 note_type: 搜索的笔记类型 Returns: """ uri = "/api/sns/web/v1/search/notes" data = { "keyword": keyword, "page": page, "page_size": page_size, "search_id": search_id, "sort": sort.value, "note_type": note_type.value } return await self.post(uri, data) async def get_note_by_id(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict: """ 获取笔记详情API Args: note_id:笔记ID xsec_source: 渠道来源 xsec_token: 搜索关键字之后返回的比较列表中返回的token Returns: """ if xsec_source == "": xsec_source = "pc_search" data = { "source_note_id": note_id, "image_formats": ["jpg", "webp", "avif"], "extra": {"need_body_topic": 1}, "xsec_source": xsec_source, "xsec_token": xsec_token } uri = "/api/sns/web/v1/feed" res = await self.post(uri, data) if res and res.get("items"): res_dict: Dict = res["items"][0]["note_card"] return res_dict # 爬取频繁了可能会出现有的笔记能有结果有的没有 utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}") return dict() async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: """ 获取一级评论的API Args: note_id: 笔记ID cursor: 分页游标 Returns: """ uri = "/api/sns/web/v2/comment/page" params = { "note_id": note_id, "cursor": cursor, "top_comment_id": "", "image_formats": "jpg,webp,avif" } return await self.get(uri, params) async def get_note_sub_comments(self, note_id: str, root_comment_id: str, num: int = 10, cursor: str = ""): """ 获取指定父评论下的子评论的API Args: note_id: 子评论的帖子ID root_comment_id: 根评论ID num: 分页数量 cursor: 分页游标 Returns: """ uri = "/api/sns/web/v2/comment/sub/page" params = { "note_id": note_id, "root_comment_id": root_comment_id, "num": num, "cursor": cursor, } return await self.get(uri, params) async def get_note_all_comments(self, note_id: str, crawl_interval: float = 1.0, callback: Optional[Callable] = None, max_count: int = 10) -> List[Dict]: """ 获取指定笔记下的所有一级评论,该方法会一直查找一个帖子下的所有评论信息 Args: note_id: 笔记ID crawl_interval: 爬取一次笔记的延迟单位(秒) callback: 一次笔记爬取结束后 max_count: 一次笔记爬取的最大评论数量 Returns: """ result = [] comments_has_more = True comments_cursor = "" while comments_has_more and len(result) < max_count: comments_res = await self.get_note_comments(note_id, comments_cursor) comments_has_more = comments_res.get("has_more", False) comments_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: utils.logger.info( f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") break comments = comments_res["comments"] if len(result) + len(comments) > max_count: comments = comments[:max_count - len(result)] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) result.extend(comments) sub_comments = await self.get_comments_all_sub_comments(comments, crawl_interval, callback) result.extend(sub_comments) return result async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0, callback: Optional[Callable] = None) -> List[Dict]: """ 获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息 Args: comments: 评论列表 crawl_interval: 爬取一次评论的延迟单位(秒) callback: 一次评论爬取结束后 Returns: """ if not config.ENABLE_GET_SUB_COMMENTS: utils.logger.info( f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled") return [] result = [] for comment in comments: note_id = comment.get("note_id") sub_comments = comment.get("sub_comments") if sub_comments and callback: await callback(note_id, sub_comments) sub_comment_has_more = comment.get("sub_comment_has_more") if not sub_comment_has_more: continue root_comment_id = comment.get("id") sub_comment_cursor = comment.get("sub_comment_cursor") while sub_comment_has_more: comments_res = await self.get_note_sub_comments(note_id, root_comment_id, 10, sub_comment_cursor) sub_comment_has_more = comments_res.get("has_more", False) sub_comment_cursor = comments_res.get("cursor", "") if "comments" not in comments_res: utils.logger.info( f"[XiaoHongShuClient.get_comments_all_sub_comments] No 'comments' key found in response: {comments_res}") break comments = comments_res["comments"] if callback: await callback(note_id, comments) await asyncio.sleep(crawl_interval) result.extend(comments) return result async def get_creator_info(self, user_id: str) -> Dict: """ 通过解析网页版的用户主页HTML,获取用户个人简要信息 PC端用户主页的网页存在window.__INITIAL_STATE__这个变量上的,解析它即可 eg: https://www.xiaohongshu.com/user/profile/59d8cb33de5fb4696bf17217 """ uri = f"/user/profile/{user_id}" html_content = await self.request("GET", self._domain + uri, return_response=True, headers=self.headers) match = re.search(r'", html)[ 0 ].replace("undefined", '""') if state != "{}": note_dict = transform_json_keys(state) return note_dict["note"]["note_detail_map"][note_id]["note"] return {} try: return get_note_dict(html) except: href = re.findall(r'href="(.*?)"', html)[0] utils.logger.info( f"[XiaoHongShuClient.get_note_by_id_from_html] 出现验证码: {href}, 请手动验证" ) await self.playwright_page.goto(href) # 等待用户完成操作页面重定向 if await self.check_redirect(): html = await self.playwright_page.content() return get_note_dict(html) else: raise DataFetchError(html) @retry( stop=stop_after_attempt(100), wait=wait_fixed(5), retry=retry_if_result(lambda value: value is False), ) async def check_redirect(self): url = await self.playwright_page.url() if url.startswith("https://www.xiaohongshu.com/explore"): return True return False