import sys import asyncio from typing import Optional, List, Dict from playwright.async_api import Page from playwright.async_api import Cookie from playwright.async_api import BrowserContext from playwright.async_api import async_playwright import utils from .client import XHSClient from base_crawler import Crawler class XiaoHongShuCrawler(Crawler): def __init__(self): self.keywords = None self.scan_qrcode_time = None self.cookies: Optional[List[Cookie]] = None self.browser_context: Optional[BrowserContext] = None self.context_page: Optional[Page] = None self.proxy: Optional[Dict] = None self.user_agent = utils.get_user_agent() self.xhs_client: Optional[XHSClient] = None self.login_url = "https://www.xiaohongshu.com" self.scan_qrcode_time = 20 # second def init_config(self, **kwargs): self.keywords = kwargs.get("keywords") async def update_cookies(self): self.cookies = await self.browser_context.cookies() async def start(self): async with async_playwright() as playwright: # launch browser and create single browser context chromium = playwright.chromium browser = await chromium.launch(headless=True) self.browser_context = await browser.new_context( viewport={"width": 1920, "height": 1080}, user_agent=self.user_agent, proxy=self.proxy ) # execute JS to bypass anti automation/crawler detection await self.browser_context.add_init_script(path="libs/stealth.min.js") self.context_page = await self.browser_context.new_page() await self.context_page.goto(self.login_url) # scan qrcode login await self.login() await self.update_cookies() # init request client cookie_str, cookie_dict = utils.convert_cookies(self.cookies) self.xhs_client = XHSClient( proxies=self.proxy, headers={ "User-Agent": self.user_agent, "Cookie": cookie_str, "Origin": "https://www.xiaohongshu.com", "Referer": "https://www.xiaohongshu.com", "Content-Type": "application/json;charset=UTF-8" }, playwright_page=self.context_page, cookie_dict=cookie_dict, ) # Search for notes and retrieve their comment information. note_res = await self.search_posts() for post_item in note_res.get("items"): note_id = post_item.get("id") await self.get_comments(note_id=note_id) await asyncio.sleep(1) # block main crawler coroutine await asyncio.Event().wait() async def login(self): """login xiaohongshu website and keep webdriver login state""" print("Begin login xiaohongshu ...") # find login qrcode base64_qrcode_img = await utils.find_login_qrcode( self.context_page, selector="div.login-container > div.left > div.qrcode > img" ) current_cookie = await self.browser_context.cookies() _, cookie_dict = utils.convert_cookies(current_cookie) no_logged_in_session = cookie_dict.get("web_session") if not base64_qrcode_img: if await self.check_login_state(no_logged_in_session): return # todo ...if this website does not automatically popup login dialog box, we will manual click login button print("login failed , have not found qrcode please check ....") sys.exit() # show login qrcode utils.show_qrcode(base64_qrcode_img) while self.scan_qrcode_time > 0: await asyncio.sleep(1) self.scan_qrcode_time -= 1 print(f"waiting for scan code login, remaining time is {self.scan_qrcode_time} seconds") # get login state from browser if await self.check_login_state(no_logged_in_session): # If the QR code login is successful, you need to wait for a moment. # Because there will be a second redirection after successful login # executing JS during this period may be performed in a Page that has already been destroyed. wait_for_seconds = 5 print(f"Login successful then wait for {wait_for_seconds} seconds redirect ...") while wait_for_seconds > 0: await asyncio.sleep(1) print(f"remaining wait {wait_for_seconds} seconds ...") wait_for_seconds -= 1 break else: sys.exit() async def check_login_state(self, no_logged_in_session: str) -> bool: """Check if the current login status is successful and return True otherwise return False""" current_cookie = await self.browser_context.cookies() _, cookie_dict = utils.convert_cookies(current_cookie) current_web_session = cookie_dict.get("web_session") if current_web_session != no_logged_in_session: return True return False async def search_posts(self): # This function only retrieves the first 10 note # And you can continue to make requests to obtain more by checking the boolean status of "has_more". print("Begin search xiaohongshu keywords: ", self.keywords) posts_res = await self.xhs_client.get_note_by_keyword(keyword=self.keywords) for post_item in posts_res.get("items"): note_id = post_item.get("id") title = post_item.get("note_card", {}).get("display_title") print(f"Note ID:{note_id}; Title:{title}") # todo record note or save to db or csv return posts_res async def get_comments(self, note_id: str): # This function only retrieves the first 10 comments # And you can continue to make requests to obtain more by checking the boolean status of "has_more". print("Begin get note id comments ", note_id) res = await self.xhs_client.get_note_comments(note_id=note_id) # res = await self.xhs_client.get_note_all_comments(note_id=note_id) for comment in res.get("comments"): nick_name = comment.get("user_info").get("nickname") comment_content = comment.get("content") print(f"Nickname:{nick_name}; Comment content:{comment_content}") # todo save to db or csv return res