refactor: rename xhs to xiaohongshu

This commit is contained in:
Relakkes 2024-03-30 21:17:33 +08:00
parent aa257aab51
commit 67ec49498a
3 changed files with 28 additions and 28 deletions

View File

@ -14,7 +14,7 @@ from .field import SearchNoteType, SearchSortType
from .help import get_search_id, sign from .help import get_search_id, sign
class XHSClient: class XiaoHongShuClient:
def __init__( def __init__(
self, self,
timeout=10, timeout=10,
@ -134,14 +134,14 @@ class XHSClient:
""" """
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
utils.logger.info("[XHSClient.pong] Begin to pong xhs...") utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
ping_flag = False ping_flag = False
try: try:
note_card: Dict = await self.get_note_by_keyword(keyword="小红书") note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
if note_card.get("items"): if note_card.get("items"):
ping_flag = True ping_flag = True
except Exception as e: except Exception as e:
utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...") utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
ping_flag = False ping_flag = False
return ping_flag return ping_flag
@ -202,7 +202,7 @@ class XHSClient:
if res and res.get("items"): if res and res.get("items"):
res_dict: Dict = res["items"][0]["note_card"] res_dict: Dict = res["items"][0]["note_card"]
return res_dict return res_dict
utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}") utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note empty and res:{res}")
return dict() return dict()
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict: async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
@ -266,7 +266,7 @@ class XHSClient:
comments_cursor = comments_res.get("cursor", "") comments_cursor = comments_res.get("cursor", "")
if "comments" not in comments_res: if "comments" not in comments_res:
utils.logger.info( utils.logger.info(
f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}") f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
break break
comments = comments_res["comments"] comments = comments_res["comments"]
if callback: if callback:
@ -337,11 +337,11 @@ class XHSClient:
notes_has_more = notes_res.get("has_more", False) notes_has_more = notes_res.get("has_more", False)
notes_cursor = notes_res.get("cursor", "") notes_cursor = notes_res.get("cursor", "")
if "notes" not in notes_res: if "notes" not in notes_res:
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}") utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
break break
notes = notes_res["notes"] notes = notes_res["notes"]
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}") utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
if callback: if callback:
await callback(notes) await callback(notes)
await asyncio.sleep(crawl_interval) await asyncio.sleep(crawl_interval)

View File

@ -14,10 +14,10 @@ from store import xhs as xhs_store
from tools import utils from tools import utils
from var import crawler_type_var from var import crawler_type_var
from .client import XHSClient from .client import XiaoHongShuClient
from .exception import DataFetchError from .exception import DataFetchError
from .field import SearchSortType from .field import SearchSortType
from .login import XHSLogin from .login import XiaoHongShuLogin
class XiaoHongShuCrawler(AbstractCrawler): class XiaoHongShuCrawler(AbstractCrawler):
@ -25,7 +25,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
login_type: str login_type: str
crawler_type: str crawler_type: str
context_page: Page context_page: Page
xhs_client: XHSClient xhs_client: XiaoHongShuClient
browser_context: BrowserContext browser_context: BrowserContext
def __init__(self) -> None: def __init__(self) -> None:
@ -68,7 +68,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
# Create a client to interact with the xiaohongshu website. # Create a client to interact with the xiaohongshu website.
self.xhs_client = await self.create_xhs_client(httpx_proxy_format) self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
if not await self.xhs_client.pong(): if not await self.xhs_client.pong():
login_obj = XHSLogin( login_obj = XiaoHongShuLogin(
login_type=self.login_type, login_type=self.login_type,
login_phone="", # input your phone number login_phone="", # input your phone number
browser_context=self.browser_context, browser_context=self.browser_context,
@ -219,11 +219,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
} }
return playwright_proxy, httpx_proxy return playwright_proxy, httpx_proxy
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient: async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
"""Create xhs client""" """Create xhs client"""
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...") utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
xhs_client_obj = XHSClient( xhs_client_obj = XiaoHongShuClient(
proxies=httpx_proxy, proxies=httpx_proxy,
headers={ headers={
"User-Agent": self.user_agent, "User-Agent": self.user_agent,

View File

@ -13,7 +13,7 @@ from base.base_crawler import AbstractLogin
from tools import utils from tools import utils
class XHSLogin(AbstractLogin): class XiaoHongShuLogin(AbstractLogin):
def __init__(self, def __init__(self,
login_type: str, login_type: str,
@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
""" """
if "请通过验证" in await self.context_page.content(): if "请通过验证" in await self.context_page.content():
utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证") utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
current_cookie = await self.browser_context.cookies() current_cookie = await self.browser_context.cookies()
_, cookie_dict = utils.convert_cookies(current_cookie) _, cookie_dict = utils.convert_cookies(current_cookie)
@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...") utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
elif self.login_type == "cookie": elif self.login_type == "cookie":
await self.login_by_cookies() await self.login_by_cookies()
else: else:
raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...") raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
async def login_by_mobile(self): async def login_by_mobile(self):
"""Login xiaohongshu by mobile""" """Login xiaohongshu by mobile"""
utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...") utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1) await asyncio.sleep(1)
try: try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
) )
await element.click() await element.click()
except Exception as e: except Exception as e:
utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...") utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
await asyncio.sleep(1) await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container") login_container_ele = await self.context_page.wait_for_selector("div.login-container")
@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = "" no_logged_in_session = ""
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}" sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = redis_obj.get(sms_code_key) sms_code_value = redis_obj.get(sms_code_key)
@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...") utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state""" """login xiaohongshu website and keep webdriver login state"""
utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...") utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
# login_selector = "div.login-container > div.left > div.qrcode > img" # login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']" qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode # find login qrcode
@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....") utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button # if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s") utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...") utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self): async def login_by_cookies(self):
"""login xiaohongshu website by cookies""" """login xiaohongshu website by cookies"""
utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...") utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
if key != "web_session": # only set web_session cookie attr if key != "web_session": # only set web_session cookie attr
continue continue