refactor: rename xhs to xiaohongshu
This commit is contained in:
parent
aa257aab51
commit
67ec49498a
@ -14,7 +14,7 @@ from .field import SearchNoteType, SearchSortType
|
|||||||
from .help import get_search_id, sign
|
from .help import get_search_id, sign
|
||||||
|
|
||||||
|
|
||||||
class XHSClient:
|
class XiaoHongShuClient:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
timeout=10,
|
timeout=10,
|
||||||
@ -134,14 +134,14 @@ class XHSClient:
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
"""get a note to check if login state is ok"""
|
"""get a note to check if login state is ok"""
|
||||||
utils.logger.info("[XHSClient.pong] Begin to pong xhs...")
|
utils.logger.info("[XiaoHongShuClient.pong] Begin to pong xhs...")
|
||||||
ping_flag = False
|
ping_flag = False
|
||||||
try:
|
try:
|
||||||
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
note_card: Dict = await self.get_note_by_keyword(keyword="小红书")
|
||||||
if note_card.get("items"):
|
if note_card.get("items"):
|
||||||
ping_flag = True
|
ping_flag = True
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.error(f"[XHSClient.pong] Ping xhs failed: {e}, and try to login again...")
|
utils.logger.error(f"[XiaoHongShuClient.pong] Ping xhs failed: {e}, and try to login again...")
|
||||||
ping_flag = False
|
ping_flag = False
|
||||||
return ping_flag
|
return ping_flag
|
||||||
|
|
||||||
@ -202,7 +202,7 @@ class XHSClient:
|
|||||||
if res and res.get("items"):
|
if res and res.get("items"):
|
||||||
res_dict: Dict = res["items"][0]["note_card"]
|
res_dict: Dict = res["items"][0]["note_card"]
|
||||||
return res_dict
|
return res_dict
|
||||||
utils.logger.error(f"[XHSClient.get_note_by_id] get note empty and res:{res}")
|
utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note empty and res:{res}")
|
||||||
return dict()
|
return dict()
|
||||||
|
|
||||||
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
||||||
@ -266,7 +266,7 @@ class XHSClient:
|
|||||||
comments_cursor = comments_res.get("cursor", "")
|
comments_cursor = comments_res.get("cursor", "")
|
||||||
if "comments" not in comments_res:
|
if "comments" not in comments_res:
|
||||||
utils.logger.info(
|
utils.logger.info(
|
||||||
f"[XHSClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
f"[XiaoHongShuClient.get_note_all_comments] No 'comments' key found in response: {comments_res}")
|
||||||
break
|
break
|
||||||
comments = comments_res["comments"]
|
comments = comments_res["comments"]
|
||||||
if callback:
|
if callback:
|
||||||
@ -337,11 +337,11 @@ class XHSClient:
|
|||||||
notes_has_more = notes_res.get("has_more", False)
|
notes_has_more = notes_res.get("has_more", False)
|
||||||
notes_cursor = notes_res.get("cursor", "")
|
notes_cursor = notes_res.get("cursor", "")
|
||||||
if "notes" not in notes_res:
|
if "notes" not in notes_res:
|
||||||
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||||
break
|
break
|
||||||
|
|
||||||
notes = notes_res["notes"]
|
notes = notes_res["notes"]
|
||||||
utils.logger.info(f"[XHSClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
||||||
if callback:
|
if callback:
|
||||||
await callback(notes)
|
await callback(notes)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
|
@ -14,10 +14,10 @@ from store import xhs as xhs_store
|
|||||||
from tools import utils
|
from tools import utils
|
||||||
from var import crawler_type_var
|
from var import crawler_type_var
|
||||||
|
|
||||||
from .client import XHSClient
|
from .client import XiaoHongShuClient
|
||||||
from .exception import DataFetchError
|
from .exception import DataFetchError
|
||||||
from .field import SearchSortType
|
from .field import SearchSortType
|
||||||
from .login import XHSLogin
|
from .login import XiaoHongShuLogin
|
||||||
|
|
||||||
|
|
||||||
class XiaoHongShuCrawler(AbstractCrawler):
|
class XiaoHongShuCrawler(AbstractCrawler):
|
||||||
@ -25,7 +25,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
login_type: str
|
login_type: str
|
||||||
crawler_type: str
|
crawler_type: str
|
||||||
context_page: Page
|
context_page: Page
|
||||||
xhs_client: XHSClient
|
xhs_client: XiaoHongShuClient
|
||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
@ -68,7 +68,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
# Create a client to interact with the xiaohongshu website.
|
# Create a client to interact with the xiaohongshu website.
|
||||||
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
self.xhs_client = await self.create_xhs_client(httpx_proxy_format)
|
||||||
if not await self.xhs_client.pong():
|
if not await self.xhs_client.pong():
|
||||||
login_obj = XHSLogin(
|
login_obj = XiaoHongShuLogin(
|
||||||
login_type=self.login_type,
|
login_type=self.login_type,
|
||||||
login_phone="", # input your phone number
|
login_phone="", # input your phone number
|
||||||
browser_context=self.browser_context,
|
browser_context=self.browser_context,
|
||||||
@ -219,11 +219,11 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
}
|
}
|
||||||
return playwright_proxy, httpx_proxy
|
return playwright_proxy, httpx_proxy
|
||||||
|
|
||||||
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient:
|
async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XiaoHongShuClient:
|
||||||
"""Create xhs client"""
|
"""Create xhs client"""
|
||||||
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
utils.logger.info("[XiaoHongShuCrawler.create_xhs_client] Begin create xiaohongshu API client ...")
|
||||||
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
|
||||||
xhs_client_obj = XHSClient(
|
xhs_client_obj = XiaoHongShuClient(
|
||||||
proxies=httpx_proxy,
|
proxies=httpx_proxy,
|
||||||
headers={
|
headers={
|
||||||
"User-Agent": self.user_agent,
|
"User-Agent": self.user_agent,
|
||||||
|
@ -13,7 +13,7 @@ from base.base_crawler import AbstractLogin
|
|||||||
from tools import utils
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
class XHSLogin(AbstractLogin):
|
class XiaoHongShuLogin(AbstractLogin):
|
||||||
|
|
||||||
def __init__(self,
|
def __init__(self,
|
||||||
login_type: str,
|
login_type: str,
|
||||||
@ -37,7 +37,7 @@ class XHSLogin(AbstractLogin):
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
if "请通过验证" in await self.context_page.content():
|
if "请通过验证" in await self.context_page.content():
|
||||||
utils.logger.info("[XHSLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
utils.logger.info("[XiaoHongShuLogin.check_login_state] 登录过程中出现验证码,请手动验证")
|
||||||
|
|
||||||
current_cookie = await self.browser_context.cookies()
|
current_cookie = await self.browser_context.cookies()
|
||||||
_, cookie_dict = utils.convert_cookies(current_cookie)
|
_, cookie_dict = utils.convert_cookies(current_cookie)
|
||||||
@ -48,7 +48,7 @@ class XHSLogin(AbstractLogin):
|
|||||||
|
|
||||||
async def begin(self):
|
async def begin(self):
|
||||||
"""Start login xiaohongshu"""
|
"""Start login xiaohongshu"""
|
||||||
utils.logger.info("[XHSLogin.begin] Begin login xiaohongshu ...")
|
utils.logger.info("[XiaoHongShuLogin.begin] Begin login xiaohongshu ...")
|
||||||
if self.login_type == "qrcode":
|
if self.login_type == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif self.login_type == "phone":
|
||||||
@ -56,11 +56,11 @@ class XHSLogin(AbstractLogin):
|
|||||||
elif self.login_type == "cookie":
|
elif self.login_type == "cookie":
|
||||||
await self.login_by_cookies()
|
await self.login_by_cookies()
|
||||||
else:
|
else:
|
||||||
raise ValueError("[XHSLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
raise ValueError("[XiaoHongShuLogin.begin]I nvalid Login Type Currently only supported qrcode or phone or cookies ...")
|
||||||
|
|
||||||
async def login_by_mobile(self):
|
async def login_by_mobile(self):
|
||||||
"""Login xiaohongshu by mobile"""
|
"""Login xiaohongshu by mobile"""
|
||||||
utils.logger.info("[XHSLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Begin login xiaohongshu by mobile ...")
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
try:
|
try:
|
||||||
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
|
||||||
@ -77,7 +77,7 @@ class XHSLogin(AbstractLogin):
|
|||||||
)
|
)
|
||||||
await element.click()
|
await element.click()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
utils.logger.info("[XHSLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] have not found mobile button icon and keep going ...")
|
||||||
|
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||||
@ -93,7 +93,7 @@ class XHSLogin(AbstractLogin):
|
|||||||
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
|
||||||
no_logged_in_session = ""
|
no_logged_in_session = ""
|
||||||
while max_get_sms_code_time > 0:
|
while max_get_sms_code_time > 0:
|
||||||
utils.logger.info(f"[XHSLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] get sms code from redis remaining time {max_get_sms_code_time}s ...")
|
||||||
await asyncio.sleep(1)
|
await asyncio.sleep(1)
|
||||||
sms_code_key = f"xhs_{self.login_phone}"
|
sms_code_key = f"xhs_{self.login_phone}"
|
||||||
sms_code_value = redis_obj.get(sms_code_key)
|
sms_code_value = redis_obj.get(sms_code_key)
|
||||||
@ -119,16 +119,16 @@ class XHSLogin(AbstractLogin):
|
|||||||
try:
|
try:
|
||||||
await self.check_login_state(no_logged_in_session)
|
await self.check_login_state(no_logged_in_session)
|
||||||
except RetryError:
|
except RetryError:
|
||||||
utils.logger.info("[XHSLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_mobile] Login xiaohongshu failed by mobile login method ...")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
wait_redirect_seconds = 5
|
wait_redirect_seconds = 5
|
||||||
utils.logger.info(f"[XHSLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_mobile] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||||
await asyncio.sleep(wait_redirect_seconds)
|
await asyncio.sleep(wait_redirect_seconds)
|
||||||
|
|
||||||
async def login_by_qrcode(self):
|
async def login_by_qrcode(self):
|
||||||
"""login xiaohongshu website and keep webdriver login state"""
|
"""login xiaohongshu website and keep webdriver login state"""
|
||||||
utils.logger.info("[XHSLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Begin login xiaohongshu by qrcode ...")
|
||||||
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
# login_selector = "div.login-container > div.left > div.qrcode > img"
|
||||||
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
|
||||||
# find login qrcode
|
# find login qrcode
|
||||||
@ -137,7 +137,7 @@ class XHSLogin(AbstractLogin):
|
|||||||
selector=qrcode_img_selector
|
selector=qrcode_img_selector
|
||||||
)
|
)
|
||||||
if not base64_qrcode_img:
|
if not base64_qrcode_img:
|
||||||
utils.logger.info("[XHSLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] login failed , have not found qrcode please check ....")
|
||||||
# if this website does not automatically popup login dialog box, we will manual click login button
|
# if this website does not automatically popup login dialog box, we will manual click login button
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
|
||||||
@ -161,20 +161,20 @@ class XHSLogin(AbstractLogin):
|
|||||||
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
|
||||||
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
|
||||||
|
|
||||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] waiting for scan code login, remaining time is 120s")
|
||||||
try:
|
try:
|
||||||
await self.check_login_state(no_logged_in_session)
|
await self.check_login_state(no_logged_in_session)
|
||||||
except RetryError:
|
except RetryError:
|
||||||
utils.logger.info("[XHSLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_qrcode] Login xiaohongshu failed by qrcode login method ...")
|
||||||
sys.exit()
|
sys.exit()
|
||||||
|
|
||||||
wait_redirect_seconds = 5
|
wait_redirect_seconds = 5
|
||||||
utils.logger.info(f"[XHSLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
utils.logger.info(f"[XiaoHongShuLogin.login_by_qrcode] Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
|
||||||
await asyncio.sleep(wait_redirect_seconds)
|
await asyncio.sleep(wait_redirect_seconds)
|
||||||
|
|
||||||
async def login_by_cookies(self):
|
async def login_by_cookies(self):
|
||||||
"""login xiaohongshu website by cookies"""
|
"""login xiaohongshu website by cookies"""
|
||||||
utils.logger.info("[XHSLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
utils.logger.info("[XiaoHongShuLogin.login_by_cookies] Begin login xiaohongshu by cookie ...")
|
||||||
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
|
||||||
if key != "web_session": # only set web_session cookie attr
|
if key != "web_session": # only set web_session cookie attr
|
||||||
continue
|
continue
|
||||||
|
Loading…
Reference in New Issue
Block a user