fix: 增加小红书登录两种形态下弹窗的兼容代码

This commit is contained in:
Relakkes 2023-06-22 22:43:26 +08:00
parent 88e8ee302e
commit 1085a2a769
4 changed files with 75 additions and 23 deletions

View File

@ -2,9 +2,12 @@
platform = "xhs" platform = "xhs"
keyword = "健身" keyword = "健身"
login_type = "handby" # qrcode or phone login_type = "cookie" # qrcode or phone or cookie
login_phone = "13812345678" # your login phone login_phone = "" # your login phone
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
# If it's on the Xiaohongshu platform, only the web_session cookie will be kept.
# web_session=040069b2acxxxxxxxxxxxxxxxxxxxx;
cookies = ""
# redis config # redis config
redis_db_host = "redis://127.0.0.1" redis_db_host = "redis://127.0.0.1"

View File

@ -23,9 +23,9 @@ async def main():
parser = argparse.ArgumentParser(description='Media crawler program.') parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | cookie)', default=config.login_type)
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
parser.add_argument('--cookies', type=str, help='cookies to keep log in', default=config.cookies)
args = parser.parse_args() args = parser.parse_args()
crawler = CrawlerFactory().create_crawler(platform=args.platform) crawler = CrawlerFactory().create_crawler(platform=args.platform)
@ -33,7 +33,7 @@ async def main():
keywords=args.keywords, keywords=args.keywords,
login_phone=args.phone, login_phone=args.phone,
login_type=args.lt, login_type=args.lt,
web_session=args.web_session cookie_str=args.cookies
) )
await crawler.start() await crawler.start()

View File

@ -30,7 +30,8 @@ class XiaoHongShuCrawler(Crawler):
self.login_type = None self.login_type = None
self.keywords = None self.keywords = None
self.web_session = None self.web_session = None
self.cookies: Optional[List[Cookie]] = None self.cookies: Optional[List[Cookie]] = None # cookies from browser context
self.cookie_str: Optional[str] = None # cookie string from config or command line
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None self.proxy: Optional[Dict] = None
@ -88,28 +89,51 @@ class XiaoHongShuCrawler(Crawler):
async def login(self): async def login(self):
"""login xiaohongshu website and keep webdriver login state""" """login xiaohongshu website and keep webdriver login state"""
# There are two ways to log in: # There are three ways to log in:
# 1. Semi-automatic: Log in by scanning the QR code. # 1. Semi-automatic: Log in by scanning the QR code.
# 2. Fully automatic: Log in using forwarded text message notifications # 2. Fully automatic: Log in using forwarded text message notifications
# 3. handby automatic: Log in using preset cookie # 3. Semi-automatic: Log in using preset cookie
# which includes mobile phone number and verification code.
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "handby": elif self.login_type == "cookie":
# cookie str convert to cookie dict
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': 'web_session', 'name': key,
'value': self.web_session, 'value': value,
'domain': ".xiaohongshu.com", 'domain': ".xiaohongshu.com",
'path': "/" 'path': "/"
}]) }])
else: else:
pass pass
async def login_by_mobile(self): async def login_by_mobile(self):
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...") print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
await asyncio.sleep(1)
try:
# After entering the main page of Xiaohongshu,
# the login window may not pop up automatically and you need to manually click the login button.
login_button_ele = await self.context_page.wait_for_selector(
selector="xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button",
timeout=5000
)
await login_button_ele.click()
# There are also two types of login dialog boxes for pop-ups.
# One type directly shows the phone number and verification code.
# Another type requires clicking to switch to mobile login.
element = await self.context_page.wait_for_selector(
selector='xpath=//div[@class="login-container"]//div[@class="other-method"]/div[1]',
timeout=5000
)
await element.click()
except:
print("have not found mobile button icon and keep going ...")
await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container") login_container_ele = await self.context_page.wait_for_selector("div.login-container")
# Fill login phone # Fill login phone
input_ele = await login_container_ele.query_selector("label.phone > input") input_ele = await login_container_ele.query_selector("label.phone > input")
@ -158,15 +182,24 @@ class XiaoHongShuCrawler(Crawler):
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state""" """login xiaohongshu website and keep webdriver login state"""
print("Start scanning QR code to log in to Xiaohongshu. ...") print("Start scanning QR code to log in to Xiaohongshu. ...")
qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode # find login qrcode
base64_qrcode_img = await utils.find_login_qrcode( base64_qrcode_img = await utils.find_login_qrcode(
self.context_page, self.context_page,
selector="div.login-container > div.left > div.qrcode > img" selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
# todo ...if this website does not automatically popup login dialog box, we will manual click login button print("have not found qrcode and try again get it ....")
print("login failed , have not found qrcode please check ....") # if this website does not automatically popup login dialog box, we will manual click login button
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
await login_button_ele.click()
base64_qrcode_img = await utils.find_login_qrcode(
self.context_page,
selector=qrcode_img_selector
)
if not base64_qrcode_img:
print("login failed , program exit ...")
sys.exit() sys.exit()
# get not logged session # get not logged session

View File

@ -61,6 +61,22 @@ def convert_cookies(cookies: Optional[List[Cookie]]) -> Tuple[str, Dict]:
return cookies_str, cookie_dict return cookies_str, cookie_dict
def convert_str_cookie_to_dict(cookie_str: str) -> Dict:
cookie_dict = dict()
if not cookie_str:
return cookie_dict
for cookie in cookie_str.split(";"):
cookie = cookie.strip()
if not cookie:
continue
cookie = cookie.split("=")
cookie_value = cookie[1]
if isinstance(cookie_value, list):
cookie_value = "".join(cookie_value)
cookie_dict[cookie[0]] = cookie_value
return cookie_dict
def get_current_timestamp(): def get_current_timestamp():
return int(time.time() * 1000) return int(time.time() * 1000)