handby&catch exception

This commit is contained in:
ubuntu 2023-06-17 15:14:58 +08:00
parent 8206f83639
commit 79a0296312
3 changed files with 25 additions and 9 deletions

View File

@ -2,8 +2,9 @@
platform = "xhs" platform = "xhs"
keyword = "健身" keyword = "健身"
login_type = "qrcode" # qrcode or phone login_type = "handby" # qrcode or phone
login_phone = "13812345678" # your login phone login_phone = "13812345678" # your login phone
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
# redis config # redis config
redis_db_host = "redis://127.0.0.1" redis_db_host = "redis://127.0.0.1"

View File

@ -23,7 +23,8 @@ async def main():
parser = argparse.ArgumentParser(description='Media crawler program.') parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform) parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword) parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone)', default=config.login_type) parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone) parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
args = parser.parse_args() args = parser.parse_args()
@ -31,7 +32,8 @@ async def main():
crawler.init_config( crawler.init_config(
keywords=args.keywords, keywords=args.keywords,
login_phone=args.phone, login_phone=args.phone,
login_type=args.lt login_type=args.lt,
web_session=args.web_session
) )
await crawler.start() await crawler.start()

View File

@ -21,6 +21,7 @@ import config
from .client import XHSClient from .client import XHSClient
from base_crawler import Crawler from base_crawler import Crawler
from models import xhs as xhs_model from models import xhs as xhs_model
from .exception import *
class XiaoHongShuCrawler(Crawler): class XiaoHongShuCrawler(Crawler):
@ -28,6 +29,7 @@ class XiaoHongShuCrawler(Crawler):
self.login_phone = None self.login_phone = None
self.login_type = None self.login_type = None
self.keywords = None self.keywords = None
self.web_session = None
self.cookies: Optional[List[Cookie]] = None self.cookies: Optional[List[Cookie]] = None
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None self.context_page: Optional[Page] = None
@ -37,9 +39,8 @@ class XiaoHongShuCrawler(Crawler):
self.index_url = "https://www.xiaohongshu.com" self.index_url = "https://www.xiaohongshu.com"
def init_config(self, **kwargs): def init_config(self, **kwargs):
self.keywords = kwargs.get("keywords") for key in kwargs.keys():
self.login_type = kwargs.get("login_type") setattr(self, key, kwargs[key])
self.login_phone = kwargs.get("login_phone")
async def update_cookies(self): async def update_cookies(self):
self.cookies = await self.browser_context.cookies() self.cookies = await self.browser_context.cookies()
@ -48,7 +49,7 @@ class XiaoHongShuCrawler(Crawler):
async with async_playwright() as playwright: async with async_playwright() as playwright:
# launch browser and create single browser context # launch browser and create single browser context
chromium = playwright.chromium chromium = playwright.chromium
browser = await chromium.launch(headless=False) browser = await chromium.launch(headless=True)
self.browser_context = await browser.new_context( self.browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080}, viewport={"width": 1920, "height": 1080},
user_agent=self.user_agent, user_agent=self.user_agent,
@ -90,14 +91,23 @@ class XiaoHongShuCrawler(Crawler):
# There are two ways to log in: # There are two ways to log in:
# 1. Semi-automatic: Log in by scanning the QR code. # 1. Semi-automatic: Log in by scanning the QR code.
# 2. Fully automatic: Log in using forwarded text message notifications # 2. Fully automatic: Log in using forwarded text message notifications
# 3. handby automatic: Log in using preset cookie
# which includes mobile phone number and verification code. # which includes mobile phone number and verification code.
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
await self.login_by_mobile() await self.login_by_mobile()
elif self.login_type == "handby":
await self.browser_context.add_cookies([{
'name': 'web_session',
'value': self.web_session,
'domain': ".xiaohongshu.com",
'path': "/"
}])
else: else:
pass pass
async def login_by_mobile(self): async def login_by_mobile(self):
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...") print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
login_container_ele = await self.context_page.wait_for_selector("div.login-container") login_container_ele = await self.context_page.wait_for_selector("div.login-container")
@ -203,7 +213,10 @@ class XiaoHongShuCrawler(Crawler):
for post_item in posts_res.get("items"): for post_item in posts_res.get("items"):
max_note_len -= 1 max_note_len -= 1
note_id = post_item.get("id") note_id = post_item.get("id")
try:
note_detail = await self.xhs_client.get_note_by_id(note_id) note_detail = await self.xhs_client.get_note_by_id(note_id)
except DataFetchError as ex:
continue
await xhs_model.update_xhs_note(note_detail) await xhs_model.update_xhs_note(note_detail)
await asyncio.sleep(0.05) await asyncio.sleep(0.05)
note_list.append(note_id) note_list.append(note_id)