commit
88e8ee302e
@ -2,8 +2,9 @@
|
|||||||
|
|
||||||
platform = "xhs"
|
platform = "xhs"
|
||||||
keyword = "健身"
|
keyword = "健身"
|
||||||
login_type = "qrcode" # qrcode or phone
|
login_type = "handby" # qrcode or phone
|
||||||
login_phone = "13812345678" # your login phone
|
login_phone = "13812345678" # your login phone
|
||||||
|
login_webSession="040069b5f35b1cfef9787551bd364b86f4d839"
|
||||||
|
|
||||||
# redis config
|
# redis config
|
||||||
redis_db_host = "redis://127.0.0.1"
|
redis_db_host = "redis://127.0.0.1"
|
||||||
|
6
main.py
6
main.py
@ -23,7 +23,8 @@ async def main():
|
|||||||
parser = argparse.ArgumentParser(description='Media crawler program.')
|
parser = argparse.ArgumentParser(description='Media crawler program.')
|
||||||
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
|
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.platform)
|
||||||
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
|
parser.add_argument('--keywords', type=str, help='Search note/page keywords...', default=config.keyword)
|
||||||
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone)', default=config.login_type)
|
parser.add_argument('--lt', type=str, help='Login type (qrcode | phone | handby)', default=config.login_type)
|
||||||
|
parser.add_argument('--web_session', type=str, help='cookies to keep log in', default=config.login_webSession)
|
||||||
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
|
parser.add_argument('--phone', type=str, help='Login phone', default=config.login_phone)
|
||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
@ -31,7 +32,8 @@ async def main():
|
|||||||
crawler.init_config(
|
crawler.init_config(
|
||||||
keywords=args.keywords,
|
keywords=args.keywords,
|
||||||
login_phone=args.phone,
|
login_phone=args.phone,
|
||||||
login_type=args.lt
|
login_type=args.lt,
|
||||||
|
web_session=args.web_session
|
||||||
)
|
)
|
||||||
await crawler.start()
|
await crawler.start()
|
||||||
|
|
||||||
|
@ -21,6 +21,7 @@ import config
|
|||||||
from .client import XHSClient
|
from .client import XHSClient
|
||||||
from base_crawler import Crawler
|
from base_crawler import Crawler
|
||||||
from models import xhs as xhs_model
|
from models import xhs as xhs_model
|
||||||
|
from .exception import *
|
||||||
|
|
||||||
|
|
||||||
class XiaoHongShuCrawler(Crawler):
|
class XiaoHongShuCrawler(Crawler):
|
||||||
@ -28,6 +29,7 @@ class XiaoHongShuCrawler(Crawler):
|
|||||||
self.login_phone = None
|
self.login_phone = None
|
||||||
self.login_type = None
|
self.login_type = None
|
||||||
self.keywords = None
|
self.keywords = None
|
||||||
|
self.web_session = None
|
||||||
self.cookies: Optional[List[Cookie]] = None
|
self.cookies: Optional[List[Cookie]] = None
|
||||||
self.browser_context: Optional[BrowserContext] = None
|
self.browser_context: Optional[BrowserContext] = None
|
||||||
self.context_page: Optional[Page] = None
|
self.context_page: Optional[Page] = None
|
||||||
@ -37,9 +39,8 @@ class XiaoHongShuCrawler(Crawler):
|
|||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
|
|
||||||
def init_config(self, **kwargs):
|
def init_config(self, **kwargs):
|
||||||
self.keywords = kwargs.get("keywords")
|
for key in kwargs.keys():
|
||||||
self.login_type = kwargs.get("login_type")
|
setattr(self, key, kwargs[key])
|
||||||
self.login_phone = kwargs.get("login_phone")
|
|
||||||
|
|
||||||
async def update_cookies(self):
|
async def update_cookies(self):
|
||||||
self.cookies = await self.browser_context.cookies()
|
self.cookies = await self.browser_context.cookies()
|
||||||
@ -48,7 +49,7 @@ class XiaoHongShuCrawler(Crawler):
|
|||||||
async with async_playwright() as playwright:
|
async with async_playwright() as playwright:
|
||||||
# launch browser and create single browser context
|
# launch browser and create single browser context
|
||||||
chromium = playwright.chromium
|
chromium = playwright.chromium
|
||||||
browser = await chromium.launch(headless=False)
|
browser = await chromium.launch(headless=True)
|
||||||
self.browser_context = await browser.new_context(
|
self.browser_context = await browser.new_context(
|
||||||
viewport={"width": 1920, "height": 1080},
|
viewport={"width": 1920, "height": 1080},
|
||||||
user_agent=self.user_agent,
|
user_agent=self.user_agent,
|
||||||
@ -90,14 +91,23 @@ class XiaoHongShuCrawler(Crawler):
|
|||||||
# There are two ways to log in:
|
# There are two ways to log in:
|
||||||
# 1. Semi-automatic: Log in by scanning the QR code.
|
# 1. Semi-automatic: Log in by scanning the QR code.
|
||||||
# 2. Fully automatic: Log in using forwarded text message notifications
|
# 2. Fully automatic: Log in using forwarded text message notifications
|
||||||
|
# 3. handby automatic: Log in using preset cookie
|
||||||
# which includes mobile phone number and verification code.
|
# which includes mobile phone number and verification code.
|
||||||
if self.login_type == "qrcode":
|
if self.login_type == "qrcode":
|
||||||
await self.login_by_qrcode()
|
await self.login_by_qrcode()
|
||||||
elif self.login_type == "phone":
|
elif self.login_type == "phone":
|
||||||
await self.login_by_mobile()
|
await self.login_by_mobile()
|
||||||
|
elif self.login_type == "handby":
|
||||||
|
await self.browser_context.add_cookies([{
|
||||||
|
'name': 'web_session',
|
||||||
|
'value': self.web_session,
|
||||||
|
'domain': ".xiaohongshu.com",
|
||||||
|
'path': "/"
|
||||||
|
}])
|
||||||
else:
|
else:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
async def login_by_mobile(self):
|
async def login_by_mobile(self):
|
||||||
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
|
print("Start executing mobile phone number + verification code login on Xiaohongshu. ...")
|
||||||
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
login_container_ele = await self.context_page.wait_for_selector("div.login-container")
|
||||||
@ -203,7 +213,10 @@ class XiaoHongShuCrawler(Crawler):
|
|||||||
for post_item in posts_res.get("items"):
|
for post_item in posts_res.get("items"):
|
||||||
max_note_len -= 1
|
max_note_len -= 1
|
||||||
note_id = post_item.get("id")
|
note_id = post_item.get("id")
|
||||||
note_detail = await self.xhs_client.get_note_by_id(note_id)
|
try:
|
||||||
|
note_detail = await self.xhs_client.get_note_by_id(note_id)
|
||||||
|
except DataFetchError as ex:
|
||||||
|
continue
|
||||||
await xhs_model.update_xhs_note(note_detail)
|
await xhs_model.update_xhs_note(note_detail)
|
||||||
await asyncio.sleep(0.05)
|
await asyncio.sleep(0.05)
|
||||||
note_list.append(note_id)
|
note_list.append(note_id)
|
||||||
|
Loading…
Reference in New Issue
Block a user