refactor: 优化抖音Crawler部分代码

fix: 日志初始化错误修复
This commit is contained in:
Relakkes 2023-07-15 21:30:12 +08:00
parent dad8d56ab5
commit 2398a17e21
10 changed files with 186 additions and 152 deletions

3
.gitignore vendored
View File

@ -163,5 +163,4 @@ cython_debug/
*.iml *.iml
.idea .idea
/temp_image/ /temp_image/
/xhs_user_data_dir/ /browser_data/
/dy_user_data_dir/

View File

@ -1,3 +1,5 @@
from typing import Tuple, Optional
import config import config
@ -8,14 +10,14 @@ class PhonePool:
self.phones = [] self.phones = []
self.used_phones = set() self.used_phones = set()
def add_phone(self, phone): def add_phone(self, phone: str) -> bool:
"""add phone to the pool""" """add phone to the pool"""
if phone not in self.phones: if phone not in self.phones:
self.phones.append(phone) self.phones.append(phone)
return True return True
return False return False
def remove_phone(self, phone): def remove_phone(self, phone: str) -> bool:
"""remove phone from the pool""" """remove phone from the pool"""
if phone in self.used_phones: if phone in self.used_phones:
self.phones.remove(phone) self.phones.remove(phone)
@ -23,7 +25,7 @@ class PhonePool:
return True return True
return False return False
def get_phone(self): def get_phone(self) -> Optional[str]:
"""get phone and mark as used""" """get phone and mark as used"""
if self.phones: if self.phones:
left_phone = self.phones.pop(0) left_phone = self.phones.pop(0)
@ -49,7 +51,7 @@ class IPPool:
return True return True
return False return False
def remove_ip(self, ip): def remove_ip(self, ip: str) -> bool:
"""remove ip""" """remove ip"""
if ip in self.used_ips: if ip in self.used_ips:
self.ips.remove(ip) self.ips.remove(ip)
@ -57,7 +59,7 @@ class IPPool:
return True return True
return False return False
def get_ip(self): def get_ip(self) -> Optional[str]:
"""get ip and mark as used""" """get ip and mark as used"""
if self.ips: if self.ips:
left_ips = self.ips.pop(0) left_ips = self.ips.pop(0)
@ -78,19 +80,19 @@ class AccountPool:
self.phone_pool = PhonePool() self.phone_pool = PhonePool()
self.ip_pool = IPPool() self.ip_pool = IPPool()
def add_account(self, phone, ip): def add_account(self, phone: str, ip: str) -> bool:
"""add account to pool with phone and ip""" """add account to pool with phone and ip"""
if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip): if self.phone_pool.add_phone(phone) and self.ip_pool.add_ip(ip):
return True return True
return False return False
def remove_account(self, phone, ip): def remove_account(self, phone: str, ip: str) -> bool:
"""remove account from pool """ """remove account from pool """
if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip): if self.phone_pool.remove_phone(phone) and self.ip_pool.remove_ip(ip):
return True return True
return False return False
def get_account(self): def get_account(self) -> Tuple[str, str]:
"""get account if no account, reload account pool""" """get account if no account, reload account pool"""
phone = self.phone_pool.get_phone() phone = self.phone_pool.get_phone()
ip = self.ip_pool.get_ip() ip = self.ip_pool.get_ip()

15
main.py
View File

@ -21,7 +21,6 @@ class CrawlerFactory:
async def main(): async def main():
utils.init_loging_config()
# define command line params ... # define command line params ...
parser = argparse.ArgumentParser(description='Media crawler program.') parser = argparse.ArgumentParser(description='Media crawler program.')
parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM) parser.add_argument('--platform', type=str, help='Media platform select (xhs|dy)...', default=config.PLATFORM)
@ -38,20 +37,6 @@ async def main():
) )
await crawler.start() await crawler.start()
"""
# retry when exception ...
while True:
try:
await crawler.start()
except Exception as e:
logging.info(f"crawler start error: {e} ...")
await crawler.close()
# If you encounter an exception
# sleep for a period of time before retrying
# to avoid frequent requests that may result in the account being blocked.
await asyncio.sleep(config.RETRY_INTERVAL)
"""
if __name__ == '__main__': if __name__ == '__main__':
try: try:

View File

@ -6,9 +6,11 @@ import httpx
import execjs import execjs
import urllib.parse import urllib.parse
from playwright.async_api import Page from playwright.async_api import Page
from playwright.async_api import BrowserContext
from .field import * from .field import *
from .exception import * from .exception import *
from tools import utils
class DOUYINClient: class DOUYINClient:
@ -33,7 +35,6 @@ class DOUYINClient:
headers = headers or self.headers headers = headers or self.headers
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage")
douyin_js_obj = execjs.compile(open('libs/douyin.js').read()) douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
# douyin_js_obj = execjs.compile(open('libs/X-Bogus.js').read())
common_params = { common_params = {
"device_platform": "webapp", "device_platform": "webapp",
"aid": "6383", "aid": "6383",
@ -82,6 +83,17 @@ class DOUYINClient:
headers = headers or self.headers headers = headers or self.headers
return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers) return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers)
@staticmethod
async def ping(browser_context: BrowserContext) -> bool:
_, cookie_dict = utils.convert_cookies(await browser_context.cookies())
# todo send some api to test login status
return cookie_dict.get("LOGIN_STATUS") == "1"
async def update_cookies(self, browser_context: BrowserContext):
cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies())
self.headers["Cookie"] = cookie_str
self.cookie_dict = cookie_dict
async def search_info_by_keyword( async def search_info_by_keyword(
self, self,
keyword: str, keyword: str,

View File

@ -1,13 +1,14 @@
import logging import os
import asyncio import asyncio
import logging
from asyncio import Task from asyncio import Task
from argparse import Namespace from argparse import Namespace
from typing import Optional, List, Dict, Tuple from typing import Optional, List, Dict, Tuple
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from playwright.async_api import Page from playwright.async_api import BrowserType
from playwright.async_api import Cookie
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
from playwright.async_api import Page
import config import config
from tools import utils from tools import utils
@ -21,12 +22,11 @@ from models import douyin
class DouYinCrawler(AbstractCrawler): class DouYinCrawler(AbstractCrawler):
def __init__(self): def __init__(self):
self.cookies: Optional[List[Cookie]] = None
self.browser_context: Optional[BrowserContext] = None self.browser_context: Optional[BrowserContext] = None
self.context_page: Optional[Page] = None self.context_page: Optional[Page] = None
self.proxy: Optional[Dict] = None
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
self.dy_client: Optional[DOUYINClient] = None self.dy_client: Optional[DOUYINClient] = None
self.index_url = "https://www.douyin.com"
self.command_args: Optional[Namespace] = None self.command_args: Optional[Namespace] = None
self.account_pool: Optional[AccountPool] = None self.account_pool: Optional[AccountPool] = None
@ -34,51 +34,24 @@ class DouYinCrawler(AbstractCrawler):
for key, value in kwargs.items(): for key, value in kwargs.items():
setattr(self, key, value) setattr(self, key, value)
def create_proxy_info(self) -> Tuple[str, Dict, str]:
"""Create proxy info for playwright and httpx"""
# phone: 13012345671
# ip_proxy: 111.122.xx.xx1:8888
# 手机号和IP代理都是从账号池中获取的并且它们是固定绑定的
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def start(self): async def start(self):
# phone: 1340xxxx, ip_proxy: 47.xxx.xxx.xxx:8888 account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
account_phone, ip_proxy = self.account_pool.get_account()
# 抖音平台如果开启代理登录的话,会被风控,所以这里不开启代理
playwright_proxy = None
# playwright_proxy = {
# "server": f"{config.ip_proxy_protocol}{ip_proxy}",
# "username": config.ip_proxy_user,
# "password": config.ip_proxy_password,
# }
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
if not config.ENABLE_IP_PROXY:
playwright_proxy = None
httpx_proxy = None
async with async_playwright() as playwright: async with async_playwright() as playwright:
# Launch a browser context.
chromium = playwright.chromium chromium = playwright.chromium
browser = await chromium.launch(headless=config.HEADLESS, proxy=playwright_proxy) self.browser_context = await self.launch_browser(
self.browser_context = await browser.new_context( chromium,
viewport={"width": 1800, "height": 900}, playwright_proxy,
user_agent=self.user_agent, self.user_agent,
headless=config.HEADLESS
) )
# execute JS to bypass anti automation/crawler detection # stealth.min.js is a js script to prevent the website from detecting the crawler.
await self.browser_context.add_init_script(path="libs/stealth.min.js") await self.browser_context.add_init_script(path="libs/stealth.min.js")
self.context_page = await self.browser_context.new_page() self.context_page = await self.browser_context.new_page()
await self.context_page.goto("https://www.douyin.com", wait_until="domcontentloaded") await self.context_page.goto(self.index_url)
await asyncio.sleep(3)
# begin login self.dy_client = await self.create_douyin_client(httpx_proxy)
if not await self.dy_client.ping(browser_context=self.browser_context):
login_obj = DouYinLogin( login_obj = DouYinLogin(
login_type=self.command_args.lt, login_type=self.command_args.lt,
login_phone=account_phone, login_phone=account_phone,
@ -87,41 +60,19 @@ class DouYinCrawler(AbstractCrawler):
cookie_str=config.COOKIES cookie_str=config.COOKIES
) )
await login_obj.begin() await login_obj.begin()
await self.dy_client.update_cookies(browser_context=self.browser_context)
# update cookies
await self.update_cookies()
# init request client
cookie_str, cookie_dict = utils.convert_cookies(self.cookies)
self.dy_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
# search_posts # search_posts
await self.search_posts() await self.search_posts()
# block main crawler coroutine utils.logger.info("Douyin Crawler finished ...")
await asyncio.Event().wait()
async def update_cookies(self):
self.cookies = await self.browser_context.cookies()
async def search_posts(self): async def search_posts(self):
logging.info("Begin search douyin keywords") utils.logger.info("Begin search douyin keywords")
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
logging.info(f"Current keyword: {keyword}") utils.logger.info(f"Current keyword: {keyword}")
aweme_list: List[str] = [] aweme_list: List[str] = []
max_note_len = 20 max_note_len = config.MAX_PAGE_NUM
page = 0 page = 0
while max_note_len > 0: while max_note_len > 0:
try: try:
@ -139,8 +90,8 @@ class DouYinCrawler(AbstractCrawler):
continue continue
aweme_list.append(aweme_info.get("aweme_id")) aweme_list.append(aweme_info.get("aweme_id"))
await douyin.update_douyin_aweme(aweme_item=aweme_info) await douyin.update_douyin_aweme(aweme_item=aweme_info)
print(f"keyword:{keyword}, aweme_list:{aweme_list}") utils.logger.info(f"keyword:{keyword}, aweme_list:{aweme_list}")
await self.batch_get_note_comments(aweme_list) # await self.batch_get_note_comments(aweme_list)
async def batch_get_note_comments(self, aweme_list: List[str]): async def batch_get_note_comments(self, aweme_list: List[str]):
task_list: List[Task] = [] task_list: List[Task] = []
@ -155,6 +106,71 @@ class DouYinCrawler(AbstractCrawler):
aweme_id=aweme_id, aweme_id=aweme_id,
callback=douyin.batch_update_dy_aweme_comments callback=douyin.batch_update_dy_aweme_comments
) )
print(f"aweme_id: {aweme_id} comments have all been obtained completed ...") utils.logger.info(f"aweme_id: {aweme_id} comments have all been obtained completed ...")
except DataFetchError as e: except DataFetchError as e:
logging.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") logging.error(f"aweme_id: {aweme_id} get comments failed, error: {e}")
def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]:
"""Create proxy info for playwright and httpx"""
if not config.ENABLE_IP_PROXY:
return None, None, None
# phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888
phone, ip_proxy = self.account_pool.get_account()
playwright_proxy = {
"server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}",
"username": config.IP_PROXY_USER,
"password": config.IP_PROXY_PASSWORD,
}
httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}"
return phone, playwright_proxy, httpx_proxy
async def create_douyin_client(self, httpx_proxy: str) -> DOUYINClient:
"""Create douyin client"""
cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies())
douyin_client = DOUYINClient(
proxies=httpx_proxy,
headers={
"User-Agent": self.user_agent,
"Cookie": cookie_str,
"Host": "www.douyin.com",
"Origin": "https://www.douyin.com/",
"Referer": "https://www.douyin.com/",
"Content-Type": "application/json;charset=UTF-8"
},
playwright_page=self.context_page,
cookie_dict=cookie_dict,
)
return douyin_client
async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
"""Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE:
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform)
browser_context = await chromium.launch_persistent_context(
user_data_dir=user_data_dir,
accept_downloads=True,
headless=headless,
proxy=playwright_proxy,
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
else:
browser = await chromium.launch(headless=headless, proxy=playwright_proxy)
browser_context = await browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent=user_agent
)
return browser_context
async def close(self):
"""Close browser context"""
await self.browser_context.close()
utils.logger.info("Browser context closed ...")

View File

@ -8,13 +8,14 @@ from tenacity import (
retry, retry,
stop_after_attempt, stop_after_attempt,
wait_fixed, wait_fixed,
retry_if_result retry_if_result,
RetryError
) )
from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError from playwright.async_api import Page, TimeoutError as PlaywrightTimeoutError
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
import config import config
from tools import utils, easing from tools import utils
from base.base_crawler import AbstractLogin from base.base_crawler import AbstractLogin
@ -54,21 +55,22 @@ class DouYinLogin(AbstractLogin):
raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...") raise ValueError("Invalid Login Type Currently only supported qrcode or phone ...")
# 如果页面重定向到滑动验证码页面,需要再次滑动滑块 # 如果页面重定向到滑动验证码页面,需要再次滑动滑块
await asyncio.sleep(3) await asyncio.sleep(6)
current_page_title = await self.context_page.title() current_page_title = await self.context_page.title()
if "验证码中间页" in current_page_title: if "验证码中间页" in current_page_title:
await self.check_page_display_slider(move_step=3, slider_level="hard") await self.check_page_display_slider(move_step=3, slider_level="hard")
# check login state # check login state
logging.info(f"login finished then check login state ...") utils.logger.info(f"login finished then check login state ...")
login_flag: bool = await self.check_login_state() try:
if not login_flag: await self.check_login_state()
logging.info("login failed please confirm ...") except RetryError:
utils.logger.info("login failed please confirm ...")
sys.exit() sys.exit()
# wait for redirect # wait for redirect
wait_redirect_seconds = 5 wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
@retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False)) @retry(stop=stop_after_attempt(20), wait=wait_fixed(1), retry=retry_if_result(lambda value: value is False))
@ -88,20 +90,20 @@ class DouYinLogin(AbstractLogin):
await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10) await self.context_page.wait_for_selector(dialog_selector, timeout=1000 * 10)
except Exception as e: except Exception as e:
logging.error(f"login dialog box does not pop up automatically, error: {e}") logging.error(f"login dialog box does not pop up automatically, error: {e}")
logging.info("login dialog box does not pop up automatically, we will manually click the login button") utils.logger.info("login dialog box does not pop up automatically, we will manually click the login button")
login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']") login_button_ele = self.context_page.locator("xpath=//p[text() = '登录']")
await login_button_ele.click() await login_button_ele.click()
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
async def login_by_qrcode(self): async def login_by_qrcode(self):
logging.info("Begin login douyin by qrcode...") utils.logger.info("Begin login douyin by qrcode...")
qrcode_img_selector = "xpath=//article[@class='web-login']//img" qrcode_img_selector = "xpath=//article[@class='web-login']//img"
base64_qrcode_img = await utils.find_login_qrcode( base64_qrcode_img = await utils.find_login_qrcode(
self.context_page, self.context_page,
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
logging.info("login qrcode not found please confirm ...") utils.logger.info("login qrcode not found please confirm ...")
sys.exit() sys.exit()
# show login qrcode # show login qrcode
@ -112,7 +114,7 @@ class DouYinLogin(AbstractLogin):
await asyncio.sleep(2) await asyncio.sleep(2)
async def login_by_mobile(self): async def login_by_mobile(self):
logging.info("Begin login douyin by mobile ...") utils.logger.info("Begin login douyin by mobile ...")
mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']") mobile_tap_ele = self.context_page.locator("xpath=//li[text() = '验证码登录']")
await mobile_tap_ele.click() await mobile_tap_ele.click()
await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']") await self.context_page.wait_for_selector("xpath=//article[@class='web-login-mobile-code']")
@ -128,7 +130,7 @@ class DouYinLogin(AbstractLogin):
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
logging.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"get douyin sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"dy_{self.login_phone}" sms_code_key = f"dy_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key) sms_code_value = await redis_obj.get(sms_code_key)
@ -170,20 +172,20 @@ class DouYinLogin(AbstractLogin):
# 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮 # 如果滑块滑动慢了,或者验证失败了,会提示操作过慢,这里点一下刷新按钮
page_content = await self.context_page.content() page_content = await self.context_page.content()
if "操作过慢" in page_content or "提示重新操作" in page_content: if "操作过慢" in page_content or "提示重新操作" in page_content:
logging.info("slider verify failed, retry ...") utils.logger.info("slider verify failed, retry ...")
await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]") await self.context_page.click(selector="//a[contains(@class, 'secsdk_captcha_refresh')]")
continue continue
# 滑动成功后,等待滑块消失 # 滑动成功后,等待滑块消失
await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000) await self.context_page.wait_for_selector(selector=back_selector, state="hidden", timeout=1000)
# 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码 # 如果滑块消失了,说明验证成功了,跳出循环,如果没有消失,说明验证失败了,上面这一行代码会抛出异常被捕获后继续循环滑动验证码
logging.info("slider verify success ...") utils.logger.info("slider verify success ...")
slider_verify_success = True slider_verify_success = True
except Exception as e: except Exception as e:
logging.error(f"slider verify failed, error: {e}") logging.error(f"slider verify failed, error: {e}")
await asyncio.sleep(1) await asyncio.sleep(1)
max_slider_try_times -= 1 max_slider_try_times -= 1
logging.info(f"remaining slider try times: {max_slider_try_times}") utils.logger.info(f"remaining slider try times: {max_slider_try_times}")
continue continue
async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"): async def move_slider(self, back_selector: str, gap_selector: str, move_step: int = 10, slider_level="easy"):
@ -240,7 +242,7 @@ class DouYinLogin(AbstractLogin):
await self.context_page.mouse.up() await self.context_page.mouse.up()
async def login_by_cookies(self): async def login_by_cookies(self):
logging.info("Begin login douyin by cookie ...") utils.logger.info("Begin login douyin by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': key, 'name': key,

View File

@ -82,7 +82,7 @@ class XHSClient:
async def ping(self) -> bool: async def ping(self) -> bool:
"""get a note to check if login state is ok""" """get a note to check if login state is ok"""
logging.info("begin to ping xhs...") utils.logger.info("begin to ping xhs...")
note_id = "5e5cb38a000000000100185e" note_id = "5e5cb38a000000000100185e"
try: try:
note_card: Dict = await self.get_note_by_id(note_id) note_card: Dict = await self.get_note_by_id(note_id)

View File

@ -1,3 +1,4 @@
import os
import random import random
import asyncio import asyncio
import logging import logging
@ -8,6 +9,7 @@ from argparse import Namespace
from playwright.async_api import Page from playwright.async_api import Page
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
from playwright.async_api import BrowserType
import config import config
from tools import utils from tools import utils
@ -31,8 +33,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
self.account_pool: Optional[AccountPool] = None self.account_pool: Optional[AccountPool] = None
def init_config(self, **kwargs): def init_config(self, **kwargs):
for key in kwargs.keys(): for key, value in kwargs.items():
setattr(self, key, kwargs[key]) setattr(self, key, value)
async def start(self): async def start(self):
account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info()
@ -66,13 +68,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
# Search for notes and retrieve their comment information. # Search for notes and retrieve their comment information.
await self.search_posts() await self.search_posts()
logging.info("Xhs Crawler finished ...") utils.logger.info("Xhs Crawler finished ...")
async def search_posts(self): async def search_posts(self):
"""Search for notes and retrieve their comment information.""" """Search for notes and retrieve their comment information."""
logging.info("Begin search xiaohongshu keywords") utils.logger.info("Begin search xiaohongshu keywords")
for keyword in config.KEYWORDS.split(","): for keyword in config.KEYWORDS.split(","):
logging.info(f"Current keyword: {keyword}") utils.logger.info(f"Current keyword: {keyword}")
note_list: List[str] = [] note_list: List[str] = []
max_note_len = config.MAX_PAGE_NUM max_note_len = config.MAX_PAGE_NUM
page = 1 page = 1
@ -88,13 +90,13 @@ class XiaoHongShuCrawler(AbstractCrawler):
try: try:
note_detail = await self.xhs_client.get_note_by_id(note_id) note_detail = await self.xhs_client.get_note_by_id(note_id)
except DataFetchError as ex: except DataFetchError as ex:
logging.error(f"Get note detail error: {ex}") utils.logger.error(f"Get note detail error: {ex}")
continue continue
await xhs_model.update_xhs_note(note_detail) await xhs_model.update_xhs_note(note_detail)
await asyncio.sleep(0.05) await asyncio.sleep(0.05)
note_list.append(note_id) note_list.append(note_id)
logging.info(f"keyword:{keyword}, note_list:{note_list}") utils.logger.info(f"keyword:{keyword}, note_list:{note_list}")
# await self.batch_get_note_comments(note_list) await self.batch_get_note_comments(note_list)
async def batch_get_note_comments(self, note_list: List[str]): async def batch_get_note_comments(self, note_list: List[str]):
"""Batch get note comments""" """Batch get note comments"""
@ -106,7 +108,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def get_comments(self, note_id: str): async def get_comments(self, note_id: str):
"""Get note comments""" """Get note comments"""
logging.info(f"Begin get note id comments {note_id}") utils.logger.info(f"Begin get note id comments {note_id}")
all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random()) all_comments = await self.xhs_client.get_note_all_comments(note_id=note_id, crawl_interval=random.random())
for comment in all_comments: for comment in all_comments:
await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment)
@ -143,12 +145,21 @@ class XiaoHongShuCrawler(AbstractCrawler):
) )
return xhs_client_obj return xhs_client_obj
async def launch_browser(self, chromium, playwright_proxy, user_agent, headless=True) -> BrowserContext: async def launch_browser(
self,
chromium: BrowserType,
playwright_proxy: Optional[Dict],
user_agent: Optional[str],
headless: bool = True
) -> BrowserContext:
utils.logger.info("Begin create browser context ...")
"""Launch browser and create browser context""" """Launch browser and create browser context"""
if config.SAVE_LOGIN_STATE: if config.SAVE_LOGIN_STATE:
# feat issue #14 # feat issue #14
# we will save login state to avoid login every time
user_data_dir = os.path.join(os.getcwd(), "browser_data", config.USER_DATA_DIR % self.command_args.platform)
browser_context = await chromium.launch_persistent_context( browser_context = await chromium.launch_persistent_context(
user_data_dir=config.USER_DATA_DIR % self.command_args.platform, user_data_dir=user_data_dir,
accept_downloads=True, accept_downloads=True,
headless=headless, headless=headless,
proxy=playwright_proxy, proxy=playwright_proxy,
@ -167,4 +178,4 @@ class XiaoHongShuCrawler(AbstractCrawler):
async def close(self): async def close(self):
"""Close browser context""" """Close browser context"""
await self.browser_context.close() await self.browser_context.close()
logging.info("Browser context closed ...") utils.logger.info("Browser context closed ...")

View File

@ -50,7 +50,7 @@ class XHSLogin(AbstractLogin):
async def begin(self): async def begin(self):
"""Start login xiaohongshu""" """Start login xiaohongshu"""
logging.info("Begin login xiaohongshu ...") utils.logger.info("Begin login xiaohongshu ...")
if self.login_type == "qrcode": if self.login_type == "qrcode":
await self.login_by_qrcode() await self.login_by_qrcode()
elif self.login_type == "phone": elif self.login_type == "phone":
@ -62,7 +62,7 @@ class XHSLogin(AbstractLogin):
async def login_by_mobile(self): async def login_by_mobile(self):
"""Login xiaohongshu by mobile""" """Login xiaohongshu by mobile"""
logging.info("Begin login xiaohongshu by mobile ...") utils.logger.info("Begin login xiaohongshu by mobile ...")
await asyncio.sleep(1) await asyncio.sleep(1)
try: try:
# 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮 # 小红书进入首页后,有可能不会自动弹出登录框,需要手动点击登录按钮
@ -79,21 +79,24 @@ class XHSLogin(AbstractLogin):
) )
await element.click() await element.click()
except Exception as e: except Exception as e:
logging.info("have not found mobile button icon and keep going ...") utils.logger.info("have not found mobile button icon and keep going ...")
await asyncio.sleep(1) await asyncio.sleep(1)
login_container_ele = await self.context_page.wait_for_selector("div.login-container") login_container_ele = await self.context_page.wait_for_selector("div.login-container")
input_ele = await login_container_ele.query_selector("label.phone > input") input_ele = await login_container_ele.query_selector("label.phone > input")
await input_ele.fill(self.login_phone) await input_ele.fill(self.login_phone)
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
send_btn_ele = await login_container_ele.query_selector("label.auth-code > span") send_btn_ele = await login_container_ele.query_selector("label.auth-code > span")
await send_btn_ele.click() # 点击发送验证码 await send_btn_ele.click() # 点击发送验证码
sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input") sms_code_input_ele = await login_container_ele.query_selector("label.auth-code > input")
submit_btn_ele = await login_container_ele.query_selector("div.input-container > button") submit_btn_ele = await login_container_ele.query_selector("div.input-container > button")
redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True) redis_obj = aioredis.from_url(url=config.REDIS_DB_HOST, password=config.REDIS_DB_PWD, decode_responses=True)
max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟 max_get_sms_code_time = 60 * 2 # 最长获取验证码的时间为2分钟
no_logged_in_session = "" no_logged_in_session = ""
while max_get_sms_code_time > 0: while max_get_sms_code_time > 0:
logging.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...") utils.logger.info(f"get sms code from redis remaining time {max_get_sms_code_time}s ...")
await asyncio.sleep(1) await asyncio.sleep(1)
sms_code_key = f"xhs_{self.login_phone}" sms_code_key = f"xhs_{self.login_phone}"
sms_code_value = await redis_obj.get(sms_code_key) sms_code_value = await redis_obj.get(sms_code_key)
@ -119,17 +122,16 @@ class XHSLogin(AbstractLogin):
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
logging.info("Login xiaohongshu failed by mobile login method ...") utils.logger.info("Login xiaohongshu failed by mobile login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_qrcode(self): async def login_by_qrcode(self):
"""login xiaohongshu website and keep webdriver login state""" """login xiaohongshu website and keep webdriver login state"""
logging.info("Begin login xiaohongshu by qrcode ...") utils.logger.info("Begin login xiaohongshu by qrcode ...")
await asyncio.sleep(10)
# login_selector = "div.login-container > div.left > div.qrcode > img" # login_selector = "div.login-container > div.left > div.qrcode > img"
qrcode_img_selector = "xpath=//img[@class='qrcode-img']" qrcode_img_selector = "xpath=//img[@class='qrcode-img']"
# find login qrcode # find login qrcode
@ -138,7 +140,7 @@ class XHSLogin(AbstractLogin):
selector=qrcode_img_selector selector=qrcode_img_selector
) )
if not base64_qrcode_img: if not base64_qrcode_img:
logging.info("login failed , have not found qrcode please check ....") utils.logger.info("login failed , have not found qrcode please check ....")
# if this website does not automatically popup login dialog box, we will manual click login button # if this website does not automatically popup login dialog box, we will manual click login button
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button") login_button_ele = self.context_page.locator("xpath=//*[@id='app']/div[1]/div[2]/div[1]/ul/div[1]/button")
@ -162,20 +164,20 @@ class XHSLogin(AbstractLogin):
partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img) partial_show_qrcode = functools.partial(utils.show_qrcode, base64_qrcode_img)
asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode) asyncio.get_running_loop().run_in_executor(executor=None, func=partial_show_qrcode)
logging.info(f"waiting for scan code login, remaining time is 20s") utils.logger.info(f"waiting for scan code login, remaining time is 20s")
try: try:
await self.check_login_state(no_logged_in_session) await self.check_login_state(no_logged_in_session)
except RetryError: except RetryError:
logging.info("Login xiaohongshu failed by qrcode login method ...") utils.logger.info("Login xiaohongshu failed by qrcode login method ...")
sys.exit() sys.exit()
wait_redirect_seconds = 5 wait_redirect_seconds = 5
logging.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...") utils.logger.info(f"Login successful then wait for {wait_redirect_seconds} seconds redirect ...")
await asyncio.sleep(wait_redirect_seconds) await asyncio.sleep(wait_redirect_seconds)
async def login_by_cookies(self): async def login_by_cookies(self):
"""login xiaohongshu website by cookies""" """login xiaohongshu website by cookies"""
logging.info("Begin login xiaohongshu by cookie ...") utils.logger.info("Begin login xiaohongshu by cookie ...")
for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items(): for key, value in utils.convert_str_cookie_to_dict(self.cookie_str).items():
await self.browser_context.add_cookies([{ await self.browser_context.add_cookies([{
'name': key, 'name': key,

View File

@ -108,7 +108,12 @@ def init_loging_config():
format="%(asctime)s %(name)s %(levelname)s %(message)s ", format="%(asctime)s %(name)s %(levelname)s %(message)s ",
datefmt='%Y-%m-%d %H:%M:%S' datefmt='%Y-%m-%d %H:%M:%S'
) )
logging.Logger("Media Crawler") _logger = logging.getLogger("MediaCrawler")
_logger.setLevel(level)
return _logger
logger = init_loging_config()
class Slide: class Slide: