diff --git a/base/base_crawler.py b/base/base_crawler.py index b56be0a..d4b28fe 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -5,7 +5,7 @@ from proxy.proxy_account_pool import AccountPool class AbstractCrawler(ABC): @abstractmethod - def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str): pass @abstractmethod diff --git a/config/base_config.py b/config/base_config.py index 80d53eb..fe0d783 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -6,7 +6,10 @@ COOKIES = "" CRAWLER_TYPE = "search" # 是否开启 IP 代理 -ENABLE_IP_PROXY = False +ENABLE_IP_PROXY = True + +# 代理IP池数量 +IP_PROXY_POOL_COUNT = 2 # 重试时间 RETRY_INTERVAL = 60 * 30 # 30 minutes @@ -49,4 +52,10 @@ DY_SPECIFIED_ID_LIST = [ "7280854932641664319", "7202432992642387233" # ........................ +] + + +# 指定快手平台需要爬取的ID列表 +KS_SPECIFIED_ID_LIST = [ + ] \ No newline at end of file diff --git a/main.py b/main.py index edcae7b..d9793c6 100644 --- a/main.py +++ b/main.py @@ -38,9 +38,6 @@ async def main(): parser.add_argument('--type', type=str, help='crawler type (search | detail)', choices=["search", "detail"], default=config.CRAWLER_TYPE) - # init account pool - account_pool = proxy_account_pool.create_account_pool() - # init db if config.IS_SAVED_DATABASED: await db.init_db() @@ -50,7 +47,6 @@ async def main(): crawler.init_config( platform=args.platform, login_type=args.lt, - account_pool=account_pool, crawler_type=args.type ) await crawler.start() diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index feca000..149aa33 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -16,7 +16,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import bilibili -from proxy.proxy_account_pool import AccountPool +from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel from tools import utils from var import comment_tasks_var, crawler_type_var @@ -31,27 +31,30 @@ class BilibiliCrawler(AbstractCrawler): crawler_type: str context_page: Page bili_client: BilibiliClient - account_pool: AccountPool browser_context: BrowserContext def __init__(self): self.index_url = "https://www.bilibili.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str): self.platform = platform self.login_type = login_type - self.account_pool = account_pool self.crawler_type = crawler_type async def start(self): - account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + async with async_playwright() as playwright: # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( chromium, - playwright_proxy, + None, self.user_agent, headless=config.HEADLESS ) @@ -61,11 +64,11 @@ class BilibiliCrawler(AbstractCrawler): await self.context_page.goto(self.index_url) # Create a client to interact with the xiaohongshu website. - self.bili_client = await self.create_bilibili_client(httpx_proxy) + self.bili_client = await self.create_bilibili_client(httpx_proxy_format) if not await self.bili_client.pong(): login_obj = BilibiliLogin( login_type=self.login_type, - login_phone=account_phone, + login_phone="", # your phone number browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -134,20 +137,18 @@ class BilibiliCrawler(AbstractCrawler): ) return bilibili_client_obj - def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: - """Create proxy info for playwright and httpx""" - # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 - phone, ip_proxy = self.account_pool.get_account() - if not config.ENABLE_IP_PROXY: - return phone, None, None - utils.logger.info("Begin proxy info for playwright and httpx ...") + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" playwright_proxy = { - "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", - "username": config.IP_PROXY_USER, - "password": config.IP_PROXY_PASSWORD, + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, } - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - return phone, playwright_proxy, httpx_proxy + httpx_proxy = { + f"{ip_proxy_info.protocol}{ip_proxy_info.ip}": f"{ip_proxy_info.protocol}{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy async def launch_browser( self, diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index c327801..e047d39 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -9,7 +9,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import douyin -from proxy.proxy_account_pool import AccountPool +from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel from tools import utils from var import crawler_type_var @@ -24,27 +24,30 @@ class DouYinCrawler(AbstractCrawler): crawler_type: str context_page: Page dy_client: DOUYINClient - account_pool: AccountPool browser_context: BrowserContext def __init__(self) -> None: self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" - def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None: + def init_config(self, platform: str, login_type: str, crawler_type: str) -> None: self.platform = platform self.login_type = login_type - self.account_pool = account_pool self.crawler_type = crawler_type async def start(self) -> None: - account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + async with async_playwright() as playwright: # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( chromium, - playwright_proxy, + None, self.user_agent, headless=config.HEADLESS ) @@ -53,11 +56,11 @@ class DouYinCrawler(AbstractCrawler): self.context_page = await self.browser_context.new_page() await self.context_page.goto(self.index_url) - self.dy_client = await self.create_douyin_client(httpx_proxy) + self.dy_client = await self.create_douyin_client(httpx_proxy_format) if not await self.dy_client.pong(browser_context=self.browser_context): login_obj = DouYinLogin( login_type=self.login_type, - login_phone=account_phone, + login_phone="", # you phone number browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -148,20 +151,18 @@ class DouYinCrawler(AbstractCrawler): except DataFetchError as e: utils.logger.error(f"aweme_id: {aweme_id} get comments failed, error: {e}") - def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: - """Create proxy info for playwright and httpx""" - if not config.ENABLE_IP_PROXY: - return None, None, None - - # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 - phone, ip_proxy = self.account_pool.get_account() # type: ignore + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" playwright_proxy = { - "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", - "username": config.IP_PROXY_USER, - "password": config.IP_PROXY_PASSWORD, + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, } - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - return phone, playwright_proxy, httpx_proxy + httpx_proxy = { + f"{ip_proxy_info.protocol}{ip_proxy_info.ip}": f"{ip_proxy_info.protocol}{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy async def create_douyin_client(self, httpx_proxy: Optional[str]) -> DOUYINClient: """Create douyin client""" diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 82f2b9c..78b9f6f 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -11,7 +11,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import kuaishou -from proxy.proxy_account_pool import AccountPool +from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel from tools import utils from var import comment_tasks_var, crawler_type_var @@ -26,27 +26,30 @@ class KuaishouCrawler(AbstractCrawler): crawler_type: str context_page: Page ks_client: KuaiShouClient - account_pool: AccountPool browser_context: BrowserContext def __init__(self): self.index_url = "https://www.kuaishou.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str): self.platform = platform self.login_type = login_type - self.account_pool = account_pool self.crawler_type = crawler_type async def start(self): - account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + async with async_playwright() as playwright: # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( chromium, - playwright_proxy, + None, self.user_agent, headless=config.HEADLESS ) @@ -56,11 +59,11 @@ class KuaishouCrawler(AbstractCrawler): await self.context_page.goto(f"{self.index_url}?isHome=1") # Create a client to interact with the kuaishou website. - self.ks_client = await self.create_ks_client(httpx_proxy) + self.ks_client = await self.create_ks_client(httpx_proxy_format) if not await self.ks_client.pong(): login_obj = KuaishouLogin( login_type=self.login_type, - login_phone=account_phone, + login_phone=httpx_proxy_format, browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -179,20 +182,18 @@ class KuaishouCrawler(AbstractCrawler): await self.context_page.goto(f"{self.index_url}?isHome=1") await self.ks_client.update_cookies(browser_context=self.browser_context) - def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: - """Create proxy info for playwright and httpx""" - # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 - phone, ip_proxy = self.account_pool.get_account() - if not config.ENABLE_IP_PROXY: - return phone, None, None - utils.logger.info("Begin proxy info for playwright and httpx ...") + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" playwright_proxy = { - "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", - "username": config.IP_PROXY_USER, - "password": config.IP_PROXY_PASSWORD, + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, } - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - return phone, playwright_proxy, httpx_proxy + httpx_proxy = { + f"{ip_proxy_info.protocol}{ip_proxy_info.ip}": f"{ip_proxy_info.protocol}{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy async def create_ks_client(self, httpx_proxy: Optional[str]) -> KuaiShouClient: """Create xhs client""" diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 118f44c..c724ae1 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -10,7 +10,7 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler from models import xiaohongshu as xhs_model -from proxy.proxy_account_pool import AccountPool +from proxy.proxy_ip_pool import create_ip_pool, IpInfoModel from tools import utils from var import crawler_type_var @@ -25,27 +25,30 @@ class XiaoHongShuCrawler(AbstractCrawler): crawler_type: str context_page: Page xhs_client: XHSClient - account_pool: AccountPool browser_context: BrowserContext def __init__(self) -> None: self.index_url = "https://www.xiaohongshu.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, account_pool: AccountPool, crawler_type: str) -> None: + def init_config(self, platform: str, login_type: str, crawler_type: str) -> None: self.platform = platform self.login_type = login_type - self.account_pool = account_pool - self.crawler_type =crawler_type + self.crawler_type = crawler_type async def start(self) -> None: - account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + playwright_proxy_format, httpx_proxy_format = None, None + if config.ENABLE_IP_PROXY: + ip_proxy_pool = await create_ip_pool(config.IP_PROXY_POOL_COUNT, enable_validate_ip=True) + ip_proxy_info: IpInfoModel = await ip_proxy_pool.get_proxy() + playwright_proxy_format, httpx_proxy_format = self.format_proxy_info(ip_proxy_info) + async with async_playwright() as playwright: # Launch a browser context. chromium = playwright.chromium self.browser_context = await self.launch_browser( chromium, - playwright_proxy, + None, self.user_agent, headless=config.HEADLESS ) @@ -62,11 +65,11 @@ class XiaoHongShuCrawler(AbstractCrawler): await self.context_page.goto(self.index_url) # Create a client to interact with the xiaohongshu website. - self.xhs_client = await self.create_xhs_client(httpx_proxy) + self.xhs_client = await self.create_xhs_client(httpx_proxy_format) if not await self.xhs_client.pong(): login_obj = XHSLogin( login_type=self.login_type, - login_phone=account_phone, + login_phone="", # input your phone number browser_context=self.browser_context, context_page=self.context_page, cookie_str=config.COOKIES @@ -126,7 +129,6 @@ class XiaoHongShuCrawler(AbstractCrawler): await xhs_model.update_xhs_note(note_detail) await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST) - async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]: """Get note detail""" async with semaphore: @@ -157,20 +159,18 @@ class XiaoHongShuCrawler(AbstractCrawler): for comment in all_comments: await xhs_model.update_xhs_note_comment(note_id=note_id, comment_item=comment) - def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: - """Create proxy info for playwright and httpx""" - # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 - phone, ip_proxy = self.account_pool.get_account() - if not config.ENABLE_IP_PROXY: - return phone, None, None - utils.logger.info("Begin proxy info for playwright and httpx ...") + @staticmethod + def format_proxy_info(ip_proxy_info: IpInfoModel) -> Tuple[Optional[Dict], Optional[Dict]]: + """format proxy info for playwright and httpx""" playwright_proxy = { - "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", - "username": config.IP_PROXY_USER, - "password": config.IP_PROXY_PASSWORD, + "server": f"{ip_proxy_info.protocol}{ip_proxy_info.ip}:{ip_proxy_info.port}", + "username": ip_proxy_info.user, + "password": ip_proxy_info.password, } - httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" - return phone, playwright_proxy, httpx_proxy + httpx_proxy = { + f"{ip_proxy_info.protocol}{ip_proxy_info.ip}":f"{ip_proxy_info.protocol}{ip_proxy_info.user}:{ip_proxy_info.password}@{ip_proxy_info.ip}:{ip_proxy_info.port}" + } + return playwright_proxy, httpx_proxy async def create_xhs_client(self, httpx_proxy: Optional[str]) -> XHSClient: """Create xhs client""" diff --git a/proxy/proxy_ip_pool.py b/proxy/proxy_ip_pool.py index 4763528..dd41e70 100644 --- a/proxy/proxy_ip_pool.py +++ b/proxy/proxy_ip_pool.py @@ -5,7 +5,7 @@ import json import pathlib import random -from typing import Dict, List +from typing import List import httpx from tenacity import retry, stop_after_attempt, wait_fixed @@ -29,18 +29,21 @@ class ProxyIpPool: """ self.proxy_list = await IpProxy.get_proxies(self.ip_pool_count) - @retry(stop=stop_after_attempt(3), wait=wait_fixed(1)) async def is_valid_proxy(self, proxy: IpInfoModel) -> bool: """ 验证代理IP是否有效 :param proxy: :return: """ + utils.logger.info(f"[ProxyIpPool.is_valid_proxy] testing {proxy.ip} is it valid ") try: httpx_proxy = f"{proxy.protocol}{proxy.ip}:{proxy.port}" proxy_auth = httpx.BasicAuth(proxy.user, proxy.password) - async with httpx.AsyncClient(proxies={proxy.protocol: httpx_proxy}, auth=proxy_auth) as client: + proxies = { + f"{proxy.protocol}{proxy.ip}": httpx_proxy + } + async with httpx.AsyncClient(proxies=proxies, auth=proxy_auth) as client: response = await client.get(self.valid_ip_url) if response.status_code == 200: return True diff --git a/test/test_proxy_ip_pool.py b/test/test_proxy_ip_pool.py index 4c2a52c..5974c90 100644 --- a/test/test_proxy_ip_pool.py +++ b/test/test_proxy_ip_pool.py @@ -11,7 +11,7 @@ from proxy.proxy_ip_provider import IpInfoModel class TestIpPool(IsolatedAsyncioTestCase): async def test_ip_pool(self): pool = await create_ip_pool(ip_pool_count=3, enable_validate_ip=True) - for i in range(3): + for i in range(1): ip_proxy_info: IpInfoModel = await pool.get_proxy() print(ip_proxy_info) self.assertIsNotNone(ip_proxy_info.ip, msg="验证 ip 是否获取成功")