diff --git a/.gitignore b/.gitignore index 8402108..4b088ab 100644 --- a/.gitignore +++ b/.gitignore @@ -165,4 +165,6 @@ cython_debug/ /temp_image/ /browser_data/ /data/ -/cache \ No newline at end of file +/cache + +*/.DS_Store \ No newline at end of file diff --git a/base/base_crawler.py b/base/base_crawler.py index 2a5b69f..6817c69 100644 --- a/base/base_crawler.py +++ b/base/base_crawler.py @@ -6,7 +6,7 @@ from playwright.async_api import BrowserContext, BrowserType class AbstractCrawler(ABC): @abstractmethod - def init_config(self, platform: str, login_type: str, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): pass @abstractmethod diff --git a/config/base_config.py b/config/base_config.py index 26761f8..309654d 100644 --- a/config/base_config.py +++ b/config/base_config.py @@ -27,6 +27,9 @@ SAVE_DATA_OPTION = "json" # csv or db or json # 用户浏览器缓存的浏览器文件配置 USER_DATA_DIR = "%s_user_data_dir" # %s will be replaced by platform name +# 爬取开始页数 默认从第一页开始 +START_PAGE = 1 + # 爬取视频/帖子的数量控制 CRAWLER_MAX_NOTES_COUNT = 20 diff --git a/main.py b/main.py index 7c5902a..94e4d20 100644 --- a/main.py +++ b/main.py @@ -38,7 +38,11 @@ async def main(): choices=["qrcode", "phone", "cookie"], default=config.LOGIN_TYPE) parser.add_argument('--type', type=str, help='crawler type (search | detail | creator)', choices=["search", "detail", "creator"], default=config.CRAWLER_TYPE) - + parser.add_argument('--start', type=int, help='crawler type (number of start page)', + default=config.START_PAGE) + parser.add_argument('--keyword', type=str, help='crawler type (please input keywords)', + default=config.KEYWORDS) + # init db if config.SAVE_DATA_OPTION == "db": await db.init_db() @@ -48,7 +52,9 @@ async def main(): crawler.init_config( platform=args.platform, login_type=args.lt, - crawler_type=args.type + crawler_type=args.type, + start_page=args.start, + keyword=args.key ) await crawler.start() diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index 758ae0f..4a30e5f 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -37,10 +37,12 @@ class BilibiliCrawler(AbstractCrawler): self.index_url = "https://www.bilibili.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): self.platform = platform self.login_type = login_type self.crawler_type = crawler_type + self.start_page = start_page + self.keyword = keyword async def start(self): playwright_proxy_format, httpx_proxy_format = None, None @@ -96,10 +98,16 @@ class BilibiliCrawler(AbstractCrawler): bili_limit_count =20 # bilibili limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < bili_limit_count: config.CRAWLER_MAX_NOTES_COUNT = bili_limit_count - for keyword in config.KEYWORDS.split(","): + start_page = self.start_page # start page number + for keyword in self.keyword.split(","): utils.logger.info(f"[BilibiliCrawler.search] Current search keyword: {keyword}") page = 1 - while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page - start_page + 1) * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[BilibiliCrawler.search] Skip page: {page}") + page += 1 + continue + video_id_list: List[str] = [] videos_res = await self.bili_client.search_video_by_keyword( keyword=keyword, diff --git a/media_platform/douyin/core.py b/media_platform/douyin/core.py index 50c2a18..fb7936d 100644 --- a/media_platform/douyin/core.py +++ b/media_platform/douyin/core.py @@ -32,10 +32,12 @@ class DouYinCrawler(AbstractCrawler): self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed self.index_url = "https://www.douyin.com" - def init_config(self, platform: str, login_type: str, crawler_type: str) -> None: + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None: self.platform = platform self.login_type = login_type self.crawler_type = crawler_type + self.start_page = start_page + self.keyword = keyword async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None @@ -84,11 +86,16 @@ class DouYinCrawler(AbstractCrawler): dy_limit_count = 10 # douyin limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < dy_limit_count: config.CRAWLER_MAX_NOTES_COUNT = dy_limit_count - for keyword in config.KEYWORDS.split(","): + start_page = self.start_page # start page number + for keyword in self.keyword.split(","): utils.logger.info(f"[DouYinCrawler.search] Current keyword: {keyword}") aweme_list: List[str] = [] page = 0 - while (page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page - start_page + 1) * dy_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[DouYinCrawler.search] Skip {page}") + page += 1 + continue try: posts_res = await self.dy_client.search_info_by_keyword(keyword=keyword, offset=page * dy_limit_count, diff --git a/media_platform/kuaishou/core.py b/media_platform/kuaishou/core.py index 4eb7796..df39374 100644 --- a/media_platform/kuaishou/core.py +++ b/media_platform/kuaishou/core.py @@ -32,10 +32,12 @@ class KuaishouCrawler(AbstractCrawler): self.index_url = "https://www.kuaishou.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): self.platform = platform self.login_type = login_type self.crawler_type = crawler_type + self.start_page = start_page + self.keyword = keyword async def start(self): playwright_proxy_format, httpx_proxy_format = None, None @@ -88,10 +90,16 @@ class KuaishouCrawler(AbstractCrawler): ks_limit_count = 20 # kuaishou limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < ks_limit_count: config.CRAWLER_MAX_NOTES_COUNT = ks_limit_count - for keyword in config.KEYWORDS.split(","): + start_page = self.start_page + for keyword in self.keyword.split(","): utils.logger.info(f"[KuaishouCrawler.search] Current search keyword: {keyword}") page = 1 - while page * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page - start_page + 1) * ks_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[KuaishouCrawler.search] Skip page: {page}") + page += 1 + continue + video_id_list: List[str] = [] videos_res = await self.ks_client.search_info_by_keyword( keyword=keyword, diff --git a/media_platform/weibo/core.py b/media_platform/weibo/core.py index 4395c8b..1f12ec7 100644 --- a/media_platform/weibo/core.py +++ b/media_platform/weibo/core.py @@ -40,10 +40,12 @@ class WeiboCrawler(AbstractCrawler): self.user_agent = utils.get_user_agent() self.mobile_user_agent = utils.get_mobile_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str): + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str): self.platform = platform self.login_type = login_type self.crawler_type = crawler_type + self.start_page = start_page + self.keyword = keyword async def start(self): playwright_proxy_format, httpx_proxy_format = None, None @@ -106,10 +108,16 @@ class WeiboCrawler(AbstractCrawler): weibo_limit_count = 10 # weibo limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < weibo_limit_count: config.CRAWLER_MAX_NOTES_COUNT = weibo_limit_count - for keyword in config.KEYWORDS.split(","): + start_page = self.start_page + for keyword in self.keyword.split(","): utils.logger.info(f"[WeiboCrawler.search] Current search keyword: {keyword}") page = 1 - while page * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + while (page - start_page + 1) * weibo_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[WeiboCrawler.search] Skip page: {page}") + page += 1 + continue + search_res = await self.wb_client.get_note_by_keyword( keyword=keyword, page=page, diff --git a/media_platform/xhs/core.py b/media_platform/xhs/core.py index 5b67d6d..6947083 100644 --- a/media_platform/xhs/core.py +++ b/media_platform/xhs/core.py @@ -32,10 +32,12 @@ class XiaoHongShuCrawler(AbstractCrawler): self.index_url = "https://www.xiaohongshu.com" self.user_agent = utils.get_user_agent() - def init_config(self, platform: str, login_type: str, crawler_type: str) -> None: + def init_config(self, platform: str, login_type: str, crawler_type: str, start_page: int, keyword: str) -> None: self.platform = platform self.login_type = login_type self.crawler_type = crawler_type + self.start_page = start_page + self.keyword = keyword async def start(self) -> None: playwright_proxy_format, httpx_proxy_format = None, None @@ -99,31 +101,41 @@ class XiaoHongShuCrawler(AbstractCrawler): xhs_limit_count = 20 # xhs limit page fixed value if config.CRAWLER_MAX_NOTES_COUNT < xhs_limit_count: config.CRAWLER_MAX_NOTES_COUNT = xhs_limit_count - for keyword in config.KEYWORDS.split(","): + start_page = self.start_page + for keyword in self.keyword.split(","): utils.logger.info(f"[XiaoHongShuCrawler.search] Current search keyword: {keyword}") page = 1 - while page * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: - note_id_list: List[str] = [] - notes_res = await self.xhs_client.get_note_by_keyword( - keyword=keyword, - page=page, - sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL, - ) - utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") - semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) - task_list = [ - self.get_note_detail(post_item.get("id"), semaphore) - for post_item in notes_res.get("items", {}) - if post_item.get('model_type') not in ('rec_query', 'hot_query') - ] - note_details = await asyncio.gather(*task_list) - for note_detail in note_details: - if note_detail is not None: - await xhs_store.update_xhs_note(note_detail) - note_id_list.append(note_detail.get("note_id")) - page += 1 - utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") - await self.batch_get_note_comments(note_id_list) + while (page - start_page + 1) * xhs_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + if page < start_page: + utils.logger.info(f"[XiaoHongShuCrawler.search] Skip page {page}") + page += 1 + continue + + try: + note_id_list: List[str] = [] + notes_res = await self.xhs_client.get_note_by_keyword( + keyword=keyword, + page=page, + sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL, + ) + utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}") + semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM) + task_list = [ + self.get_note_detail(post_item.get("id"), semaphore) + for post_item in notes_res.get("items", {}) + if post_item.get('model_type') not in ('rec_query', 'hot_query') + ] + note_details = await asyncio.gather(*task_list) + for note_detail in note_details: + if note_detail is not None: + await xhs_store.update_xhs_note(note_detail) + note_id_list.append(note_detail.get("note_id")) + page += 1 + utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}") + await self.batch_get_note_comments(note_id_list) + except DataFetchError: + utils.logger.error("[XiaoHongShuCrawler.search] Get note detail error") + break async def get_creators_and_notes(self) -> None: """Get creator's notes and retrieve their comment information."""