From a90b411e6880c77c1289ffa3fba169d3453d282c Mon Sep 17 00:00:00 2001 From: Relakkes Date: Sun, 3 Dec 2023 23:19:02 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20B=E7=AB=99=E7=88=AC=E8=99=AB=E6=90=9C?= =?UTF-8?q?=E7=B4=A2=E5=85=B3=E9=94=AE=E8=AF=8D=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- media_platform/bilibili/client.py | 35 +++++--- media_platform/bilibili/core.py | 135 +++++++++++++++++++++++++++++- media_platform/bilibili/field.py | 23 +++++ media_platform/bilibili/help.py | 4 +- media_platform/bilibili/login.py | 29 ++++++- models/bilibili.py | 94 +++++++++++++++++++++ 6 files changed, 303 insertions(+), 17 deletions(-) create mode 100644 media_platform/bilibili/field.py create mode 100644 models/bilibili.py diff --git a/media_platform/bilibili/client.py b/media_platform/bilibili/client.py index 483d6ef..07396ed 100644 --- a/media_platform/bilibili/client.py +++ b/media_platform/bilibili/client.py @@ -14,6 +14,7 @@ from tools import utils from .help import BilibiliSign from .exception import DataFetchError +from .field import OrderType class BilibiliClient: @@ -53,7 +54,7 @@ class BilibiliClient: :param req_data: :return: """ - img_key, sub_key = self.get_wbi_keys() + img_key, sub_key = await self.get_wbi_keys() return BilibiliSign(img_key, sub_key).sign(req_data) async def get_wbi_keys(self) -> tuple[str, str]: @@ -62,9 +63,10 @@ class BilibiliClient: :return: """ local_storage = await self.playwright_page.evaluate("() => window.localStorage") - wbi_img_urls = local_storage.get("wbi_img_urls", "") - img_url, sub_url = wbi_img_urls.split("-") - if not img_url or not sub_url: + wbi_img_urls = local_storage.get("wbi_img_urls", "") or local_storage.get("wbi_img_url") + "-" + local_storage.get("wbi_sub_url") + if wbi_img_urls and "-" in wbi_img_urls: + img_url, sub_url = wbi_img_urls.split("-") + else: resp = await self.request(method="GET", url=self._host + "/x/web-interface/nav") img_url: str = resp['wbi_img']['img_url'] sub_url: str = resp['wbi_img']['sub_url'] @@ -74,14 +76,14 @@ class BilibiliClient: async def get(self, uri: str, params=None) -> Dict: final_uri = uri - params = self.pre_request_data(params) + params = await self.pre_request_data(params) if isinstance(params, dict): final_uri = (f"{uri}?" f"{urlencode(params)}") return await self.request(method="GET", url=f"{self._host}{final_uri}", headers=self.headers) async def post(self, uri: str, data: dict) -> Dict: - data = self.pre_request_data(data) + data = await self.pre_request_data(data) json_str = json.dumps(data, separators=(',', ':'), ensure_ascii=False) return await self.request(method="POST", url=f"{self._host}{uri}", data=json_str, headers=self.headers) @@ -102,21 +104,30 @@ class BilibiliClient: self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict - async def search_info_by_keyword(self, keyword: str, pcursor: str): + async def search_video_by_keyword(self, keyword: str, page: int = 1, page_size: int = 20, + order: OrderType = OrderType.DEFAULT): """ KuaiShou web search api - :param keyword: search keyword - :param pcursor: limite page curson + :param keyword: 搜索关键词 + :param page: 分页参数具体第几页 + :param page_size: 每一页参数的数量 + :param order: 搜索结果排序,默认位综合排序 :return: """ + uri = "/x/web-interface/wbi/search/type" post_data = { + "search_type": "video", + "keyword": keyword, + "page": page, + "page_size": page_size, + "order": order.value } - return await self.post("", post_data) + return await self.get(uri, post_data) - async def get_video_info(self, photo_id: str) -> Dict: + async def get_video_info(self, video_id: str) -> Dict: """ Kuaishou web video detail api - :param photo_id: + :param video_id: :return: """ post_data = { diff --git a/media_platform/bilibili/core.py b/media_platform/bilibili/core.py index b05c98a..b1404ff 100644 --- a/media_platform/bilibili/core.py +++ b/media_platform/bilibili/core.py @@ -15,13 +15,14 @@ from playwright.async_api import (BrowserContext, BrowserType, Page, import config from base.base_crawler import AbstractCrawler -from models import kuaishou +from models import bilibili from proxy.proxy_account_pool import AccountPool from tools import utils from var import comment_tasks_var, crawler_type_var + from .client import BilibiliClient -from .exception import DataFetchError +from .field import OrderType from .login import BilibiliLogin @@ -45,7 +46,137 @@ class BilibiliCrawler(AbstractCrawler): self.crawler_type = crawler_type async def start(self): + account_phone, playwright_proxy, httpx_proxy = self.create_proxy_info() + async with async_playwright() as playwright: + # Launch a browser context. + chromium = playwright.chromium + self.browser_context = await self.launch_browser( + chromium, + playwright_proxy, + self.user_agent, + headless=config.HEADLESS + ) + # stealth.min.js is a js script to prevent the website from detecting the crawler. + await self.browser_context.add_init_script(path="libs/stealth.min.js") + self.context_page = await self.browser_context.new_page() + await self.context_page.goto(self.index_url) + + # Create a client to interact with the xiaohongshu website. + self.bili_client = await self.create_bilibili_client(httpx_proxy) + if not await self.bili_client.pong(): + login_obj = BilibiliLogin( + login_type=self.login_type, + login_phone=account_phone, + browser_context=self.browser_context, + context_page=self.context_page, + cookie_str=config.COOKIES + ) + await login_obj.begin() + await self.bili_client.update_cookies(browser_context=self.browser_context) + + crawler_type_var.set(self.crawler_type) + if self.crawler_type == "search": + # Search for video and retrieve their comment information. + await self.search() + elif self.crawler_type == "detail": + # Get the information and comments of the specified post + await self.get_specified_videos() + else: + pass + utils.logger.info("Bilibili Crawler finished ...") pass async def search(self): + """ + search bilibili video with keywords + :return: + """ + utils.logger.info("Begin search bilibli keywords") + bili_limit_count = 20 # bilibili limit page fixed value + for keyword in config.KEYWORDS.split(","): + utils.logger.info(f"Current search keyword: {keyword}") + page = 1 + while page * bili_limit_count <= config.CRAWLER_MAX_NOTES_COUNT: + video_id_list: List[str] = [] + videos_res = await self.bili_client.search_video_by_keyword( + keyword=keyword, + page=page, + page_size=bili_limit_count, + order=OrderType.DEFAULT, + ) + video_list: List[Dict] = videos_res.get("result") + for video_item in video_list: + video_id_list.append(video_item.get("id")) + await bilibili.update_bilibili_video(video_item) + page += 1 + + async def get_specified_videos(self): + """ + get specified videos info + :return: + """ pass + + async def create_bilibili_client(self, httpx_proxy: Optional[str]) -> BilibiliClient: + """Create xhs client""" + utils.logger.info("Begin create xiaohongshu API client ...") + cookie_str, cookie_dict = utils.convert_cookies(await self.browser_context.cookies()) + bilibili_client_obj = BilibiliClient( + proxies=httpx_proxy, + headers={ + "User-Agent": self.user_agent, + "Cookie": cookie_str, + "Origin": "https://www.bilibili.com", + "Referer": "https://www.bilibili.com", + "Content-Type": "application/json;charset=UTF-8" + }, + playwright_page=self.context_page, + cookie_dict=cookie_dict, + ) + return bilibili_client_obj + + def create_proxy_info(self) -> Tuple[Optional[str], Optional[Dict], Optional[str]]: + """Create proxy info for playwright and httpx""" + # phone: 13012345671 ip_proxy: 111.122.xx.xx1:8888 + phone, ip_proxy = self.account_pool.get_account() + if not config.ENABLE_IP_PROXY: + return phone, None, None + utils.logger.info("Begin proxy info for playwright and httpx ...") + playwright_proxy = { + "server": f"{config.IP_PROXY_PROTOCOL}{ip_proxy}", + "username": config.IP_PROXY_USER, + "password": config.IP_PROXY_PASSWORD, + } + httpx_proxy = f"{config.IP_PROXY_PROTOCOL}{config.IP_PROXY_USER}:{config.IP_PROXY_PASSWORD}@{ip_proxy}" + return phone, playwright_proxy, httpx_proxy + + async def launch_browser( + self, + chromium: BrowserType, + playwright_proxy: Optional[Dict], + user_agent: Optional[str], + headless: bool = True + ) -> BrowserContext: + """Launch browser and create browser context""" + utils.logger.info("Begin create browser context ...") + if config.SAVE_LOGIN_STATE: + # feat issue #14 + # we will save login state to avoid login every time + user_data_dir = os.path.join(os.getcwd(), "browser_data", + config.USER_DATA_DIR % self.platform) # type: ignore + browser_context = await chromium.launch_persistent_context( + user_data_dir=user_data_dir, + accept_downloads=True, + headless=headless, + proxy=playwright_proxy, # type: ignore + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context + else: + browser = await chromium.launch(headless=headless, proxy=playwright_proxy) # type: ignore + browser_context = await browser.new_context( + viewport={"width": 1920, "height": 1080}, + user_agent=user_agent + ) + return browser_context diff --git a/media_platform/bilibili/field.py b/media_platform/bilibili/field.py new file mode 100644 index 0000000..179f15c --- /dev/null +++ b/media_platform/bilibili/field.py @@ -0,0 +1,23 @@ +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/3 16:20 +# @Desc : + +from enum import Enum + + +class OrderType(Enum): + # 综合排序 + DEFAULT = "" + + # 最多点击 + MOST_CLICK = "click" + + # 最新发布 + LAST_PUBLISH = "pubdate" + + # 最多弹幕 + MOST_DANMU = "dm" + + # 最多收藏 + MOST_MARK = "stow" diff --git a/media_platform/bilibili/help.py b/media_platform/bilibili/help.py index fb9bef8..6978bfb 100644 --- a/media_platform/bilibili/help.py +++ b/media_platform/bilibili/help.py @@ -52,14 +52,14 @@ class BilibiliSign: salt = self.get_salt() wbi_sign = md5((query + salt).encode()).hexdigest() # 计算 w_rid req_data['w_rid'] = wbi_sign - # print(urllib.parse.urlencode(req_data)) + print(urllib.parse.urlencode(req_data)) return req_data if __name__ == '__main__': _img_key = "7cd084941338484aae1ad9425b84077c" _sub_key = "4932caff0ff746eab6f01bf08b70ac45" - _search_url = "category_id=&search_type=video&ad_resource=5654&__refresh__=true&_extra=&context=&page=1&page_size=42&order=click&from_source=&from_spmid=333.337&platform=pc&highlight=1&single_column=0&keyword=python&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&source_tag=3&gaia_vtoken=&dynamic_offset=0&web_location=1430654" + _search_url = "__refresh__=true&_extra=&ad_resource=5654&category_id=&context=&dynamic_offset=0&from_source=&from_spmid=333.337&gaia_vtoken=&highlight=1&keyword=python&order=click&page=1&page_size=20&platform=pc&qv_id=OQ8f2qtgYdBV1UoEnqXUNUl8LEDAdzsD&search_type=video&single_column=0&source_tag=3&web_location=1430654" _req_data = dict() for params in _search_url.split("&"): kvalues = params.split("=") diff --git a/media_platform/bilibili/login.py b/media_platform/bilibili/login.py index 6e4a1ed..31b0421 100644 --- a/media_platform/bilibili/login.py +++ b/media_platform/bilibili/login.py @@ -1,11 +1,38 @@ # -*- coding: utf-8 -*- # @Author : relakkes@gmail.com # @Time : 2023/12/2 18:44 -# @Desc : +# @Desc : bilibli登录类实现 + +import asyncio +import functools +import sys +from typing import Optional + +import redis +from playwright.async_api import BrowserContext, Page +from tenacity import (RetryError, retry, retry_if_result, stop_after_attempt, + wait_fixed) + +import config +from base.base_crawler import AbstractLogin +from tools import utils from base.base_crawler import AbstractLogin class BilibiliLogin(AbstractLogin): + def __init__(self, + login_type: str, + browser_context: BrowserContext, + context_page: Page, + login_phone: Optional[str] = "", + cookie_str: str = "" + ): + self.login_type = login_type + self.browser_context = browser_context + self.context_page = context_page + self.login_phone = login_phone + self.cookie_str = cookie_str + async def begin(self): pass diff --git a/models/bilibili.py b/models/bilibili.py new file mode 100644 index 0000000..a4757f3 --- /dev/null +++ b/models/bilibili.py @@ -0,0 +1,94 @@ +# -*- coding: utf-8 -*- +# @Author : relakkes@gmail.com +# @Time : 2023/12/3 16:16 +# @Desc : B 站的模型类 +import csv +import pathlib +from typing import Dict, List + +from tortoise import fields +from tortoise.contrib.pydantic import pydantic_model_creator +from tortoise.models import Model + +import config +from tools import utils +from var import crawler_type_var + + +class BilibiliBaseModel(Model): + id = fields.IntField(pk=True, autoincrement=True, description="自增ID") + user_id = fields.CharField(null=True, max_length=64, description="用户ID") + nickname = fields.CharField(null=True, max_length=64, description="用户昵称") + avatar = fields.CharField(null=True, max_length=255, description="用户头像地址") + add_ts = fields.BigIntField(description="记录添加时间戳") + last_modify_ts = fields.BigIntField(description="记录最后修改时间戳") + + class Meta: + abstract = True + + +class BilibiliVideo(BilibiliBaseModel): + video_id = fields.CharField(max_length=64, index=True, description="视频ID") + video_type = fields.CharField(max_length=16, description="视频类型") + title = fields.CharField(null=True, max_length=500, description="视频标题") + desc = fields.TextField(null=True, description="视频描述") + create_time = fields.BigIntField(description="视频发布时间戳", index=True) + liked_count = fields.CharField(null=True, max_length=16, description="视频点赞数") + video_play_count = fields.CharField(null=True, max_length=16, description="视频播放数量") + video_danmaku = fields.CharField(null=True, max_length=16, description="视频弹幕数量") + video_comment = fields.CharField(null=True, max_length=16, description="视频评论数量") + video_url = fields.CharField(null=True, max_length=512, description="视频详情URL") + video_cover_url = fields.CharField(null=True, max_length=512, description="视频封面图 URL") + + class Meta: + table = "bilibili_video" + table_description = "B站视频" + + def __str__(self): + return f"{self.video_id} - {self.title}" + + +async def update_bilibili_video(video_item: Dict): + video_id = video_item.get("id") + if video_item.get("type") != "video": + return + local_db_item = { + "video_id": video_id, + "video_type": str(video_item.get("type")), + "title": video_item.get("title", "")[:500], + "desc": video_item.get("description", "")[:500], + "create_time": video_item.get("pubdate"), + "user_id": video_item.get("mid"), + "nickname": video_item.get("author"), + "avatar": video_item.get("upic", ""), + "liked_count": str(video_item.get("like", "")), + "video_play_count": str(video_item.get("play", "")), + "video_danmaku": str(video_item.get("danmaku", "")), + "video_comment": str(video_item.get("review", "")), + "last_modify_ts": utils.get_current_timestamp(), + "video_url": f"https://www.bilibili.com/video/av{video_id}", + "video_cover_url": video_item.get("pic", ""), + } + print(f"bilibili video id:{video_id}, title:{local_db_item.get('title')}") + if config.IS_SAVED_DATABASED: + if not await BilibiliVideo.filter(video_id=video_id).exists(): + local_db_item["add_ts"] = utils.get_current_timestamp() + bilibili_video_pydantic = pydantic_model_creator(BilibiliVideo, name='BilibiliVideoCreate', exclude=('id',)) + bilibili_data = bilibili_video_pydantic(**local_db_item) + bilibili_video_pydantic.model_validate(bilibili_data) + await BilibiliVideo.create(**bilibili_data.model_dump()) + else: + bilibili_video_pydantic = pydantic_model_creator(BilibiliVideo, name='BilibiliVideoUpdate', + exclude=('id', 'add_ts')) + bilibili_data = bilibili_video_pydantic(**local_db_item) + bilibili_video_pydantic.model_validate(bilibili_data) + await BilibiliVideo.filter(video_id=video_id).update(**bilibili_data.model_dump()) + else: + # Below is a simple way to save it in CSV format. + pathlib.Path(f"data/bilibili").mkdir(parents=True, exist_ok=True) + save_file_name = f"data/bilibili/{crawler_type_var.get()}_videos_{utils.get_current_date()}.csv" + with open(save_file_name, mode='a+', encoding="utf-8-sig", newline="") as f: + writer = csv.writer(f) + if f.tell() == 0: + writer.writerow(local_db_item.keys()) + writer.writerow(local_db_item.values()) \ No newline at end of file