import copy import asyncio from typing import Optional, Dict, Callable import httpx import execjs import urllib.parse from playwright.async_api import Page from playwright.async_api import BrowserContext from .field import * from .exception import * from tools import utils class DOUYINClient: def __init__( self, timeout=30, proxies=None, headers: Optional[Dict] = None, playwright_page: Page = None, cookie_dict: Dict = None ): self.proxies = proxies self.timeout = timeout self.headers = headers self._host = "https://www.douyin.com" self.playwright_page = playwright_page self.cookie_dict = cookie_dict async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None): if not params: return headers = headers or self.headers local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") douyin_js_obj = execjs.compile(open('libs/douyin.js').read()) common_params = { "device_platform": "webapp", "aid": "6383", "channel": "channel_pc_web", "cookie_enabled": "true", "browser_language": "zh-CN", "browser_platform": "Win32", "browser_name": "Firefox", "browser_version": "110.0", "browser_online": "true", "engine_name": "Gecko", "os_name": "Windows", "os_version": "10", "engine_version": "109.0", "platform": "PC", "screen_width": "1920", "screen_height": "1200", "webid": douyin_js_obj.call("get_web_id"), "msToken": local_storage.get("xmst"), # "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK", } params.update(common_params) query = '&'.join([f'{k}={v}' for k, v in params.items()]) x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"]) params["X-Bogus"] = x_bogus # print(x_bogus, query) async def request(self, method, url, **kwargs): async with httpx.AsyncClient(proxies=self.proxies) as client: response = await client.request( method, url, timeout=self.timeout, **kwargs ) try: return response.json() except Exception as e: raise DataFetchError(f"{e}, {response.text}") async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None): await self.__process_req_params(params, headers) headers = headers or self.headers return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers) async def post(self, uri: str, data: dict, headers: Optional[Dict] = None): await self.__process_req_params(data, headers) headers = headers or self.headers return await self.request(method="POST", url=f"{self._host}{uri}", data=data, headers=headers) @staticmethod async def ping(browser_context: BrowserContext) -> bool: _, cookie_dict = utils.convert_cookies(await browser_context.cookies()) # todo send some api to test login status return cookie_dict.get("LOGIN_STATUS") == "1" async def update_cookies(self, browser_context: BrowserContext): cookie_str, cookie_dict = utils.convert_cookies(await browser_context.cookies()) self.headers["Cookie"] = cookie_str self.cookie_dict = cookie_dict async def search_info_by_keyword( self, keyword: str, offset: int = 0, search_channel: SearchChannelType = SearchChannelType.GENERAL, sort_type: SearchSortType = SearchSortType.GENERAL, publish_time: PublishTimeType = PublishTimeType.UNLIMITED ): """ DouYin Web Search API :param keyword: :param offset: :param search_channel: :param sort_type: :param publish_time: · :return: """ params = { "keyword": keyword, "search_channel": search_channel.value, "sort_type": sort_type.value, "publish_time": publish_time.value, "search_source": "normal_search", "query_correct_type": "1", "is_filter_search": "0", "offset": offset, "count": 10 # must be set to 10 } referer_url = "https://www.douyin.com/search/" + keyword headers = copy.copy(self.headers) headers["Referer"] = urllib.parse.quote(referer_url, safe=':/') return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers) async def get_video_by_id(self, aweme_id: str): """ DouYin Video Detail API :param aweme_id: :return: """ params = { "aweme_id": aweme_id } headers = copy.copy(self.headers) headers["Cookie"] = "s_v_web_id=verify_leytkxgn_kvO5kOmO_SdMs_4t1o_B5ml_BUqtWM1mP6BF;" del headers["Origin"] return await self.get("/aweme/v1/web/aweme/detail/", params, headers) async def get_aweme_comments(self, aweme_id: str, cursor: str = ""): """get note comments """ uri = "/aweme/v1/web/comment/list/" params = { "aweme_id": aweme_id, "cursor": cursor, "count": 20, "item_type": 0 } return await self.get(uri, params) async def get_aweme_all_comments( self, aweme_id: str, crawl_interval: float = 1.0, is_fetch_sub_comments=False, callback: Optional[Callable] = None ): """ get note all comments include sub comments :param aweme_id: :param crawl_interval: :param is_fetch_sub_comments: :param callback: :return: """ result = [] comments_has_more = 1 comments_cursor = 0 while comments_has_more: comments_res = await self.get_aweme_comments(aweme_id, comments_cursor) comments_has_more = comments_res.get("has_more", 0) comments_cursor = comments_res.get("cursor", comments_cursor + 20) comments = comments_res.get("comments") if not comments: continue if callback: # 如果有回调函数,就执行回调函数 await callback(aweme_id, comments) await asyncio.sleep(crawl_interval) if not is_fetch_sub_comments: result.extend(comments) continue # todo fetch sub comments return result