feat: 抖音abogus参数更新
This commit is contained in:
parent
0807862b63
commit
f8096e3d58
17
README.md
17
README.md
@ -81,6 +81,13 @@
|
|||||||
|
|
||||||
|
|
||||||
## 开发者服务
|
## 开发者服务
|
||||||
|
- MediaCrawler视频课程:
|
||||||
|
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
|
||||||
|
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低
|
||||||
|
>
|
||||||
|
> 同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
|
||||||
|
|
||||||
|
|
||||||
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
|
- 知识星球:沉淀高质量常见问题、最佳实践文档、多年编程+爬虫经验分享,提供付费知识星球服务,主动提问,作者会定期回答问题 (每天 1 快钱订阅我的知识服务)
|
||||||
<p>
|
<p>
|
||||||
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
|
<img alt="xingqiu" src="https://nm.zizhi1.com/static/img/8e1312d1f52f2e0ff436ea7196b4e27b.15555424244122T1.webp" style="width: auto;height: 400px" >
|
||||||
@ -94,20 +101,14 @@
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
- MediaCrawler视频课程:
|
|
||||||
> 如果你想很快入门这个项目,或者想了具体实现原理,我推荐你看看这个视频课程,从设计出发一步步带你如何使用,门槛大大降低,同时也是对我开源的支持,如果你能支持我的课程,我将会非常开心~<br>
|
|
||||||
> 课程售价非常非常的便宜,几杯咖啡的事儿.<br>
|
|
||||||
> 课程介绍飞书文档链接:https://relakkes.feishu.cn/wiki/JUgBwdhIeiSbAwkFCLkciHdAnhh
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
## 感谢下列Sponsors对本仓库赞助
|
## 感谢下列Sponsors对本仓库赞助
|
||||||
|
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手,帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
|
||||||
|
<br>
|
||||||
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
|
- 感谢 [JetBrains](https://www.jetbrains.com/?from=gaowei-space/markdown-blog) 对本项目的支持!
|
||||||
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
|
<a href="https://www.jetbrains.com/?from=NanmiCoder/MediaCrawler" target="_blank">
|
||||||
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
|
<img src="https://resources.jetbrains.com/storage/products/company/brand/logos/jb_beam.png" width="100" height="100">
|
||||||
</a>
|
</a>
|
||||||
<br>
|
<br>
|
||||||
- <a href="https://sider.ai/ad-land-redirect?source=github&p1=mi&p2=kk">通过注册这个款免费的GPT助手,帮我获取GPT4额度作为支持。也是我每天在用的一款chrome AI助手插件</a>
|
|
||||||
|
|
||||||
成为赞助者,展示你的产品在这里,联系作者:relakkes@gmail.com
|
成为赞助者,展示你的产品在这里,联系作者:relakkes@gmail.com
|
||||||
|
|
||||||
|
578
libs/douyin.js
578
libs/douyin.js
File diff suppressed because one or more lines are too long
@ -2,11 +2,10 @@ import asyncio
|
|||||||
import copy
|
import copy
|
||||||
import json
|
import json
|
||||||
import urllib.parse
|
import urllib.parse
|
||||||
from typing import Any, Callable, Dict, List, Optional
|
from typing import Any, Callable, Dict, Optional
|
||||||
|
|
||||||
import execjs
|
import requests
|
||||||
import httpx
|
from playwright.async_api import BrowserContext
|
||||||
from playwright.async_api import BrowserContext, Page
|
|
||||||
|
|
||||||
from base.base_crawler import AbstractApiClient
|
from base.base_crawler import AbstractApiClient
|
||||||
from tools import utils
|
from tools import utils
|
||||||
@ -14,6 +13,7 @@ from var import request_keyword_var
|
|||||||
|
|
||||||
from .exception import *
|
from .exception import *
|
||||||
from .field import *
|
from .field import *
|
||||||
|
from .help import *
|
||||||
|
|
||||||
|
|
||||||
class DOUYINClient(AbstractApiClient):
|
class DOUYINClient(AbstractApiClient):
|
||||||
@ -33,51 +33,71 @@ class DOUYINClient(AbstractApiClient):
|
|||||||
self.playwright_page = playwright_page
|
self.playwright_page = playwright_page
|
||||||
self.cookie_dict = cookie_dict
|
self.cookie_dict = cookie_dict
|
||||||
|
|
||||||
async def __process_req_params(self, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
async def __process_req_params(
|
||||||
|
self, params: Optional[Dict] = None, headers: Optional[Dict] = None,
|
||||||
|
request_method="GET"
|
||||||
|
):
|
||||||
|
|
||||||
if not params:
|
if not params:
|
||||||
return
|
return
|
||||||
headers = headers or self.headers
|
headers = headers or self.headers
|
||||||
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
local_storage: Dict = await self.playwright_page.evaluate("() => window.localStorage") # type: ignore
|
||||||
douyin_js_obj = execjs.compile(open('libs/douyin.js').read())
|
|
||||||
common_params = {
|
common_params = {
|
||||||
"device_platform": "webapp",
|
"device_platform": "webapp",
|
||||||
"aid": "6383",
|
"aid": "6383",
|
||||||
"channel": "channel_pc_web",
|
"channel": "channel_pc_web",
|
||||||
|
"version_code": "190600",
|
||||||
|
"version_name": "19.6.0",
|
||||||
|
"update_version_code": "170400",
|
||||||
|
"pc_client_type": "1",
|
||||||
"cookie_enabled": "true",
|
"cookie_enabled": "true",
|
||||||
"browser_language": "zh-CN",
|
"browser_language": "zh-CN",
|
||||||
"browser_platform": "Win32",
|
"browser_platform": "MacIntel",
|
||||||
"browser_name": "Firefox",
|
"browser_name": "Chrome",
|
||||||
"browser_version": "110.0",
|
"browser_version": "125.0.0.0",
|
||||||
"browser_online": "true",
|
"browser_online": "true",
|
||||||
"engine_name": "Gecko",
|
"engine_name": "Blink",
|
||||||
"os_name": "Windows",
|
"os_name": "Mac OS",
|
||||||
"os_version": "10",
|
"os_version": "10.15.7",
|
||||||
|
"cpu_core_num": "8",
|
||||||
|
"device_memory": "8",
|
||||||
"engine_version": "109.0",
|
"engine_version": "109.0",
|
||||||
"platform": "PC",
|
"platform": "PC",
|
||||||
"screen_width": "1920",
|
"screen_width": "2560",
|
||||||
"screen_height": "1200",
|
"screen_height": "1440",
|
||||||
# " webid": douyin_js_obj.call("get_web_id"),
|
'effective_type': '4g',
|
||||||
# "msToken": local_storage.get("xmst"),
|
"round_trip_time": "50",
|
||||||
# "msToken": "abL8SeUTPa9-EToD8qfC7toScSADxpg6yLh2dbNcpWHzE0bT04txM_4UwquIcRvkRb9IU8sifwgM1Kwf1Lsld81o9Irt2_yNyUbbQPSUO8EfVlZJ_78FckDFnwVBVUVK",
|
"webid": get_web_id(),
|
||||||
|
"msToken": local_storage.get("xmst"),
|
||||||
}
|
}
|
||||||
params.update(common_params)
|
params.update(common_params)
|
||||||
query = '&'.join([f'{k}={v}' for k, v in params.items()])
|
query_string = urllib.parse.urlencode(params)
|
||||||
x_bogus = douyin_js_obj.call('sign', query, headers["User-Agent"])
|
|
||||||
params["X-Bogus"] = x_bogus
|
# 20240610 a-bogus更新(Playwright版本)
|
||||||
# print(x_bogus, query)
|
post_data = {}
|
||||||
|
if request_method == "POST":
|
||||||
|
post_data = params
|
||||||
|
a_bogus = await get_a_bogus(query_string, post_data, headers["User-Agent"], self.playwright_page)
|
||||||
|
params["a_bogus"] = a_bogus
|
||||||
|
|
||||||
async def request(self, method, url, **kwargs):
|
async def request(self, method, url, **kwargs):
|
||||||
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
response = None
|
||||||
response = await client.request(
|
if method == "GET":
|
||||||
method, url, timeout=self.timeout,
|
response = requests.request(method, url, **kwargs)
|
||||||
**kwargs
|
elif method == "POST":
|
||||||
)
|
response = requests.request(method, url, **kwargs)
|
||||||
try:
|
try:
|
||||||
|
if response.text == "" or response.text == "blocked":
|
||||||
|
utils.logger.error(f"request params incrr, response.text: {response.text}")
|
||||||
|
raise Exception("account blocked")
|
||||||
return response.json()
|
return response.json()
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise DataFetchError(f"{e}, {response.text}")
|
raise DataFetchError(f"{e}, {response.text}")
|
||||||
|
|
||||||
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
async def get(self, uri: str, params: Optional[Dict] = None, headers: Optional[Dict] = None):
|
||||||
|
"""
|
||||||
|
GET请求
|
||||||
|
"""
|
||||||
await self.__process_req_params(params, headers)
|
await self.__process_req_params(params, headers)
|
||||||
headers = headers or self.headers
|
headers = headers or self.headers
|
||||||
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
return await self.request(method="GET", url=f"{self._host}{uri}", params=params, headers=headers)
|
||||||
@ -117,27 +137,30 @@ class DOUYINClient(AbstractApiClient):
|
|||||||
:param publish_time: ·
|
:param publish_time: ·
|
||||||
:return:
|
:return:
|
||||||
"""
|
"""
|
||||||
params = {
|
query_params = {
|
||||||
"keyword": urllib.parse.quote(keyword),
|
'search_channel': search_channel.value,
|
||||||
"search_channel": search_channel.value,
|
'enable_history': '1',
|
||||||
"search_source": "normal_search",
|
'keyword': urllib.parse.quote(keyword),
|
||||||
"query_correct_type": 1,
|
'search_source': 'tab_search',
|
||||||
"is_filter_search": 0,
|
'query_correct_type': '1',
|
||||||
"offset": offset,
|
'is_filter_search': '0',
|
||||||
"count": 10 # must be set to 10
|
'from_group_id': '7378810571505847586',
|
||||||
|
'offset': offset,
|
||||||
|
'count': '15',
|
||||||
|
'need_filter_settings': '1',
|
||||||
|
'list_type': 'multi',
|
||||||
}
|
}
|
||||||
if sort_type != SearchSortType.GENERAL or publish_time != PublishTimeType.UNLIMITED:
|
if sort_type.value != SearchSortType.GENERAL.value or publish_time.value != PublishTimeType.UNLIMITED.value:
|
||||||
params["filter_selected"] = urllib.parse.quote(json.dumps({
|
query_params["filter_selected"] = urllib.parse.quote(json.dumps({
|
||||||
"sort_type": str(sort_type.value),
|
"sort_type": str(sort_type.value),
|
||||||
"publish_time": str(publish_time.value)
|
"publish_time": str(publish_time.value)
|
||||||
}))
|
}))
|
||||||
params["is_filter_search"] = 1
|
query_params["is_filter_search"] = 1
|
||||||
params["search_source"] = "tab_search"
|
query_params["search_source"] = "tab_search"
|
||||||
referer_url = "https://www.douyin.com/search/" + keyword
|
referer_url = f"https://www.douyin.com/search/{keyword}?aid=f594bbd9-a0e2-4651-9319-ebe3cb6298c1&type=general"
|
||||||
referer_url += f"?publish_time={publish_time.value}&sort_type={sort_type.value}&type=general"
|
|
||||||
headers = copy.copy(self.headers)
|
headers = copy.copy(self.headers)
|
||||||
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
headers["Referer"] = urllib.parse.quote(referer_url, safe=':/')
|
||||||
return await self.get("/aweme/v1/web/general/search/single/", params, headers=headers)
|
return await self.get("/aweme/v1/web/general/search/single/", query_params, headers=headers)
|
||||||
|
|
||||||
async def get_video_by_id(self, aweme_id: str) -> Any:
|
async def get_video_by_id(self, aweme_id: str) -> Any:
|
||||||
"""
|
"""
|
||||||
@ -149,7 +172,6 @@ class DOUYINClient(AbstractApiClient):
|
|||||||
"aweme_id": aweme_id
|
"aweme_id": aweme_id
|
||||||
}
|
}
|
||||||
headers = copy.copy(self.headers)
|
headers = copy.copy(self.headers)
|
||||||
# headers["Cookie"] = "s_v_web_id=verify_lol4a8dv_wpQ1QMyP_xemd_4wON_8Yzr_FJa8DN1vdY2m;"
|
|
||||||
del headers["Origin"]
|
del headers["Origin"]
|
||||||
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
res = await self.get("/aweme/v1/web/aweme/detail/", params, headers)
|
||||||
return res.get("aweme_detail", {})
|
return res.get("aweme_detail", {})
|
||||||
@ -259,7 +281,9 @@ class DOUYINClient(AbstractApiClient):
|
|||||||
"count": 18,
|
"count": 18,
|
||||||
"max_cursor": max_cursor,
|
"max_cursor": max_cursor,
|
||||||
"locate_query": "false",
|
"locate_query": "false",
|
||||||
"publish_video_strategy_type": 2
|
"publish_video_strategy_type": 2,
|
||||||
|
'verifyFp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130',
|
||||||
|
'fp': 'verify_lx901cuk_K7kaK4dK_bn2E_4dgk_BxAA_E0XS1VtUi130'
|
||||||
}
|
}
|
||||||
return await self.get(uri, params)
|
return await self.get(uri, params)
|
||||||
|
|
||||||
|
@ -26,7 +26,6 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
browser_context: BrowserContext
|
browser_context: BrowserContext
|
||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/109.0.0.0 Safari/537.36" # fixed
|
|
||||||
self.index_url = "https://www.douyin.com"
|
self.index_url = "https://www.douyin.com"
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
@ -42,7 +41,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
self.browser_context = await self.launch_browser(
|
self.browser_context = await self.launch_browser(
|
||||||
chromium,
|
chromium,
|
||||||
None,
|
None,
|
||||||
self.user_agent,
|
user_agent=None,
|
||||||
headless=config.HEADLESS
|
headless=config.HEADLESS
|
||||||
)
|
)
|
||||||
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
# stealth.min.js is a js script to prevent the website from detecting the crawler.
|
||||||
@ -225,7 +224,7 @@ class DouYinCrawler(AbstractCrawler):
|
|||||||
douyin_client = DOUYINClient(
|
douyin_client = DOUYINClient(
|
||||||
proxies=httpx_proxy,
|
proxies=httpx_proxy,
|
||||||
headers={
|
headers={
|
||||||
"User-Agent": self.user_agent,
|
"User-Agent": await self.context_page.evaluate("() => navigator.userAgent"),
|
||||||
"Cookie": cookie_str,
|
"Cookie": cookie_str,
|
||||||
"Host": "www.douyin.com",
|
"Host": "www.douyin.com",
|
||||||
"Origin": "https://www.douyin.com/",
|
"Origin": "https://www.douyin.com/",
|
||||||
|
53
media_platform/douyin/help.py
Normal file
53
media_platform/douyin/help.py
Normal file
@ -0,0 +1,53 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Author : relakkes@gmail.com
|
||||||
|
# @Name : 程序员阿江-Relakkes
|
||||||
|
# @Time : 2024/6/10 02:24
|
||||||
|
# @Desc : 获取 a_bogus 参数, 学习交流使用,请勿用作商业用途,侵权联系作者删除
|
||||||
|
|
||||||
|
import random
|
||||||
|
|
||||||
|
from playwright.async_api import Page
|
||||||
|
|
||||||
|
|
||||||
|
def get_web_id():
|
||||||
|
"""
|
||||||
|
生成随机的webid
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
|
||||||
|
def e(t):
|
||||||
|
if t is not None:
|
||||||
|
return str(t ^ (int(16 * random.random()) >> (t // 4)))
|
||||||
|
else:
|
||||||
|
return ''.join(
|
||||||
|
[str(int(1e7)), '-', str(int(1e3)), '-', str(int(4e3)), '-', str(int(8e3)), '-', str(int(1e11))]
|
||||||
|
)
|
||||||
|
|
||||||
|
web_id = ''.join(
|
||||||
|
e(int(x)) if x in '018' else x for x in e(None)
|
||||||
|
)
|
||||||
|
return web_id.replace('-', '')[:19]
|
||||||
|
|
||||||
|
|
||||||
|
async def get_a_bogus(params: str, post_data: dict, user_agent: str, page: Page = None):
|
||||||
|
"""
|
||||||
|
获取 a_bogus 参数
|
||||||
|
"""
|
||||||
|
return await get_a_bogus_from_playright(params, post_data, user_agent, page)
|
||||||
|
|
||||||
|
|
||||||
|
async def get_a_bogus_from_playright(params: str, post_data: dict, user_agent: str, page: Page):
|
||||||
|
"""
|
||||||
|
通过playright获取 a_bogus 参数
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
if not post_data:
|
||||||
|
post_data = ""
|
||||||
|
a_bogus = await page.evaluate(
|
||||||
|
"([params, post_data, ua]) => window.bdms.init._v[2].p[42].apply(null, [0, 1, 8, params, post_data, ua])",
|
||||||
|
[params, post_data, user_agent])
|
||||||
|
|
||||||
|
return a_bogus
|
||||||
|
|
@ -2,7 +2,6 @@ httpx==0.24.0
|
|||||||
Pillow==9.5.0
|
Pillow==9.5.0
|
||||||
playwright==1.42.0
|
playwright==1.42.0
|
||||||
tenacity==8.2.2
|
tenacity==8.2.2
|
||||||
PyExecJS==1.5.1
|
|
||||||
opencv-python
|
opencv-python
|
||||||
aiomysql==0.2.0
|
aiomysql==0.2.0
|
||||||
redis~=4.6.0
|
redis~=4.6.0
|
||||||
@ -14,3 +13,4 @@ python-dotenv==1.0.1
|
|||||||
jieba==0.42.1
|
jieba==0.42.1
|
||||||
wordcloud==1.9.3
|
wordcloud==1.9.3
|
||||||
matplotlib==3.9.0
|
matplotlib==3.9.0
|
||||||
|
requests==2.32.3
|
@ -7,8 +7,8 @@ from typing import List
|
|||||||
import config
|
import config
|
||||||
|
|
||||||
from . import xhs_store_impl
|
from . import xhs_store_impl
|
||||||
from .xhs_store_impl import *
|
|
||||||
from .xhs_store_image import *
|
from .xhs_store_image import *
|
||||||
|
from .xhs_store_impl import *
|
||||||
|
|
||||||
|
|
||||||
class XhsStoreFactory:
|
class XhsStoreFactory:
|
||||||
|
Loading…
Reference in New Issue
Block a user