feat: xhs笔记详情更新
This commit is contained in:
parent
678b358e2e
commit
573ca9a659
@ -40,7 +40,7 @@ START_PAGE = 1
|
|||||||
CRAWLER_MAX_NOTES_COUNT = 20
|
CRAWLER_MAX_NOTES_COUNT = 20
|
||||||
|
|
||||||
# 并发爬虫数量控制
|
# 并发爬虫数量控制
|
||||||
MAX_CONCURRENCY_NUM = 4
|
MAX_CONCURRENCY_NUM = 1
|
||||||
|
|
||||||
# 是否开启爬图片模式, 默认不开启爬图片
|
# 是否开启爬图片模式, 默认不开启爬图片
|
||||||
ENABLE_GET_IMAGES = False
|
ENABLE_GET_IMAGES = False
|
||||||
|
@ -198,27 +198,34 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
}
|
}
|
||||||
return await self.post(uri, data)
|
return await self.post(uri, data)
|
||||||
|
|
||||||
async def get_note_by_id(self, note_id: str) -> Dict:
|
async def get_note_by_id(self, note_id: str, xsec_source: str, xsec_token: str) -> Dict:
|
||||||
"""
|
"""
|
||||||
获取笔记详情API
|
获取笔记详情API
|
||||||
Args:
|
Args:
|
||||||
note_id:笔记ID
|
note_id:笔记ID
|
||||||
|
xsec_source: 渠道来源
|
||||||
|
xsec_token: 搜索关键字之后返回的比较列表中返回的token
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
|
|
||||||
"""
|
"""
|
||||||
|
if xsec_source == "":
|
||||||
|
xsec_source = "pc_search"
|
||||||
|
|
||||||
data = {
|
data = {
|
||||||
"source_note_id": note_id,
|
"source_note_id": note_id,
|
||||||
"image_formats": ["jpg", "webp", "avif"],
|
"image_formats": ["jpg", "webp", "avif"],
|
||||||
"extra": {"need_body_topic": 1},
|
"extra": {"need_body_topic": 1},
|
||||||
"xsec_source": "pc_feed",
|
"xsec_source": xsec_source,
|
||||||
|
"xsec_token": xsec_token
|
||||||
}
|
}
|
||||||
uri = "/api/sns/web/v1/feed"
|
uri = "/api/sns/web/v1/feed"
|
||||||
res = await self.post(uri, data)
|
res = await self.post(uri, data)
|
||||||
if res and res.get("items"):
|
if res and res.get("items"):
|
||||||
res_dict: Dict = res["items"][0]["note_card"]
|
res_dict: Dict = res["items"][0]["note_card"]
|
||||||
return res_dict
|
return res_dict
|
||||||
utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note empty and res:{res}")
|
# 爬取频繁了可能会出现有的笔记能有结果有的没有
|
||||||
|
utils.logger.error(f"[XiaoHongShuClient.get_note_by_id] get note id:{note_id} empty and res:{res}")
|
||||||
return dict()
|
return dict()
|
||||||
|
|
||||||
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
async def get_note_comments(self, note_id: str, cursor: str = "") -> Dict:
|
||||||
@ -294,7 +301,7 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
return result
|
return result
|
||||||
|
|
||||||
async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
|
async def get_comments_all_sub_comments(self, comments: List[Dict], crawl_interval: float = 1.0,
|
||||||
callback: Optional[Callable] = None) -> List[Dict]:
|
callback: Optional[Callable] = None) -> List[Dict]:
|
||||||
"""
|
"""
|
||||||
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
获取指定一级评论下的所有二级评论, 该方法会一直查找一级评论下的所有二级评论信息
|
||||||
Args:
|
Args:
|
||||||
@ -306,7 +313,8 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
if not config.ENABLE_GET_SUB_COMMENTS:
|
if not config.ENABLE_GET_SUB_COMMENTS:
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuCrawler.get_comments_all_sub_comments] Crawling sub_comment mode is not enabled")
|
||||||
return []
|
return []
|
||||||
|
|
||||||
result = []
|
result = []
|
||||||
@ -398,17 +406,20 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
while notes_has_more:
|
while notes_has_more:
|
||||||
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
notes_res = await self.get_notes_by_creator(user_id, notes_cursor)
|
||||||
if not notes_res:
|
if not notes_res:
|
||||||
utils.logger.error(f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
utils.logger.error(
|
||||||
|
f"[XiaoHongShuClient.get_notes_by_creator] The current creator may have been banned by xhs, so they cannot access the data.")
|
||||||
break
|
break
|
||||||
|
|
||||||
notes_has_more = notes_res.get("has_more", False)
|
notes_has_more = notes_res.get("has_more", False)
|
||||||
notes_cursor = notes_res.get("cursor", "")
|
notes_cursor = notes_res.get("cursor", "")
|
||||||
if "notes" not in notes_res:
|
if "notes" not in notes_res:
|
||||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuClient.get_all_notes_by_creator] No 'notes' key found in response: {notes_res}")
|
||||||
break
|
break
|
||||||
|
|
||||||
notes = notes_res["notes"]
|
notes = notes_res["notes"]
|
||||||
utils.logger.info(f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
utils.logger.info(
|
||||||
|
f"[XiaoHongShuClient.get_all_notes_by_creator] got user_id:{user_id} notes len : {len(notes)}")
|
||||||
if callback:
|
if callback:
|
||||||
await callback(notes)
|
await callback(notes)
|
||||||
await asyncio.sleep(crawl_interval)
|
await asyncio.sleep(crawl_interval)
|
||||||
|
@ -27,7 +27,8 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
|
|
||||||
def __init__(self) -> None:
|
def __init__(self) -> None:
|
||||||
self.index_url = "https://www.xiaohongshu.com"
|
self.index_url = "https://www.xiaohongshu.com"
|
||||||
self.user_agent = utils.get_user_agent()
|
# self.user_agent = utils.get_user_agent()
|
||||||
|
self.user_agent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
|
||||||
|
|
||||||
async def start(self) -> None:
|
async def start(self) -> None:
|
||||||
playwright_proxy_format, httpx_proxy_format = None, None
|
playwright_proxy_format, httpx_proxy_format = None, None
|
||||||
@ -110,18 +111,23 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
sort=SearchSortType(config.SORT_TYPE) if config.SORT_TYPE != '' else SearchSortType.GENERAL,
|
||||||
)
|
)
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Search notes res:{notes_res}")
|
||||||
if(not notes_res or not notes_res.get('has_more', False)):
|
if not notes_res or not notes_res.get('has_more', False):
|
||||||
utils.logger.info("No more content!")
|
utils.logger.info("No more content!")
|
||||||
break
|
break
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [
|
||||||
self.get_note_detail(post_item.get("id"), semaphore)
|
self.get_note_detail(
|
||||||
|
note_id=post_item.get("id"),
|
||||||
|
xsec_source=post_item.get("xsec_source"),
|
||||||
|
xsec_token=post_item.get("xsec_token"),
|
||||||
|
semaphore=semaphore
|
||||||
|
)
|
||||||
for post_item in notes_res.get("items", {})
|
for post_item in notes_res.get("items", {})
|
||||||
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
if post_item.get('model_type') not in ('rec_query', 'hot_query')
|
||||||
]
|
]
|
||||||
note_details = await asyncio.gather(*task_list)
|
note_details = await asyncio.gather(*task_list)
|
||||||
for note_detail in note_details:
|
for note_detail in note_details:
|
||||||
if note_detail is not None:
|
if note_detail:
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
await self.get_notice_media(note_detail)
|
await self.get_notice_media(note_detail)
|
||||||
note_id_list.append(note_detail.get("note_id"))
|
note_id_list.append(note_detail.get("note_id"))
|
||||||
@ -157,32 +163,42 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
"""
|
"""
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
task_list = [
|
task_list = [
|
||||||
self.get_note_detail(post_item.get("note_id"), semaphore) for post_item in note_list
|
self.get_note_detail(
|
||||||
|
note_id=post_item.get("id"),
|
||||||
|
xsec_source=post_item.get("xsec_source"),
|
||||||
|
xsec_token=post_item.get("xsec_token"),
|
||||||
|
semaphore=semaphore
|
||||||
|
)
|
||||||
|
for post_item in note_list
|
||||||
]
|
]
|
||||||
|
|
||||||
note_details = await asyncio.gather(*task_list)
|
note_details = await asyncio.gather(*task_list)
|
||||||
for note_detail in note_details:
|
for note_detail in note_details:
|
||||||
if note_detail is not None:
|
if note_detail:
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
|
|
||||||
async def get_specified_notes(self):
|
async def get_specified_notes(self):
|
||||||
"""Get the information and comments of the specified post"""
|
"""Get the information and comments of the specified post"""
|
||||||
semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
# todo 指定帖子爬取暂时失效,xhs更新了帖子详情的请求参数,需要携带xsec_token,目前发现该参数只能在搜索场景下获取到
|
||||||
task_list = [
|
raise Exception(
|
||||||
self.get_note_detail(note_id=note_id, semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST
|
"指定帖子爬取暂时失效,xhs更新了帖子详情的请求参数,需要携带xsec_token,目前发现只能在搜索场景下获取到")
|
||||||
]
|
# semaphore = asyncio.Semaphore(config.MAX_CONCURRENCY_NUM)
|
||||||
note_details = await asyncio.gather(*task_list)
|
# task_list = [
|
||||||
for note_detail in note_details:
|
# self.get_note_detail(note_id=note_id, xsec_token="", semaphore=semaphore) for note_id in config.XHS_SPECIFIED_ID_LIST
|
||||||
if note_detail is not None:
|
# ]
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
# note_details = await asyncio.gather(*task_list)
|
||||||
await self.get_notice_media(note_detail)
|
# for note_detail in note_details:
|
||||||
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
|
# if note_detail is not None:
|
||||||
|
# await xhs_store.update_xhs_note(note_detail)
|
||||||
|
# await self.get_notice_media(note_detail)
|
||||||
|
# await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
|
||||||
|
|
||||||
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
async def get_note_detail(self, note_id: str, xsec_source: str, xsec_token: str, semaphore: asyncio.Semaphore) -> \
|
||||||
|
Optional[Dict]:
|
||||||
"""Get note detail"""
|
"""Get note detail"""
|
||||||
async with semaphore:
|
async with semaphore:
|
||||||
try:
|
try:
|
||||||
return await self.xhs_client.get_note_by_id(note_id)
|
return await self.xhs_client.get_note_by_id(note_id, xsec_source, xsec_token)
|
||||||
except DataFetchError as ex:
|
except DataFetchError as ex:
|
||||||
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
|
utils.logger.error(f"[XiaoHongShuCrawler.get_note_detail] Get note detail error: {ex}")
|
||||||
return None
|
return None
|
||||||
|
@ -10,19 +10,19 @@ def sign(a1="", b1="", x_s="", x_t=""):
|
|||||||
takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
|
takes in a URI (uniform resource identifier), an optional data dictionary, and an optional ctime parameter. It returns a dictionary containing two keys: "x-s" and "x-t".
|
||||||
"""
|
"""
|
||||||
common = {
|
common = {
|
||||||
"s0": 5, # getPlatformCode
|
"s0": 3, # getPlatformCode
|
||||||
"s1": "",
|
"s1": "",
|
||||||
"x0": "1", # localStorage.getItem("b1b1")
|
"x0": "1", # localStorage.getItem("b1b1")
|
||||||
"x1": "3.3.0", # version
|
"x1": "3.7.8-2", # version
|
||||||
"x2": "Windows",
|
"x2": "Mac OS",
|
||||||
"x3": "xhs-pc-web",
|
"x3": "xhs-pc-web",
|
||||||
"x4": "1.4.4",
|
"x4": "4.27.2",
|
||||||
"x5": a1, # cookie of a1
|
"x5": a1, # cookie of a1
|
||||||
"x6": x_t,
|
"x6": x_t,
|
||||||
"x7": x_s,
|
"x7": x_s,
|
||||||
"x8": b1, # localStorage.getItem("b1")
|
"x8": b1, # localStorage.getItem("b1")
|
||||||
"x9": mrc(x_t + x_s + b1),
|
"x9": mrc(x_t + x_s + b1),
|
||||||
"x10": 1, # getSigCount
|
"x10": 154, # getSigCount
|
||||||
}
|
}
|
||||||
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
encode_str = encodeUtf8(json.dumps(common, separators=(',', ':')))
|
||||||
x_s_common = b64Encode(encode_str)
|
x_s_common = b64Encode(encode_str)
|
||||||
|
@ -10,6 +10,7 @@ import config
|
|||||||
from .bilibili_store_impl import *
|
from .bilibili_store_impl import *
|
||||||
from .bilibilli_store_video import *
|
from .bilibilli_store_video import *
|
||||||
|
|
||||||
|
|
||||||
class BiliStoreFactory:
|
class BiliStoreFactory:
|
||||||
STORES = {
|
STORES = {
|
||||||
"csv": BiliCsvStoreImplement,
|
"csv": BiliCsvStoreImplement,
|
||||||
|
@ -33,6 +33,9 @@ def calculate_number_of_files(file_store_path: str) -> int:
|
|||||||
|
|
||||||
|
|
||||||
class WeiboCsvStoreImplement(AbstractStore):
|
class WeiboCsvStoreImplement(AbstractStore):
|
||||||
|
async def store_creator(self, creator: Dict):
|
||||||
|
pass
|
||||||
|
|
||||||
csv_store_path: str = "data/weibo"
|
csv_store_path: str = "data/weibo"
|
||||||
file_count:int=calculate_number_of_files(csv_store_path)
|
file_count:int=calculate_number_of_files(csv_store_path)
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user