Merge pull request #328 from helloteemo/feature/xiaohognshu_get_image
feature: 支持小红书图片、视频下载
This commit is contained in:
commit
8fe93dca23
@ -57,6 +57,7 @@ XHS_SPECIFIED_ID_LIST = [
|
|||||||
"6422c2750000000027000d88",
|
"6422c2750000000027000d88",
|
||||||
"64ca1b73000000000b028dd2",
|
"64ca1b73000000000b028dd2",
|
||||||
"630d5b85000000001203ab41",
|
"630d5b85000000001203ab41",
|
||||||
|
"668fe13000000000030241fa", # 图文混合
|
||||||
# ........................
|
# ........................
|
||||||
]
|
]
|
||||||
|
|
||||||
|
@ -129,6 +129,15 @@ class XiaoHongShuClient(AbstractApiClient):
|
|||||||
return await self.request(method="POST", url=f"{self._host}{uri}",
|
return await self.request(method="POST", url=f"{self._host}{uri}",
|
||||||
data=json_str, headers=headers)
|
data=json_str, headers=headers)
|
||||||
|
|
||||||
|
async def get_note_media(self, url: str) -> bytes | None:
|
||||||
|
async with httpx.AsyncClient(proxies=self.proxies) as client:
|
||||||
|
response = await client.request("GET", url, timeout=self.timeout)
|
||||||
|
if not response.reason_phrase == "OK":
|
||||||
|
utils.logger.error(f"[XiaoHongShuClient.get_note_media] request {url} err, res:{response.text}")
|
||||||
|
return None
|
||||||
|
else:
|
||||||
|
return response.content
|
||||||
|
|
||||||
async def pong(self) -> bool:
|
async def pong(self) -> bool:
|
||||||
"""
|
"""
|
||||||
用于检查登录态是否失效了
|
用于检查登录态是否失效了
|
||||||
|
@ -120,6 +120,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
for note_detail in note_details:
|
for note_detail in note_details:
|
||||||
if note_detail is not None:
|
if note_detail is not None:
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
|
await self.get_notice_media(note_detail)
|
||||||
note_id_list.append(note_detail.get("note_id"))
|
note_id_list.append(note_detail.get("note_id"))
|
||||||
page += 1
|
page += 1
|
||||||
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
utils.logger.info(f"[XiaoHongShuCrawler.search] Note details: {note_details}")
|
||||||
@ -171,6 +172,7 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
for note_detail in note_details:
|
for note_detail in note_details:
|
||||||
if note_detail is not None:
|
if note_detail is not None:
|
||||||
await xhs_store.update_xhs_note(note_detail)
|
await xhs_store.update_xhs_note(note_detail)
|
||||||
|
await self.get_notice_media(note_detail)
|
||||||
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
|
await self.batch_get_note_comments(config.XHS_SPECIFIED_ID_LIST)
|
||||||
|
|
||||||
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
async def get_note_detail(self, note_id: str, semaphore: asyncio.Semaphore) -> Optional[Dict]:
|
||||||
@ -276,4 +278,63 @@ class XiaoHongShuCrawler(AbstractCrawler):
|
|||||||
async def close(self):
|
async def close(self):
|
||||||
"""Close browser context"""
|
"""Close browser context"""
|
||||||
await self.browser_context.close()
|
await self.browser_context.close()
|
||||||
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
utils.logger.info("[XiaoHongShuCrawler.close] Browser context closed ...")
|
||||||
|
|
||||||
|
async def get_notice_media(self, note_detail: Dict):
|
||||||
|
if not config.ENABLE_GET_IMAGES:
|
||||||
|
utils.logger.info(f"[XiaoHongShuCrawler.get_notice_media] Crawling image mode is not enabled")
|
||||||
|
return
|
||||||
|
await self.get_note_images(note_detail)
|
||||||
|
await self.get_notice_video(note_detail)
|
||||||
|
|
||||||
|
async def get_note_images(self, note_item: Dict):
|
||||||
|
"""
|
||||||
|
get note images. please use get_notice_media
|
||||||
|
:param note_item:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if not config.ENABLE_GET_IMAGES:
|
||||||
|
return
|
||||||
|
note_id = note_item.get("note_id")
|
||||||
|
image_list: List[Dict] = note_item.get("image_list", [])
|
||||||
|
|
||||||
|
for img in image_list:
|
||||||
|
if img.get('url_default') != '':
|
||||||
|
img.update({'url': img.get('url_default')})
|
||||||
|
|
||||||
|
if not image_list:
|
||||||
|
return
|
||||||
|
picNum = 0
|
||||||
|
for pic in image_list:
|
||||||
|
url = pic.get("url")
|
||||||
|
if not url:
|
||||||
|
continue
|
||||||
|
content = await self.xhs_client.get_note_media(url)
|
||||||
|
if content is None:
|
||||||
|
continue
|
||||||
|
extension_file_name = f"{picNum}.jpg"
|
||||||
|
picNum += 1
|
||||||
|
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||||
|
|
||||||
|
async def get_notice_video(self, note_item: Dict):
|
||||||
|
"""
|
||||||
|
get note images. please use get_notice_media
|
||||||
|
:param note_item:
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
if not config.ENABLE_GET_IMAGES:
|
||||||
|
return
|
||||||
|
note_id = note_item.get("note_id")
|
||||||
|
|
||||||
|
videos = xhs_store.get_video_url_arr(note_item)
|
||||||
|
|
||||||
|
if not videos:
|
||||||
|
return
|
||||||
|
videoNum = 0
|
||||||
|
for url in videos:
|
||||||
|
content = await self.xhs_client.get_note_media(url)
|
||||||
|
if content is None:
|
||||||
|
continue
|
||||||
|
extension_file_name = f"{videoNum}.mp4"
|
||||||
|
videoNum += 1
|
||||||
|
await xhs_store.update_xhs_note_image(note_id, content, extension_file_name)
|
||||||
|
@ -8,6 +8,7 @@ import config
|
|||||||
|
|
||||||
from . import xhs_store_impl
|
from . import xhs_store_impl
|
||||||
from .xhs_store_impl import *
|
from .xhs_store_impl import *
|
||||||
|
from .xhs_store_image import *
|
||||||
|
|
||||||
|
|
||||||
class XhsStoreFactory:
|
class XhsStoreFactory:
|
||||||
@ -25,6 +26,25 @@ class XhsStoreFactory:
|
|||||||
return store_class()
|
return store_class()
|
||||||
|
|
||||||
|
|
||||||
|
def get_video_url_arr(note_item: Dict) -> List:
|
||||||
|
if note_item.get('type') != 'video':
|
||||||
|
return []
|
||||||
|
|
||||||
|
videoArr = []
|
||||||
|
originVideoKey = note_item.get('video').get('consumer').get('origin_video_key')
|
||||||
|
if originVideoKey == '':
|
||||||
|
originVideoKey = note_item.get('video').get('consumer').get('originVideoKey')
|
||||||
|
# 降级有水印
|
||||||
|
if originVideoKey == '':
|
||||||
|
videos = note_item.get('video').get('media').get('stream').get('h264')
|
||||||
|
if type(videos).__name__ == 'list':
|
||||||
|
videoArr = [v.get('master_url') for v in videos]
|
||||||
|
else:
|
||||||
|
videoArr = [f"http://sns-video-bd.xhscdn.com/{originVideoKey}"]
|
||||||
|
|
||||||
|
return videoArr
|
||||||
|
|
||||||
|
|
||||||
async def update_xhs_note(note_item: Dict):
|
async def update_xhs_note(note_item: Dict):
|
||||||
note_id = note_item.get("note_id")
|
note_id = note_item.get("note_id")
|
||||||
user_info = note_item.get("user", {})
|
user_info = note_item.get("user", {})
|
||||||
@ -36,11 +56,7 @@ async def update_xhs_note(note_item: Dict):
|
|||||||
if img.get('url_default') != '':
|
if img.get('url_default') != '':
|
||||||
img.update({'url': img.get('url_default')})
|
img.update({'url': img.get('url_default')})
|
||||||
|
|
||||||
video_url = ''
|
video_url = ','.join(get_video_url_arr(note_item))
|
||||||
if note_item.get('type') == 'video':
|
|
||||||
videos = note_item.get('video').get('media').get('stream').get('h264')
|
|
||||||
if type(videos).__name__ == 'list':
|
|
||||||
video_url = ','.join([v.get('master_url') for v in videos])
|
|
||||||
|
|
||||||
local_db_item = {
|
local_db_item = {
|
||||||
"note_id": note_item.get("note_id"),
|
"note_id": note_item.get("note_id"),
|
||||||
@ -127,3 +143,8 @@ async def save_creator(user_id: str, creator: Dict):
|
|||||||
}
|
}
|
||||||
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
|
utils.logger.info(f"[store.xhs.save_creator] creator:{local_db_item}")
|
||||||
await XhsStoreFactory.create_store().store_creator(local_db_item)
|
await XhsStoreFactory.create_store().store_creator(local_db_item)
|
||||||
|
|
||||||
|
|
||||||
|
async def update_xhs_note_image(note_id, pic_content, extension_file_name):
|
||||||
|
await XiaoHongShuImage().store_image(
|
||||||
|
{"notice_id": note_id, "pic_content": pic_content, "extension_file_name": extension_file_name})
|
||||||
|
55
store/xhs/xhs_store_image.py
Normal file
55
store/xhs/xhs_store_image.py
Normal file
@ -0,0 +1,55 @@
|
|||||||
|
# -*- coding: utf-8 -*-
|
||||||
|
# @Author : helloteemo
|
||||||
|
# @Time : 2024/7/11 22:35
|
||||||
|
# @Desc : 小红书图片保存
|
||||||
|
import pathlib
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
import aiofiles
|
||||||
|
|
||||||
|
from base.base_crawler import AbstractStoreImage
|
||||||
|
from tools import utils
|
||||||
|
|
||||||
|
|
||||||
|
class XiaoHongShuImage(AbstractStoreImage):
|
||||||
|
image_store_path: str = "data/xhs/images"
|
||||||
|
|
||||||
|
async def store_image(self, image_content_item: Dict):
|
||||||
|
"""
|
||||||
|
store content
|
||||||
|
Args:
|
||||||
|
content_item:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
await self.save_image(image_content_item.get("notice_id"), image_content_item.get("pic_content"),
|
||||||
|
image_content_item.get("extension_file_name"))
|
||||||
|
|
||||||
|
def make_save_file_name(self, notice_id: str, extension_file_name: str) -> str:
|
||||||
|
"""
|
||||||
|
make save file name by store type
|
||||||
|
Args:
|
||||||
|
notice_id: notice id
|
||||||
|
picid: image id
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
return f"{self.image_store_path}/{notice_id}/{extension_file_name}"
|
||||||
|
|
||||||
|
async def save_image(self, notice_id: str, pic_content: str, extension_file_name="jpg"):
|
||||||
|
"""
|
||||||
|
save image to local
|
||||||
|
Args:
|
||||||
|
notice_id: notice id
|
||||||
|
pic_content: image content
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
|
||||||
|
"""
|
||||||
|
pathlib.Path(self.image_store_path + "/" + notice_id).mkdir(parents=True, exist_ok=True)
|
||||||
|
save_file_name = self.make_save_file_name(notice_id, extension_file_name)
|
||||||
|
async with aiofiles.open(save_file_name, 'wb') as f:
|
||||||
|
await f.write(pic_content)
|
||||||
|
utils.logger.info(f"[XiaoHongShuImageStoreImplement.save_image] save image {save_file_name} success ...")
|
Loading…
Reference in New Issue
Block a user