当前位置：首页 > news >正文

Python小红书数据采集终极指南：xhs工具完整使用教程

news 2026/5/15 10:53:15

Python小红书数据采集终极指南：xhs工具完整使用教程

【免费下载链接】xhs基于小红书 Web 端进行的请求封装。https://reajason.github.io/xhs/项目地址: https://gitcode.com/gh_mirrors/xh/xhs

小红书作为中国领先的生活方式分享平台，蕴含着丰富的用户行为数据和内容趋势。xhs是一个专为开发者设计的Python爬虫工具，通过封装小红书Web端API接口，帮助用户快速获取公开内容数据。本文将深入解析xhs工具的核心功能、实战应用和最佳实践，为开发者提供完整的小红书数据采集解决方案。

核心特性解析：为什么选择xhs工具？

xhs工具的核心优势在于其完整的API封装和智能签名机制。通过分析xhs/core.py源码，我们可以看到工具提供了全面的接口覆盖：

1. 完整的API接口支持

xhs工具支持小红书主要的数据接口，包括：

笔记搜索与详情获取
用户信息与发布内容查询
多种内容类型的分类获取
智能签名验证机制

2. 智能签名服务架构

为了解决小红书的反爬机制，xhs工具设计了灵活的签名方案。通过example/basic_sign_server.py示例，开发者可以部署独立的签名服务，确保请求的稳定性和安全性。

3. 错误处理与重试机制

xhs/exception.py模块定义了完整的异常处理体系，包括DataFetchError、IPBlockError等，帮助开发者构建健壮的数据采集应用。

环境搭建与快速开始

安装xhs工具

pip install xhs

从源码安装最新版本

git clone https://gitcode.com/gh_mirrors/xh/xhs cd xhs && python setup.py install

获取必要凭证

使用xhs工具需要小红书的cookie信息，关键字段包括：

a1：用户身份标识
web_session：会话信息
webId：设备标识

基础使用示例

参考example/basic_usage.py，快速开始你的第一个数据采集脚本：

from xhs import XhsClient # 初始化客户端 client = XhsClient(cookie="your_cookie_here") # 搜索热门笔记 results = client.search_note( keyword="美食探店", page=1, page_size=20 ) print(f"找到 {len(results['items'])} 条相关笔记")

实战应用场景与代码示例

场景一：市场趋势分析

import json from xhs import XhsClient, FeedType def analyze_market_trends(): client = XhsClient(cookie="your_cookie") # 获取不同类别的推荐内容 categories = [ FeedType.FOOD, # 美食 FeedType.FASION, # 穿搭 FeedType.COSMETICS, # 彩妆 FeedType.TRAVEL # 旅行 ] trends_data = {} for category in categories: try: feed = client.get_home_feed(category=category.value) trends_data[category.name] = { "count": len(feed.get("items", [])), "top_keywords": extract_keywords(feed) } except Exception as e: print(f"获取{category.name}数据失败: {e}") return trends_data def extract_keywords(feed_data): """从feed数据中提取关键词""" # 实现关键词提取逻辑 pass

场景二：竞品内容监控

import schedule import time from datetime import datetime from xhs import XhsClient, help class CompetitorMonitor: def __init__(self, cookie): self.client = XhsClient(cookie) self.competitors = { "brand_a": "user_id_1", "brand_b": "user_id_2", "brand_c": "user_id_3" } def monitor_daily_posts(self): """监控竞争对手的每日发布内容""" daily_report = {} for brand, user_id in self.competitors.items(): try: user_notes = self.client.get_user_notes( user_id=user_id, page=1, page_size=10 ) daily_report[brand] = { "post_count": len(user_notes.get("notes", [])), "total_likes": sum(note.get("likes", 0) for note in user_notes.get("notes", [])), "total_collects": sum(note.get("collects", 0) for note in user_notes.get("notes", [])), "latest_post": user_notes.get("notes", [])[0] if user_notes.get("notes") else None } except Exception as e: print(f"监控{brand}失败: {e}") return daily_report def start_monitoring(self, interval_hours=6): """启动定时监控""" schedule.every(interval_hours).hours.do(self.monitor_daily_posts) while True: schedule.run_pending() time.sleep(60)

场景三：内容质量评估

from xhs import XhsClient from typing import Dict, List class ContentQualityAnalyzer: def __init__(self, cookie): self.client = XhsClient(cookie) def analyze_note_quality(self, note_id: str) -> Dict: """分析笔记质量指标""" try: note_detail = self.client.get_note_by_id( note_id=note_id, xsec_token="your_token" ) # 计算互动率 likes = note_detail.get("likes", 0) collects = note_detail.get("collects", 0) comments = note_detail.get("comments", 0) shares = note_detail.get("shares", 0) # 提取内容特征 content = note_detail.get("desc", "") images = help.get_imgs_url_from_note(note_detail) quality_score = self.calculate_quality_score( likes, collects, comments, shares, len(content), len(images) ) return { "note_id": note_id, "quality_score": quality_score, "engagement_rate": (likes + collects + comments) / 1000, "content_length": len(content), "image_count": len(images), "has_video": "video" in note_detail, "publish_time": note_detail.get("time", "") } except Exception as e: print(f"分析笔记{note_id}失败: {e}") return None def calculate_quality_score(self, *args) -> float: """计算内容质量得分""" # 实现质量评分算法 pass

高级功能配置与优化

签名服务部署方案

对于生产环境，建议部署独立的签名服务。参考xhs-api/app.py实现：

# Docker部署签名服务 docker run -it -d -p 5005:5005 reajason/xhs-api:latest # 客户端使用签名服务 from xhs import XhsClient import requests def remote_sign(uri, data=None, a1="", web_session=""): """远程签名函数""" response = requests.post( "http://localhost:5005/sign", json={ "uri": uri, "data": data, "a1": a1, "web_session": web_session } ) return response.json() # 初始化带远程签名的客户端 client = XhsClient( cookie="your_cookie", sign=remote_sign )

性能优化策略

1. 连接池管理

import requests from requests.adapters import HTTPAdapter from xhs import XhsClient class OptimizedXhsClient: def __init__(self, cookie, max_retries=3, pool_connections=10, pool_maxsize=10): self.session = requests.Session() # 配置连接池 adapter = HTTPAdapter( pool_connections=pool_connections, pool_maxsize=pool_maxsize, max_retries=max_retries ) self.session.mount('https://', adapter) self.session.mount('http://', adapter) self.client = XhsClient(cookie, session=self.session) def get_with_retry(self, func, *args, **kwargs): """带重试机制的请求""" for attempt in range(3): try: return func(*args, **kwargs) except Exception as e: if attempt == 2: raise time.sleep(2 ** attempt) # 指数退避

2. 缓存机制实现

import json import os from datetime import datetime, timedelta from functools import wraps def cached_result(cache_dir="cache", ttl_hours=24): """结果缓存装饰器""" def decorator(func): @wraps(func) def wrapper(*args, **kwargs): # 生成缓存键 cache_key = f"{func.__name__}_{hash(str(args) + str(kwargs))}" cache_file = os.path.join(cache_dir, f"{cache_key}.json") # 检查缓存是否有效 if os.path.exists(cache_file): with open(cache_file, 'r', encoding='utf-8') as f: cache_data = json.load(f) cache_time = datetime.fromisoformat(cache_data['timestamp']) if datetime.now() - cache_time < timedelta(hours=ttl_hours): return cache_data['data'] # 执行函数并缓存结果 result = func(*args, **kwargs) os.makedirs(cache_dir, exist_ok=True) cache_data = { 'timestamp': datetime.now().isoformat(), 'data': result } with open(cache_file, 'w', encoding='utf-8') as f: json.dump(cache_data, f, ensure_ascii=False, indent=2) return result return wrapper return decorator # 使用缓存 @cached_result(cache_dir="note_cache", ttl_hours=12) def get_note_with_cache(client, note_id, xsec_token): """带缓存的笔记获取""" return client.get_note_by_id(note_id, xsec_token)

3. 并发处理优化

import concurrent.futures from typing import List, Dict class BatchProcessor: def __init__(self, client, max_workers=5): self.client = client self.max_workers = max_workers def batch_get_notes(self, note_ids: List[str], xsec_tokens: List[str]) -> List[Dict]: """批量获取笔记信息""" results = [] with concurrent.futures.ThreadPoolExecutor(max_workers=self.max_workers) as executor: # 创建任务映射 future_to_note = { executor.submit( self.client.get_note_by_id, note_id, token ): (note_id, token) for note_id, token in zip(note_ids, xsec_tokens) } # 收集结果 for future in concurrent.futures.as_completed(future_to_note): note_id, token = future_to_note[future] try: result = future.result() results.append(result) except Exception as e: print(f"获取笔记{note_id}失败: {e}") results.append({"note_id": note_id, "error": str(e)}) return results

错误处理与监控

异常处理最佳实践

from xhs.exception import DataFetchError, IPBlockError, NeedVerifyError, SignError import time import random class RobustXhsClient: def __init__(self, cookie, sign_func=None): self.client = XhsClient(cookie, sign=sign_func) self.request_count = 0 self.last_request_time = time.time() def safe_request(self, api_func, max_retries=3, delay_range=(2, 5)): """安全的API请求封装""" for attempt in range(max_retries): try: # 控制请求频率 current_time = time.time() time_since_last = current_time - self.last_request_time if time_since_last < 1.0: # 至少1秒间隔 time.sleep(1.0 - time_since_last) result = api_func() self.request_count += 1 self.last_request_time = time.time() return result except DataFetchError as e: print(f"数据获取失败 (尝试 {attempt + 1}/{max_retries}): {e}") if attempt < max_retries - 1: wait_time = random.uniform(*delay_range) time.sleep(wait_time) except IPBlockError: print("检测到IP限制，建议更换IP或等待一段时间") break except SignError: print("签名失败，检查签名服务配置") break except NeedVerifyError: print("需要验证码验证") break except Exception as e: print(f"未知错误: {e}") if attempt < max_retries - 1: time.sleep(random.uniform(*delay_range)) return None def get_note_safe(self, note_id, xsec_token, max_retries=3): """安全的获取笔记信息""" return self.safe_request( lambda: self.client.get_note_by_id(note_id, xsec_token), max_retries=max_retries )

监控与日志记录

import logging from logging.handlers import RotatingFileHandler from datetime import datetime def setup_logging(): """配置日志系统""" logger = logging.getLogger('xhs_client') logger.setLevel(logging.INFO) # 文件处理器 file_handler = RotatingFileHandler( 'xhs_client.log', maxBytes=10*1024*1024, # 10MB backupCount=5 ) file_handler.setLevel(logging.INFO) # 控制台处理器 console_handler = logging.StreamHandler() console_handler.setLevel(logging.WARNING) # 格式器 formatter = logging.Formatter( '%(asctime)s - %(name)s - %(levelname)s - %(message)s' ) file_handler.setFormatter(formatter) console_handler.setFormatter(formatter) logger.addHandler(file_handler) logger.addHandler(console_handler) return logger class MonitoredXhsClient: def __init__(self, cookie, logger=None): self.client = XhsClient(cookie) self.logger = logger or setup_logging() self.metrics = { 'total_requests': 0, 'successful_requests': 0, 'failed_requests': 0, 'start_time': datetime.now() } def get_with_monitoring(self, api_func, *args, **kwargs): """带监控的API调用""" self.metrics['total_requests'] += 1 try: result = api_func(*args, **kwargs) self.metrics['successful_requests'] += 1 self.logger.info(f"API调用成功: {api_func.__name__}") return result except Exception as e: self.metrics['failed_requests'] += 1 self.logger.error(f"API调用失败: {api_func.__name__}, 错误: {str(e)}") raise def get_metrics(self): """获取监控指标""" current_time = datetime.now() runtime = (current_time - self.metrics['start_time']).total_seconds() return { **self.metrics, 'runtime_seconds': runtime, 'requests_per_second': self.metrics['total_requests'] / runtime if runtime > 0 else 0, 'success_rate': self.metrics['successful_requests'] / self.metrics['total_requests'] if self.metrics['total_requests'] > 0 else 0 }

数据存储与处理建议

数据库设计示例

import sqlite3 from datetime import datetime import json class XhsDataStorage: def __init__(self, db_path='xhs_data.db'): self.conn = sqlite3.connect(db_path) self.create_tables() def create_tables(self): """创建数据表""" cursor = self.conn.cursor() # 笔记表 cursor.execute(''' CREATE TABLE IF NOT EXISTS notes ( note_id TEXT PRIMARY KEY, user_id TEXT, title TEXT, content TEXT, likes INTEGER, collects INTEGER, comments INTEGER, shares INTEGER, image_count INTEGER, has_video BOOLEAN, publish_time TIMESTAMP, category TEXT, tags TEXT, -- JSON数组 raw_data TEXT, -- 原始JSON数据 created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # 用户表 cursor.execute(''' CREATE TABLE IF NOT EXISTS users ( user_id TEXT PRIMARY KEY, nickname TEXT, avatar_url TEXT, followers INTEGER, following INTEGER, notes_count INTEGER, likes_count INTEGER, raw_data TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # 搜索记录表 cursor.execute(''' CREATE TABLE IF NOT EXISTS search_records ( id INTEGER PRIMARY KEY AUTOINCREMENT, keyword TEXT, sort_type TEXT, page INTEGER, page_size INTEGER, total_results INTEGER, search_time TIMESTAMP, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') self.conn.commit() def save_note(self, note_data): """保存笔记数据""" cursor = self.conn.cursor() cursor.execute(''' INSERT OR REPLACE INTO notes (note_id, user_id, title, content, likes, collects, comments, shares, image_count, has_video, publish_time, category, tags, raw_data, updated_at) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?) ''', ( note_data.get('note_id'), note_data.get('user_id'), note_data.get('title', ''), note_data.get('desc', ''), note_data.get('likes', 0), note_data.get('collects', 0), note_data.get('comments', 0), note_data.get('shares', 0), len(note_data.get('images', [])), 'video' in note_data, note_data.get('time'), note_data.get('category', ''), json.dumps(note_data.get('tags', []), ensure_ascii=False), json.dumps(note_data, ensure_ascii=False), datetime.now().isoformat() )) self.conn.commit() def get_notes_by_category(self, category, limit=100): """按分类获取笔记""" cursor = self.conn.cursor() cursor.execute(''' SELECT * FROM notes WHERE category = ? ORDER BY publish_time DESC LIMIT ? ''', (category, limit)) columns = [desc[0] for desc in cursor.description] return [dict(zip(columns, row)) for row in cursor.fetchall()] def close(self): """关闭数据库连接""" self.conn.close()