Python网络爬虫实战:从Requests到Scrapy的完整指南
Python网络爬虫实战:从Requests到Scrapy的完整指南
引言
网络爬虫是数据采集和分析的重要工具。作为从Python转向Rust的后端开发者,我发现Python的爬虫生态非常成熟,从简单的Requests到强大的Scrapy框架,能够满足各种爬虫需求。本文将从实战角度出发,深入探讨Python网络爬虫的最佳实践,帮助你构建高效、稳定的爬虫系统。
一、网络爬虫概述
1.1 爬虫类型
| 类型 | 特点 | 适用场景 |
|---|---|---|
| 静态爬虫 | 爬取静态HTML页面 | 简单网站、数据采集 |
| 动态爬虫 | 处理JavaScript渲染 | 现代SPA应用 |
| 增量爬虫 | 定期更新数据 | 新闻、博客监控 |
| 分布式爬虫 | 多节点协作 | 大规模数据采集 |
1.2 爬虫架构
┌─────────────────────────────────────────────────────┐ │ 调度层 │ │ URL队列 → 调度器 → 请求分发 │ ├─────────────────────────────────────────────────────┤ │ 抓取层 │ │ 请求模块 → 页面解析 → 数据提取 │ ├─────────────────────────────────────────────────────┤ │ 存储层 │ │ 数据清洗 → 数据存储 → 数据备份 │ └─────────────────────────────────────────────────────┘二、Requests基础爬虫
2.1 基本请求
import requests url = 'https://example.com' response = requests.get(url) print(f"状态码: {response.status_code}") print(f"响应头: {response.headers}") print(f"响应内容: {response.text[:500]}")2.2 请求参数
params = {'key1': 'value1', 'key2': 'value2'} headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Referer': 'https://example.com' } response = requests.get( 'https://api.example.com/data', params=params, headers=headers, timeout=10 )2.3 会话管理
session = requests.Session() session.headers.update({'User-Agent': 'MyBot/1.0'}) # 保持登录状态 session.post('https://example.com/login', data={'username': 'user', 'password': 'pass'}) # 后续请求自动携带cookie response = session.get('https://example.com/dashboard')三、BeautifulSoup解析
3.1 HTML解析
from bs4 import BeautifulSoup html = response.text soup = BeautifulSoup(html, 'html.parser') # 查找标签 title = soup.title.string print(f"页面标题: {title}") # 查找元素 links = soup.find_all('a', href=True) for link in links[:5]: print(f"链接: {link['href']} - {link.get_text()}") # 使用CSS选择器 articles = soup.select('article.post') for article in articles: title = article.select_one('h2.title').get_text() summary = article.select_one('p.summary').get_text() print(f"{title}: {summary}")3.2 数据提取实战
def extract_news_items(html): soup = BeautifulSoup(html, 'html.parser') news_items = [] for item in soup.select('div.news-item'): title = item.select_one('h3').get_text(strip=True) url = item.select_one('a')['href'] date = item.select_one('span.date').get_text(strip=True) category = item.select_one('span.category').get_text(strip=True) news_items.append({ 'title': title, 'url': url, 'date': date, 'category': category }) return news_items四、Scrapy框架
4.1 创建项目
scrapy startproject my_spider cd my_spider scrapy genspider example example.com4.2 编写爬虫
import scrapy class ExampleSpider(scrapy.Spider): name = 'example' allowed_domains = ['example.com'] start_urls = ['https://example.com/news'] def parse(self, response): for article in response.css('article.post'): yield { 'title': article.css('h2.title::text').get(), 'url': article.css('a::attr(href)').get(), 'summary': article.css('p.summary::text').get(), 'date': article.css('time::attr(datetime)').get() } # 分页处理 next_page = response.css('a.next-page::attr(href)').get() if next_page: yield response.follow(next_page, self.parse)4.3 配置文件
# settings.py USER_AGENT = 'MySpider/1.0 (+http://www.example.com)' ROBOTSTXT_OBEY = True DOWNLOAD_DELAY = 2 CONCURRENT_REQUESTS = 8 ITEM_PIPELINES = { 'my_spider.pipelines.MySpiderPipeline': 300, }4.4 数据管道
class MySpiderPipeline: def process_item(self, item, spider): # 数据清洗 item['title'] = item['title'].strip() item['summary'] = item['summary'].strip() # 数据存储 self.store_item(item) return item def store_item(self, item): # 存储到数据库或文件 pass五、动态页面爬取
5.1 使用Selenium
from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC driver = webdriver.Chrome() driver.get('https://example.com/dynamic-page') # 等待元素加载 wait = WebDriverWait(driver, 10) element = wait.until( EC.presence_of_element_located((By.CSS_SELECTOR, 'div.content')) ) # 提取数据 content = element.text print(content) driver.quit()5.2 使用Playwright
from playwright.sync_api import sync_playwright with sync_playwright() as p: browser = p.chromium.launch(headless=True) page = browser.new_page() page.goto('https://example.com/dynamic-page') # 等待网络空闲 page.wait_for_load_state('networkidle') # 提取数据 items = page.query_selector_all('div.item') for item in items: title = item.query_selector('h3').inner_text() print(title) browser.close()六、反爬策略
6.1 请求频率控制
import time from random import randint class RateLimiter: def __init__(self, min_delay=1, max_delay=3): self.min_delay = min_delay self.max_delay = max_delay def wait(self): delay = randint(self.min_delay * 1000, self.max_delay * 1000) / 1000 time.sleep(delay) rate_limiter = RateLimiter() # 在请求之间等待 rate_limiter.wait() response = requests.get(url)6.2 User-Agent轮换
USER_AGENTS = [ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36', 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36', ] headers = {'User-Agent': random.choice(USER_AGENTS)} response = requests.get(url, headers=headers)6.3 使用代理
proxies = { 'http': 'http://proxy-server:port', 'https': 'https://proxy-server:port' } response = requests.get(url, proxies=proxies)七、实战:完整爬虫系统
7.1 项目结构
my_crawler/ ├── crawler/ │ ├── __init__.py │ ├── spiders/ │ │ ├── news_spider.py │ │ └── product_spider.py │ ├── pipelines/ │ │ └── database_pipeline.py │ └── settings.py ├── data/ ├── logs/ └── main.py7.2 主程序
from scrapy.crawler import CrawlerProcess from crawler.settings import Settings from crawler.spiders.news_spider import NewsSpider def main(): process = CrawlerProcess(settings=Settings()) process.crawl(NewsSpider) process.start() if __name__ == '__main__': main()7.3 数据库存储
import sqlite3 class DatabasePipeline: def __init__(self): self.conn = sqlite3.connect('data/crawler.db') self.cursor = self.conn.cursor() self.create_table() def create_table(self): self.cursor.execute(''' CREATE TABLE IF NOT EXISTS news ( id INTEGER PRIMARY KEY AUTOINCREMENT, title TEXT, url TEXT UNIQUE, summary TEXT, date TEXT, category TEXT ) ''') self.conn.commit() def process_item(self, item, spider): try: self.cursor.execute(''' INSERT OR IGNORE INTO news (title, url, summary, date, category) VALUES (?, ?, ?, ?, ?) ''', (item['title'], item['url'], item['summary'], item['date'], item['category'])) self.conn.commit() except Exception as e: spider.logger.error(f"存储失败: {e}") return item八、爬虫最佳实践
8.1 遵守robots.txt
# 检查robots.txt from urllib.robotparser import RobotFileParser rp = RobotFileParser() rp.set_url('https://example.com/robots.txt') rp.read() if rp.can_fetch('MyBot', 'https://example.com/news'): # 可以爬取 response = requests.get('https://example.com/news') else: print("该页面禁止爬取")8.2 设置合理的请求头
headers = { 'User-Agent': 'MyCrawler/1.0 (+https://example.com/crawler)', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', 'Accept-Encoding': 'gzip, deflate, br', 'Connection': 'keep-alive', 'Cache-Control': 'max-age=0', }8.3 错误处理
try: response = requests.get(url, timeout=10) response.raise_for_status() except requests.exceptions.RequestException as e: print(f"请求失败: {e}") # 可以选择重试或跳过九、总结
Python的爬虫生态非常强大,从简单的Requests到专业的Scrapy框架,能够满足各种数据采集需求。作为后端开发者,掌握爬虫技能不仅能够帮助我们获取数据,还能为数据分析和机器学习提供数据支持。
关键要点:
- 选择合适的工具:根据需求选择Requests、BeautifulSoup、Scrapy或Playwright
- 遵守规则:尊重网站的robots.txt和使用条款
- 反爬应对:实现请求频率控制、User-Agent轮换、代理使用
- 数据存储:合理设计数据存储方案
- 错误处理:完善的异常处理机制
从Python转向Rust后,我发现Rust的reqwest库在性能方面有很大优势,适合构建高性能的爬虫系统。
延伸阅读
- Scrapy官方文档
- Requests官方文档
- BeautifulSoup教程
- Playwright官方指南
