当前位置: 首页 > news >正文

python实现信件详细信息爬取

以下是任务要求:
(1) 采集北京市政百姓信件内容;
(2) 编写 MapReduce 程序清洗信件内容数据;
(3) 利用 HiveSql 语句离线分析信件内容数据;
(4) 利用 Sqoop 导出 Hive 分析数据到 MySQL 库;
(5) 开发 JavaWeb+ECharts 完成信件数据图表展示过程。
现在我做到了python实现数据爬取,现在卡在了(4),我暂时不知道Sqoop是什么,后面的部分等我完成再说,以下是我当前的源码:

batch_letter_crawler.py

import requests
from bs4 import BeautifulSoup
import re
import time
import csv

def get_page_content(url, page_no):
"""获取指定页码的网页内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 构建表单数据
data = {
'page.pageNo': page_no,
'page.pageSize': '6'
}
try:
response = requests.post(url, headers=headers, data=data)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取第{page_no}页内容失败: {e}")
return None

def parse_letter_list(html_content):
"""解析信件列表页面,提取信件基本信息和originalId"""
letters = []
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有信件条目
letter_items = soup.select('div.row.clearfix.my-2.list-group.o-border-bottom2.p-3')for item in letter_items:try:# 提取信件名称和originalIdletter_name_div = item.select_one('div.o-font3.col-md-7.pb-3 a')if letter_name_div:letter_name = letter_name_div.text.strip()# 提取originalIdonclick_attr = letter_name_div.get('onclick', '')match = re.search(r"letterdetail\('\d+'\s*,\s*'([^']+)'\)", onclick_attr)if match:original_id = match.group(1)else:original_id = ""else:letter_name = "未找到名称"original_id = ""# 提取来信时间和回复时间time_divs = item.select('div.col-md-5 div.o-font2')来信_time = ""回复_time = ""for time_div in time_divs:text = time_div.text.strip()if '来信时间:' in text:来信_time = text.replace('来信时间:', '').strip()elif '回复时间:' in text:回复_time = text.replace('回复时间:', '').strip()# 添加到列表letters.append({'名称': letter_name,'来信时间': 来信_time,'回复时间': 回复_time,'originalId': original_id})except Exception as e:print(f"解析信件列表失败: {e}")continuereturn letters

def get_letter_detail(original_id):
"""获取单个信件的详细信息"""
if not original_id:
return None

url = f'https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId={original_id}'
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}try:response = requests.get(url, headers=headers)response.encoding = 'utf-8'return response.text
except Exception as e:print(f"获取信件 {original_id} 详情失败: {e}")return None

def parse_letter_detail(html_content):
"""解析信件详情页面的内容"""
soup = BeautifulSoup(html_content, 'html.parser')
letter_detail = {}

try:# 提取信件内容content_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')if content_div:letter_detail['信件内容'] = content_div.text.strip()else:letter_detail['信件内容'] = "未找到信件内容"# 提取回复信息reply_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 + div.row.clearfix.my-5.o-border.p-2')if reply_div:# 回复单位reply_unit_div = reply_div.select_one('div.col-xs-9.col-sm-7.col-md-5.o-font4.my-2 strong')if reply_unit_div:letter_detail['回复单位'] = reply_unit_div.text.strip()else:letter_detail['回复单位'] = "未找到回复单位"# 回复时间reply_time_div = reply_div.select_one('div.col-xs-12.col-sm-12.col-md-12.nmx-2.my-2.text-muted')if reply_time_div:letter_detail['回复时间'] = reply_time_div.text.replace('答复时间:', '').strip()else:letter_detail['回复时间'] = "未找到回复时间"# 回复内容reply_content_div = reply_div.select_one('div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')if reply_content_div:letter_detail['回复内容'] = reply_content_div.text.strip()else:letter_detail['回复内容'] = "未找到回复内容"except Exception as e:print(f"解析信件详情失败: {e}")return letter_detail

def output_letters(all_letters, output_file='all_letters_detail.csv'):
"""将所有信件详细信息输出到CSV文件"""
# 定义CSV列名
fieldnames = ['序号', '名称', '来信时间', '回复时间', 'originalId', '信件内容', '回复单位', '回复内容']

with open(output_file, 'w', encoding='utf-8', newline='') as csvfile:writer = csv.DictWriter(csvfile, fieldnames=fieldnames)# 写入表头writer.writeheader()# 写入数据for i, letter in enumerate(all_letters, 1):# 清洗数据:去除多余的空格、换行符和制表符def clean_text(text):if text:# 去除多余的空格和换行符text = ' '.join(text.split())return textreturn ''writer.writerow({'序号': i,'名称': clean_text(letter.get('名称', '')),'来信时间': clean_text(letter.get('来信时间', '')),'回复时间': clean_text(letter.get('回复时间', '')),'originalId': clean_text(letter.get('originalId', '')),'信件内容': clean_text(letter.get('信件内容', '')),'回复单位': clean_text(letter.get('回复单位', '')),'回复内容': clean_text(letter.get('回复内容', ''))})print(f"所有信件详细信息已输出到 {output_file}")
print(f"共输出 {len(all_letters)} 条记录")

def main():
"""主函数"""
list_url = 'https://www.beijing.gov.cn/hudong/hdjl/sindex/hdjl-xjxd.html'
all_letters = []
total_pages = 44 # 总页数为44

print("开始爬取信件列表...")# 第一步:爬取所有信件列表信息
for page_no in range(1, total_pages + 1):print(f"正在爬取第 {page_no}/{total_pages} 页列表...")html_content = get_page_content(list_url, page_no)if html_content:letters = parse_letter_list(html_content)all_letters.extend(letters)print(f"爬取完成,共获取到 {len(all_letters)} 封信件列表信息")# 第二步:爬取每个信件的详细信息
print("\n开始爬取信件详细信息...")
for i, letter in enumerate(all_letters, 1):original_id = letter.get('originalId', '')if original_id:print(f"正在爬取第 {i}/{len(all_letters)} 封信件的详细信息 (ID: {original_id})...")detail_html = get_letter_detail(original_id)if detail_html:detail_info = parse_letter_detail(detail_html)# 合并详细信息到原有字典letter.update(detail_info)# 避免请求过快time.sleep(1)else:print(f"第 {i}/{len(all_letters)} 封信件没有originalId,跳过...")# 第三步:输出结果
output_letters(all_letters)
print("\n所有信件信息爬取完成!")

if name == "main":
main()

letter_crawler.py

import requests
from bs4 import BeautifulSoup
import re

def get_page_content(url, page_no):
"""获取指定页码的网页内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
# 构建表单数据
data = {
'page.pageNo': page_no,
'page.pageSize': '6'
}
try:
response = requests.post(url, headers=headers, data=data)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取第{page_no}页内容失败: {e}")
return None

def parse_letters(html_content):
"""解析HTML内容,提取信件信息"""
letters = []
soup = BeautifulSoup(html_content, 'html.parser')

# 查找所有信件条目
letter_items = soup.select('div.row.clearfix.my-2.list-group.o-border-bottom2.p-3')for item in letter_items:try:# 提取信件名称和originalIdletter_name_div = item.select_one('div.o-font3.col-md-7.pb-3 a')if letter_name_div:letter_name = letter_name_div.text.strip()# 提取originalIdonclick_attr = letter_name_div.get('onclick', '')import rematch = re.search(r"letterdetail\('\d+'\s*,\s*'([^']+)'\)", onclick_attr)if match:original_id = match.group(1)else:original_id = ""else:letter_name = "未找到名称"original_id = ""# 提取来信时间和回复时间time_divs = item.select('div.col-md-5 div.o-font2')来信_time = ""回复_time = ""for time_div in time_divs:text = time_div.text.strip()if '来信时间:' in text:来信_time = text.replace('来信时间:', '').strip()elif '回复时间:' in text:回复_time = text.replace('回复时间:', '').strip()# 添加到列表letters.append({'名称': letter_name,'来信时间': 来信_time,'回复时间': 回复_time,'originalId': original_id})except Exception as e:print(f"解析信件失败: {e}")continuereturn letters

def output_letters(letters, output_file='letters_info.txt'):
"""将信件信息输出到文本文件"""
with open(output_file, 'w', encoding='utf-8') as f:
for i, letter in enumerate(letters, 1):
f.write(f"信件 {i}:\n")
f.write(f" 名称: {letter['名称']}\n")
f.write(f" 来信时间: {letter['来信时间']}\n")
f.write(f" 回复时间: {letter['回复时间']}\n")
f.write("-" * 50 + "\n")
print(f"信件信息已输出到 {output_file}")

def main():
"""主函数"""
url = 'https://www.beijing.gov.cn/hudong/hdjl/sindex/hdjl-xjxd.html'
all_letters = []
total_pages = 44 # 从HTML中获取到总页数为44

print("开始爬取信件信息...")for page_no in range(1, total_pages + 1):print(f"正在爬取第 {page_no}/{total_pages} 页...")html_content = get_page_content(url, page_no)if html_content:letters = parse_letters(html_content)all_letters.extend(letters)print(f"爬取完成,共获取到 {len(all_letters)} 封信件信息")
output_letters(all_letters)

if name == "main":
main()

letter_detail_crawler.py

import requests
from bs4 import BeautifulSoup

def get_letter_detail(url):
"""获取信件详情页面的内容"""
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
}
try:
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
return response.text
except Exception as e:
print(f"获取信件详情失败: {e}")
return None

def parse_letter_detail(html_content):
"""解析信件详情页面的内容"""
soup = BeautifulSoup(html_content, 'html.parser')
letter_detail = {}

try:# 提取信件标题title_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div strong')if title_div:letter_detail['标题'] = title_div.text.strip()else:letter_detail['标题'] = "未找到标题"# 提取来信人信息letter_info_div = soup.select_one('div.col-xs-12.col-md-12.column.my-3')if letter_info_div:# 来信人sender_div = letter_info_div.select_one('div.col-xs-10.col-lg-3.col-sm-3.col-md-4.text-muted')if sender_div:letter_detail['来信人'] = sender_div.text.replace('来信人:', '').strip()else:letter_detail['来信人'] = "未找到来信人"# 来信时间time_div = letter_info_div.select_one('div.col-xs-5.col-lg-3.col-sm-3.col-md-3.text-muted')if time_div:letter_detail['来信时间'] = time_div.text.replace('时间:', '').strip()else:letter_detail['来信时间'] = "未找到来信时间"# 提取信件内容content_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')if content_div:letter_detail['信件内容'] = content_div.text.strip()else:letter_detail['信件内容'] = "未找到信件内容"# 提取回复信息reply_div = soup.select_one('div.row.clearfix.my-5.o-border.p-2 + div.row.clearfix.my-5.o-border.p-2')if reply_div:# 回复单位reply_unit_div = reply_div.select_one('div.col-xs-9.col-sm-7.col-md-5.o-font4.my-2 strong')if reply_unit_div:letter_detail['回复单位'] = reply_unit_div.text.strip()else:letter_detail['回复单位'] = "未找到回复单位"# 回复时间reply_time_div = reply_div.select_one('div.col-xs-12.col-sm-12.col-md-12.nmx-2.my-2.text-muted')if reply_time_div:letter_detail['回复时间'] = reply_time_div.text.replace('答复时间:', '').strip()else:letter_detail['回复时间'] = "未找到回复时间"# 回复内容reply_content_div = reply_div.select_one('div.col-xs-12.col-md-12.column.p-2.text-muted.text-format.mx-2')if reply_content_div:letter_detail['回复内容'] = reply_content_div.text.strip()else:letter_detail['回复内容'] = "未找到回复内容"except Exception as e:print(f"解析信件详情失败: {e}")return letter_detail

def output_letter_detail(letter_detail, output_file='letter_detail.txt'):
"""将信件详情输出到文本文件"""
with open(output_file, 'w', encoding='utf-8') as f:
f.write("信件详情信息\n")
f.write("=" * 50 + "\n")
for key, value in letter_detail.items():
f.write(f"{key}: {value}\n")
print(f"信件详情已输出到 {output_file}")

def main():
"""主函数"""
url = 'https://www.beijing.gov.cn/hudong/hdjl/com.web.consult.consultDetail.flow?originalId=AH25122401333'

print("开始爬取信件详情...")# 获取页面内容
html_content = get_letter_detail(url)
if html_content:# 解析内容letter_detail = parse_letter_detail(html_content)# 输出结果if letter_detail:print("爬取成功,信件详情:")for key, value in letter_detail.items():print(f"{key}: {value}")output_letter_detail(letter_detail)else:print("未找到信件详情")
else:print("获取页面失败")

if name == "main":
main()

http://www.jsqmd.com/news/259660/

相关文章:

  • 拒绝“PPT 造芯”,边缘 AI 芯片 IP 厂商 Quadric 拿下 3000 万美元 C 轮
  • 基于Java+SpringBoot+Vue的大学生房屋租赁系统【附源码+文档+部署视频+讲解】Python,Django,php,Flask,node.js,SSM,JSP,微信小程序,大数据技术
  • 计算机Java毕设实战-基于Javaspringboot的博客系统基于springboot的博客系统【完整源码+LW+部署说明+演示视频,全bao一条龙等】
  • 基于Java+SpringBoot+Vue的城市花园维修小区管理【附源码+文档+部署视频+讲解】Python,Django,php,Flask,node.js,SSM,JSP,微信小程序,大数据技术
  • Java毕设选题推荐:基于vue的博客分享发布系统基于springboot的博客系统【附源码、mysql、文档、调试+代码讲解+全bao等】
  • 集体好奇心如何提升团队适应能力
  • 【计算机毕业设计案例】基于python-CNN卷神经网络训练识别手势方向
  • 详细介绍:Java 中 NIO 和IO 的区别
  • LVGL 双缓冲机制深入技术讲解
  • LeeCode_693. 交替位二进制数
  • java的AES加密算法和RSA非对称加密算法
  • 图的基本概念
  • 物联网数据中台建设方法论与实践
  • 探寻不锈钢管板好货源?2026年国内厂家推荐,高温合金法兰/压力容器法兰/非标法兰/双相钢法兰,不锈钢管板公司有哪些 - 品牌推荐师
  • java-ssm324医院预约挂号系统vue问诊 失信 投诉-springboot
  • 一篇文章看懂 spring-boot-starter-web 的 POM 配置与 compile 作用域
  • 深度学习毕设项目推荐-基于python-CNN卷积神经网络训练识别不同颜色的裤子识别
  • 2026年目前服务好的双相钢法兰供应商选哪家,不锈钢法兰/双相钢法兰/非标法兰/变压器法兰,双相钢法兰直销厂家排行 - 品牌推荐师
  • Maven 依赖作用域实战避坑指南
  • 2026年目前做得好的变压器法兰品牌有哪些,不锈钢管板/压力容器法兰/不锈钢法兰/法兰/船用法兰,变压器法兰厂家推荐 - 品牌推荐师
  • 深度学习毕设项目推荐-基于python-CNN-pytorch训练识别苹果树叶病害识别
  • 企业估值中的可穿戴设备市场评估
  • 10 分钟使用 OrchardCore 快速构建 .NET 内容管理系统(CMS)
  • 基于微信小程序的宠物寄领养系统(源码+论文+部署+安装)
  • 深度学习毕设项目推荐-基于python-CNN深度学习训练识别手势方向
  • C# 的小惊喜:ValueTuple,让多返回值更优雅,性能更强
  • 聚焦不锈钢管板:国内生产技术成熟的厂家一览,变压器法兰/压力容器法兰/双相钢法兰/不锈钢法兰,不锈钢管板公司哪个好 - 品牌推荐师
  • 2026年市场评价好的锻件源头厂家哪家权威,法兰/双相钢法兰/非标法兰/船用法兰/变压器法兰,锻件供应商找哪家 - 品牌推荐师
  • .NET + Vue 3 全栈开发:基于 YOLO 的AI图像识别平台实践
  • 2026年行业内可靠的不锈钢法兰厂商排行,非标法兰/不锈钢法兰/不锈钢管板/锻件/法兰,不锈钢法兰品牌怎么选择 - 品牌推荐师