当前位置: 首页 > news >正文

爬取七猫中文网小说

爬取七猫中文网小说

前言:纯小白,若有什么不对之处,大家海涵海涵,大家可以在评论区雅正,其次只做分享不做商业用途,很多东西都是站在很多前辈肩上去学习的,再次恳请各位佬手下留情~

目标地址:aHR0cHM6Ly93d3cucWltYW8uY29tLw==

目标:爬取七猫 中文 网小说


场景

通过地址可跳转到网页如下

搜一本自己喜欢的书或者随便搜一本

分析步骤

点击开始阅读然后打开开发者工具点击下一章查看网络状态

 

抓包发现在我们点击下一章这一事件之后有两个比较重要的东西get_bookchapter-list?book_id

 

因为我们发现他们载荷是相同的同时chapter-list?book_id的响应部分就是当前我们观看书籍的章节

从上述不难看出每一本书都有一个唯一对应的id,而我们想要爬取整篇小说的话,那就需要拿到对应小说篇章的全部id即可,而全部id就在chapter-list里面。而每本书的id包含在URL中


下载实现

代码示例:

  1.  
    from urllib.parse import urlencode, quote
  2.  
    import json
  3.  
    from lxml import etree
  4.  
    import os
  5.  
    import re
  6.  
    import requests
  7.  
     
  8.  
    def search_book_by_title(title):
  9.  
    """
  10.  
    通过书名搜索书籍ID
  11.  
    :param title: 书籍标题
  12.  
    :return: 书籍ID
  13.  
    """
  14.  
    search_url = 'https://www.qimao.com/qimaoapi/api/search/result'
  15.  
    params = {
  16.  
    "keyword": title,
  17.  
    "count": 0,
  18.  
    "page": 1,
  19.  
    "page_size": 15
  20.  
    }
  21.  
     
  22.  
    headers = {
  23.  
    "accept": "application/json, text/plain, */*",
  24.  
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
  25.  
    }
  26.  
     
  27.  
    try:
  28.  
    # 对标题进行URL编码
  29.  
    encoded_title = quote(title)
  30.  
    search_url_with_params = f"https://www.qimao.com/qimaoapi/api/search/result?keyword={encoded_title}&count=0&page=1&page_size=15"
  31.  
     
  32.  
    response = requests.get(search_url_with_params, headers=headers)
  33.  
    if response.status_code == 200:
  34.  
    data = json.loads(response.text)
  35.  
    books = data.get('data', {}).get('search_list', [])
  36.  
     
  37.  
    if books:
  38.  
    # 显示搜索结果供用户选择
  39.  
    print(f"\n找到 {len(books)} 本与 \"{title}\" 相关的书籍:")
  40.  
    print("=" * 80)
  41.  
     
  42.  
    # 按匹配度排序(参考test.py中的实现)
  43.  
    sorted_books = []
  44.  
    for i, book in enumerate(books[:10]): # 只显示前10个结果
  45.  
    book_info = {
  46.  
    'index': i+1,
  47.  
    'book_id': book.get('book_id', ''),
  48.  
    'title': book.get('title', ''),
  49.  
    'author': book.get('author', ''),
  50.  
    'intro': book.get('intro', '')[:50] + '...' if len(book.get('intro', '')) > 50 else book.get('intro', ''),
  51.  
    }
  52.  
    sorted_books.append(book_info)
  53.  
     
  54.  
    status_icon = "✅" if book.get('is_over_txt', '') == '完结' else "⏳"
  55.  
    print(f"{i+1:2d}. {status_icon} {book_info['title']} - {book_info['author']}")
  56.  
    print(f" {book_info['intro']}")
  57.  
    print(f" ID: {book_info['book_id']}")
  58.  
    print()
  59.  
     
  60.  
    # 让用户选择正确的书籍
  61.  
    try:
  62.  
    choice = input("请选择正确的书籍编号 (1-{},或按回车选择第一个): ".format(len(sorted_books)))
  63.  
    if choice.strip() == "":
  64.  
    selected_index = 1
  65.  
    else:
  66.  
    selected_index = int(choice)
  67.  
     
  68.  
    if 1 <= selected_index <= len(sorted_books):
  69.  
    selected_book = sorted_books[selected_index-1]
  70.  
    print(f"已选择: 《{selected_book['title']}》")
  71.  
    return selected_book['book_id']
  72.  
    else:
  73.  
    print("选择无效,使用第一个结果")
  74.  
    return sorted_books[0]['book_id']
  75.  
    except (ValueError, IndexError):
  76.  
    print("输入无效,使用第一个结果")
  77.  
    return sorted_books[0]['book_id']
  78.  
    else:
  79.  
    print(f"No books found for title: {title}")
  80.  
    else:
  81.  
    print(f"Search failed with status code: {response.status_code}")
  82.  
    except Exception as e:
  83.  
    print(f"Error searching book: {e}")
  84.  
     
  85.  
    return None
  86.  
     
  87.  
    def get_id(book_id):
  88.  
    start_url = 'https://www.qimao.com/api/book/chapter-list?'
  89.  
    params = {
  90.  
    "book_id": book_id
  91.  
    }
  92.  
    url = start_url + urlencode(params)
  93.  
     
  94.  
    headers = {
  95.  
    "accept": "application/json, text/plain, */*",
  96.  
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
  97.  
    }
  98.  
     
  99.  
    res = requests.get(url, headers=headers)
  100.  
    if res.status_code == 200:
  101.  
    _ = json.loads(res.text)
  102.  
    result = _['data']['chapters']
  103.  
     
  104.  
    # Get book title from the first chapter page
  105.  
    book_title = None
  106.  
    if result:
  107.  
    first_chapter = result[0]
  108.  
    # Try to get book title from first chapter
  109.  
    book_title = get_book_title_from_chapter(book_id, first_chapter['id'])
  110.  
     
  111.  
    # If we couldn't get title from chapter, use a default name
  112.  
    if not book_title:
  113.  
    book_title = f"book_{book_id}"
  114.  
     
  115.  
    # Create directory for the book
  116.  
    book_dir = create_book_directory(book_title)
  117.  
     
  118.  
    # 保存章节信息,以便后续按顺序处理
  119.  
    chapters = []
  120.  
    for r in result:
  121.  
    title = r['title']
  122.  
    txt_id = r['id']
  123.  
     
  124.  
    # 保存章节信息
  125.  
    if title and txt_id:
  126.  
    chapters.append({
  127.  
    'title': title,
  128.  
    'txt_id': txt_id
  129.  
    })
  130.  
     
  131.  
    # 按顺序下载并保存所有章节
  132.  
    for chapter in chapters:
  133.  
    save_txt(chapter['title'], chapter['txt_id'], book_id, book_dir)
  134.  
     
  135.  
    else:
  136.  
    print(res.status_code)
  137.  
     
  138.  
     
  139.  
    def get_book_title_from_chapter(book_id, chapter_id):
  140.  
    '''
  141.  
    从章节页面提取书籍标题
  142.  
    :param book_id: 书籍ID
  143.  
    :param chapter_id: 章节ID
  144.  
    :return: 书籍标题
  145.  
    '''
  146.  
    url = f'https://www.qimao.com/shuku/{book_id}-{chapter_id}/'
  147.  
    headers = {
  148.  
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  149.  
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
  150.  
    }
  151.  
     
  152.  
    try:
  153.  
    resp = requests.get(url, headers=headers)
  154.  
    if resp.status_code == 200:
  155.  
    html = etree.HTML(resp.text)
  156.  
    # Try to extract book title from page
  157.  
    title_elements = html.xpath('//div[@class="breadcrumb"]//a[last()]/text()')
  158.  
    if title_elements:
  159.  
    return title_elements[0].strip()
  160.  
     
  161.  
    # Alternative method - from book info section
  162.  
    title_elements = html.xpath('//div[@class="book-info"]//h1/text()')
  163.  
    if title_elements:
  164.  
    return title_elements[0].strip()
  165.  
    except Exception as e:
  166.  
    print(f"获取书籍标题失败: {e}")
  167.  
     
  168.  
    return None
  169.  
     
  170.  
     
  171.  
    def create_book_directory(book_title):
  172.  
    '''
  173.  
    创建书籍目录
  174.  
    :param book_title: 书籍标题
  175.  
    :return: 目录路径
  176.  
    '''
  177.  
    # 清理书名中的非法字符
  178.  
    clean_title = re.sub(r'[<>:"/\\|?*]', '', book_title)
  179.  
    book_dir = os.path.join(os.path.dirname(__file__), clean_title)
  180.  
     
  181.  
    # 创建目录
  182.  
    if not os.path.exists(book_dir):
  183.  
    os.makedirs(book_dir)
  184.  
     
  185.  
    return book_dir
  186.  
     
  187.  
     
  188.  
    def save_txt(title, txt_id, book_id, book_dir):
  189.  
    '''
  190.  
    :param title: 标题,
  191.  
    :param txt_id: 小说篇章ID
  192.  
    :param book_id: 小说ID
  193.  
    :param book_dir: 书籍目录路径
  194.  
    :return:
  195.  
    '''
  196.  
    url = f'https://www.qimao.com/shuku/{book_id}-{txt_id}/'
  197.  
    headers = {
  198.  
    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
  199.  
    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/142.0.0.0 Safari/537.36"
  200.  
    }
  201.  
    resp = requests.get(url, headers=headers)
  202.  
    if resp.status_code == 200:
  203.  
    html = etree.HTML(resp.text)
  204.  
     
  205.  
    divs = html.xpath('//div[@class="article"]//p')
  206.  
    # 构建文件路径
  207.  
    file_path = os.path.join(book_dir, f'{title}.txt')
  208.  
     
  209.  
    for div in divs:
  210.  
    p = div.xpath('.//text()')
  211.  
    if p:
  212.  
    # 将小说内容写入txt
  213.  
    with open(file_path, 'a', encoding='utf-8') as fp:
  214.  
    fp.write(f'{p[0]}\n')
  215.  
     
  216.  
    fp.close()
  217.  
     
  218.  
    print(f'{title} 存储文本成功')
  219.  
     
  220.  
    else:
  221.  
    print(resp.status_code)
  222.  
     
  223.  
     
  224.  
    if __name__ == '__main__':
  225.  
    # 通过输入书名的方式下载小说
  226.  
    book_title = input("请输入要下载的小说书名: ")
  227.  
     
  228.  
    if book_title:
  229.  
    book_id = search_book_by_title(book_title)
  230.  
    if book_id:
  231.  
    print(f"开始下载书籍,ID为: {book_id}")
  232.  
    get_id(book_id)
  233.  
    else:
  234.  
    print("未找到匹配的书籍")
  235.  
    else:
  236.  
    print("书名不能为空")
python 运行

 

 


 

运行代码

其实只需要携带相应书籍的ID就可以下载,但是我觉得麻烦就让AI魔改了一下代码通过搜索书名然后知道作者的情况下下载比较方便。

 


数据存储

对于下载放在什么位置的话就简单得多了,在放代码或者建立应该 文件夹 将路径放到代码中就行,想要批量下载的话修改代码就行。

注意哈只能下载免费的,如果要下载VIP的得携带VIP账号的cookie

 

2026-04-15 20:28:58【出处】:https://blog.csdn.net/2303_80825459/article/details/156482255

=======================================================================================