当前位置：首页 > news >正文

Qianfan-OCR实操手册：批量处理脚本编写与OCR结果去重/合并/校验逻辑

news 2026/6/22 0:57:58

Qianfan-OCR实操手册：批量处理脚本编写与OCR结果去重/合并/校验逻辑

1. 项目概述

Qianfan-OCR是百度千帆推出的开源文档智能多模态模型，基于4B参数的端到端架构设计。相比传统OCR方案，它集成了文字识别、版面分析和文档理解三大核心功能，大幅简化了文档处理流程。

核心优势：

一体化处理：单模型完成传统OCR流水线多个步骤
智能分析：支持布局识别和结构化信息提取
开源商用：Apache 2.0协议，可自由商用和二次开发
多语言支持：覆盖主流语言的文字识别需求

2. 环境准备

2.1 基础环境配置

确保已安装以下组件：

Conda环境（推荐使用torch28）
Python 3.11
CUDA 11.7+（GPU加速）
至少16GB显存（推荐24GB以上）

2.2 服务启动

# 进入项目目录 cd /root/Qianfan-OCR # 启动服务 supervisorctl start qianfan-ocr

服务启动后，可通过http://localhost:7860访问Web界面。

3. 批量处理脚本编写

3.1 基础批量处理脚本

以下Python脚本实现了图片目录的批量OCR处理：

import os import requests from tqdm import tqdm def batch_ocr(image_dir, output_file, endpoint="http://localhost:7860/api/ocr"): """ 批量OCR处理脚本 :param image_dir: 图片目录路径 :param output_file: 结果输出文件 :param endpoint: API端点地址 """ results = {} image_files = [f for f in os.listdir(image_dir) if f.lower().endswith(('.png', '.jpg', '.jpeg'))] for img_file in tqdm(image_files, desc="Processing images"): img_path = os.path.join(image_dir, img_file) try: with open(img_path, 'rb') as f: response = requests.post(endpoint, files={'image': f}) if response.status_code == 200: results[img_file] = response.json()['text'] else: results[img_file] = f"Error: {response.text}" except Exception as e: results[img_file] = f"Exception: {str(e)}" with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2)

3.2 带布局分析的增强脚本

启用Layout-as-Thought模式获取结构化结果：

def batch_ocr_with_layout(image_dir, output_file): """ 带布局分析的批量OCR处理 """ results = {} for img_file in os.listdir(image_dir): if not img_file.lower().endswith(('.png', '.jpg', '.jpeg')): continue img_path = os.path.join(image_dir, img_file) try: with open(img_path, 'rb') as f: response = requests.post( "http://localhost:7860/api/ocr", files={'image': f}, data={'layout': 'true'} # 启用布局分析 ) results[img_file] = response.json() except Exception as e: results[img_file] = {'error': str(e)} with open(output_file, 'w', encoding='utf-8') as f: json.dump(results, f, ensure_ascii=False, indent=2)

4. OCR结果后处理

4.1 文本去重算法

针对重复出现的文本内容，实现智能去重：

from collections import defaultdict import re def deduplicate_texts(text_list, threshold=0.9): """ 文本内容去重 :param text_list: 待处理文本列表 :param threshold: 相似度阈值(0-1) :return: 去重后的文本列表 """ unique_texts = [] text_groups = defaultdict(list) # 基于文本指纹的粗筛 for text in text_list: if not text.strip(): continue # 生成简化指纹（去除空格和标点） fingerprint = re.sub(r'[\s\W]', '', text.lower()) text_groups[fingerprint[:20]].append(text) # 精细相似度计算 for group in text_groups.values(): if len(group) == 1: unique_texts.append(group[0]) continue # 使用编辑距离计算相似度 representative = group[0] unique_texts.append(representative) for text in group[1:]: distance = levenshtein_distance(representative, text) similarity = 1 - (distance / max(len(representative), len(text))) if similarity < threshold: unique_texts.append(text) return unique_texts def levenshtein_distance(s1, s2): """ 计算两个字符串的编辑距离 """ if len(s1) < len(s2): return levenshtein_distance(s2, s1) if len(s2) == 0: return len(s1) previous_row = range(len(s2) + 1) for i, c1 in enumerate(s1): current_row = [i + 1] for j, c2 in enumerate(s2): insertions = previous_row[j + 1] + 1 deletions = current_row[j] + 1 substitutions = previous_row[j] + (c1 != c2) current_row.append(min(insertions, deletions, substitutions)) previous_row = current_row return previous_row[-1]

4.2 多页文档合并

处理扫描版多页文档的合并逻辑：

def merge_document_pages(page_results, merge_strategy='sequential'): """ 合并多页OCR结果 :param page_results: 按页码排序的OCR结果列表 :param merge_strategy: 合并策略(sequential/smart) :return: 合并后的完整文档 """ merged_text = "" if merge_strategy == 'sequential': # 简单顺序合并 for page in sorted(page_results.keys()): merged_text += f"\n\n--- Page {page} ---\n{page_results[page]}" elif merge_strategy == 'smart': # 智能合并（识别页眉页脚） header_footer_threshold = 0.7 header_candidates = [] footer_candidates = [] # 分析页眉页脚模式 sample_pages = list(page_results.values())[:3] for text in sample_pages: lines = text.split('\n') if len(lines) > 1: header_candidates.append(lines[0]) footer_candidates.append(lines[-1]) # 确定公共页眉页脚 common_header = find_common_text(header_candidates, threshold=header_footer_threshold) common_footer = find_common_text(footer_candidates, threshold=header_footer_threshold) # 应用合并 for page_num, text in sorted(page_results.items()): lines = text.split('\n') if len(lines) > 1: # 移除页眉页脚 if lines[0] == common_header: lines = lines[1:] if lines[-1] == common_footer: lines = lines[:-1] merged_text += '\n'.join(lines) + '\n\n' return merged_text.strip()

4.3 结果校验机制

实现OCR结果的自动校验：

def validate_ocr_result(text, validation_rules=None): """ OCR结果校验 :param text: 待校验文本 :param validation_rules: 校验规则字典 :return: (是否有效, 错误信息) """ if not validation_rules: validation_rules = { 'min_length': 3, # 最小文本长度 'max_repeat': 5, # 最大连续重复字符数 'invalid_chars': ['�', '��'], # 非法字符 'expected_patterns': None # 预期正则模式 } # 检查最小长度 if len(text.strip()) < validation_rules['min_length']: return False, "Text too short" # 检查重复字符 if any(len(list(g)) > validation_rules['max_repeat'] for k, g in groupby(text)): return False, "Excessive character repetition" # 检查非法字符 if any(invalid_char in text for invalid_char in validation_rules['invalid_chars']): return False, "Contains invalid characters" # 检查正则模式 if validation_rules['expected_patterns']: if not any(re.search(pattern, text) for pattern in validation_rules['expected_patterns']): return False, "Does not match expected patterns" return True, ""

5. 实战案例：发票处理系统

5.1 发票信息提取流程

def process_invoice_images(image_dir): """ 完整发票处理流程 """ # 1. 批量OCR处理 raw_results = batch_ocr_with_layout(image_dir, 'raw_results.json') # 2. 关键信息提取 extracted_data = [] for img_file, ocr_result in raw_results.items(): if 'error' in ocr_result: continue # 使用提示词提取关键字段 prompt = """ 请从发票中提取以下信息： - 发票号码 - 开票日期 - 销售方名称 - 购买方名称 - 金额合计(大写) - 金额合计(小写) 以JSON格式返回 """ response = requests.post( "http://localhost:7860/api/ocr", json={ 'text': ocr_result['text'], 'prompt': prompt } ) if response.status_code == 200: extracted_data.append({ 'image': img_file, 'data': response.json() }) # 3. 数据校验 valid_data = [] for item in extracted_data: is_valid, error = validate_invoice_data(item['data']) if is_valid: valid_data.append(item) else: print(f"Invalid data in {item['image']}: {error}") return valid_data def validate_invoice_data(data): """ 发票数据校验 """ required_fields = [ '发票号码', '开票日期', '销售方名称', '金额合计(小写)' ] for field in required_fields: if field not in data or not data[field].strip(): return False, f"Missing {field}" # 检查金额格式 if not re.match(r'^\d+(\.\d{1,2})?$', data['金额合计(小写)']): return False, "Invalid amount format" return True, ""

5.2 处理结果可视化

import pandas as pd def generate_invoice_report(data, output_file): """ 生成发票处理报告 """ df = pd.DataFrame([item['data'] for item in data]) df['源文件'] = [item['image'] for item in data] # 保存Excel报告 with pd.ExcelWriter(output_file) as writer: df.to_excel(writer, sheet_name='发票汇总', index=False) # 添加统计信息 stats = pd.DataFrame({ '统计项': ['发票总数', '总金额', '平均金额'], '值': [len(df), df['金额合计(小写)'].astype(float).sum(), df['金额合计(小写)'].astype(float).mean()] }) stats.to_excel(writer, sheet_name='统计信息', index=False) print(f"报告已生成: {output_file}")