当前位置: 首页 > news >正文

龙虾Claw实战扫描件证件信息智能提取与自动归档管理场景应用

龙虾 Claw 实战:扫描件证件信息智能提取与自动归档管理场景应用

一、背景与需求分析

在企业办公和政务服务中,证件信息的采集和归档是一项高频工作。无论是员工入职、客户开户、还是业务审批,都需要处理大量的证件扫描件。传统处理方式存在以下问题:

  1. 人工录入效率低:逐张证件手动输入信息耗时费力
  2. 信息容易出错:证件号码、日期等关键信息录入错误率高
  3. 归档不规范:文件命名和存储缺乏统一标准
  4. 检索困难:纸质或扫描件难以快速查找
  5. 隐私安全风险:证件信息存储和管理不当

龙虾 Claw 结合 OCR 识别和智能分类技术,可以实现证件信息的自动提取与规范化归档,提升信息管理效率。

二、整体解决方案

2.1 技术架构

证件扫描件 → 图像预处理 → 证件识别 → 信息提取 → 数据校验 → 归档存储 ↓ ↓ ↓ ↓ ↓ ↓ PDF/图片 增强矫正 类型判断 字段解析 格式验证 结构化存储 批量上传 去噪锐化 模板匹配 OCR识别 逻辑校验 自动命名 多格式支持 倾斜校正 智能分类 结构化输出 异常标记 权限管理

2.2 支持的证件类型

证件类型提取字段特殊处理
身份证正面姓名、性别、民族、出生日期、住址、身份证号人脸区域提取
身份证反面签发机关、有效期限有效期校验
护照姓名、护照号、出生日期、有效期、签发地多语言支持
驾驶证姓名、证号、准驾车型、有效期分数查询对接
行驶证车牌号、车辆类型、所有人、品牌型号车辆信息关联
营业执照企业名称、统一社会信用代码、法定代表人企业信息核验

三、环境准备

3.1 依赖安装

# requirements.txt opencv-python>=4.8.0 Pillow>=9.0.0 pdfplumber>=0.9.0 pandas>=2.0.0 openpyxl>=3.1.0 python-dotenv>=1.0.0 requests>=2.28.0 pycryptodome>=3.18.0 # 身份证号加密

3.2 项目结构

id_card_processor/ ├── config/ │ ├── settings.py # 全局配置 │ └── id_templates.yaml # 证件模板配置 ├── preprocessors/ │ ├── image_enhancer.py # 图像增强 │ └── id_detector.py # 证件检测 ├── recognizers/ │ ├── id_recognizer.py # 证件识别 │ └── field_extractor.py # 字段提取 ├── validators/ │ ├── id_validator.py # 证件校验 │ └── data_validator.py # 数据校验 ├── archivers/ │ ├── file_archiver.py # 文件归档 │ └── db_archiver.py # 数据库归档 └── main.py # 主程序入口

四、核心代码实现

4.1 配置管理

# config/settings.py import os from dataclasses import dataclass from typing import Dict, List @dataclass class IDCardTemplate: """证件模板配置""" type_name: str type_code: str fields: Dict[str, Dict] validation_rules: Dict class Settings: """全局配置""" # 证件模板配置 ID_TEMPLATES = { "id_card_front": IDCardTemplate( type_name="身份证正面", type_code="ID_FRONT", fields={ "name": {"label": "姓名", "pattern": r"姓\s*名[::]?\s*([\u4e00-\u9fa5]+)"}, "gender": {"label": "性别", "pattern": r"性\s*别[::]?\s*([男女])"}, "nation": {"label": "民族", "pattern": r"民\s*族[::]?\s*([\u4e00-\u9fa5]+)"}, "birthday": {"label": "出生日期", "pattern": r"出生[::]?\s*(\d{4})年(\d{1,2})月(\d{1,2})日"}, "address": {"label": "住址", "pattern": r"住\s*址[::]?\s*([\u4e00-\u9fa5\d\-#号路街道小区楼栋单元室]+)"}, "id_number": {"label": "公民身份号码", "pattern": r"公民身份号码[::]?\s*(\d{17}[\dXx])"} }, validation_rules={ "id_number": { "length": 18, "checksum": True } } ), "id_card_back": IDCardTemplate( type_name="身份证反面", type_code="ID_BACK", fields={ "authority": {"label": "签发机关", "pattern": r"签发机关[::]?\s*([\u4e00-\u9fa5]+)"}, "valid_date": {"label": "有效期限", "pattern": r"有效期限[::]?\s*(\d{4}\.\d{2}\.\d{2})-(\d{4}\.\d{2}\.\d{2}|长期)"} }, validation_rules={} ), "passport": IDCardTemplate( type_name="护照", type_code="PASSPORT", fields={ "name": {"label": "姓名", "pattern": r"姓\s*名[::/]?\s*([\u4e00-\u9fa5A-Za-z\s]+)"}, "passport_no": {"label": "护照号码", "pattern": r"护照号码[::]?\s*([A-Z]\d{8})"}, "gender": {"label": "性别", "pattern": r"性\s*别[::]?\s*([男女M/F])"}, "birthday": {"label": "出生日期", "pattern": r"出生日期[::]?\s*(\d{4}年\d{2}月\d{2}日)"}, "issue_date": {"label": "签发日期", "pattern": r"签发日期[::]?\s*(\d{4}年\d{2}月\d{2}日)"}, "expiry_date": {"label": "有效期至", "pattern": r"有效期至[::]?\s*(\d{4}年\d{2}月\d{2}日)"} }, validation_rules={ "passport_no": { "pattern": r"^[A-Z]\d{8}$" } } ), "driver_license": IDCardTemplate( type_name="驾驶证", type_code="DRIVER", fields={ "name": {"label": "姓名", "pattern": r"姓\s*名[::]?\s*([\u4e00-\u9fa5]+)"}, "id_number": {"label": "证号", "pattern": r"证\s*号[::]?\s*(\d{17}[\dXx])"}, "vehicle_class": {"label": "准驾车型", "pattern": r"准驾车型[::]?\s*([A-Z1-3,]+)"}, "valid_date": {"label": "有效期", "pattern": r"有效期[至]*[::]?\s*(\d{4}-\d{2}-\d{2})"} }, validation_rules={} ), "business_license": IDCardTemplate( type_name="营业执照", type_code="BUSINESS", fields={ "company_name": {"label": "名称", "pattern": r"名\s*称[::]?\s*([^\n]+)"}, "credit_code": {"label": "统一社会信用代码", "pattern": r"统一社会信用代码[::]?\s*([A-Z0-9]{18})"}, "legal_person": {"label": "法定代表人", "pattern": r"法定代表人[::]?\s*([\u4e00-\u9fa5]+)"}, "capital": {"label": "注册资本", "pattern": r"注册资本[::]?\s*([\d.]+万[美元人民币]+)"}, "establish_date": {"label": "成立日期", "pattern": r"成立日期[::]?\s*(\d{4}年\d{2}月\d{2}日)"}, "address": {"label": "住所", "pattern": r"住\s*所[::]?\s*([^\n]+)"} }, validation_rules={ "credit_code": { "length": 18, "pattern": r"^[A-Z0-9]{18}$" } } ) } # 敏感字段配置 SENSITIVE_FIELDS = ["id_number", "passport_no", "credit_code"] # 输出配置 OUTPUT_DIR = os.getenv("OUTPUT_DIR", "./output") # 数据库配置 DATABASE_URL = os.getenv("DATABASE_URL", "sqlite:///id_cards.db")

4.2 证件检测与识别

# recognizers/id_recognizer.py import cv2 import numpy as np from typing import Dict, List, Optional, Tuple from dataclasses import dataclass import base64 import requests @dataclass class IDCardInfo: """证件信息""" card_type: str card_type_name: str fields: Dict[str, str] confidence: float image_path: str face_image: Optional[bytes] = None class IDCardRecognizer: """证件识别器""" def __init__(self, api_key: str = None, api_secret: str = None): self.api_key = api_key self.api_secret = api_secret self.use_api = api_key is not None def recognize(self, image: np.ndarray) -> IDCardInfo: """识别证件""" # 判断证件类型 card_type = self._detect_card_type(image) # 识别证件信息 if self.use_api: fields, confidence = self._api_recognize(image, card_type) else: fields, confidence = self._local_recognize(image, card_type) # 提取人脸(如果是身份证正面) face_image = None if card_type == "id_card_front": face_image = self._extract_face(image) return IDCardInfo( card_type=card_type, card_type_name=self._get_card_type_name(card_type), fields=fields, confidence=confidence, image_path="", face_image=face_image ) def _detect_card_type(self, image: np.ndarray) -> str: """检测证件类型""" # 简单的颜色和文字特征判断 # 实际应用中可以使用训练好的分类模型 # 转换颜色空间 hsv = cv2.cvtColor(image, cv2.COLOR_BGR2HSV) # 计算颜色直方图 hist = cv2.calcHist([hsv], [0], None, [180], [0, 180]) dominant_hue = np.argmax(hist) # 身份证正面偏绿色 if 35 <= dominant_hue <= 85: return "id_card_front" # 身份证反面偏黄色 if 20 <= dominant_hue <= 35: return "id_card_back" # 护照偏红色 if 0 <= dominant_hue <= 10 or 170 <= dominant_hue <= 180: return "passport" # 默认返回身份证正面 return "id_card_front" def _get_card_type_name(self, card_type: str) -> str: """获取证件类型名称""" from config.settings import Settings template = Settings.ID_TEMPLATES.get(card_type) return template.type_name if template else "未知证件" def _api_recognize(self, image: np.ndarray, card_type: str) -> Tuple[Dict, float]: """使用API识别""" # 编码图像 _, buffer = cv2.imencode('.jpg', image) image_base64 = base64.b64encode(buffer).decode() # 调用OCR API # 这里以百度身份证识别API为例 if card_type in ["id_card_front", "id_card_back"]: url = "https://aip.baidubce.com/rest/2.0/ocr/v1/idcard" params = { "image": image_base64, "id_card_side": "front" if card_type == "id_card_front" else "back" } else: url = "https://aip.baidubce.com/rest/2.0/ocr/v1/general_basic" params = {"image": image_base64} try: response = requests.post(url, data=params, timeout=30) result = response.json() return self._parse_api_result(result, card_type) except Exception as e: print(f"API识别失败: {e}") return {}, 0.0 def _parse_api_result(self, result: Dict, card_type: str) -> Tuple[Dict, float]: """解析API返回结果""" fields = {} confidence = 0.0 if card_type in ["id_card_front", "id_card_back"]: if "words_result" in result: for key, value in result["words_result"].items(): fields[key.lower()] = value.get("words", "") # 计算平均置信度 if "card_location" in result: confidence = 0.9 else: if "words_result" in result: words = [item["words"] for item in result["words_result"]] # 简单的字段提取 fields["raw_text"] = "\n".join(words) confidence = 0.8 return fields, confidence def _local_recognize(self, image: np.ndarray, card_type: str) -> Tuple[Dict, float]: """本地OCR识别""" try: import pytesseract from PIL import Image # 转换为PIL图像 pil_image = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # OCR识别 text = pytesseract.image_to_string(pil_image, lang='chi_sim+eng') # 使用正则提取字段 fields = self._extract_fields(text, card_type) return fields, 0.7 except Exception as e: print(f"本地OCR失败: {e}") return {}, 0.0 def _extract_fields(self, text: str, card_type: str) -> Dict: """从文本提取字段""" from config.settings import Settings template = Settings.ID_TEMPLATES.get(card_type) if not template: return {} fields = {} for field_name, field_config in template.fields.items(): pattern = field_config.get("pattern", "") if pattern: match = re.search(pattern, text) if match: fields[field_name] = match.group(1) if match.lastindex else match.group() return fields def _extract_face(self, image: np.ndarray) -> Optional[bytes]: """提取人脸区域""" # 使用OpenCV的人脸检测器 face_cascade = cv2.CascadeClassifier( cv2.data.haarcascades + 'haarcascade_frontalface_default.xml' ) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) faces = face_cascade.detectMultiScale(gray, 1.1, 4) if len(faces) > 0: x, y, w, h = faces[0] face = image[y:y+h, x:x+w] # 编码为JPEG _, buffer = cv2.imencode('.jpg', face) return buffer.tobytes() return None # recognizers/field_extractor.py import re from typing import Dict, Optional from datetime import datetime class FieldExtractor: """字段提取器""" def __init__(self): pass def extract_all(self, text: str, card_type: str) -> Dict: """提取所有字段""" from config.settings import Settings template = Settings.ID_TEMPLATES.get(card_type) if not template: return {} fields = {} for field_name, field_config in template.fields.items(): value = self._extract_field(text, field_config) if value: fields[field_name] = value return fields def _extract_field(self, text: str, field_config: Dict) -> str: """提取单个字段""" pattern = field_config.get("pattern", "") if not pattern: return "" match = re.search(pattern, text, re.IGNORECASE) if match: if match.lastindex and match.lastindex >= 1: # 多个捕获组,合并结果 groups = match.groups() if field_config.get("label") == "出生日期": return f"{groups[0]}-{groups[1].zfill(2)}-{groups[2].zfill(2)}" elif field_config.get("label") == "有效期限": return f"{groups[0]} 至 {groups[1]}" else: return groups[0] return match.group() return "" def normalize_date(self, date_str: str) -> str: """标准化日期格式""" if not date_str: return "" # 尝试多种格式 formats = [ "%Y年%m月%d日", "%Y-%m-%d", "%Y.%m.%d", "%Y/%m/%d" ] for fmt in formats: try: dt = datetime.strptime(date_str.strip(), fmt) return dt.strftime("%Y-%m-%d") except ValueError: continue return date_str def normalize_id_number(self, id_number: str) -> str: """标准化身份证号""" # 移除空格 id_number = id_number.replace(" ", "") # 转换为小写x if len(id_number) == 18: id_number = id_number[:-1] + id_number[-1].lower() return id_number

4.3 数据校验器

# validators/id_validator.py from typing import Dict, List, Tuple from dataclasses import dataclass from datetime import datetime @dataclass class ValidationResult: """校验结果""" valid: bool errors: List[str] warnings: List[str] class IDCardValidator: """证件校验器""" def __init__(self): pass def validate(self, card_info) -> ValidationResult: """校验证件信息""" errors = [] warnings = [] # 根据证件类型选择校验规则 if card_info.card_type == "id_card_front": errors, warnings = self._validate_id_card_front(card_info.fields) elif card_info.card_type == "id_card_back": errors, warnings = self._validate_id_card_back(card_info.fields) elif card_info.card_type == "passport": errors, warnings = self._validate_passport(card_info.fields) elif card_info.card_type == "business_license": errors, warnings = self._validate_business_license(card_info.fields) return ValidationResult( valid=len(errors) == 0, errors=errors, warnings=warnings ) def _validate_id_card_front(self, fields: Dict) -> Tuple[List, List]: """校验身份证正面""" errors = [] warnings = [] # 校验身份证号 id_number = fields.get("id_number", "") if not id_number: errors.append("缺少身份证号码") elif not self._validate_id_number_checksum(id_number): errors.append("身份证号码校验失败") # 校验姓名 name = fields.get("name", "") if not name: errors.append("缺少姓名") elif len(name) < 2: warnings.append("姓名长度异常") # 校验出生日期 birthday = fields.get("birthday", "") if birthday: age = self._calculate_age(birthday) if age < 0: errors.append("出生日期无效") elif age < 18: warnings.append("未满18周岁") elif age > 120: warnings.append("年龄异常") return errors, warnings def _validate_id_card_back(self, fields: Dict) -> Tuple[List, List]: """校验身份证反面""" errors = [] warnings = [] # 校验有效期限 valid_date = fields.get("valid_date", "") if valid_date: if "长期" not in valid_date: # 检查是否过期 try: end_date_str = valid_date.split("至")[-1].strip() end_date = datetime.strptime(end_date_str, "%Y.%m.%d") if end_date < datetime.now(): warnings.append("身份证已过期") except: pass return errors, warnings def _validate_passport(self, fields: Dict) -> Tuple[List, List]: """校验护照""" errors = [] warnings = [] # 校验护照号 passport_no = fields.get("passport_no", "") if not passport_no: errors.append("缺少护照号码") elif not self._validate_passport_number(passport_no): errors.append("护照号码格式错误") # 校验有效期 expiry_date = fields.get("expiry_date", "") if expiry_date: try: dt = datetime.strptime(expiry_date, "%Y年%m月%d日") if dt < datetime.now(): warnings.append("护照已过期") except: pass return errors, warnings def _validate_business_license(self, fields: Dict) -> Tuple[List, List]: """校验营业执照""" errors = [] warnings = [] # 校验统一社会信用代码 credit_code = fields.get("credit_code", "") if not credit_code: errors.append("缺少统一社会信用代码") elif len(credit_code) != 18: errors.append("统一社会信用代码长度错误") # 校验企业名称 company_name = fields.get("company_name", "") if not company_name: errors.append("缺少企业名称") return errors, warnings def _validate_id_number_checksum(self, id_number: str) -> bool: """校验身份证号校验码""" if len(id_number) != 18: return False # 权重因子 weights = [7, 9, 10, 5, 8, 4, 2, 1, 6, 3, 7, 9, 10, 5, 8, 4, 2] # 校验码对应值 check_codes = ['1', '0', 'X', '9', '8', '7', '6', '5', '4', '3', '2'] try: # 计算校验和 total = 0 for i in range(17): total += int(id_number[i]) * weights[i] # 计算校验码 check_code = check_codes[total % 11] # 比较 return id_number[-1].upper() == check_code except: return False def _validate_passport_number(self, passport_no: str) -> bool: """校验护照号格式""" import re return bool(re.match(r'^[A-Z]\d{8}$', passport_no.upper())) def _calculate_age(self, birthday: str) -> int: """计算年龄""" try: # 解析生日 if "-" in birthday: birth_date = datetime.strptime(birthday, "%Y-%m-%d") else: birth_date = datetime.strptime(birthday, "%Y年%m月%d日") # 计算年龄 today = datetime.now() age = today.year - birth_date.year # 调整年龄 if (today.month, today.day) < (birth_date.month, birth_date.day): age -= 1 return age except: return -1

4.4 文件归档器

# archivers/file_archiver.py import os import shutil from pathlib import Path from typing import Dict, Optional from datetime import datetime import hashlib from Crypto.Cipher import AES from Crypto.Util.Padding import pad, unpad import base64 class FileArchiver: """文件归档器""" def __init__(self, output_dir: str = "./archive"): self.output_dir = Path(output_dir) self.output_dir.mkdir(parents=True, exist_ok=True) # 加密密钥(实际应用中应从安全配置获取) self.encryption_key = b'16bytesecretkey!' # 16字节密钥 def archive(self, source_path: str, card_info, encrypt: bool = False) -> str: """归档文件""" # 生成目标文件名 target_filename = self._generate_filename(card_info) # 确定目标目录 target_dir = self.output_dir / card_info.card_type target_dir.mkdir(parents=True, exist_ok=True) target_path = target_dir / target_filename # 复制文件 if encrypt: self._copy_with_encryption(source_path, target_path) else: shutil.copy2(source_path, target_path) # 保存元数据 self._save_metadata(target_path, card_info) return str(target_path) def _generate_filename(self, card_info) -> str: """生成文件名""" # 使用证件类型和关键字段生成文件名 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") # 获取关键字段 if card_info.card_type == "id_card_front": key_field = card_info.fields.get("id_number", "unknown")[-6:] elif card_info.card_type == "business_license": key_field = card_info.fields.get("credit_code", "unknown")[-6:] else: key_field = card_info.fields.get("name", "unknown") # 清理文件名 key_field = "".join(c for c in str(key_field) if c.isalnum()) return f"{card_info.card_type}_{key_field}_{timestamp}.jpg" def _copy_with_encryption(self, source: str, target: Path): """加密复制文件""" # 读取源文件 with open(source, 'rb') as f: data = f.read() # 加密 cipher = AES.new(self.encryption_key, AES.MODE_CBC) ct_bytes = cipher.encrypt(pad(data, AES.block_size)) # 写入目标文件 with open(target, 'wb') as f: f.write(cipher.iv) f.write(ct_bytes) def _save_metadata(self, file_path: Path, card_info): """保存元数据""" import json metadata = { "card_type": card_info.card_type, "card_type_name": card_info.card_type_name, "fields": card_info.fields, "confidence": card_info.confidence, "archive_time": datetime.now().isoformat(), "file_hash": self._calculate_hash(file_path) } metadata_path = file_path.with_suffix('.json') with open(metadata_path, 'w', encoding='utf-8') as f: json.dump(metadata, f, ensure_ascii=False, indent=2) def _calculate_hash(self, file_path: Path) -> str: """计算文件哈希""" sha256 = hashlib.sha256() with open(file_path, 'rb') as f: for chunk in iter(lambda: f.read(4096), b''): sha256.update(chunk) return sha256.hexdigest() def search(self, query: Dict) -> List[str]: """搜索归档文件""" results = [] for metadata_file in self.output_dir.rglob("*.json"): import json with open(metadata_file, 'r', encoding='utf-8') as f: metadata = json.load(f) # 匹配查询条件 match = True for key, value in query.items(): if key in metadata.get("fields", {}): if value.lower() not in str(metadata["fields"][key]).lower(): match = False break elif metadata.get(key) != value: match = False break if match: # 返回对应的图片文件 image_file = metadata_file.with_suffix('.jpg') if image_file.exists(): results.append(str(image_file)) return results # archivers/db_archiver.py import sqlite3 from typing import Dict, List, Optional from datetime import datetime from pathlib import Path import json class DatabaseArchiver: """数据库归档器""" def __init__(self, db_path: str = "./id_cards.db"): self.db_path = db_path self._init_database() def _init_database(self): """初始化数据库""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # 创建证件表 cursor.execute(''' CREATE TABLE IF NOT EXISTS id_cards ( id INTEGER PRIMARY KEY AUTOINCREMENT, card_type TEXT NOT NULL, card_type_name TEXT, fields_json TEXT, confidence REAL, image_path TEXT, face_image_path TEXT, created_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP, updated_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP ) ''') # 创建索引 cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_card_type ON id_cards(card_type) ''') cursor.execute(''' CREATE INDEX IF NOT EXISTS idx_created_at ON id_cards(created_at) ''') conn.commit() conn.close() def save(self, card_info, image_path: str = None, face_image_path: str = None) -> int: """保存证件信息""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() cursor.execute(''' INSERT INTO id_cards (card_type, card_type_name, fields_json, confidence, image_path, face_image_path) VALUES (?, ?, ?, ?, ?, ?) ''', ( card_info.card_type, card_info.card_type_name, json.dumps(card_info.fields, ensure_ascii=False), card_info.confidence, image_path, face_image_path )) record_id = cursor.lastrowid conn.commit() conn.close() return record_id def search(self, card_type: str = None, field_query: Dict = None) -> List[Dict]: """搜索证件信息""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() query = "SELECT * FROM id_cards WHERE 1=1" params = [] if card_type: query += " AND card_type = ?" params.append(card_type) cursor.execute(query, params) rows = cursor.fetchall() results = [] for row in rows: record = { "id": row[0], "card_type": row[1], "card_type_name": row[2], "fields": json.loads(row[3]) if row[3] else {}, "confidence": row[4], "image_path": row[5], "face_image_path": row[6], "created_at": row[7] } # 字段查询过滤 if field_query: match = True for key, value in field_query.items(): if key in record["fields"]: if value.lower() not in str(record["fields"][key]).lower(): match = False break if match: results.append(record) else: results.append(record) conn.close() return results def get_statistics(self) -> Dict: """获取统计信息""" conn = sqlite3.connect(self.db_path) cursor = conn.cursor() # 总数 cursor.execute("SELECT COUNT(*) FROM id_cards") total = cursor.fetchone()[0] # 按类型统计 cursor.execute(''' SELECT card_type_name, COUNT(*) FROM id_cards GROUP BY card_type ''') by_type = dict(cursor.fetchall()) conn.close() return { "total": total, "by_type": by_type }

4.5 Claw 工作流配置

# claw_workflows/id_card_processor.yaml name: 证件信息智能提取与归档 version: 1.0 description: 自动识别证件信息并规范化归档管理 triggers: - type: file_watcher path: ./id_cards/input extensions: [.jpg, .jpeg, .png, .pdf] - type: manual variables: input_dir: "./id_cards/input" output_dir: "./id_cards/archive" steps: - name: 扫描证件文件 action: scan_files config: path: "${input_dir}" extensions: [.jpg, .jpeg, .png, .pdf] output: id_files - name: 图像预处理 action: preprocess_image config: files: "${id_files}" enhance: true correct_skew: true output: processed_images - name: 证件识别 action: recognize_id_card config: images: "${processed_images}" provider: baidu output: card_infos - name: 数据校验 action: validate_id_card config: card_infos: "${card_infos}" strict_mode: true output: validated_infos - name: 文件归档 action: archive_files config: source_files: "${id_files}" card_infos: "${validated_infos}" output_dir: "${output_dir}" encrypt: true output: archive_paths - name: 数据库归档 action: save_to_database config: card_infos: "${validated_infos}" archive_paths: "${archive_paths}" - name: 生成报告 action: generate_report config: card_infos: "${validated_infos}" output_dir: "${output_dir}" format: excel output: report_path - name: 发送通知 action: notify config: type: email recipients: ["admin@company.com"] subject: "证件处理完成" body: "已处理${card_infos.count}张证件"

五、实战运行示例

5.1 主程序入口

# main.py import argparse from pathlib import Path from datetime import datetime from recognizers.id_recognizer import IDCardRecognizer from validators.id_validator import IDCardValidator from archivers.file_archiver import FileArchiver from archivers.db_archiver import DatabaseArchiver def main(): parser = argparse.ArgumentParser(description="龙虾Claw证件处理系统") parser.add_argument("--input", "-i", required=True, help="输入文件或目录") parser.add_argument("--output", "-o", default="./archive", help="归档目录") parser.add_argument("--encrypt", "-e", action="store_true", help="加密存储") parser.add_argument("--api-key", help="OCR API密钥") args = parser.parse_args() print("=== 龙虾Claw证件处理系统 ===") print(f"输入路径: {args.input}") print("-" * 40) # 初始化组件 recognizer = IDCardRecognizer(api_key=args.api_key) validator = IDCardValidator() file_archiver = FileArchiver(args.output) db_archiver = DatabaseArchiver() # 收集文件 input_path = Path(args.input) files = [] if input_path.is_file(): files = [input_path] elif input_path.is_dir(): for ext in ['*.jpg', '*.jpeg', '*.png']: files.extend(input_path.glob(ext)) print(f"\n发现 {len(files)} 个证件文件") # 处理每个文件 for idx, file_path in enumerate(files, 1): print(f"\n[{idx}/{len(files)}] 处理: {file_path.name}") try: # 读取图像 import cv2 image = cv2.imread(str(file_path)) # 识别证件 card_info = recognizer.recognize(image) print(f" - 证件类型: {card_info.card_type_name}") print(f" - 识别置信度: {card_info.confidence:.2%}") # 校验 result = validator.validate(card_info) if result.errors: print(f" - 校验错误: {result.errors}") if result.warnings: print(f" - 警告: {result.warnings}") # 归档 archive_path = file_archiver.archive( str(file_path), card_info, encrypt=args.encrypt ) print(f" - 归档路径: {archive_path}") # 保存到数据库 record_id = db_archiver.save(card_info, archive_path) print(f" - 数据库ID: {record_id}") except Exception as e: print(f" - 处理失败: {e}") # 统计 stats = db_archiver.get_statistics() print(f"\n=== 统计信息 ===") print(f"总记录数: {stats['total']}") for card_type, count in stats['by_type'].items(): print(f" - {card_type}: {count}") print("\n处理完成!") if __name__ == "__main__": main()

5.2 运行命令

# 处理单个证件 python main.py -i id_card.jpg # 处理目录并加密存储 python main.py -i ./scans -o ./archive --encrypt # 使用API识别 python main.py -i ./scans --api-key YOUR_API_KEY

六、总结

本文详细介绍了使用龙虾 Claw 实现扫描件证件信息智能提取与自动归档管理的完整方案。通过证件识别、字段提取、数据校验和规范化归档,实现了证件信息管理的高度自动化。

核心优势:

  • 多证件支持:身份证、护照、驾驶证、营业执照等
  • 智能识别:自动判断证件类型并提取字段
  • 数据校验:身份证校验码、有效期等自动验证
  • 安全归档:支持加密存储,保护隐私安全
  • 便捷检索:支持多条件搜索和统计

http://www.jsqmd.com/news/529232/

相关文章:

  • 构建企业级网络准入控制体系:PacketFence解决方案深度解析
  • 如何通过3步注册解锁Jasmine全部潜力?
  • 如何通过开源IT资产管理平台实现企业基础设施的智能化管控
  • OmenSuperHub:惠普游戏本的开源硬件控制解决方案
  • 5个高效工具助你构建企业级Tesseract.js OCR应用
  • 如何突破Java串口通信的跨平台瓶颈?jSerialComm全方位技术解析
  • GHelper:华硕笔记本用户的轻量级控制神器
  • 【困惑度 计算和可视化】
  • Tao-8k模型在不同硬件平台的部署对比:从GPU到边缘设备
  • 3大突破:res-downloader网络资源获取全场景解决方案
  • 喀什新风系统优质公司推荐榜 - 资讯焦点
  • 内容无法被AI收录?90%的根源是GEO服务商没选对! - 资讯焦点
  • IEEE33节点交直流混合配电网潮流计算:交替迭代法下的系统架构解析与优化
  • 丹青幻境惊艳效果展示:AI生成敦煌壁画风格飞天形象高清细节图
  • 终极指南:如何用MobaXterm中文版高效解决远程服务器管理5大痛点
  • 2026软床源头工厂优质推荐榜 靠谱之选 - 资讯焦点
  • UndertaleModTool全流程指南:GameMaker游戏深度定制与扩展解决方案
  • halcon算子
  • 纹理压缩效率革命:Intel Texture Works插件如何重塑数字创作流程
  • 瓜果育苗栽培基质优质厂家高性价比推荐 - 资讯焦点
  • PyEMD深度解析:Python中的经验模态分解实战指南
  • # 发散创新:用 Rust实现高性能光线追踪渲染器——从零构建你的第一个 GPU 加速光追引擎在现代图形学领域,**光线追踪(Ray
  • 喀什新风系统优质公司排名推荐 - 资讯焦点
  • 协程调度器重写,IOCP深度适配,UVLoop无缝集成——Python 3.15异步模型三大硬核升级,你还在用3.12的旧范式?
  • 11倍性能突破:Lightpanda无头浏览器如何重塑Web自动化新标准
  • AIGlasses_for_navigation开源大模型:YOLO-SEG等5个定制化模型全部开放
  • Nuxt3 SSR 接口请求封装实战:从基础封装到多接口并发处理
  • 浪潮341万中标麻湖北黄冈数字公共基础设施二期项目
  • 开源安全软件工程实践分析——OWASP ZAP
  • DanKoe-视频笔记-基于证据的生活优化指南-如何系统性地改善你的生活