当前位置：首页 > news >正文

# 【Python实战】自动化处理Word文档：批量替换+模板填充+格式转换+水印添加

news 2026/6/18 3:59:09

一、项目背景

1.1 痛点分析

Word文档处理是办公中最高频的场景，但手动操作效率极低：

场景	手工方式	时间
50份合同改名字	逐份打开修改	3小时
100份通知填数据	模板复制粘贴	5小时
30份文档转PDF	逐个另存为	1小时
批量加水印	逐页插入	2小时
总计	-	11小时

HR每月发薪资条、法务每季度改合同模板、行政群发通知…全是重复劳动。

1.2 技术需求

核心需求：

批量查找替换（支持正则）
模板变量填充（从Excel读取数据）
批量格式转换（Word↔PDF）
批量添加水印
保留原文档格式和样式

二、技术架构

数据源(Excel) → 模板加载 → 变量填充 → 格式处理 → 批量输出
↑ ↑ ↑ ↑ ↑
pandas python-docx python-docx docx2pdf pathlib
comtypes

技术栈： - **python-docx**：Word文档读写 - **pandas**：读取Excel数据源 - **docx2pdf**：Word转PDF - **comtypes**：调用Word COM接口（高级转换） - **Pillow**：水印图片生成 - **pathlib**：文件路径管理 --- ## 三、环境准备 ### 3.1 安装依赖 ```bash pip install python-docx pandas docx2pdf pillow openpyxl

3.2 项目结构

word-automation/ ├── main.py # 主程序 ├── replacer.py # 批量替换模块 ├── filler.py # 模板填充模块 ├── converter.py # 格式转换模块 ├── watermark.py # 水印模块 ├── config.py # 配置 ├── templates/ # 模板目录 │ └── contract.docx # 合同模板 ├── data/ # 数据目录 │ └── employees.xlsx # 员工数据 └── output/ # 输出目录

四、核心模块实现

4.1 批量查找替换模块

支持段落、表格、页眉页脚的全文替换：

from docx import Document from pathlib import Path import re import copy class WordReplacer: def __init__(self): self.replace_count = 0 def replace_in_file(self, file_path, replacements, output_path=None): """单文件替换""" doc = Document(file_path) self.replace_count = 0 # 替换段落 for paragraph in doc.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 替换表格 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 替换页眉页脚 for section in doc.sections: for paragraph in section.header.paragraphs: self._replace_in_paragraph(paragraph, replacements) for paragraph in section.footer.paragraphs: self._replace_in_paragraph(paragraph, replacements) # 保存 save_path = output_path or file_path doc.save(save_path) return self.replace_count def _replace_in_paragraph(self, paragraph, replacements): """替换段落文字（保留格式）""" full_text = paragraph.text for old_text, new_text in replacements.items(): if old_text in full_text: self.replace_count += full_text.count(old_text) full_text = full_text.replace(old_text, new_text) if paragraph.text != full_text: # 保留第一个run的格式 if paragraph.runs: first_run_format = self._get_run_format(paragraph.runs[0]) for run in paragraph.runs: run.text = "" paragraph.runs[0].text = full_text self._apply_run_format(paragraph.runs[0], first_run_format) def _get_run_format(self, run): """获取run格式""" return { 'bold': run.bold, 'italic': run.italic, 'font_name': run.font.name, 'font_size': run.font.size, 'font_color': run.font.color.rgb if run.font.color and run.font.color.rgb else None } def _apply_run_format(self, run, fmt): """应用run格式""" run.bold = fmt['bold'] run.italic = fmt['italic'] if fmt['font_name']: run.font.name = fmt['font_name'] if fmt['font_size']: run.font.size = fmt['font_size'] def batch_replace(self, folder_path, replacements, output_folder=None): """批量替换文件夹中的所有Word文档""" folder = Path(folder_path) output = Path(output_folder) if output_folder else folder / 'replaced' output.mkdir(exist_ok=True) files = list(folder.glob('*.docx')) total_count = 0 print(f"发现 {len(files)} 个Word文档") for i, file in enumerate(files): if file.name.startswith('~$'): continue output_path = output / file.name count = self.replace_in_file(str(file), replacements, str(output_path)) total_count += count print(f" [{i+1}/{len(files)}] {file.name} → 替换{count}处") print(f"✅ 批量替换完成，共替换{total_count}处") return total_count

4.2 模板变量填充模块

从Excel读取数据，批量填充Word模板：

import pandas as pd from docx import Document from pathlib import Path class TemplateFiller: def __init__(self, template_path): self.template_path = template_path def fill_from_excel(self, excel_path, output_folder, filename_column=None): """从Excel读取数据批量填充模板""" df = pd.read_excel(excel_path) output = Path(output_folder) output.mkdir(exist_ok=True) print(f"模板：{self.template_path}") print(f"数据：{len(df)}行 × {len(df.columns)}列") for index, row in df.iterrows(): # 构建替换字典：{{列名}} → 值 replacements = {} for col in df.columns: placeholder = '{{' + str(col) + '}}' value = str(row[col]) if pd.notna(row[col]) else '' replacements[placeholder] = value # 生成文件名 if filename_column and filename_column in df.columns: filename = f"{row[filename_column]}.docx" else: filename = f"output_{index + 1}.docx" # 填充模板 doc = Document(self.template_path) self._fill_document(doc, replacements) output_path = output / filename doc.save(str(output_path)) print(f" [{index+1}/{len(df)}] → {filename}") print(f"✅ 批量填充完成，共生成{len(df)}个文档") def _fill_document(self, doc, replacements): """填充文档中的占位符""" # 段落 for paragraph in doc.paragraphs: self._fill_paragraph(paragraph, replacements) # 表格 for table in doc.tables: for row in table.rows: for cell in row.cells: for paragraph in cell.paragraphs: self._fill_paragraph(paragraph, replacements) # 页眉页脚 for section in doc.sections: for paragraph in section.header.paragraphs: self._fill_paragraph(paragraph, replacements) for paragraph in section.footer.paragraphs: self._fill_paragraph(paragraph, replacements) def _fill_paragraph(self, paragraph, replacements): """填充段落（处理跨run的占位符）""" full_text = paragraph.text changed = False for placeholder, value in replacements.items(): if placeholder in full_text: full_text = full_text.replace(placeholder, value) changed = True if changed and paragraph.runs: # 保留格式，更新文字 first_format = { 'bold': paragraph.runs[0].bold, 'italic': paragraph.runs[0].italic, 'font_name': paragraph.runs[0].font.name, 'font_size': paragraph.runs[0].font.size } for run in paragraph.runs: run.text = "" paragraph.runs[0].text = full_text paragraph.runs[0].bold = first_format['bold'] paragraph.runs[0].italic = first_format['italic'] if first_format['font_name']: paragraph.runs[0].font.name = first_format['font_name'] if first_format['font_size']: paragraph.runs[0].font.size = first_format['font_size'] def fill_single(self, data_dict, output_path): """填充单个文档""" doc = Document(self.template_path) replacements = { '{{' + k + '}}': str(v) for k, v in data_dict.items() } self._fill_document(doc, replacements) doc.save(output_path) print(f"✅ 文档已生成：{output_path}")

4.3 格式转换模块

Word↔PDF批量转换：

from pathlib import Path import subprocess import platform class FormatConverter: def word_to_pdf(self, input_path, output_path=None): """Word转PDF""" input_path = Path(input_path) if not output_path: output_path = input_path.with_suffix('.pdf') system = platform.system() if system == 'Windows': self._convert_windows(str(input_path), str(output_path)) else: self._convert_libreoffice(str(input_path), str(output_path)) return str(output_path) def _convert_windows(self, input_path, output_path): """Windows下用COM接口转换（效果最好）""" try: import comtypes.client word = comtypes.client.CreateObject('Word.Application') word.Visible = False doc = word.Documents.Open(input_path) doc.SaveAs(output_path, FileFormat=17) # 17 = PDF doc.Close() word.Quit() except ImportError: # 降级使用docx2pdf from docx2pdf import convert convert(input_path, output_path) def _convert_libreoffice(self, input_path, output_path): """Linux/Mac下用LibreOffice转换""" output_dir = str(Path(output_path).parent) subprocess.run([ 'libreoffice', '--headless', '--convert-to', 'pdf', '--outdir', output_dir, input_path ], check=True) def batch_to_pdf(self, folder_path, output_folder=None): """批量Word转PDF""" folder = Path(folder_path) output = Path(output_folder) if output_folder else folder / 'pdf' output.mkdir(exist_ok=True) files = list(folder.glob('*.docx')) print(f"发现 {len(files)} 个Word文档") success = 0 for i, file in enumerate(files): if file.name.startswith('~$'): continue try: output_path = output / file.with_suffix('.pdf').name self.word_to_pdf(str(file), str(output_path)) success += 1 print(f" [{i+1}/{len(files)}] {file.name} → PDF ✅") except Exception as e: print(f" [{i+1}/{len(files)}] {file.name} → 失败: {e}") print(f"✅ 批量转换完成：{success}/{len(files)}")

4.4 水印添加模块

支持文字水印和图片水印：

from docx import Document from docx.shared import Pt, Inches, RGBColor, Emu from docx.oxml.ns import qn, nsdecls from docx.oxml import parse_xml from pathlib import Path class WatermarkAdder: def add_text_watermark(self, file_path, text, output_path=None): """添加文字水印""" doc = Document(file_path) for section in doc.sections: header = section.header header.is_linked_to_previous = False # 创建水印XML watermark_xml = f''' <w:r {nsdecls('w', 'v', 'o', 'wp', 'r')}> <w:rPr> <w:noProof/> </w:rPr> <v:shapetype id="_x0000_t136" coordsize="21600,21600" o:spt="136" path="m@7,l@8,m@5,21600l@6,21600e"> </v:shapetype> <v:shape id="PowerP ...(truncated)...

查看全文

http://www.jsqmd.com/news/654475/