当前位置：首页 > news >正文

java解析word中的excel

news 2026/3/27 3:19:52

一、思路

1. 入口→分发：extractFromWord 是总入口，核心是 “格式分发”，将.doc 和.docx 分流到不同处理逻辑；

2. .doc 核心：绕开路径解析，用 “逐层遍历 + 兜底读取” 确保文件能读到，再交给extractFromOLE解析；

3. .docx 核心：直接遍历 PackagePart，利用 POI 对 OOXML 的原生支持，快速识别 Excel 附件；

4. 解析核心：extractFromOLE 是格式兼容关键，区分.xls/.xlsx 用不同 POI 模块，避免解析失败；

5. 稳定性保障：多层过滤（过小文件 / 特殊字符）+ 异常捕获（单个文件失败不中断）+ 格式适配，确保程序稳定运行。

二、核心依赖：

<!-- Apache POI 处理 Word 和 Excel -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-ooxml</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache POI 处理 OLE 对象（嵌入式附件） -->
<dependency>
<groupId>org.apache.poi</groupId>
<artifactId>poi-scratchpad</artifactId>
<version>5.2.4</version>
</dependency>
<!-- Apache Tika 识别文件类型（辅助提取附件） -->
<dependency>
<groupId>org.apache.tika</groupId>
<artifactId>tika-core</artifactId>
<version>2.9.1</version>
</dependency>

三、源码

import org.apache.poi.openxml4j.opc.OPCPackage;
import org.apache.poi.openxml4j.opc.PackagePart;
import org.apache.poi.poifs.filesystem.*;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.apache.poi.xwpf.usermodel.XWPFDocument;
import org.apache.tika.Tika;import java.io.*;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Pattern;public class WordExcelExtractor {private static final Tika tika = new Tika();private static final Pattern NON_PRINTABLE_CHAR_PATTERN = Pattern.compile("[^\\x20-\\x7E]");private static final int MIN_EXCEL_SIZE = 100;public static List<byte[]> extractFromWord(File wordFile) throws IOException {validateFile(wordFile);List<byte[]> excelDataList = new ArrayList<>();String fileName = wordFile.getName().toLowerCase();try {if (fileName.endsWith(".docx")) {extractFromDocx(wordFile, excelDataList);} else if (fileName.endsWith(".doc")) {extractFromDocDirect(wordFile, excelDataList);} else {throw new IllegalArgumentException("不支持的格式！仅支持 .doc/.docx");}} catch (IllegalArgumentException e) {throw e;} catch (Exception e) {throw new IOException("提取 Excel 附件失败：" + e.getMessage(), e);}return excelDataList;}private static void extractFromDocDirect(File docFile, List<byte[]> excelDataList) throws IOException {try (FileInputStream fis = new FileInputStream(docFile);POIFSFileSystem poifs = new POIFSFileSystem(fis)) {DirectoryEntry rootEntry = poifs.getRoot();DirectoryEntry objectPool = getObjectPoolDirectory(rootEntry);if (objectPool == null) {System.out.println("无 ObjectPool 目录，无嵌入式附件");return;}for (Entry entry1 : objectPool) {String dir1Name = filterSpecialChars(entry1.getName());if (dir1Name.isEmpty() || !(entry1 instanceof DirectoryEntry)) {continue;}DirectoryEntry subDir = (DirectoryEntry) entry1;System.out.println("找到 OBJECTPOOL 子目录：" + dir1Name);for (Entry entry2 : subDir) {String fileName = filterSpecialChars(entry2.getName());if (fileName.isEmpty() || !(entry2 instanceof DocumentEntry)) {continue;}DocumentEntry docEntry = (DocumentEntry) entry2;long fileSize = docEntry.getSize();System.out.println("找到文件：" + dir1Name + "/" + fileName + "（大小：" + fileSize + " 字节）");if (fileSize < MIN_EXCEL_SIZE) {System.out.println("⚠️ 跳过过小文件（非 Excel）：" + dir1Name + "/" + fileName);continue;}try {// 主方案：路径读取try (InputStream is = poifs.createDocumentInputStream(getEntryFullPath(docEntry))) {byte[] oleData = is.readAllBytes();extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);} catch (Exception e) {System.out.println("⚠️  路径读取失败，尝试直接读取文件字节");byte[] oleData = readDocumentEntryDirect(docEntry);if (oleData != null && oleData.length >= MIN_EXCEL_SIZE) {extractAndAddExcel(oleData, dir1Name + "/" + fileName, excelDataList);} else {System.out.println("❌ 兜底读取失败（数据无效）：" + dir1Name + "/" + fileName);}}} catch (Exception e) {System.out.println("❌ 处理文件失败，跳过：" + dir1Name + "/" + fileName + " → " + e.getMessage());}}}}}private static void extractAndAddExcel(byte[] oleData, String filePath, List<byte[]> excelDataList) {try {byte[] excelData = extractFromOLE(oleData);if (excelData != null) {excelDataList.add(excelData);System.out.println("✅ 成功提取 Excel：" + filePath);} else {String fileHeader = getFileHeader(oleData);System.out.println("❌ 非 Excel 文件（文件头：" + fileHeader + "）：" + filePath);}} catch (Exception e) {System.out.println("❌ 提取 Excel 失败：" + filePath + " → " + e.getMessage());}}private static byte[] readDocumentEntryDirect(DocumentEntry docEntry) {try (InputStream is = new DocumentInputStream(docEntry)) {byte[] data = new byte[(int) docEntry.getSize()];int readLen = is.read(data);return readLen > 0 ? data : null;} catch (IOException e) {System.out.println("⚠️  直接读取字节失败：" + e.getMessage());return null;}}/*** 核心修复：区分 .xls 和 .xlsx 格式，适配对应的解析模块*/private static byte[] extractFromOLE(byte[] oleData) {// 1. 快速过滤非 Excel 文件if (!isExcelFile(oleData)) {return null;}// 2. 判断是 .xls（OLE2）还是 .xlsx（OOXML）boolean isXls = isXlsFile(oleData);boolean isXlsx = isXlsxFile(oleData);// 3. 处理 .xlsx 格式（OOXML）if (isXlsx) {try (ByteArrayInputStream bais = new ByteArrayInputStream(oleData)) {// 验证是否为有效 .xlsx（用 OOXML 专用的 OPCPackage）try (OPCPackage opcPackage = OPCPackage.open(bais)) {// 可选：进一步验证是否为 Excel 工作表（避免其他 OOXML 文件）try (XSSFWorkbook workbook = new XSSFWorkbook(opcPackage)) {// 能打开工作簿，说明是有效 .xlsxreturn oleData;}}} catch (Exception e) {System.out.println("⚠️  无效的 .xlsx 文件：" + e.getMessage());return null;}}// 4. 处理 .xls 格式（OLE2）if (isXls) {ByteArrayInputStream bais = null;POIFSFileSystem poifs = null;try {bais = new ByteArrayInputStream(oleData);poifs = new POIFSFileSystem(bais);DirectoryEntry root = poifs.getRoot();if (root.hasEntry("Package")) {try (InputStream is = poifs.createDocumentInputStream("Package")) {byte[] data = is.readAllBytes();return isExcelFile(data) ? data : null;}} else if (root.hasEntry("Contents")) {try (InputStream is = poifs.createDocumentInputStream("Contents")) {byte[] data = is.readAllBytes();return isExcelFile(data) ? data : null;}}// 直接是 .xls 文件，无需额外解析return oleData;} catch (NotOLE2FileException e) {System.out.println("⚠️  非 OLE2 格式文件：" + e.getMessage());} catch (IOException e) {System.out.println("⚠️  解析 .xls 文件失败：" + e.getMessage());} finally {if (poifs != null) {try {poifs.close();} catch (IOException e) {}}if (bais != null) {try {bais.close();} catch (IOException e) {}}}}// 5. Tika 辅助验证String fileType = tika.detect(oleData, "");if (fileType.contains("excel") || fileType.contains("spreadsheet")) {return oleData;}return null;}/*** 单独判断是否为 .xls 文件（OLE2 格式）*/private static boolean isXlsFile(byte[] data) {if (data.length < 4) return false;byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];return (b1 == (byte) 0xD0 && b2 == (byte) 0xCF && b3 == (byte) 0x11 && b4 == (byte) 0xE0);}/*** 单独判断是否为 .xlsx 文件（OOXML 格式）*/private static boolean isXlsxFile(byte[] data) {if (data.length < 4) return false;byte b1 = data[0], b2 = data[1], b3 = data[2], b4 = data[3];return (b1 == (byte) 0x50 && b2 == (byte) 0x4B && b3 == (byte) 0x03 && b4 == (byte) 0x04);}// ---------------------- 工具方法 ----------------------private static DirectoryEntry getObjectPoolDirectory(DirectoryEntry root) throws IOException {if (root.hasEntry("ObjectPool")) {return (DirectoryEntry) root.getEntry("ObjectPool");} else if (root.hasEntry("OBJECTPOOL")) {return (DirectoryEntry) root.getEntry("OBJECTPOOL");}return null;}private static String filterSpecialChars(String name) {return name == null ? "" : NON_PRINTABLE_CHAR_PATTERN.matcher(name).replaceAll("");}private static boolean isExcelContentType(String contentType) {return contentType.equals("application/vnd.ms-excel")|| contentType.equals("application/vnd.openxmlformats-officedocument.spreadsheetml.sheet")|| contentType.equals("application/vnd.ms-excel.sheet.macroEnabled.12");}private static boolean isExcelFile(byte[] data) {return isXlsFile(data) || isXlsxFile(data);}private static String getFileHeader(byte[] data) {if (data.length < 4) return "不足4字节";StringBuilder sb = new StringBuilder();for (int i = 0; i < 4; i++) {sb.append(String.format("%02X ", data[i]));}return sb.toString().trim();}private static void validateFile(File file) throws IOException {if (!file.exists()) throw new FileNotFoundException("文件不存在：" + file.getAbsolutePath());if (!file.isFile()) throw new IOException("路径不是文件：" + file.getAbsolutePath());if (!file.canRead()) throw new IOException("文件不可读：" + file.getAbsolutePath());}private static void extractFromDocx(File docxFile, List<byte[]> excelDataList) throws IOException {try (XWPFDocument doc = new XWPFDocument(OPCPackage.open(docxFile))) {for (PackagePart part : doc.getAllEmbeddedParts()) {String contentType = part.getContentType();try (InputStream is = part.getInputStream()) {byte[] data = is.readAllBytes();if (data.length < MIN_EXCEL_SIZE) continue;if (isExcelContentType(contentType) || isExcelFile(data)) {excelDataList.add(data);System.out.println("✅ 提取 .docx 中的 Excel 附件");} else if (contentType.contains("oleObject")) {byte[] excelData = extractFromOLE(data);if (excelData != null) {excelDataList.add(excelData);}}} catch (Exception e) {System.out.println("❌ 处理 .docx 附件失败：" + e.getMessage());}}} catch (Exception e) {throw new IOException("解析 .docx 文件失败：" + e.getMessage(), e);}}/*** 获取 Entry 的绝对路径（用于主方案路径读取，即使兜底方案常用，也需保留避免报红）*/private static String getEntryFullPath(Entry entry) {List<String> pathParts = new ArrayList<>();Entry current = entry;while (current != null) {String name = current.getName();// 过滤根目录和无效名称if (name != null && !name.isEmpty() && !"Root Entry".equals(name)) {pathParts.add(name);}current = current.getParent();}// 反转路径部分，得到正确的绝对路径StringBuilder path = new StringBuilder();for (int i = pathParts.size() - 1; i >= 0; i--) {if (path.length() > 0) {path.append("/");}path.append(pathParts.get(i));}return path.toString();}
}