当前位置：首页 > news >正文

【pfg】

news 2026/3/27 0:28:05

def get_pdf_table(pdf_path,col_name): """ 提取pdf表格数据 :param pdf_path: 需要提取的pdf文件绝对路径 :param col_name: 需要特殊处理的列名 :return: """ try: # 打开PDF文件 with pdfplumber.open(pdf_path) as pdf: index_col_name = -1 # col_name的列下标，默认为-1 为-1表示不存在该列 header_all = [] # 遍历每一页 for page_num, page in enumerate(pdf.pages, 1): # 提取表格 tables = page.find_tables(custom_settings) if not tables: # 未发现表格，直接继续下一页 continue for table in tables: table_text = table.extract() table_cells = table.cells table_col_num = len(table_text[0]) if table_col_num <= 2: #丢弃2列的表数据 continue cell_span_result = find_cell_span(table_cells) # 获取该表格中横向和纵向合并细节 header_index_array = get_header_index(table_text, cell_span_result) # 获取表头开始行下标和结束行下表 content_index = header_index_array[1] + 1 # 表头页数据开始行下标 header_result = set_span_cell_all(table_text[header_index_array[0]:content_index], cell_span_result, header_index_array[0]) # 表头数据预处理，将所有合并单元格均浮上同样的值 header = get_header(header_result) # 将不同的值用_连接起来 if header not in header_all: header_all.append(header) output_csv = os.path.splitext(pdf_path)[0] + str(header_all.index(header)) + '.csv' # 找到表下标 with open(output_csv, 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerow(header) # 写入表头 print(f"已提取表头:页面：{page_num}, {header},{col_name}列索引：{index_col_name}", file=sys.stderr) else: content_index = header_index_array[1] - header_index_array[0] + 1 # 非表头页数据开始行下标 output_csv = os.path.splitext(pdf_path)[0] + str(header_all.index(header)) + '.csv' # 找到表下标 data_rows = table_text[content_index:] # 非表头页数据处理 try: index_col_name = header.index(col_name) # 获取特殊内容的列下标 需要兼容不存在特殊内容列的情况 except: pass result = set_span_cell_special(data_rows, cell_span_result, index_col_name, content_index) with open(output_csv, 'a', newline='', encoding='utf-8') as csvfile: writer = csv.writer(csvfile) writer.writerows(result) # 写入表头 print(f"✅ 成功将PDF表格提取为CSV: {os.path.abspath(output_csv)}") except Exception as e: print(f"提取文件:{pdf_path}失败，失败原因：{e}")

查看全文

http://www.jsqmd.com/news/461442/