第七章:表格识别与提取¶
表格识别概述¶
表格识别(Table Recognition)是文档解析中的难点,需要识别表格的结构(行、列、边框)和内容。本章介绍多种表格识别方案。
方案对比¶
| 方案 | 优点 | 缺点 | 适用场景 |
|---|---|---|---|
| pdfplumber | 简单易用 | 对复杂表格效果差 | 简单表格 |
| Camelot | 开源免费 | 依赖 Tabula | PDF 表格 |
| PaddleOCR | 端到端 | 需要训练 | 复杂表格 |
| DeepDocParser | 阿里开源 | 效果好 | 生产环境 |
pdfplumber 表格提取¶
基本用法¶
import pdfplumber
def extract_tables_pdfplumber(pdf_path):
"""使用 pdfplumber 提取表格"""
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
page_tables = page.extract_tables()
for table_num, table in enumerate(page_tables):
if table:
tables.append({
"page": page_num + 1,
"table_num": table_num + 1,
"data": table
})
return tables
# 使用
tables = extract_tables_pdfplumber("document.pdf")
for t in tables:
print(f"第 {t['page']} 页,第 {t['table_num']} 个表格:")
for row in t['data'][:5]:
print(row)
高级配置¶
import pdfplumber
def extract_tables_advanced(pdf_path):
"""高级表格提取"""
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
# 自定义提取策略
tables_found = page.extract_tables({
# 垂直线检测策略
"vertical_strategy": "lines_strict", # lines, lines_strict, text
# 水平线检测策略
"horizontal_strategy": "lines_strict",
# 交叉点容差
"intersection_tolerance": 5,
# 最小单元格面积
"min_cell_area": 50,
# 文本提取方式
"text_x_tolerance": 3,
"text_y_tolerance": 3,
})
for table in tables_found:
# 清理表格数据
cleaned = []
for row in table:
cleaned_row = []
for cell in row:
if cell:
# 清理多余空白
cell = ' '.join(cell.split())
cleaned_row.append(cell)
cleaned.append(cleaned_row)
tables.append(cleaned)
return tables
Camelot 表格提取¶
安装¶
基本用法¶
import camelot
def extract_tables_camelot(pdf_path):
"""使用 Camelot 提取表格"""
# 提取表格(stream 模式适合无边框表格,lattice 模式适合有边框表格)
tables = camelot.read_pdf(pdf_path, pages='1-end', flavor='stream')
results = []
for i, table in enumerate(tables):
results.append({
"table_num": i + 1,
"data": table.data,
"shape": table.shape
})
print(f"表格 {i+1}: {table.shape}")
return results
# 导出为 Excel
# tables.export('tables.csv', f='csv')
# tables.export('tables.xlsx', f='excel')
两种模式对比¶
# lattice 模式(适合有边框表格)
tables_lattice = camelot.read_pdf(
pdf_path,
pages='1',
flavor='lattice',
process_background=True # 处理背景色
)
# stream 模式(适合无边框表格)
tables_stream = camelot.read_pdf(
pdf_path,
pages='1',
flavor='stream',
edge_tol=500 # 边缘容差
)
PaddleOCR 表格识别¶
安装¶
基本用法¶
from paddleocr import PaddleOCR
def extract_tables_paddleocr(image_path):
"""使用 PaddleOCR 识别表格"""
ocr = PaddleOCR(
use_angle_cls=True,
lang='ch',
table=True, # 启用表格识别
table_max_num=10
)
result = ocr.ocr(image_path, cls=True, table=True)
# result[1] 包含表格结构
if result[1]:
for table_info in result[1]:
print(f"表格: {table_info}")
return result
# 识别并导出 HTML
result = ocr.ocr(image_path, table=True, det=False, rec=False, cls=False)
html = result[1]['html']
print(html)
阿里 DeepDocParser¶
安装¶
使用¶
from deepdocparser import Parser
# 初始化
parser = Parser()
# 解析 PDF
doc = parser.parse("document.pdf")
# 获取表格
for page in doc.pages:
for table in page.tables:
print(f"表格行数: {len(table.rows)}")
print(f"表格列数: {len(table.cols)}")
# 获取单元格内容
for row in table.rows:
row_data = [cell.text for cell in row.cells]
print(row_data)
表格后处理¶
单元格合并处理¶
def process_merged_cells(table_data):
"""处理合并单元格"""
processed = []
for row in table_data:
# 过滤空单元格和 None
cleaned_row = [cell if cell and cell.strip() else '' for cell in row]
# 合并连续的空单元格(可能是合并单元格)
merged = []
for cell in cleaned_row:
if cell or (merged and merged[-1]):
merged.append(cell)
elif merged and not merged[-1]:
# 跳过连续的空单元格
continue
if merged:
processed.append(merged)
return processed
转换为 DataFrame¶
import pandas as pd
def table_to_dataframe(table_data, headers=None):
"""将表格数据转换为 DataFrame"""
if not table_data:
return pd.DataFrame()
if headers:
# 第一行作为表头
df = pd.DataFrame(table_data[1:], columns=headers)
else:
df = pd.DataFrame(table_data)
return df
# 使用
df = table_to_dataframe(table_data)
print(df)
导出为 Excel¶
import pandas as pd
def export_tables_to_excel(tables, output_path):
"""导出表格到 Excel"""
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
for i, table in enumerate(tables):
df = pd.DataFrame(table['data'])
sheet_name = f"Page{table['page']}_Table{table['table_num']}"
df.to_excel(writer, sheet_name=sheet_name[:31], index=False, header=False)
export_tables_to_excel(tables, "tables.xlsx")
完整示例¶
#!/usr/bin/env python3
"""表格识别与提取完整示例"""
import pdfplumber
import pandas as pd
from typing import List, Dict, Optional
import json
class TableExtractor:
"""表格提取器"""
def __init__(self, method='pdfplumber'):
self.method = method
def extract(self, source: str, source_type: str = 'pdf') -> List[Dict]:
"""提取表格
Args:
source: 文件路径
source_type: 文件类型 (pdf, image)
"""
if source_type == 'pdf':
return self._extract_from_pdf(source)
elif source_type == 'image':
return self._extract_from_image(source)
else:
raise ValueError(f"不支持的文件类型: {source_type}")
def _extract_from_pdf(self, pdf_path: str) -> List[Dict]:
"""从 PDF 提取表格"""
tables = []
with pdfplumber.open(pdf_path) as pdf:
for page_num, page in enumerate(pdf.pages):
# 尝试多种提取策略
for strategy in ['exact', 'lines', 'text']:
try:
page_tables = page.extract_tables({
"vertical_strategy": strategy,
"horizontal_strategy": strategy,
})
for table_num, table in enumerate(page_tables):
if table and self._is_valid_table(table):
tables.append({
"page": page_num + 1,
"table_num": table_num + 1,
"strategy": strategy,
"data": self._clean_table(table),
"rows": len(table),
"cols": len(table[0]) if table else 0
})
break # 成功则停止尝试其他策略
except Exception as e:
continue
return tables
def _extract_from_image(self, image_path: str) -> List[Dict]:
"""从图片提取表格(使用 OCR)"""
# 这里可以使用 PaddleOCR 或其他 OCR 工具
# 简化示例
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='ch', table=True)
result = ocr.ocr(image_path, cls=True, table=True)
tables = []
if result and result[1]:
tables.append({
"page": 1,
"table_num": 1,
"data": result[1],
"rows": len(result[1]),
"cols": len(result[1][0]) if result[1] else 0
})
return tables
def _is_valid_table(self, table: List[List]) -> bool:
"""检查是否为有效表格"""
if not table or len(table) < 2:
return False
# 检查是否有足够的非空单元格
non_empty = sum(1 for row in table for cell in row if cell and cell.strip())
total = len(table) * len(table[0])
return non_empty / total > 0.3 if total > 0 else False
def _clean_table(self, table: List[List]) -> List[List]:
"""清理表格数据"""
cleaned = []
for row in table:
cleaned_row = []
for cell in row:
if cell:
# 清理多余空白
cell = ' '.join(str(cell).split())
cleaned_row.append(cell)
cleaned.append(cleaned_row)
return cleaned
def to_dataframe(self, table_data: List[List]) -> pd.DataFrame:
"""转换为 DataFrame"""
if not table_data:
return pd.DataFrame()
# 尝试使用第一行作为表头
df = pd.DataFrame(table_data[1:], columns=table_data[0])
return df
def to_markdown(self, table_data: List[List]) -> str:
"""转换为 Markdown 表格"""
if not table_data:
return ""
md = ""
for i, row in enumerate(table_data):
# 转义竖线
row = [str(cell).replace('|', '\\|') if cell else '' for cell in row]
if i == 0:
# 表头
md += "| " + " | ".join(row) + " |\n"
# 分隔线
md += "| " + " | ".join(["---"] * len(row)) + " |\n"
else:
md += "| " + " | ".join(row) + " |\n"
return md
def export(self, tables: List[Dict], output_path: str, format: str = 'excel'):
"""导出表格
Args:
tables: 表格数据
output_path: 输出路径
format: 输出格式 (excel, csv, json, markdown)
"""
if format == 'excel':
with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
for i, table in enumerate(tables):
df = pd.DataFrame(table['data'])
sheet_name = f"Page{table['page']}_T{table['table_num']}"
df.to_excel(writer, sheet_name=sheet_name[:31], index=False, header=False)
elif format == 'csv':
for i, table in enumerate(tables):
df = pd.DataFrame(table['data'])
df.to_csv(f"{output_path}_{i+1}.csv", index=False, header=False)
elif format == 'json':
with open(output_path, 'w', encoding='utf-8') as f:
json.dump(tables, f, ensure_ascii=False, indent=2)
elif format == 'markdown':
with open(output_path, 'w', encoding='utf-8') as f:
for table in tables:
f.write(f"## Page {table['page']}, Table {table['table_num']}\n\n")
f.write(self.to_markdown(table['data']))
f.write("\n\n")
def main():
extractor = TableExtractor()
# 提取表格
# tables = extractor.extract("document.pdf", "pdf")
# print(f"提取到 {len(tables)} 个表格")
# 导出
# if tables:
# extractor.export(tables, "tables.xlsx", "excel")
print("表格提取器就绪")
if __name__ == '__main__':
main()
常见问题¶
1. 表格检测不到¶
- 尝试不同的提取策略
- 调整图像预处理参数
- 使用专门的表格检测模型
2. 单元格内容不完整¶
- 调整文本提取参数
- 使用 OCR 作为后备方案
3. 合并单元格处理¶
- 使用后处理逻辑合并
- 使用支持合并单元格的解析器
下一步¶
下一章我们将学习实战项目:构建智能文档处理系统。