跳转至

第七章:表格识别与提取

表格识别概述

表格识别(Table Recognition)是文档解析中的难点,需要识别表格的结构(行、列、边框)和内容。本章介绍多种表格识别方案。

方案对比

方案 优点 缺点 适用场景
pdfplumber 简单易用 对复杂表格效果差 简单表格
Camelot 开源免费 依赖 Tabula PDF 表格
PaddleOCR 端到端 需要训练 复杂表格
DeepDocParser 阿里开源 效果好 生产环境

pdfplumber 表格提取

基本用法

import pdfplumber

def extract_tables_pdfplumber(pdf_path):
    """使用 pdfplumber 提取表格"""
    tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            page_tables = page.extract_tables()

            for table_num, table in enumerate(page_tables):
                if table:
                    tables.append({
                        "page": page_num + 1,
                        "table_num": table_num + 1,
                        "data": table
                    })

    return tables

# 使用
tables = extract_tables_pdfplumber("document.pdf")
for t in tables:
    print(f"第 {t['page']} 页,第 {t['table_num']} 个表格:")
    for row in t['data'][:5]:
        print(row)

高级配置

import pdfplumber

def extract_tables_advanced(pdf_path):
    """高级表格提取"""
    tables = []

    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            # 自定义提取策略
            tables_found = page.extract_tables({
                # 垂直线检测策略
                "vertical_strategy": "lines_strict",  # lines, lines_strict, text
                # 水平线检测策略
                "horizontal_strategy": "lines_strict",
                # 交叉点容差
                "intersection_tolerance": 5,
                # 最小单元格面积
                "min_cell_area": 50,
                # 文本提取方式
                "text_x_tolerance": 3,
                "text_y_tolerance": 3,
            })

            for table in tables_found:
                # 清理表格数据
                cleaned = []
                for row in table:
                    cleaned_row = []
                    for cell in row:
                        if cell:
                            # 清理多余空白
                            cell = ' '.join(cell.split())
                        cleaned_row.append(cell)
                    cleaned.append(cleaned_row)

                tables.append(cleaned)

    return tables

Camelot 表格提取

安装

# Ubuntu/Debian
sudo apt-get install ghostscript

pip install camelot-py[cv]

基本用法

import camelot

def extract_tables_camelot(pdf_path):
    """使用 Camelot 提取表格"""
    # 提取表格(stream 模式适合无边框表格,lattice 模式适合有边框表格)
    tables = camelot.read_pdf(pdf_path, pages='1-end', flavor='stream')

    results = []
    for i, table in enumerate(tables):
        results.append({
            "table_num": i + 1,
            "data": table.data,
            "shape": table.shape
        })
        print(f"表格 {i+1}: {table.shape}")

    return results

# 导出为 Excel
# tables.export('tables.csv', f='csv')
# tables.export('tables.xlsx', f='excel')

两种模式对比

# lattice 模式(适合有边框表格)
tables_lattice = camelot.read_pdf(
    pdf_path, 
    pages='1',
    flavor='lattice',
    process_background=True  # 处理背景色
)

# stream 模式(适合无边框表格)
tables_stream = camelot.read_pdf(
    pdf_path,
    pages='1',
    flavor='stream',
    edge_tol=500  # 边缘容差
)

PaddleOCR 表格识别

安装

pip install paddleocr paddlepaddle

基本用法

from paddleocr import PaddleOCR

def extract_tables_paddleocr(image_path):
    """使用 PaddleOCR 识别表格"""
    ocr = PaddleOCR(
        use_angle_cls=True,
        lang='ch',
        table=True,  # 启用表格识别
        table_max_num=10
    )

    result = ocr.ocr(image_path, cls=True, table=True)

    # result[1] 包含表格结构
    if result[1]:
        for table_info in result[1]:
            print(f"表格: {table_info}")

    return result

# 识别并导出 HTML
result = ocr.ocr(image_path, table=True, det=False, rec=False, cls=False)
html = result[1]['html']
print(html)

阿里 DeepDocParser

安装

pip install deepdocparser

使用

from deepdocparser import Parser

# 初始化
parser = Parser()

# 解析 PDF
doc = parser.parse("document.pdf")

# 获取表格
for page in doc.pages:
    for table in page.tables:
        print(f"表格行数: {len(table.rows)}")
        print(f"表格列数: {len(table.cols)}")

        # 获取单元格内容
        for row in table.rows:
            row_data = [cell.text for cell in row.cells]
            print(row_data)

表格后处理

单元格合并处理

def process_merged_cells(table_data):
    """处理合并单元格"""
    processed = []

    for row in table_data:
        # 过滤空单元格和 None
        cleaned_row = [cell if cell and cell.strip() else '' for cell in row]

        # 合并连续的空单元格(可能是合并单元格)
        merged = []
        for cell in cleaned_row:
            if cell or (merged and merged[-1]):
                merged.append(cell)
            elif merged and not merged[-1]:
                # 跳过连续的空单元格
                continue

        if merged:
            processed.append(merged)

    return processed

转换为 DataFrame

import pandas as pd

def table_to_dataframe(table_data, headers=None):
    """将表格数据转换为 DataFrame"""
    if not table_data:
        return pd.DataFrame()

    if headers:
        # 第一行作为表头
        df = pd.DataFrame(table_data[1:], columns=headers)
    else:
        df = pd.DataFrame(table_data)

    return df

# 使用
df = table_to_dataframe(table_data)
print(df)

导出为 Excel

import pandas as pd

def export_tables_to_excel(tables, output_path):
    """导出表格到 Excel"""
    with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
        for i, table in enumerate(tables):
            df = pd.DataFrame(table['data'])
            sheet_name = f"Page{table['page']}_Table{table['table_num']}"
            df.to_excel(writer, sheet_name=sheet_name[:31], index=False, header=False)

export_tables_to_excel(tables, "tables.xlsx")

完整示例

#!/usr/bin/env python3
"""表格识别与提取完整示例"""

import pdfplumber
import pandas as pd
from typing import List, Dict, Optional
import json

class TableExtractor:
    """表格提取器"""

    def __init__(self, method='pdfplumber'):
        self.method = method

    def extract(self, source: str, source_type: str = 'pdf') -> List[Dict]:
        """提取表格

        Args:
            source: 文件路径
            source_type: 文件类型 (pdf, image)
        """
        if source_type == 'pdf':
            return self._extract_from_pdf(source)
        elif source_type == 'image':
            return self._extract_from_image(source)
        else:
            raise ValueError(f"不支持的文件类型: {source_type}")

    def _extract_from_pdf(self, pdf_path: str) -> List[Dict]:
        """从 PDF 提取表格"""
        tables = []

        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                # 尝试多种提取策略
                for strategy in ['exact', 'lines', 'text']:
                    try:
                        page_tables = page.extract_tables({
                            "vertical_strategy": strategy,
                            "horizontal_strategy": strategy,
                        })

                        for table_num, table in enumerate(page_tables):
                            if table and self._is_valid_table(table):
                                tables.append({
                                    "page": page_num + 1,
                                    "table_num": table_num + 1,
                                    "strategy": strategy,
                                    "data": self._clean_table(table),
                                    "rows": len(table),
                                    "cols": len(table[0]) if table else 0
                                })
                        break  # 成功则停止尝试其他策略
                    except Exception as e:
                        continue

        return tables

    def _extract_from_image(self, image_path: str) -> List[Dict]:
        """从图片提取表格(使用 OCR)"""
        # 这里可以使用 PaddleOCR 或其他 OCR 工具
        # 简化示例
        from paddleocr import PaddleOCR

        ocr = PaddleOCR(use_angle_cls=True, lang='ch', table=True)
        result = ocr.ocr(image_path, cls=True, table=True)

        tables = []
        if result and result[1]:
            tables.append({
                "page": 1,
                "table_num": 1,
                "data": result[1],
                "rows": len(result[1]),
                "cols": len(result[1][0]) if result[1] else 0
            })

        return tables

    def _is_valid_table(self, table: List[List]) -> bool:
        """检查是否为有效表格"""
        if not table or len(table) < 2:
            return False

        # 检查是否有足够的非空单元格
        non_empty = sum(1 for row in table for cell in row if cell and cell.strip())
        total = len(table) * len(table[0])

        return non_empty / total > 0.3 if total > 0 else False

    def _clean_table(self, table: List[List]) -> List[List]:
        """清理表格数据"""
        cleaned = []

        for row in table:
            cleaned_row = []
            for cell in row:
                if cell:
                    # 清理多余空白
                    cell = ' '.join(str(cell).split())
                cleaned_row.append(cell)
            cleaned.append(cleaned_row)

        return cleaned

    def to_dataframe(self, table_data: List[List]) -> pd.DataFrame:
        """转换为 DataFrame"""
        if not table_data:
            return pd.DataFrame()

        # 尝试使用第一行作为表头
        df = pd.DataFrame(table_data[1:], columns=table_data[0])
        return df

    def to_markdown(self, table_data: List[List]) -> str:
        """转换为 Markdown 表格"""
        if not table_data:
            return ""

        md = ""

        for i, row in enumerate(table_data):
            # 转义竖线
            row = [str(cell).replace('|', '\\|') if cell else '' for cell in row]

            if i == 0:
                # 表头
                md += "| " + " | ".join(row) + " |\n"
                # 分隔线
                md += "| " + " | ".join(["---"] * len(row)) + " |\n"
            else:
                md += "| " + " | ".join(row) + " |\n"

        return md

    def export(self, tables: List[Dict], output_path: str, format: str = 'excel'):
        """导出表格

        Args:
            tables: 表格数据
            output_path: 输出路径
            format: 输出格式 (excel, csv, json, markdown)
        """
        if format == 'excel':
            with pd.ExcelWriter(output_path, engine='openpyxl') as writer:
                for i, table in enumerate(tables):
                    df = pd.DataFrame(table['data'])
                    sheet_name = f"Page{table['page']}_T{table['table_num']}"
                    df.to_excel(writer, sheet_name=sheet_name[:31], index=False, header=False)

        elif format == 'csv':
            for i, table in enumerate(tables):
                df = pd.DataFrame(table['data'])
                df.to_csv(f"{output_path}_{i+1}.csv", index=False, header=False)

        elif format == 'json':
            with open(output_path, 'w', encoding='utf-8') as f:
                json.dump(tables, f, ensure_ascii=False, indent=2)

        elif format == 'markdown':
            with open(output_path, 'w', encoding='utf-8') as f:
                for table in tables:
                    f.write(f"## Page {table['page']}, Table {table['table_num']}\n\n")
                    f.write(self.to_markdown(table['data']))
                    f.write("\n\n")

def main():
    extractor = TableExtractor()

    # 提取表格
    # tables = extractor.extract("document.pdf", "pdf")
    # print(f"提取到 {len(tables)} 个表格")

    # 导出
    # if tables:
    #     extractor.export(tables, "tables.xlsx", "excel")

    print("表格提取器就绪")

if __name__ == '__main__':
    main()

常见问题

1. 表格检测不到

  • 尝试不同的提取策略
  • 调整图像预处理参数
  • 使用专门的表格检测模型

2. 单元格内容不完整

  • 调整文本提取参数
  • 使用 OCR 作为后备方案

3. 合并单元格处理

  • 使用后处理逻辑合并
  • 使用支持合并单元格的解析器

下一步

下一章我们将学习实战项目:构建智能文档处理系统。