跳转至

第六章:文档结构提取

文档结构概述

文档结构提取(Document Layout Analysis)是指识别文档中不同元素的位置和类型,如标题、段落、表格、图片、页眉页脚等。这是构建智能文档处理系统的关键步骤。

版面分析方法

传统方法

  • 基于规则的方法(连通域分析、投影法)
  • 机器学习方法(SVM、随机森林)

深度学习方法

  • 目标检测方法(Faster R-CNN、YOLO)
  • 分割方法(Mask R-CNN、U-Net)
  • 端到端布局分析模型

主流布局分析模型

模型 特点 适用场景
LayoutLM BERT + 视觉 表单、收据
LayoutLMv3 端到端 复杂文档
PubLayNet 预训练模型 通用文档
DiT 文档图像 Transformer 高精度需求

LayoutParser 使用

安装

pip install layoutparser torchvision

基本用法

import layoutparser as lp
import cv2

# 加载预训练模型
model = lp.Detectron2LayoutModel(
    "lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
    label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)

# 检测布局
image = cv2.imread("document.jpg")
image = image[..., ::-1]  # BGR to RGB

layout = model.detect(image)

# 打印结果
for block in layout:
    print(f"类型: {block.type}, 坐标: {block.coordinates}")

提取标题

import layoutparser as lp
import cv2

# 加载模型
model = lp.Detectron2LayoutModel(
    "lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config"
)

def extract_titles(image_path):
    """提取文档中的标题"""
    image = cv2.imread(image_path)
    image = image[..., ::-1]

    layout = model.detect(image)

    titles = []
    for block in layout:
        if block.type == "Title":
            # 裁剪标题区域
            x, y, w, h = block.coordinates
            title_image = image[int(y):int(y+h), int(x):int(x+w)]
            titles.append({
                "text": block.text,
                "image": title_image,
                "bbox": block.coordinates
            })

    return titles

titles = extract_titles("document.jpg")
for title in titles:
    print(title["text"])

文档元素分类

元素类型定义

from enum import Enum

class DocumentElementType(Enum):
    """文档元素类型"""
    TITLE = "title"           # 标题
    HEADING = "heading"       # 章节标题
    PARAGRAPH = "paragraph"   # 段落
    TABLE = "table"           # 表格
    FIGURE = "figure"         # 图片
    CAPTION = "caption"       # 图片说明
    FOOTER = "footer"         # 页脚
    HEADER = "header"         # 页眉
    PAGE_NUMBER = "page_num"  # 页码
    MARGINALIA = "marginalia" # 旁注

结构化输出

import layoutparser as lp

class DocumentStructure:
    """文档结构"""

    def __init__(self):
        self.elements = []

    def add_element(self, element_type, content, bbox):
        """添加元素"""
        self.elements.append({
            "type": element_type,
            "content": content,
            "bbox": bbox  # [x, y, w, h]
        })

    def to_dict(self):
        """转换为字典"""
        return {
            "elements": self.elements,
            "element_count": len(self.elements)
        }

    def to_markdown(self):
        """转换为 Markdown"""
        md = ""
        for elem in self.elements:
            if elem["type"] == "title":
                md += f"# {elem['content']}\n\n"
            elif elem["type"] == "heading":
                md += f"## {elem['content']}\n\n"
            elif elem["type"] == "paragraph":
                md += f"{elem['content']}\n\n"
            # ... 其他类型
        return md

实战:PDF 布局分析

完整示例

#!/usr/bin/env python3
"""文档结构分析完整示例"""

import cv2
import layoutparser as lp
import fitz  # PyMuPDF
from typing import List, Dict
import json

class DocumentLayoutAnalyzer:
    """文档布局分析器"""

    def __init__(self, model_type="publaynet"):
        if model_type == "publaynet":
            self.model = lp.Detectron2LayoutModel(
                "lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
                extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
                label_map={
                    0: "Text",
                    1: "Title", 
                    2: "List",
                    3: "Table",
                    4: "Figure"
                }
            )
        self.label_map = {
            0: "text",
            1: "title",
            2: "list",
            3: "table",
            4: "figure"
        }

    def analyze_image(self, image_path: str) -> List[Dict]:
        """分析图片文档"""
        image = cv2.imread(image_path)
        image = image[..., ::-1]

        layout = self.model.detect(image)

        results = []
        for block in layout:
            results.append({
                "type": self.label_map.get(block.type, block.type),
                "bbox": block.coordinates,
                "score": block.score if hasattr(block, 'score') else 1.0
            })

        return results

    def analyze_pdf(self, pdf_path: str) -> List[Dict]:
        """分析 PDF 文档"""
        doc = fitz.open(pdf_path)
        all_results = []

        for page_num in range(len(doc)):
            page = doc[page_num]
            pix = page.get_pixmap(dpi=200)
            img_data = pix.tobytes("png")

            # 保存临时图片
            temp_path = f"/tmp/layout_page_{page_num}.png"
            with open(temp_path, "wb") as f:
                f.write(img_data)

            # 分析
            page_results = self.analyze_image(temp_path)

            all_results.append({
                "page": page_num + 1,
                "elements": page_results
            })

            # 清理
            import os
            os.remove(temp_path)

        doc.close()
        return all_results

    def extract_by_type(self, results: List[Dict], element_type: str) -> List[Dict]:
        """按类型提取元素"""
        extracted = []
        for page_result in results:
            for elem in page_result.get("elements", []):
                if elem["type"] == element_type:
                    extracted.append({
                        "page": page_result["page"],
                        "bbox": elem["bbox"]
                    })
        return extracted

    def visualize(self, image_path: str, output_path: str):
        """可视化布局分析结果"""
        import matplotlib.pyplot as plt
        import numpy as np

        image = cv2.imread(image_path)
        image = image[..., ::-1]

        layout = self.model.detect(image)

        # 绘制边界框
        colors = {
            "Text": (255, 0, 0),
            "Title": (0, 255, 0),
            "List": (0, 0, 255),
            "Table": (255, 255, 0),
            "Figure": (255, 0, 255)
        }

        for block in layout:
            x1, y1, x2, y2 = block.coordinates
            color = colors.get(block.type, (128, 128, 128))
            cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
            cv2.putText(image, block.type, (int(x1), int(y1)-5),
                       cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)

        plt.figure(figsize=(12, 16))
        plt.imshow(image)
        plt.axis('off')
        plt.savefig(output_path, bbox_inches='tight', dpi=150)
        plt.close()

def main():
    analyzer = DocumentLayoutAnalyzer()

    # 分析图片
    # results = analyzer.analyze_image("document.jpg")
    # print(json.dumps(results, indent=2))

    # 分析 PDF
    # results = analyzer.analyze_pdf("document.pdf")
    # print(json.dumps(results, indent=2))

    # 可视化
    # analyzer.visualize("document.jpg", "layout_visualization.jpg")

    print("布局分析完成")

if __name__ == '__main__':
    main()

高级:自定义布局模型

使用 Detectron2 训练自定义模型

# 1. 准备数据集(COCO 格式)
# {
#     "images": [{"id": 1, "file_name": "img1.jpg", "height": 800, "width": 600}],
#     "annotations": [
#         {"id": 1, "image_id": 1, "category_id": 0, "bbox": [x, y, w, h], "area": w*h}
#     ],
#     "categories": [
#         {"id": 0, "name": "title"},
#         {"id": 1, "name": "paragraph"},
#         {"id": 2, "name": "table"}
#     ]
# }

# 2. 配置训练
from detectron2.config import get_cfg
from detectron2.engine import DefaultTrainer

cfg = get_cfg()
cfg.merge_from_file("detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
cfg.DATASET.TRAIN = ("my_layout_train",)
cfg.DATASET.TEST = ("my_layout_val",)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3  # 自定义类别数

# 3. 训练
# trainer = DefaultTrainer(cfg)
# trainer.train()

常见问题

1. 检测不到小文本

# 调整置信度阈值
model = lp.Detectron2LayoutModel(
    "lp://PubLayNet/...",
    extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.3]  # 降低阈值
)

2. 表格检测不准确

# 使用专门的表格检测模型
model = lp.Detectron2LayoutModel(
    "lp://TableBank/faster_rcnn_X_101_32x8d_FPN_3x/config"
)

3. 性能优化

# 使用更轻量的模型
model = lp.Detectron2LayoutModel(
    "lp://PubLayNet/mask_rcnn_R_50_FPN_3x/config"  # 轻量版
)

下一步

下一章我们将学习表格识别与提取。