第六章:文档结构提取¶
文档结构概述¶
文档结构提取(Document Layout Analysis)是指识别文档中不同元素的位置和类型,如标题、段落、表格、图片、页眉页脚等。这是构建智能文档处理系统的关键步骤。
版面分析方法¶
传统方法¶
- 基于规则的方法(连通域分析、投影法)
- 机器学习方法(SVM、随机森林)
深度学习方法¶
- 目标检测方法(Faster R-CNN、YOLO)
- 分割方法(Mask R-CNN、U-Net)
- 端到端布局分析模型
主流布局分析模型¶
| 模型 | 特点 | 适用场景 |
|---|---|---|
| LayoutLM | BERT + 视觉 | 表单、收据 |
| LayoutLMv3 | 端到端 | 复杂文档 |
| PubLayNet | 预训练模型 | 通用文档 |
| DiT | 文档图像 Transformer | 高精度需求 |
LayoutParser 使用¶
安装¶
基本用法¶
import layoutparser as lp
import cv2
# 加载预训练模型
model = lp.Detectron2LayoutModel(
"lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
label_map={0: "Text", 1: "Title", 2: "List", 3: "Table", 4: "Figure"}
)
# 检测布局
image = cv2.imread("document.jpg")
image = image[..., ::-1] # BGR to RGB
layout = model.detect(image)
# 打印结果
for block in layout:
print(f"类型: {block.type}, 坐标: {block.coordinates}")
提取标题¶
import layoutparser as lp
import cv2
# 加载模型
model = lp.Detectron2LayoutModel(
"lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config"
)
def extract_titles(image_path):
"""提取文档中的标题"""
image = cv2.imread(image_path)
image = image[..., ::-1]
layout = model.detect(image)
titles = []
for block in layout:
if block.type == "Title":
# 裁剪标题区域
x, y, w, h = block.coordinates
title_image = image[int(y):int(y+h), int(x):int(x+w)]
titles.append({
"text": block.text,
"image": title_image,
"bbox": block.coordinates
})
return titles
titles = extract_titles("document.jpg")
for title in titles:
print(title["text"])
文档元素分类¶
元素类型定义¶
from enum import Enum
class DocumentElementType(Enum):
"""文档元素类型"""
TITLE = "title" # 标题
HEADING = "heading" # 章节标题
PARAGRAPH = "paragraph" # 段落
TABLE = "table" # 表格
FIGURE = "figure" # 图片
CAPTION = "caption" # 图片说明
FOOTER = "footer" # 页脚
HEADER = "header" # 页眉
PAGE_NUMBER = "page_num" # 页码
MARGINALIA = "marginalia" # 旁注
结构化输出¶
import layoutparser as lp
class DocumentStructure:
"""文档结构"""
def __init__(self):
self.elements = []
def add_element(self, element_type, content, bbox):
"""添加元素"""
self.elements.append({
"type": element_type,
"content": content,
"bbox": bbox # [x, y, w, h]
})
def to_dict(self):
"""转换为字典"""
return {
"elements": self.elements,
"element_count": len(self.elements)
}
def to_markdown(self):
"""转换为 Markdown"""
md = ""
for elem in self.elements:
if elem["type"] == "title":
md += f"# {elem['content']}\n\n"
elif elem["type"] == "heading":
md += f"## {elem['content']}\n\n"
elif elem["type"] == "paragraph":
md += f"{elem['content']}\n\n"
# ... 其他类型
return md
实战:PDF 布局分析¶
完整示例¶
#!/usr/bin/env python3
"""文档结构分析完整示例"""
import cv2
import layoutparser as lp
import fitz # PyMuPDF
from typing import List, Dict
import json
class DocumentLayoutAnalyzer:
"""文档布局分析器"""
def __init__(self, model_type="publaynet"):
if model_type == "publaynet":
self.model = lp.Detectron2LayoutModel(
"lp://PubLayNet/mask_rcnn_X_101_32x8d_FPN_3x/config",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.5],
label_map={
0: "Text",
1: "Title",
2: "List",
3: "Table",
4: "Figure"
}
)
self.label_map = {
0: "text",
1: "title",
2: "list",
3: "table",
4: "figure"
}
def analyze_image(self, image_path: str) -> List[Dict]:
"""分析图片文档"""
image = cv2.imread(image_path)
image = image[..., ::-1]
layout = self.model.detect(image)
results = []
for block in layout:
results.append({
"type": self.label_map.get(block.type, block.type),
"bbox": block.coordinates,
"score": block.score if hasattr(block, 'score') else 1.0
})
return results
def analyze_pdf(self, pdf_path: str) -> List[Dict]:
"""分析 PDF 文档"""
doc = fitz.open(pdf_path)
all_results = []
for page_num in range(len(doc)):
page = doc[page_num]
pix = page.get_pixmap(dpi=200)
img_data = pix.tobytes("png")
# 保存临时图片
temp_path = f"/tmp/layout_page_{page_num}.png"
with open(temp_path, "wb") as f:
f.write(img_data)
# 分析
page_results = self.analyze_image(temp_path)
all_results.append({
"page": page_num + 1,
"elements": page_results
})
# 清理
import os
os.remove(temp_path)
doc.close()
return all_results
def extract_by_type(self, results: List[Dict], element_type: str) -> List[Dict]:
"""按类型提取元素"""
extracted = []
for page_result in results:
for elem in page_result.get("elements", []):
if elem["type"] == element_type:
extracted.append({
"page": page_result["page"],
"bbox": elem["bbox"]
})
return extracted
def visualize(self, image_path: str, output_path: str):
"""可视化布局分析结果"""
import matplotlib.pyplot as plt
import numpy as np
image = cv2.imread(image_path)
image = image[..., ::-1]
layout = self.model.detect(image)
# 绘制边界框
colors = {
"Text": (255, 0, 0),
"Title": (0, 255, 0),
"List": (0, 0, 255),
"Table": (255, 255, 0),
"Figure": (255, 0, 255)
}
for block in layout:
x1, y1, x2, y2 = block.coordinates
color = colors.get(block.type, (128, 128, 128))
cv2.rectangle(image, (int(x1), int(y1)), (int(x2), int(y2)), color, 2)
cv2.putText(image, block.type, (int(x1), int(y1)-5),
cv2.FONT_HERSHEY_SIMPLEX, 0.5, color, 2)
plt.figure(figsize=(12, 16))
plt.imshow(image)
plt.axis('off')
plt.savefig(output_path, bbox_inches='tight', dpi=150)
plt.close()
def main():
analyzer = DocumentLayoutAnalyzer()
# 分析图片
# results = analyzer.analyze_image("document.jpg")
# print(json.dumps(results, indent=2))
# 分析 PDF
# results = analyzer.analyze_pdf("document.pdf")
# print(json.dumps(results, indent=2))
# 可视化
# analyzer.visualize("document.jpg", "layout_visualization.jpg")
print("布局分析完成")
if __name__ == '__main__':
main()
高级:自定义布局模型¶
使用 Detectron2 训练自定义模型¶
# 1. 准备数据集(COCO 格式)
# {
# "images": [{"id": 1, "file_name": "img1.jpg", "height": 800, "width": 600}],
# "annotations": [
# {"id": 1, "image_id": 1, "category_id": 0, "bbox": [x, y, w, h], "area": w*h}
# ],
# "categories": [
# {"id": 0, "name": "title"},
# {"id": 1, "name": "paragraph"},
# {"id": 2, "name": "table"}
# ]
# }
# 2. 配置训练
from detectron2.config import get_cfg
from detectron2.engine import DefaultTrainer
cfg = get_cfg()
cfg.merge_from_file("detectron2/configs/COCO-Detection/faster_rcnn_X_101_32x8d_FPN_3x.yaml")
cfg.DATASET.TRAIN = ("my_layout_train",)
cfg.DATASET.TEST = ("my_layout_val",)
cfg.MODEL.ROI_HEADS.NUM_CLASSES = 3 # 自定义类别数
# 3. 训练
# trainer = DefaultTrainer(cfg)
# trainer.train()
常见问题¶
1. 检测不到小文本¶
# 调整置信度阈值
model = lp.Detectron2LayoutModel(
"lp://PubLayNet/...",
extra_config=["MODEL.ROI_HEADS.SCORE_THRESH_TEST", 0.3] # 降低阈值
)
2. 表格检测不准确¶
# 使用专门的表格检测模型
model = lp.Detectron2LayoutModel(
"lp://TableBank/faster_rcnn_X_101_32x8d_FPN_3x/config"
)
3. 性能优化¶
下一步¶
下一章我们将学习表格识别与提取。