跳转至

第二章:文档处理(OCR 与多模态解析)

本章介绍如何使用 PaddleOCR 和多模态模型处理图片、扫描件等非文本文档。

PaddleOCR 简介

PaddleOCR 是百度开源的 OCR 工具,支持文本检测、方向识别和文本识别。

安装

pip install paddlepaddle paddleocr

注意:PaddlePaddle 支持 CPU 和 GPU。GPU 版本安装:

pip install paddlepaddle-gpu

基本 OCR 识别

简单图片文字识别

from paddleocr import PaddleOCR

# 初始化 OCR(首次运行会下载模型)
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')

# 识别图片
result = ocr.ocr('image.png', cls=True)

# 遍历结果
for line in result[0]:
    print(f"文字: {line[1][0]}, 置信度: {line[1][1]:.2f}")

批量识别多张图片

from paddleocr import PaddleOCR
from pathlib import Path

ocr = PaddleOCR(use_angle_cls=True, lang='chinese')

image_dir = Path('./images')
for image_path in image_dir.glob('*.png'):
    result = ocr.ocr(str(image_path), cls=True)
    print(f"文件: {image_path.name}")
    for line in result[0]:
        print(f"  - {line[1][0]}")

识别 PDF 文档

from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import fitz  # PyMuPDF

ocr = PaddleOCR(use_angle_cls=True, lang='chinese')

# 方法1:使用 pdf2image 转换后识别
images = convert_from_path('document.pdf')
for i, image in enumerate(images):
    image.save(f'page_{i+1}.png', 'PNG')
    result = ocr.ocr(f'page_{i+1}.png', cls=True)
    print(f"第 {i+1} 页: {result}")

# 方法2:使用 PyMuPDF 提取图片后识别
doc = fitz.open('document.pdf')
for page_num in range(len(doc)):
    page = doc[page_num]
    images = page.get_images()
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]

        # 识别图片中的文字
        import cv2
        import numpy as np
        nparr = np.frombuffer(image_bytes, np.uint8)
        img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)

        result = ocr.ocr(img, cls=True)
        print(f"Page {page_num + 1}, Image {img_index + 1}: {result}")

表格识别

使用 PP-Structure

from paddleocr import PPStructure

# 表格识别
table_engine = PPStructure(show_log=True)

img = cv2.imread('table.png')
result = table_engine(img)

for item in result:
    if item['type'] == 'table':
        print("表格内容:", item['res']['table'])
    elif item['type'] == 'text':
        print("文本:", item['res']['text'])

表格转 Excel

from paddleocr import PPStructure
import pandas as pd

table_engine = PPStructure(show_log=True)

img = cv2.imread('table.png')
result = table_engine(img)

for item in result:
    if item['type'] == 'table':
        # 获取表格 HTML
        table_html = item['res']['table_html']
        # 解析 HTML 为 DataFrame
        dfs = pd.read_html(table_html)
        df = dfs[0]
        df.to_excel('output.xlsx', index=False)
        print("表格已保存到 output.xlsx")

多模态文档解析

使用 Qwen-VL 处理图片

from openai import OpenAI

client = OpenAI(
    api_key="your-api-key",
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

def analyze_document_image(image_path: str, prompt: str = "请描述这张图片的内容") -> str:
    """使用 Qwen-VL 分析文档图片"""
    response = client.chat.completions.create(
        model="qwen-vl-plus",
        messages=[{
            "role": "user",
            "content": [
                {"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
                {"type": "text", "text": prompt}
            ]
        }]
    )
    return response.choices[0].message.content

# 分析文档图片
result = analyze_document_image(
    "document.png",
    "请提取图片中的所有文字内容,并保持原有格式"
)
print(result)

使用 GPT-4V 处理文档

from openai import OpenAI

client = OpenAI(api_key="your-api-key")

def extract_document_content(image_path: str) -> str:
    """使用 GPT-4V 提取文档内容"""
    with open(image_path, "rb") as img_file:
        response = client.chat.completions.create(
            model="gpt-4o",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "text", "text": "请提取图片中的所有文字内容"},
                    {"type": "image_url", "image_url": {"url": img_file.read()}}
                ]
            }]
        )
    return response.choices[0].message.content

通用文档解析流程

import os
from pathlib import Path
from paddleocr import PaddleOCR
from openai import OpenAI

class DocumentParser:
    """通用文档解析器"""

    def __init__(self, use_vlm: bool = False):
        self.ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
        self.use_vlm = use_vlm
        if use_vlm:
            self.client = OpenAI(
                api_key=os.getenv("DASHSCOPE_API_KEY"),
                base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
            )

    def parse_image(self, image_path: str) -> str:
        """解析图片文档"""
        if self.use_vlm:
            return self._parse_with_vlm(image_path)
        else:
            return self._parse_with_ocr(image_path)

    def _parse_with_ocr(self, image_path: str) -> str:
        """使用 PaddleOCR 解析"""
        result = self.ocr.ocr(image_path, cls=True)
        texts = []
        for line in result[0]:
            texts.append(line[1][0])
        return "\n".join(texts)

    def _parse_with_vlm(self, image_path: str) -> str:
        """使用多模态模型解析"""
        response = self.client.chat.completions.create(
            model="qwen-vl-plus",
            messages=[{
                "role": "user",
                "content": [
                    {"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
                    {"type": "text", "text": "请提取图片中的所有文字内容"}
                ]
            }]
        )
        return response.choices[0].message.content

    def parse_pdf(self, pdf_path: str) -> list[str]:
        """解析 PDF 文档"""
        from pdf2image import convert_from_path

        images = convert_from_path(pdf_path)
        results = []

        for i, image in enumerate(images):
            temp_path = f"/tmp/page_{i+1}.png"
            image.save(temp_path, 'PNG')
            results.append(self.parse_image(temp_path))

        return results

# 使用示例
parser = DocumentParser(use_vlm=False)
texts = parser.parse_pdf("document.pdf")
for i, text in enumerate(texts):
    print(f"Page {i+1}: {text[:200]}...")

文档类型自动识别

import fitz  # PyMuPDF
from pathlib import Path

def detect_document_type(file_path: str) -> str:
    """自动识别文档类型"""
    path = Path(file_path)
    suffix = path.suffix.lower()

    if suffix == '.pdf':
        return detect_pdf_type(file_path)
    elif suffix in ['.png', '.jpg', '.jpeg', '.tiff']:
        return 'image'
    elif suffix in ['.docx', '.doc']:
        return 'word'
    elif suffix == '.txt':
        return 'text'
    else:
        return 'unknown'

def detect_pdf_type(pdf_path: str) -> str:
    """检测 PDF 是扫描件还是可搜索 PDF"""
    doc = fitz.open(pdf_path)

    for page in doc:
        text = page.get_text()
        if text.strip():  # 有可提取文本
            return 'searchable_pdf'

    return 'scanned_pdf'  # 扫描件,需要 OCR

# 使用示例
doc_type = detect_document_type("document.pdf")
print(f"文档类型: {doc_type}")

if doc_type == 'scanned_pdf':
    # 使用 OCR 处理
    parser = DocumentParser()
    texts = parser.parse_pdf("document.pdf")

集成 LangChain

自定义文档加载器

from langchain_core.documents import Document
from langchain_community.document_loaders import BaseLoader

class PaddleOCRLoader(BaseLoader):
    """PaddleOCR 文档加载器"""

    def __init__(self, file_path: str, use_vlm: bool = False):
        self.file_path = file_path
        self.use_vlm = use_vlm

    def lazy_load(self):
        from paddleocr import PaddleOCR
        from pdf2image import convert_from_path
        from pathlib import Path

        ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
        path = Path(self.file_path)

        if path.suffix.lower() == '.pdf':
            images = convert_from_path(self.file_path)
            for i, image in enumerate(images):
                temp_path = f"/tmp/ocr_page_{i+1}.png"
                image.save(temp_path, 'PNG')
                result = ocr.ocr(temp_path, cls=True)

                text = "\n".join([line[1][0] for line in result[0]])
                yield Document(
                    page_content=text,
                    metadata={"page": i + 1, "source": self.file_path}
                )
        else:
            result = ocr.ocr(self.file_path, cls=True)
            text = "\n".join([line[1][0] for line in result[0]])
            yield Document(
                page_content=text,
                metadata={"source": self.file_path}
            )

# 使用 LangChain 加载
loader = PaddleOCRLoader("document.pdf")
documents = list(loader.load())

完整 RAG 流程

from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# 1. 加载文档
loader = PaddleOCRLoader("scanned_document.pdf")
documents = list(loader.load())

# 2. 分割文档
splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=50
)
chunks = splitter.split_documents(documents)

# 3. 向量化
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)

# 4. 检索
query = "什么是机器学习"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

性能优化

GPU 加速

# 使用 GPU 进行 OCR
ocr = PaddleOCR(
    use_angle_cls=True,
    lang='chinese',
    use_gpu=True,
    gpu_mem=5000  # 分配 5GB 显存
)

批处理优化

from concurrent.futures import ThreadPoolExecutor
from paddleocr import PaddleOCR

ocr = PaddleOCR(use_angle_cls=True, lang='chinese')

def process_image(image_path):
    result = ocr.ocr(image_path, cls=True)
    return [line[1][0] for line in result[0]]

# 批量处理
image_paths = ['img1.png', 'img2.png', 'img3.png']
with ThreadPoolExecutor(max_workers=4) as executor:
    results = list(executor.map(process_image, image_paths))

模型缓存

import os

# 设置模型缓存路径
os.environ['PADDLEOCR_DIR'] = '/path/to/models'
os.environ['XDG_CACHE_HOME'] = '/path/to/cache'

小结

本章学习了:

  • ✅ PaddleOCR 安装与基本使用
  • ✅ 图片文字识别
  • ✅ PDF 文档 OCR 识别
  • ✅ 表格识别 (PP-Structure)
  • ✅ 多模态模型文档解析 (Qwen-VL, GPT-4V)
  • ✅ 文档类型自动识别
  • ✅ 集成 LangChain
  • ✅ 性能优化

下一章

第三章:向量化与索引 - 学习向量化存储。