第二章:文档处理(OCR 与多模态解析)¶
本章介绍如何使用 PaddleOCR 和多模态模型处理图片、扫描件等非文本文档。
PaddleOCR 简介¶
PaddleOCR 是百度开源的 OCR 工具,支持文本检测、方向识别和文本识别。
安装¶
注意:PaddlePaddle 支持 CPU 和 GPU。GPU 版本安装:
基本 OCR 识别¶
简单图片文字识别¶
from paddleocr import PaddleOCR
# 初始化 OCR(首次运行会下载模型)
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
# 识别图片
result = ocr.ocr('image.png', cls=True)
# 遍历结果
for line in result[0]:
print(f"文字: {line[1][0]}, 置信度: {line[1][1]:.2f}")
批量识别多张图片¶
from paddleocr import PaddleOCR
from pathlib import Path
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
image_dir = Path('./images')
for image_path in image_dir.glob('*.png'):
result = ocr.ocr(str(image_path), cls=True)
print(f"文件: {image_path.name}")
for line in result[0]:
print(f" - {line[1][0]}")
识别 PDF 文档¶
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
import fitz # PyMuPDF
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
# 方法1:使用 pdf2image 转换后识别
images = convert_from_path('document.pdf')
for i, image in enumerate(images):
image.save(f'page_{i+1}.png', 'PNG')
result = ocr.ocr(f'page_{i+1}.png', cls=True)
print(f"第 {i+1} 页: {result}")
# 方法2:使用 PyMuPDF 提取图片后识别
doc = fitz.open('document.pdf')
for page_num in range(len(doc)):
page = doc[page_num]
images = page.get_images()
for img_index, img in enumerate(images):
xref = img[0]
base_image = doc.extract_image(xref)
image_bytes = base_image["image"]
# 识别图片中的文字
import cv2
import numpy as np
nparr = np.frombuffer(image_bytes, np.uint8)
img = cv2.imdecode(nparr, cv2.IMREAD_COLOR)
result = ocr.ocr(img, cls=True)
print(f"Page {page_num + 1}, Image {img_index + 1}: {result}")
表格识别¶
使用 PP-Structure¶
from paddleocr import PPStructure
# 表格识别
table_engine = PPStructure(show_log=True)
img = cv2.imread('table.png')
result = table_engine(img)
for item in result:
if item['type'] == 'table':
print("表格内容:", item['res']['table'])
elif item['type'] == 'text':
print("文本:", item['res']['text'])
表格转 Excel¶
from paddleocr import PPStructure
import pandas as pd
table_engine = PPStructure(show_log=True)
img = cv2.imread('table.png')
result = table_engine(img)
for item in result:
if item['type'] == 'table':
# 获取表格 HTML
table_html = item['res']['table_html']
# 解析 HTML 为 DataFrame
dfs = pd.read_html(table_html)
df = dfs[0]
df.to_excel('output.xlsx', index=False)
print("表格已保存到 output.xlsx")
多模态文档解析¶
使用 Qwen-VL 处理图片¶
from openai import OpenAI
client = OpenAI(
api_key="your-api-key",
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
def analyze_document_image(image_path: str, prompt: str = "请描述这张图片的内容") -> str:
"""使用 Qwen-VL 分析文档图片"""
response = client.chat.completions.create(
model="qwen-vl-plus",
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
{"type": "text", "text": prompt}
]
}]
)
return response.choices[0].message.content
# 分析文档图片
result = analyze_document_image(
"document.png",
"请提取图片中的所有文字内容,并保持原有格式"
)
print(result)
使用 GPT-4V 处理文档¶
from openai import OpenAI
client = OpenAI(api_key="your-api-key")
def extract_document_content(image_path: str) -> str:
"""使用 GPT-4V 提取文档内容"""
with open(image_path, "rb") as img_file:
response = client.chat.completions.create(
model="gpt-4o",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": "请提取图片中的所有文字内容"},
{"type": "image_url", "image_url": {"url": img_file.read()}}
]
}]
)
return response.choices[0].message.content
通用文档解析流程¶
import os
from pathlib import Path
from paddleocr import PaddleOCR
from openai import OpenAI
class DocumentParser:
"""通用文档解析器"""
def __init__(self, use_vlm: bool = False):
self.ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
self.use_vlm = use_vlm
if use_vlm:
self.client = OpenAI(
api_key=os.getenv("DASHSCOPE_API_KEY"),
base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)
def parse_image(self, image_path: str) -> str:
"""解析图片文档"""
if self.use_vlm:
return self._parse_with_vlm(image_path)
else:
return self._parse_with_ocr(image_path)
def _parse_with_ocr(self, image_path: str) -> str:
"""使用 PaddleOCR 解析"""
result = self.ocr.ocr(image_path, cls=True)
texts = []
for line in result[0]:
texts.append(line[1][0])
return "\n".join(texts)
def _parse_with_vlm(self, image_path: str) -> str:
"""使用多模态模型解析"""
response = self.client.chat.completions.create(
model="qwen-vl-plus",
messages=[{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": f"file://{image_path}"}},
{"type": "text", "text": "请提取图片中的所有文字内容"}
]
}]
)
return response.choices[0].message.content
def parse_pdf(self, pdf_path: str) -> list[str]:
"""解析 PDF 文档"""
from pdf2image import convert_from_path
images = convert_from_path(pdf_path)
results = []
for i, image in enumerate(images):
temp_path = f"/tmp/page_{i+1}.png"
image.save(temp_path, 'PNG')
results.append(self.parse_image(temp_path))
return results
# 使用示例
parser = DocumentParser(use_vlm=False)
texts = parser.parse_pdf("document.pdf")
for i, text in enumerate(texts):
print(f"Page {i+1}: {text[:200]}...")
文档类型自动识别¶
import fitz # PyMuPDF
from pathlib import Path
def detect_document_type(file_path: str) -> str:
"""自动识别文档类型"""
path = Path(file_path)
suffix = path.suffix.lower()
if suffix == '.pdf':
return detect_pdf_type(file_path)
elif suffix in ['.png', '.jpg', '.jpeg', '.tiff']:
return 'image'
elif suffix in ['.docx', '.doc']:
return 'word'
elif suffix == '.txt':
return 'text'
else:
return 'unknown'
def detect_pdf_type(pdf_path: str) -> str:
"""检测 PDF 是扫描件还是可搜索 PDF"""
doc = fitz.open(pdf_path)
for page in doc:
text = page.get_text()
if text.strip(): # 有可提取文本
return 'searchable_pdf'
return 'scanned_pdf' # 扫描件,需要 OCR
# 使用示例
doc_type = detect_document_type("document.pdf")
print(f"文档类型: {doc_type}")
if doc_type == 'scanned_pdf':
# 使用 OCR 处理
parser = DocumentParser()
texts = parser.parse_pdf("document.pdf")
集成 LangChain¶
自定义文档加载器¶
from langchain_core.documents import Document
from langchain_community.document_loaders import BaseLoader
class PaddleOCRLoader(BaseLoader):
"""PaddleOCR 文档加载器"""
def __init__(self, file_path: str, use_vlm: bool = False):
self.file_path = file_path
self.use_vlm = use_vlm
def lazy_load(self):
from paddleocr import PaddleOCR
from pdf2image import convert_from_path
from pathlib import Path
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
path = Path(self.file_path)
if path.suffix.lower() == '.pdf':
images = convert_from_path(self.file_path)
for i, image in enumerate(images):
temp_path = f"/tmp/ocr_page_{i+1}.png"
image.save(temp_path, 'PNG')
result = ocr.ocr(temp_path, cls=True)
text = "\n".join([line[1][0] for line in result[0]])
yield Document(
page_content=text,
metadata={"page": i + 1, "source": self.file_path}
)
else:
result = ocr.ocr(self.file_path, cls=True)
text = "\n".join([line[1][0] for line in result[0]])
yield Document(
page_content=text,
metadata={"source": self.file_path}
)
# 使用 LangChain 加载
loader = PaddleOCRLoader("document.pdf")
documents = list(loader.load())
完整 RAG 流程¶
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma
# 1. 加载文档
loader = PaddleOCRLoader("scanned_document.pdf")
documents = list(loader.load())
# 2. 分割文档
splitter = RecursiveCharacterTextSplitter(
chunk_size=500,
chunk_overlap=50
)
chunks = splitter.split_documents(documents)
# 3. 向量化
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(chunks, embeddings)
# 4. 检索
query = "什么是机器学习"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)
性能优化¶
GPU 加速¶
# 使用 GPU 进行 OCR
ocr = PaddleOCR(
use_angle_cls=True,
lang='chinese',
use_gpu=True,
gpu_mem=5000 # 分配 5GB 显存
)
批处理优化¶
from concurrent.futures import ThreadPoolExecutor
from paddleocr import PaddleOCR
ocr = PaddleOCR(use_angle_cls=True, lang='chinese')
def process_image(image_path):
result = ocr.ocr(image_path, cls=True)
return [line[1][0] for line in result[0]]
# 批量处理
image_paths = ['img1.png', 'img2.png', 'img3.png']
with ThreadPoolExecutor(max_workers=4) as executor:
results = list(executor.map(process_image, image_paths))
模型缓存¶
import os
# 设置模型缓存路径
os.environ['PADDLEOCR_DIR'] = '/path/to/models'
os.environ['XDG_CACHE_HOME'] = '/path/to/cache'
小结¶
本章学习了:
- ✅ PaddleOCR 安装与基本使用
- ✅ 图片文字识别
- ✅ PDF 文档 OCR 识别
- ✅ 表格识别 (PP-Structure)
- ✅ 多模态模型文档解析 (Qwen-VL, GPT-4V)
- ✅ 文档类型自动识别
- ✅ 集成 LangChain
- ✅ 性能优化
下一章¶
第三章:向量化与索引 - 学习向量化存储。