第二章:查询扩展¶
什么是查询扩展¶
查询扩展通过添加相关词汇来丰富原始查询,扩大检索范围,提高召回率。
扩展方法¶
1. 同义词扩展¶
from typing import List
# 同义词词典
SYNONYMS = {
"手机": ["移动电话", "电话", "iPhone", "安卓手机", "智能手机"],
"电脑": ["计算机", "PC", "笔记本", "台式机"],
"机器学习": ["ML", "深度学习", "神经网络", "AI", "人工智能"]
}
def expand_synonyms(query: str) -> List[str]:
"""同义词扩展"""
expanded = [query]
for word, synonyms in SYNONYMS.items():
if word in query:
for syn in synonyms:
expanded.append(query.replace(word, syn))
return expanded
# 示例
query = "机器学习教程"
print(expand_synonyms(query))
# ['机器学习教程', 'ML教程', '深度学习教程', ...]
2. 语义扩展¶
使用词向量找到语义相关的词:
from sentence_transformers import SentenceTransformer
import numpy as np
model = SentenceTransformer('all-MiniLM-L6-v2')
def semantic_expansion(query: str, corpus: List[str], top_k: int = 5) -> List[str]:
"""基于语义相似度的扩展"""
# 编码查询和语料
query_emb = model.encode(query)
corpus_emb = model.encode(corpus)
# 计算相似度
scores = np.dot(corpus_emb, query_emb)
# 返回最相关的词
top_indices = np.argsort(scores)[-top_k:][::-1]
return [corpus[i] for i in top_indices]
# 示例
corpus = ["深度学习", "神经网络", "自然语言处理", "计算机视觉", "数据科学"]
expanded = semantic_expansion("机器学习", corpus)
print(expanded) # ['深度学习', '神经网络', '数据科学', ...]
3. 上下文扩展¶
利用 LLM 生成相关查询:
from openai import OpenAI
client = OpenAI()
def llm_expansion(query: str, n: int = 3) -> List[str]:
"""使用 LLM 扩展查询"""
prompt = f"""请为以下查询生成 {n} 个语义相关的扩展查询:
原始查询:{query}
要求:
1. 保持原始意图
2. 使用不同的表达方式
3. 添加相关概念
扩展查询:"""
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}]
)
# 解析结果
content = response.choices[0].message.content
queries = [line.strip() for line in content.split('\n') if line.strip()]
return queries
# 示例
expanded = llm_expansion("Python 数据分析")
print(expanded)
# ['Python pandas 数据处理', 'Python 数据可视化教程', ...]
4. 伪相关反馈(PRF)¶
假设检索结果中的高频词与查询相关:
from collections import Counter
import jieba
def prf_expansion(query: str, documents: List[str], top_k: int = 3) -> str:
"""伪相关反馈扩展"""
# 假设前 N 个文档是相关的
top_docs = documents[:10]
# 提取高频词
all_words = []
for doc in top_docs:
words = jieba.lcut(doc)
all_words.extend(words)
# 过滤停用词,取高频词
word_freq = Counter(all_words)
stop_words = {'的', '是', '在', '了', '和', '与'}
expansion_terms = []
for word, freq in word_freq.most_common(20):
if word not in stop_words and len(word) > 1:
expansion_terms.append(word)
if len(expansion_terms) >= top_k:
break
# 扩展查询
return query + ' ' + ' '.join(expansion_terms)
# 示例
docs = ["Python 是一种编程语言...", "数据分析使用 Python...", ...]
expanded = prf_expansion("Python 教程", docs)
print(expanded) # "Python 教程 编程 数据 分析"
扩展策略选择¶
┌─────────────────────────────────────────┐
│ 查询扩展策略选择 │
├─────────────────────────────────────────┤
│ │
│ 短查询 ──→ 同义词扩展 + 语义扩展 │
│ │
│ 长查询 ──→ 关键词提取 + PRF │
│ │
│ 专业领域 ──→ 领域词典 + LLM 扩展 │
│ │
│ 通用领域 ──→ 同义词 + 语义扩展 │
│ │
└─────────────────────────────────────────┘
实现示例¶
class QueryExpander:
"""查询扩展器"""
def __init__(self):
self.synonyms = SYNONYMS
self.model = SentenceTransformer('all-MiniLM-L6-v2')
def expand(self, query: str, method: str = "hybrid") -> str:
"""扩展查询"""
if method == "synonym":
return self._synonym_expand(query)
elif method == "semantic":
return self._semantic_expand(query)
elif method == "hybrid":
# 组合多种方法
expanded = self._synonym_expand(query)
return self._semantic_expand(expanded)
return query
def _synonym_expand(self, query: str) -> str:
"""同义词扩展"""
words = query.split()
expanded_words = []
for word in words:
expanded_words.append(word)
if word in self.synonyms:
expanded_words.extend(self.synonyms[word][:2])
return ' '.join(expanded_words)
def _semantic_expand(self, query: str) -> str:
"""语义扩展"""
# 实现语义扩展逻辑
return query
# 使用
expander = QueryExpander()
expanded = expander.expand("机器学习入门", method="hybrid")
print(expanded)
小结¶
查询扩展能有效提高检索召回率。下一章学习查询分解技术。