跳转至

第二章:查询扩展

什么是查询扩展

查询扩展通过添加相关词汇来丰富原始查询,扩大检索范围,提高召回率。

扩展方法

1. 同义词扩展

from typing import List

# 同义词词典
SYNONYMS = {
    "手机": ["移动电话", "电话", "iPhone", "安卓手机", "智能手机"],
    "电脑": ["计算机", "PC", "笔记本", "台式机"],
    "机器学习": ["ML", "深度学习", "神经网络", "AI", "人工智能"]
}

def expand_synonyms(query: str) -> List[str]:
    """同义词扩展"""
    expanded = [query]
    for word, synonyms in SYNONYMS.items():
        if word in query:
            for syn in synonyms:
                expanded.append(query.replace(word, syn))
    return expanded

# 示例
query = "机器学习教程"
print(expand_synonyms(query))
# ['机器学习教程', 'ML教程', '深度学习教程', ...]

2. 语义扩展

使用词向量找到语义相关的词:

from sentence_transformers import SentenceTransformer
import numpy as np

model = SentenceTransformer('all-MiniLM-L6-v2')

def semantic_expansion(query: str, corpus: List[str], top_k: int = 5) -> List[str]:
    """基于语义相似度的扩展"""
    # 编码查询和语料
    query_emb = model.encode(query)
    corpus_emb = model.encode(corpus)

    # 计算相似度
    scores = np.dot(corpus_emb, query_emb)

    # 返回最相关的词
    top_indices = np.argsort(scores)[-top_k:][::-1]
    return [corpus[i] for i in top_indices]

# 示例
corpus = ["深度学习", "神经网络", "自然语言处理", "计算机视觉", "数据科学"]
expanded = semantic_expansion("机器学习", corpus)
print(expanded)  # ['深度学习', '神经网络', '数据科学', ...]

3. 上下文扩展

利用 LLM 生成相关查询:

from openai import OpenAI

client = OpenAI()

def llm_expansion(query: str, n: int = 3) -> List[str]:
    """使用 LLM 扩展查询"""
    prompt = f"""请为以下查询生成 {n} 个语义相关的扩展查询:

原始查询:{query}

要求:
1. 保持原始意图
2. 使用不同的表达方式
3. 添加相关概念

扩展查询:"""

    response = client.chat.completions.create(
        model="gpt-3.5-turbo",
        messages=[{"role": "user", "content": prompt}]
    )

    # 解析结果
    content = response.choices[0].message.content
    queries = [line.strip() for line in content.split('\n') if line.strip()]
    return queries

# 示例
expanded = llm_expansion("Python 数据分析")
print(expanded)
# ['Python pandas 数据处理', 'Python 数据可视化教程', ...]

4. 伪相关反馈(PRF)

假设检索结果中的高频词与查询相关:

from collections import Counter
import jieba

def prf_expansion(query: str, documents: List[str], top_k: int = 3) -> str:
    """伪相关反馈扩展"""
    # 假设前 N 个文档是相关的
    top_docs = documents[:10]

    # 提取高频词
    all_words = []
    for doc in top_docs:
        words = jieba.lcut(doc)
        all_words.extend(words)

    # 过滤停用词,取高频词
    word_freq = Counter(all_words)
    stop_words = {'的', '是', '在', '了', '和', '与'}

    expansion_terms = []
    for word, freq in word_freq.most_common(20):
        if word not in stop_words and len(word) > 1:
            expansion_terms.append(word)
            if len(expansion_terms) >= top_k:
                break

    # 扩展查询
    return query + ' ' + ' '.join(expansion_terms)

# 示例
docs = ["Python 是一种编程语言...", "数据分析使用 Python...", ...]
expanded = prf_expansion("Python 教程", docs)
print(expanded)  # "Python 教程 编程 数据 分析"

扩展策略选择

┌─────────────────────────────────────────┐
│            查询扩展策略选择              │
├─────────────────────────────────────────┤
│                                         │
│   短查询 ──→ 同义词扩展 + 语义扩展      │
│                                         │
│   长查询 ──→ 关键词提取 + PRF           │
│                                         │
│   专业领域 ──→ 领域词典 + LLM 扩展      │
│                                         │
│   通用领域 ──→ 同义词 + 语义扩展        │
│                                         │
└─────────────────────────────────────────┘

实现示例

class QueryExpander:
    """查询扩展器"""

    def __init__(self):
        self.synonyms = SYNONYMS
        self.model = SentenceTransformer('all-MiniLM-L6-v2')

    def expand(self, query: str, method: str = "hybrid") -> str:
        """扩展查询"""
        if method == "synonym":
            return self._synonym_expand(query)
        elif method == "semantic":
            return self._semantic_expand(query)
        elif method == "hybrid":
            # 组合多种方法
            expanded = self._synonym_expand(query)
            return self._semantic_expand(expanded)
        return query

    def _synonym_expand(self, query: str) -> str:
        """同义词扩展"""
        words = query.split()
        expanded_words = []
        for word in words:
            expanded_words.append(word)
            if word in self.synonyms:
                expanded_words.extend(self.synonyms[word][:2])
        return ' '.join(expanded_words)

    def _semantic_expand(self, query: str) -> str:
        """语义扩展"""
        # 实现语义扩展逻辑
        return query

# 使用
expander = QueryExpander()
expanded = expander.expand("机器学习入门", method="hybrid")
print(expanded)

小结

查询扩展能有效提高检索召回率。下一章学习查询分解技术。