第七章:自定义评估¶
为什么需要自定义评估?¶
RAGAS 内置指标可能无法覆盖所有场景:
- 特定领域的评估需求
- 业务特定的质量标准
- 组合多个评估维度
自定义指标基础¶
创建简单指标¶
from ragas.metrics import Metric
class SimpleMetric(Metric):
name = "simple_metric"
def score(self, question, answer, contexts, **kwargs):
# 自定义评分逻辑
score = len(answer) / 100 # 示例:答案长度
return min(score, 1.0)
# 使用
metric = SimpleMetric()
score = metric.score(
question="什么是 Python?",
answer="Python 是一种编程语言。",
contexts=["Python 是一种高级编程语言。"]
)
基于 LLM 的指标¶
from ragas.metrics import Metric
from langchain_openai import ChatOpenAI
class LLMBasedMetric(Metric):
name = "llm_metric"
def __init__(self, llm=None):
self.llm = llm or ChatOpenAI(model="gpt-3.5-turbo")
def score(self, question, answer, contexts, **kwargs):
prompt = f"""
评估以下答案的质量(0-1分):
问题:{question}
答案:{answer}
上下文:{contexts}
只返回分数,不要解释。
"""
response = self.llm.invoke(prompt)
return float(response.content.strip())
常见自定义指标¶
1. 答案完整性¶
class CompletenessMetric(Metric):
name = "completeness"
def __init__(self, llm=None):
self.llm = llm or ChatOpenAI()
def score(self, question, answer, contexts, **kwargs):
prompt = f"""
评估答案是否完整回答了问题:
问题:{question}
答案:{answer}
评分标准:
1.0 - 完整回答
0.5 - 部分回答
0.0 - 未回答
只返回分数。
"""
return float(self.llm.invoke(prompt).content.strip())
2. 答案简洁性¶
class ConcisenessMetric(Metric):
name = "conciseness"
def score(self, question, answer, contexts, **kwargs):
# 基于长度评估
words = len(answer.split())
if words <= 50:
return 1.0
elif words <= 100:
return 0.8
elif words <= 200:
return 0.6
else:
return 0.4
3. 专业术语准确性¶
class TerminologyMetric(Metric):
name = "terminology_accuracy"
def __init__(self, domain_terms, llm=None):
self.domain_terms = domain_terms # 领域术语列表
self.llm = llm or ChatOpenAI()
def score(self, question, answer, contexts, **kwargs):
# 检查术语使用是否正确
prompt = f"""
检查答案中专业术语的使用是否正确:
答案:{answer}
领域术语:{self.domain_terms}
返回 0-1 分,表示术语使用准确程度。
"""
return float(self.llm.invoke(prompt).content.strip())
4. 多维度综合指标¶
class CompositeMetric(Metric):
name = "composite"
def __init__(self, metrics, weights=None):
self.metrics = metrics
self.weights = weights or [1/len(metrics)] * len(metrics)
def score(self, question, answer, contexts, **kwargs):
scores = []
for metric in self.metrics:
score = metric.score(question, answer, contexts, **kwargs)
scores.append(score)
# 加权平均
return sum(s * w for s, w in zip(scores, self.weights))
# 使用
from ragas.metrics import faithfulness, answer_relevancy
composite = CompositeMetric(
metrics=[faithfulness, answer_relevancy],
weights=[0.6, 0.4]
)
score = composite.score(question, answer, contexts)
业务特定指标¶
客服场景¶
class CustomerServiceMetric(Metric):
name = "customer_service_quality"
def __init__(self, llm=None):
self.llm = llm or ChatOpenAI()
def score(self, question, answer, contexts, **kwargs):
prompt = f"""
评估客服回答质量:
用户问题:{question}
客服回答:{answer}
评估维度:
1. 是否解决了用户问题
2. 语气是否友好
3. 是否提供了后续建议
返回 0-1 分。
"""
return float(self.llm.invoke(prompt).content.strip())
法律咨询场景¶
class LegalAdviceMetric(Metric):
name = "legal_advice_quality"
def __init__(self, llm=None):
self.llm = llm or ChatOpenAI()
def score(self, question, answer, contexts, **kwargs):
prompt = f"""
评估法律咨询回答质量:
问题:{question}
回答:{answer}
参考资料:{contexts}
评估要点:
1. 法律依据是否准确
2. 是否有免责声明
3. 建议是否合理
返回 0-1 分。
"""
return float(self.llm.invoke(prompt).content.strip())
注册自定义指标¶
from ragas import evaluate
# 创建自定义指标
my_metric = CompletenessMetric()
# 在评估中使用
results = evaluate(
dataset,
metrics=[faithfulness, my_metric]
)
批量评估优化¶
异步评估¶
import asyncio
async def async_score(metric, question, answer, contexts):
return await asyncio.to_thread(
metric.score, question, answer, contexts
)
async def batch_evaluate(metric, data_list):
tasks = [
async_score(metric, d['question'], d['answer'], d['contexts'])
for d in data_list
]
return await asyncio.gather(*tasks)
缓存结果¶
from functools import lru_cache
class CachedMetric(Metric):
name = "cached_metric"
@lru_cache(maxsize=1000)
def _cached_score(self, question, answer, contexts_str):
# 实际评分逻辑
return self._compute_score(question, answer, contexts_str)
def score(self, question, answer, contexts, **kwargs):
contexts_str = "|".join(contexts)
return self._cached_score(question, answer, contexts_str)
调试与测试¶
单元测试¶
import pytest
def test_completeness_metric():
metric = CompletenessMetric()
# 测试完整回答
score = metric.score(
question="Python 是什么?",
answer="Python 是一种高级编程语言,由 Guido van Rossum 创建。",
contexts=[]
)
assert score >= 0.8
# 测试不完整回答
score = metric.score(
question="Python 是什么?",
answer="编程语言。",
contexts=[]
)
assert score < 0.5
可视化分析¶
import matplotlib.pyplot as plt
def visualize_metric_distribution(results, metric_name):
scores = [r[metric_name] for r in results]
plt.hist(scores, bins=20, edgecolor='black')
plt.xlabel('Score')
plt.ylabel('Count')
plt.title(f'{metric_name} Distribution')
plt.savefig(f'{metric_name}_distribution.png')
最佳实践¶
- 明确评估目标:定义清晰的评分标准
- 验证一致性:多次评估结果应稳定
- 结合人工评估:定期校准自动评估
- 文档化:记录指标定义和评分逻辑
小结¶
本章介绍了如何创建和使用自定义评估指标。下一章将通过实战案例展示完整应用。