第七章:聚合分析¶
聚合概述¶
Elasticsearch 聚合提供了强大的数据分析能力,类似于 SQL 的 GROUP BY 和聚合函数。
聚合类型¶
| 类型 | 说明 | 示例 |
|---|---|---|
| Bucket | 分桶聚合 | terms、range、date_histogram |
| Metric | 指标聚合 | avg、sum、max、min、stats |
| Pipeline | 管道聚合 | derivative、cumulative_sum |
| Matrix | 矩阵聚合 | matrix_stats |
聚合结构¶
指标聚合¶
基本统计¶
GET /products/_search
{
"size": 0,
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"max_price": { "max": { "field": "price" } },
"min_price": { "min": { "field": "price" } },
"sum_price": { "sum": { "field": "price" } },
"count_products": { "value_count": { "field": "price" } }
}
}
stats 聚合¶
# 一次性获取多个统计值
GET /products/_search
{
"size": 0,
"aggs": {
"price_stats": {
"stats": { "field": "price" }
}
}
}
# 响应
{
"aggregations": {
"price_stats": {
"count": 100,
"min": 99.0,
"max": 9999.0,
"avg": 2549.5,
"sum": 254950.0
}
}
}
extended_stats 聚合¶
# 扩展统计(包含方差、标准差等)
GET /products/_search
{
"size": 0,
"aggs": {
"price_stats": {
"extended_stats": { "field": "price" }
}
}
}
cardinality 聚合¶
# 去重计数(近似值)
GET /products/_search
{
"size": 0,
"aggs": {
"unique_brands": {
"cardinality": {
"field": "brand",
"precision_threshold": 100
}
}
}
}
percentiles 聚合¶
# 百分位数
GET /products/_search
{
"size": 0,
"aggs": {
"price_percentiles": {
"percentiles": {
"field": "price",
"percents": [25, 50, 75, 90, 95, 99]
}
}
}
}
top_hits 聚合¶
# 获取每个桶内的文档
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": { "field": "brand" },
"aggs": {
"top_products": {
"top_hits": {
"size": 3,
"sort": [{ "price": "desc" }],
"_source": ["name", "price"]
}
}
}
}
}
}
分桶聚合¶
terms 聚合¶
# 按字段值分组
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": {
"field": "brand",
"size": 10,
"order": { "_count": "desc" }
}
}
}
}
# 按指标排序
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": {
"field": "brand",
"order": { "avg_price": "desc" }
},
"aggs": {
"avg_price": { "avg": { "field": "price" } }
}
}
}
}
range 聚合¶
# 数值范围分组
GET /products/_search
{
"size": 0,
"aggs": {
"price_ranges": {
"range": {
"field": "price",
"ranges": [
{ "key": "低价", "to": 1000 },
{ "key": "中价", "from": 1000, "to": 5000 },
{ "key": "高价", "from": 5000 }
]
}
}
}
}
date_range 聚合¶
# 日期范围分组
GET /orders/_search
{
"size": 0,
"aggs": {
"date_ranges": {
"date_range": {
"field": "created_at",
"ranges": [
{ "from": "now-1M", "to": "now", "key": "最近一月" },
{ "from": "now-3M", "to": "now-1M", "key": "一月前到三月前" },
{ "from": "now-6M", "to": "now-3M", "key": "三月前到六月前" }
]
}
}
}
}
histogram 聚合¶
# 数值直方图
GET /products/_search
{
"size": 0,
"aggs": {
"price_histogram": {
"histogram": {
"field": "price",
"interval": 1000,
"min_doc_count": 1
}
}
}
}
date_histogram 聚合¶
# 日期直方图
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month",
"format": "yyyy-MM"
}
}
}
}
# 固定间隔
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "created_at",
"fixed_interval": "1d"
}
}
}
}
filter 聚合¶
# 单过滤器分桶
GET /products/_search
{
"size": 0,
"aggs": {
"apple_products": {
"filter": {
"term": { "brand": "Apple" }
},
"aggs": {
"avg_price": { "avg": { "field": "price" } }
}
}
}
}
filters 聚合¶
# 多过滤器分桶
GET /products/_search
{
"size": 0,
"aggs": {
"brand_filter": {
"filters": {
"filters": {
"apple": { "term": { "brand": "Apple" } },
"samsung": { "term": { "brand": "Samsung" } },
"huawei": { "term": { "brand": "Huawei" } }
}
}
}
}
}
missing 聚合¶
# 缺失值分桶
GET /products/_search
{
"size": 0,
"aggs": {
"missing_description": {
"missing": { "field": "description" }
}
}
}
nested 聚合¶
# 嵌套文档聚合
GET /orders/_search
{
"size": 0,
"aggs": {
"items": {
"nested": { "path": "items" },
"aggs": {
"avg_price": { "avg": { "field": "items.price" } }
}
}
}
}
管道聚合¶
derivative 聚合(导数)¶
# 计算相邻桶的差值
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month"
},
"aggs": {
"total_sales": { "sum": { "field": "amount" } },
"sales_derivative": {
"derivative": { "buckets_path": "total_sales" }
}
}
}
}
}
cumulative_sum 聚合(累计求和)¶
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month"
},
"aggs": {
"total_sales": { "sum": { "field": "amount" } },
"cumulative_sales": {
"cumulative_sum": { "buckets_path": "total_sales" }
}
}
}
}
}
moving_avg 聚合(移动平均)¶
GET /orders/_search
{
"size": 0,
"aggs": {
"sales_over_time": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month"
},
"aggs": {
"total_sales": { "sum": { "field": "amount" } },
"moving_avg": {
"moving_avg": {
"buckets_path": "total_sales",
"window": 3
}
}
}
}
}
}
bucket_selector 聚合¶
# 基于条件过滤桶
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": { "field": "brand" },
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"price_filter": {
"bucket_selector": {
"buckets_path": {
"avgPrice": "avg_price"
},
"script": "params.avgPrice > 1000"
}
}
}
}
}
}
bucket_sort 聚合¶
# 桶排序
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": {
"field": "brand",
"size": 100
},
"aggs": {
"avg_price": { "avg": { "field": "price" } },
"sort": {
"bucket_sort": {
"sort": [{ "avg_price": "desc" }],
"size": 10
}
}
}
}
}
}
综合示例¶
电商销售分析¶
GET /orders/_search
{
"size": 0,
"aggs": {
"by_month": {
"date_histogram": {
"field": "created_at",
"calendar_interval": "month",
"format": "yyyy-MM"
},
"aggs": {
"total_amount": { "sum": { "field": "amount" } },
"order_count": { "value_count": { "field": "_id" } },
"avg_order_amount": { "avg": { "field": "amount" } },
"by_category": {
"terms": { "field": "category", "size": 5 },
"aggs": {
"category_amount": { "sum": { "field": "amount" } }
}
},
"cumulative_amount": {
"cumulative_sum": { "buckets_path": "total_amount" }
}
}
}
}
}
用户行为分析¶
GET /user_actions/_search
{
"size": 0,
"aggs": {
"by_user": {
"terms": {
"field": "user_id",
"size": 100
},
"aggs": {
"action_count": { "value_count": { "field": "action" } },
"last_action": {
"max": { "field": "timestamp" }
},
"unique_actions": {
"cardinality": { "field": "action" }
},
"active_users": {
"bucket_selector": {
"buckets_path": {
"count": "action_count"
},
"script": "params.count > 10"
}
}
}
}
}
}
日志分析¶
GET /logs/_search
{
"size": 0,
"aggs": {
"by_level": {
"terms": { "field": "level" }
},
"by_service": {
"terms": { "field": "service", "size": 10 },
"aggs": {
"error_count": {
"filter": { "term": { "level": "ERROR" } }
},
"error_rate": {
"bucket_script": {
"buckets_path": {
"errors": "error_count._count",
"total": "_count"
},
"script": "params.errors / params.total * 100"
}
}
}
},
"errors_over_time": {
"filter": { "term": { "level": "ERROR" } },
"aggs": {
"by_hour": {
"date_histogram": {
"field": "@timestamp",
"calendar_interval": "hour"
}
}
}
}
}
}
聚合优化¶
限制桶数量¶
GET /products/_search
{
"size": 0,
"aggs": {
"brands": {
"terms": {
"field": "brand",
"size": 10, # 返回前 10 个桶
"shard_size": 50 # 每个分片返回 50 个桶
}
}
}
}
使用过滤器缩小范围¶
GET /products/_search
{
"size": 0,
"query": {
"range": { "created_at": { "gte": "now-30d" } }
},
"aggs": {
"brands": {
"terms": { "field": "brand" }
}
}
}
避免高基数聚合¶
# 不推荐:高基数字段
GET /products/_search
{
"aggs": {
"all_ids": {
"terms": { "field": "product_id" } # 可能有数百万个唯一值
}
}
}
# 推荐:使用 cardinality 近似计数
GET /products/_search
{
"aggs": {
"unique_count": {
"cardinality": { "field": "product_id" }
}
}
}
使用 execution_hint¶
# 对于低基数字段使用 map 方式
GET /products/_search
{
"aggs": {
"brands": {
"terms": {
"field": "brand",
"execution_hint": "map"
}
}
}
}
小结¶
本章介绍了 Elasticsearch 聚合分析:
- 指标聚合:avg、sum、max、min、stats、cardinality
- 分桶聚合:terms、range、histogram、date_histogram
- 管道聚合:derivative、cumulative_sum、moving_avg
- 综合示例:电商分析、用户行为、日志分析
- 优化技巧:限制桶数、过滤范围、避免高基数
下一章我们将学习集群运维。