跳转至

第七章:聚合分析

聚合概述

Elasticsearch 聚合提供了强大的数据分析能力,类似于 SQL 的 GROUP BY 和聚合函数。

聚合类型

类型 说明 示例
Bucket 分桶聚合 terms、range、date_histogram
Metric 指标聚合 avg、sum、max、min、stats
Pipeline 管道聚合 derivative、cumulative_sum
Matrix 矩阵聚合 matrix_stats

聚合结构

GET /products/_search
{
  "size": 0,
  "aggs": {
    "聚合名称": {
      "聚合类型": {
        "参数": "值"
      }
    }
  }
}

指标聚合

基本统计

GET /products/_search
{
  "size": 0,
  "aggs": {
    "avg_price": { "avg": { "field": "price" } },
    "max_price": { "max": { "field": "price" } },
    "min_price": { "min": { "field": "price" } },
    "sum_price": { "sum": { "field": "price" } },
    "count_products": { "value_count": { "field": "price" } }
  }
}

stats 聚合

# 一次性获取多个统计值
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_stats": {
      "stats": { "field": "price" }
    }
  }
}

# 响应
{
  "aggregations": {
    "price_stats": {
      "count": 100,
      "min": 99.0,
      "max": 9999.0,
      "avg": 2549.5,
      "sum": 254950.0
    }
  }
}

extended_stats 聚合

# 扩展统计(包含方差、标准差等)
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_stats": {
      "extended_stats": { "field": "price" }
    }
  }
}

cardinality 聚合

# 去重计数(近似值)
GET /products/_search
{
  "size": 0,
  "aggs": {
    "unique_brands": {
      "cardinality": { 
        "field": "brand",
        "precision_threshold": 100
      }
    }
  }
}

percentiles 聚合

# 百分位数
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_percentiles": {
      "percentiles": {
        "field": "price",
        "percents": [25, 50, 75, 90, 95, 99]
      }
    }
  }
}

top_hits 聚合

# 获取每个桶内的文档
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": { "field": "brand" },
      "aggs": {
        "top_products": {
          "top_hits": {
            "size": 3,
            "sort": [{ "price": "desc" }],
            "_source": ["name", "price"]
          }
        }
      }
    }
  }
}

分桶聚合

terms 聚合

# 按字段值分组
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": {
        "field": "brand",
        "size": 10,
        "order": { "_count": "desc" }
      }
    }
  }
}

# 按指标排序
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": {
        "field": "brand",
        "order": { "avg_price": "desc" }
      },
      "aggs": {
        "avg_price": { "avg": { "field": "price" } }
      }
    }
  }
}

range 聚合

# 数值范围分组
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_ranges": {
      "range": {
        "field": "price",
        "ranges": [
          { "key": "低价", "to": 1000 },
          { "key": "中价", "from": 1000, "to": 5000 },
          { "key": "高价", "from": 5000 }
        ]
      }
    }
  }
}

date_range 聚合

# 日期范围分组
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "date_ranges": {
      "date_range": {
        "field": "created_at",
        "ranges": [
          { "from": "now-1M", "to": "now", "key": "最近一月" },
          { "from": "now-3M", "to": "now-1M", "key": "一月前到三月前" },
          { "from": "now-6M", "to": "now-3M", "key": "三月前到六月前" }
        ]
      }
    }
  }
}

histogram 聚合

# 数值直方图
GET /products/_search
{
  "size": 0,
  "aggs": {
    "price_histogram": {
      "histogram": {
        "field": "price",
        "interval": 1000,
        "min_doc_count": 1
      }
    }
  }
}

date_histogram 聚合

# 日期直方图
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "created_at",
        "calendar_interval": "month",
        "format": "yyyy-MM"
      }
    }
  }
}

# 固定间隔
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "created_at",
        "fixed_interval": "1d"
      }
    }
  }
}

filter 聚合

# 单过滤器分桶
GET /products/_search
{
  "size": 0,
  "aggs": {
    "apple_products": {
      "filter": {
        "term": { "brand": "Apple" }
      },
      "aggs": {
        "avg_price": { "avg": { "field": "price" } }
      }
    }
  }
}

filters 聚合

# 多过滤器分桶
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brand_filter": {
      "filters": {
        "filters": {
          "apple": { "term": { "brand": "Apple" } },
          "samsung": { "term": { "brand": "Samsung" } },
          "huawei": { "term": { "brand": "Huawei" } }
        }
      }
    }
  }
}

missing 聚合

# 缺失值分桶
GET /products/_search
{
  "size": 0,
  "aggs": {
    "missing_description": {
      "missing": { "field": "description" }
    }
  }
}

nested 聚合

# 嵌套文档聚合
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "items": {
      "nested": { "path": "items" },
      "aggs": {
        "avg_price": { "avg": { "field": "items.price" } }
      }
    }
  }
}

管道聚合

derivative 聚合(导数)

# 计算相邻桶的差值
GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "created_at",
        "calendar_interval": "month"
      },
      "aggs": {
        "total_sales": { "sum": { "field": "amount" } },
        "sales_derivative": {
          "derivative": { "buckets_path": "total_sales" }
        }
      }
    }
  }
}

cumulative_sum 聚合(累计求和)

GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "created_at",
        "calendar_interval": "month"
      },
      "aggs": {
        "total_sales": { "sum": { "field": "amount" } },
        "cumulative_sales": {
          "cumulative_sum": { "buckets_path": "total_sales" }
        }
      }
    }
  }
}

moving_avg 聚合(移动平均)

GET /orders/_search
{
  "size": 0,
  "aggs": {
    "sales_over_time": {
      "date_histogram": {
        "field": "created_at",
        "calendar_interval": "month"
      },
      "aggs": {
        "total_sales": { "sum": { "field": "amount" } },
        "moving_avg": {
          "moving_avg": { 
            "buckets_path": "total_sales",
            "window": 3
          }
        }
      }
    }
  }
}

bucket_selector 聚合

# 基于条件过滤桶
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": { "field": "brand" },
      "aggs": {
        "avg_price": { "avg": { "field": "price" } },
        "price_filter": {
          "bucket_selector": {
            "buckets_path": {
              "avgPrice": "avg_price"
            },
            "script": "params.avgPrice > 1000"
          }
        }
      }
    }
  }
}

bucket_sort 聚合

# 桶排序
GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": { 
        "field": "brand",
        "size": 100
      },
      "aggs": {
        "avg_price": { "avg": { "field": "price" } },
        "sort": {
          "bucket_sort": {
            "sort": [{ "avg_price": "desc" }],
            "size": 10
          }
        }
      }
    }
  }
}

综合示例

电商销售分析

GET /orders/_search
{
  "size": 0,
  "aggs": {
    "by_month": {
      "date_histogram": {
        "field": "created_at",
        "calendar_interval": "month",
        "format": "yyyy-MM"
      },
      "aggs": {
        "total_amount": { "sum": { "field": "amount" } },
        "order_count": { "value_count": { "field": "_id" } },
        "avg_order_amount": { "avg": { "field": "amount" } },
        "by_category": {
          "terms": { "field": "category", "size": 5 },
          "aggs": {
            "category_amount": { "sum": { "field": "amount" } }
          }
        },
        "cumulative_amount": {
          "cumulative_sum": { "buckets_path": "total_amount" }
        }
      }
    }
  }
}

用户行为分析

GET /user_actions/_search
{
  "size": 0,
  "aggs": {
    "by_user": {
      "terms": { 
        "field": "user_id",
        "size": 100
      },
      "aggs": {
        "action_count": { "value_count": { "field": "action" } },
        "last_action": {
          "max": { "field": "timestamp" }
        },
        "unique_actions": {
          "cardinality": { "field": "action" }
        },
        "active_users": {
          "bucket_selector": {
            "buckets_path": {
              "count": "action_count"
            },
            "script": "params.count > 10"
          }
        }
      }
    }
  }
}

日志分析

GET /logs/_search
{
  "size": 0,
  "aggs": {
    "by_level": {
      "terms": { "field": "level" }
    },
    "by_service": {
      "terms": { "field": "service", "size": 10 },
      "aggs": {
        "error_count": {
          "filter": { "term": { "level": "ERROR" } }
        },
        "error_rate": {
          "bucket_script": {
            "buckets_path": {
              "errors": "error_count._count",
              "total": "_count"
            },
            "script": "params.errors / params.total * 100"
          }
        }
      }
    },
    "errors_over_time": {
      "filter": { "term": { "level": "ERROR" } },
      "aggs": {
        "by_hour": {
          "date_histogram": {
            "field": "@timestamp",
            "calendar_interval": "hour"
          }
        }
      }
    }
  }
}

聚合优化

限制桶数量

GET /products/_search
{
  "size": 0,
  "aggs": {
    "brands": {
      "terms": {
        "field": "brand",
        "size": 10,           # 返回前 10 个桶
        "shard_size": 50      # 每个分片返回 50 个桶
      }
    }
  }
}

使用过滤器缩小范围

GET /products/_search
{
  "size": 0,
  "query": {
    "range": { "created_at": { "gte": "now-30d" } }
  },
  "aggs": {
    "brands": {
      "terms": { "field": "brand" }
    }
  }
}

避免高基数聚合

# 不推荐:高基数字段
GET /products/_search
{
  "aggs": {
    "all_ids": {
      "terms": { "field": "product_id" }  # 可能有数百万个唯一值
    }
  }
}

# 推荐:使用 cardinality 近似计数
GET /products/_search
{
  "aggs": {
    "unique_count": {
      "cardinality": { "field": "product_id" }
    }
  }
}

使用 execution_hint

# 对于低基数字段使用 map 方式
GET /products/_search
{
  "aggs": {
    "brands": {
      "terms": {
        "field": "brand",
        "execution_hint": "map"
      }
    }
  }
}

小结

本章介绍了 Elasticsearch 聚合分析:

  • 指标聚合:avg、sum、max、min、stats、cardinality
  • 分桶聚合:terms、range、histogram、date_histogram
  • 管道聚合:derivative、cumulative_sum、moving_avg
  • 综合示例:电商分析、用户行为、日志分析
  • 优化技巧:限制桶数、过滤范围、避免高基数

下一章我们将学习集群运维。