跳转至

第二章:Prometheus 安装与配置

安装方式

方式一:二进制安装

# 下载
wget https://github.com/prometheus/prometheus/releases/download/v2.48.0/prometheus-2.48.0.linux-amd64.tar.gz

# 解压
tar xvf prometheus-2.48.0.linux-amd64.tar.gz
cd prometheus-2.48.0.linux-amd64

# 启动
./prometheus --config.file=prometheus.yml

# 创建 systemd 服务
cat > /etc/systemd/system/prometheus.service << 'EOF'
[Unit]
Description=Prometheus
After=network.target

[Service]
Type=simple
User=prometheus
ExecStart=/usr/local/bin/prometheus \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.path=/var/lib/prometheus \
  --web.listen-address=0.0.0.0:9090

[Install]
WantedBy=multi-user.target
EOF

systemctl daemon-reload
systemctl enable prometheus
systemctl start prometheus

方式二:Docker 安装

# 基本启动
docker run -d \
  --name prometheus \
  -p 9090:9090 \
  prom/prometheus

# 挂载配置文件
docker run -d \
  --name prometheus \
  -p 9090:9090 \
  -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml \
  prom/prometheus

# 持久化存储
docker run -d \
  --name prometheus \
  -p 9090:9090 \
  -v $(pwd)/prometheus.yml:/etc/prometheus/prometheus.yml \
  -v prometheus-data:/prometheus \
  prom/prometheus \
  --config.file=/etc/prometheus/prometheus.yml \
  --storage.tsdb.path=/prometheus

方式三:Docker Compose

# docker-compose.yml
version: '3.8'

services:
  prometheus:
    image: prom/prometheus:latest
    container_name: prometheus
    ports:
      - "9090:9090"
    volumes:
      - ./prometheus.yml:/etc/prometheus/prometheus.yml
      - ./alerts:/etc/prometheus/alerts
      - prometheus-data:/prometheus
    command:
      - '--config.file=/etc/prometheus/prometheus.yml'
      - '--storage.tsdb.path=/prometheus'
      - '--storage.tsdb.retention.time=30d'
      - '--web.enable-lifecycle'
      - '--web.enable-admin-api'
    restart: unless-stopped
    networks:
      - monitoring

  node-exporter:
    image: prom/node-exporter:latest
    container_name: node-exporter
    ports:
      - "9100:9100"
    volumes:
      - /proc:/host/proc:ro
      - /sys:/host/sys:ro
      - /:/rootfs:ro
    command:
      - '--path.procfs=/host/proc'
      - '--path.sysfs=/host/sys'
      - '--path.rootfs=/rootfs'
    restart: unless-stopped
    networks:
      - monitoring

  alertmanager:
    image: prom/alertmanager:latest
    container_name: alertmanager
    ports:
      - "9093:9093"
    volumes:
      - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml
    restart: unless-stopped
    networks:
      - monitoring

volumes:
  prometheus-data:

networks:
  monitoring:

配置文件详解

prometheus.yml 基本结构

# 全局配置
global:
  scrape_interval: 15s      # 采集间隔
  evaluation_interval: 15s  # 规则评估间隔
  scrape_timeout: 10s       # 采集超时

# 告警配置
alerting:
  alertmanagers:
    - static_configs:
        - targets:
          - alertmanager:9093

# 规则文件
rule_files:
  - "alerts/*.yml"

# 采集配置
scrape_configs:
  - job_name: 'prometheus'
    static_configs:
      - targets: ['localhost:9090']

全局配置

global:
  # 采集间隔
  scrape_interval: 15s
  scrape_timeout: 10s

  # 规则评估间隔
  evaluation_interval: 15s

  # 外部标签(用于联邦和远程存储)
  external_labels:
    monitor: 'production'
    region: 'us-east-1'

  # 远程写入
  remote_write:
    - url: "http://remote-storage:8080/write"
      queue_config:
        max_samples_per_send: 1000

  # 远程读取
  remote_read:
    - url: "http://remote-storage:8080/read"

采集配置

scrape_configs:
  # 基本配置
  - job_name: 'prometheus'
    scrape_interval: 15s
    scrape_timeout: 10s
    metrics_path: /metrics
    scheme: http

    static_configs:
      - targets: ['localhost:9090']
        labels:
          env: 'production'

  # 带认证的配置
  - job_name: 'secure-app'
    basic_auth:
      username: admin
      password: secret
    tls_config:
      ca_file: /etc/prometheus/ca.crt
      cert_file: /etc/prometheus/client.crt
      key_file: /etc/prometheus/client.key
      insecure_skip_verify: false
    static_configs:
      - targets: ['app:8080']

  # 文件服务发现
  - job_name: 'file-sd'
    file_sd_configs:
      - files:
          - /etc/prometheus/targets/*.json
        refresh_interval: 5m

  # DNS 服务发现
  - job_name: 'dns-sd'
    dns_sd_configs:
      - names:
          - 'api.production.svc.cluster.local'
        type: 'A'
        port: 8080
        refresh_interval: 30s

  # Consul 服务发现
  - job_name: 'consul-sd'
    consul_sd_configs:
      - server: 'consul:8500'
        services: ['api', 'web']
        tags: ['production']
        refresh_interval: 30s

  # Kubernetes 服务发现
  - job_name: 'kubernetes-pods'
    kubernetes_sd_configs:
      - role: pod
        namespaces:
          names:
            - production
    relabel_configs:
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_scrape]
        action: keep
        regex: true
      - source_labels: [__meta_kubernetes_pod_annotation_prometheus_io_path]
        action: replace
        target_label: __metrics_path__
        regex: (.+)

标签重写(Relabel)

relabel_configs:
  # 保留特定标签
  - source_labels: [__meta_kubernetes_namespace]
    action: keep
    regex: production

  # 替换标签
  - source_labels: [__meta_kubernetes_pod_name]
    target_label: pod
    action: replace

  # 删除标签
  - regex: '__meta_kubernetes_pod_label_(.+)'
    action: labeldrop

  # 映射标签
  - source_labels: [__meta_kubernetes_pod_label_app]
    target_label: application
    action: replace

# 指标重写
metric_relabel_configs:
  # 删除不需要的指标
  - source_labels: [__name__]
    regex: 'go_.*'
    action: drop

  # 保留特定指标
  - source_labels: [__name__]
    regex: 'http_.*'
    action: keep

告警规则配置

规则文件

# alerts/rules.yml
groups:
  - name: node_alerts
    interval: 30s
    rules:
      # 节点宕机告警
      - alert: NodeDown
        expr: up == 0
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Node {{ $labels.instance }} is down"
          description: "{{ $labels.instance }} has been down for more than 5 minutes."

      # 高 CPU 使用率
      - alert: HighCPUUsage
        expr: 100 - (avg by(instance) (irate(node_cpu_seconds_total{mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High CPU usage on {{ $labels.instance }}"
          description: "CPU usage is {{ $value }}%"

      # 高内存使用率
      - alert: HighMemoryUsage
        expr: (1 - (node_memory_MemAvailable_bytes / node_memory_MemTotal_bytes)) * 100 > 85
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High memory usage on {{ $labels.instance }}"
          description: "Memory usage is {{ $value }}%"

      # 磁盘空间不足
      - alert: DiskSpaceLow
        expr: (node_filesystem_avail_bytes{fstype!~"tmpfs|overlay"} / node_filesystem_size_bytes{fstype!~"tmpfs|overlay"}) * 100 < 10
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "Low disk space on {{ $labels.instance }}"
          description: "Disk {{ $labels.mountpoint }} has only {{ $value }}% free space"

  - name: application_alerts
    rules:
      # HTTP 错误率
      - alert: HighErrorRate
        expr: sum(rate(http_requests_total{status=~"5.."}[5m])) / sum(rate(http_requests_total[5m])) * 100 > 5
        for: 5m
        labels:
          severity: critical
        annotations:
          summary: "High HTTP error rate"
          description: "Error rate is {{ $value }}%"

      # 请求延迟高
      - alert: HighLatency
        expr: histogram_quantile(0.99, sum(rate(http_request_duration_seconds_bucket[5m])) by (le)) > 1
        for: 5m
        labels:
          severity: warning
        annotations:
          summary: "High request latency"
          description: "99th percentile latency is {{ $value }}s"

Alertmanager 配置

# alertmanager.yml
global:
  # SMTP 配置
  smtp_smarthost: 'smtp.example.com:587'
  smtp_from: 'alertmanager@example.com'
  smtp_auth_username: 'alertmanager@example.com'
  smtp_auth_password: 'password'

  # Slack 配置
  slack_api_url: 'https://hooks.slack.com/services/xxx'

# 路由配置
route:
  receiver: 'default'
  group_by: ['alertname', 'severity']
  group_wait: 30s
  group_interval: 5m
  repeat_interval: 4h

  routes:
    # 严重告警 -> 立即通知
    - match:
        severity: critical
      receiver: 'critical'
      group_wait: 10s
      repeat_interval: 1h

    # 警告 -> 邮件通知
    - match:
        severity: warning
      receiver: 'warning'
      group_wait: 5m
      repeat_interval: 4h

# 接收器配置
receivers:
  - name: 'default'
    email_configs:
      - to: 'team@example.com'

  - name: 'critical'
    email_configs:
      - to: 'oncall@example.com'
    slack_configs:
      - channel: '#alerts-critical'
        send_resolved: true

  - name: 'warning'
    email_configs:
      - to: 'team@example.com'

# 静默配置
inhibit_rules:
  - source_match:
      severity: 'critical'
    target_match:
      severity: 'warning'
    equal: ['alertname', 'instance']

存储配置

本地存储

# 启动参数
--storage.tsdb.path=/var/lib/prometheus
--storage.tsdb.retention.time=30d
--storage.tsdb.retention.size=10GB

远程存储

# prometheus.yml
global:
  remote_write:
    - url: "http://victoriametrics:8428/write"
      queue_config:
        max_samples_per_send: 1000
        max_shards: 200
        capacity: 2500

  remote_read:
    - url: "http://victoriametrics:8428/read"
      read_recent: true

安全配置

Web 配置

# web.yml
basic_auth_users:
  admin: $2b$12$hNf2lSsxfm0.i4a.1kVpSOVyBCfIB51VRjgBUyv6kdnyTlgWj81Ay
# 生成密码
htpasswd -nBC 12 admin

TLS 配置

# web.yml
tls_server_config:
  cert_file: /etc/prometheus/server.crt
  key_file: /etc/prometheus/server.key
  client_ca_file: /etc/prometheus/ca.crt
  client_auth_type: VerifyClientCertIfGiven

运维命令

热重载配置

# 启用 lifecycle API
--web.enable-lifecycle

# 重载配置
curl -X POST http://localhost:9090/-/reload

数据管理

# 创建快照
curl -X POST http://localhost:9090/api/v1/admin/tsdb/snapshot

# 删除数据
curl -X POST http://localhost:9090/api/v1/admin/tsdb/delete_series?match[]=up

# 清理数据
curl -X POST http://localhost:9090/api/v1/admin/tsdb/clean_tombstones

小结

本章学习了:

  • ✅ 多种安装方式
  • ✅ 配置文件详解
  • ✅ 服务发现配置
  • ✅ 告警规则配置
  • ✅ Alertmanager 配置
  • ✅ 存储和安全配置

下一章

第三章:PromQL 查询语言 - 学习 Prometheus 查询语言。