version: '3.3' services: prometheus: image: prom/prometheus environment: - config.file=/etc/prometheus/prometheus.yml - storage.tsdb.path=/prometheus - alertmanager.url=http://alertmanager:9093 configs: - source: prometheus.yml target: /etc/prometheus/prometheus.yml - source: alert_rules.yml target: /etc/prometheus/first.rules networks: - traefik-net ports: - "9090:9090" volumes: - prometheus_data:/prometheus deploy: mode: replicated replicas: 1 placement: constraints: - node.hostname == ELK restart_policy: condition: on-failure alertmanager: image: prom/alertmanager configs: - source: alertmanager.yml target: /etc/alertmanager/alertmanager.yml - source: alert.tmpl target: /usr/local/alert.tmpl volumes: - alertmanager_config:/usr/local networks: - traefik-net ports: - "9093:9093" deploy: mode: replicated replicas: 1 placement: constraints: - node.hostname == ELK restart_policy: condition: on-failure dingtalk: image: registry.cn-hangzhou.aliyuncs.com/namibox/dingtalk:latest networks: - traefik-net deploy: mode: replicated replicas: 1 placement: constraints: - node.hostname == ELK restart_policy: condition: on-failure grafana: image: registry.cn-hangzhou.aliyuncs.com/namibox/grafana:latest configs: - source: grafana.ini target: /etc/grafana/grafana.ini volumes: - grafana_data:/var/lib/grafana networks: - traefik-net deploy: labels: - "traefik.frontend.rule=Host:monitor.namibox.com" - "traefik.enable=true" - "traefik.port=3000" mode: replicated replicas: 1 placement: constraints: - node.hostname == ELK restart_policy: condition: on-failure pushgateway: image: prom/pushgateway networks: - traefik-net ports: - "9091:9091" deploy: mode: replicated placement: constraints: - node.hostname == ELK restart_policy: condition: on-failure networks: traefik-net: external: true configs: prometheus.yml: external: true alert_rules.yml: external: true alertmanager.yml: external: true grafana.ini: external: true alert.tmpl: external: true volumes: prometheus_data: external: true grafana_data: external: true alertmanager_config: external: true alertmanager_data: external: true alertmanager_bin: external: true
global: scrape_interval: 15s evaluation_interval: 15s external_labels: monitor: 'namibox-monitor' alerting: alertmanagers: - static_configs: - targets: ['alertmanager:9093'] rule_files: - '/etc/prometheus/first.rules' scrape_configs: - job_name: 'custom_script' static_configs: - targets: ['pushgateway:9091'] # - job_name: 'W4-docker' # static_configs: # - targets: ['10.81.174.250:9092'] # - job_name: 'ELK-docker' # static_configs: # - targets: ['10.81.81.184:9092'] - job_name: 'prometheus-server' static_configs: - targets: ['prometheus:9090'] labels: instance: prometheus-server # 服务器主机监控 - job_name: 'W-4' static_configs: - targets: ['10.81.174.250:9100'] labels: instance: W-4 - job_name: 'ELK' static_configs: - targets: ['10.81.81.184:9100'] labels: instance: ELK - job_name: 'W-2' static_configs: - targets: ['10.81.182.151:9100'] labels: instance: W-2
groups: - name: node-exporter-monitor rules: - alert: "节点宕机" expr: up == 0 for: 10s labels: level: warning annotations: AlertType: "prometheus agent 节点丢失" Description: "{{$labels.instance}}: 有以下错误原因:node_exporter 进程未启动,服务器关机或网络通信异常" - name: host_monitor rules: - alert: "磁盘报警" expr: node_filesystem_free{fstype=~"ext4|xfs"} / node_filesystem_size{fstype=~"ext4|xfs"} < 0.2 for: 10s labels: level: warning annotations: AlertType: "磁盘报警" Description: "{{$labels.instance}} 磁盘空闲率不足20%" - alert: "CPU 报警" expr: 100 - (avg(irate(node_cpu{instance=~"$node",mode="idle"}[5m])) * 100) > 80 for: 5m labels: level: warning annotations: AlertType: "CPU报警" Description: "{{$labels.instance}} CPU使用率超过80%" - alert: "内存报警" expr: (1 - ((node_memory_MemTotal - node_memory_MemFree) / (node_memory_MemTotal)))* 100 > 80 for: 5m labels: level: warining annotations: AlertType: "内存报警" Description: "{{$labels.instance}} 内存使用率超过80%" - alert: "Ulimit 报警" expr: node_filefd_allocated{instance=~"$node"} / 65535 > 0.7 for: 5m labels: level: warining annotations: AlertType: "Ulimit 报警" Description: "{{$labels.instance}} 打开文件描述符超过70%" - alert: "Inode 报警" expr: inode_free{host=~'$node'} / inode_total{host=~'$node'} < 0.3 for: 5m labels: level: warining annotations: AlertType: "Inode 报警" Description: "{{$labels.host}} Inode 使用率超过70%"
下载node_exporter源码包解压到:/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64
一、 centos6 : /etc/init.d/nodeexporter #已添加到系统自启动项
#!/bin/bash # Start script from Prometheus node_expoter # chkconfig: 345 80 20 # by jiangweihua @ 20190626 HOME_PATH=/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64 BIN_FILE=node_exporter function checkstart() { ps -ef |grep node_exporter|grep -v "grep"|grep -v "service" >>/dev/null if [ $? -eq 0 ] then return 0 else return 1 fi } function start(){ echo "node_exporter is running now" checkstart if [ $? -ne 0 ];then nohup ${HOME_PATH}/${BIN_FILE} >/dev/null 2>&1 & fi sleep 1 checkstart if [ $? -eq 0 ];then echo "node_expoter is running......" else echo "node_expoter start faild!" fi } function stop(){ echo "node_exporter is stop now" checkstart if [ $? -eq 0 ];then for i in `ps -ef |grep node_exporter|grep -v "grep"|grep -v "service"|awk '{print $2}'`;do kill -9 $i >>/dev/null done fi sleep 1 checkstart if [ $? -ne 0 ];then echo "node_exporter is stop......" else echo "node_exporter stop fiald!" fi } case "$1" in start) start ;; stop) stop ;; restart) stop start ;; esac exit 0
二、centos7: /etc/systemd/system/node_exporter.service
[Unit] Description=node_exporter Documentation=https://github.com/prometheus/node_exporter/releases/download/v0.14.0/node_exporter-0.14.0.linux-amd64.tar.gz After=network.target [Service] Type=simple User=root ExecStart=/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64/node_exporter Restart=on-failure [Install] WantedBy=multi-user.target