池偏一 | 博客 DEDICATED OPERATION AND MAINTENANCE DEVELOPMENT.
登录
X
X
  • prometheus compose
  • version: '3.3'
    services:
      prometheus:
        image: prom/prometheus
        environment:
          - config.file=/etc/prometheus/prometheus.yml
          - storage.tsdb.path=/prometheus
          - alertmanager.url=http://alertmanager:9093
        configs:
          - source: prometheus.yml
            target: /etc/prometheus/prometheus.yml
          - source: alert_rules.yml
            target: /etc/prometheus/first.rules
        networks:
          - traefik-net
        ports:
          - "9090:9090"
        volumes:
          - prometheus_data:/prometheus
        deploy:
          mode: replicated
          replicas: 1
          placement:
            constraints:
              - node.hostname == ELK
          restart_policy:
            condition: on-failure
      alertmanager:
        image: prom/alertmanager
        configs:
          - source: alertmanager.yml
            target: /etc/alertmanager/alertmanager.yml
          - source: alert.tmpl
            target: /usr/local/alert.tmpl
        volumes:
          - alertmanager_config:/usr/local
        networks:
          - traefik-net
        ports:
          - "9093:9093"
        deploy:
          mode: replicated
          replicas: 1
          placement:
            constraints:
              - node.hostname == ELK
          restart_policy:
            condition: on-failure
      dingtalk:
        image: registry.cn-hangzhou.aliyuncs.com/namibox/dingtalk:latest
        networks:
          - traefik-net
        deploy:
          mode: replicated
          replicas: 1
          placement:
            constraints:
              - node.hostname == ELK
          restart_policy:
            condition: on-failure
     
      grafana:
        image: registry.cn-hangzhou.aliyuncs.com/namibox/grafana:latest
        configs:
          - source: grafana.ini
            target: /etc/grafana/grafana.ini
        volumes:
          - grafana_data:/var/lib/grafana
        networks:
          - traefik-net
        deploy:
          labels:
            - "traefik.frontend.rule=Host:monitor.namibox.com"
            - "traefik.enable=true"
            - "traefik.port=3000"
          mode: replicated
          replicas: 1
          placement:
            constraints:
              - node.hostname == ELK
          restart_policy:
            condition: on-failure
      pushgateway:
        image: prom/pushgateway
        networks:
          - traefik-net
        ports:
          - "9091:9091"
        deploy:
          mode: replicated
          placement:
            constraints:
              - node.hostname == ELK
          restart_policy:
            condition: on-failure
    networks:
      traefik-net:
        external: true
    configs:
      prometheus.yml:
        external: true
      alert_rules.yml:
        external: true
      alertmanager.yml:
        external: true
      grafana.ini:
        external: true
      alert.tmpl:
        external: true
    volumes:
      prometheus_data:
        external: true
      grafana_data:
        external: true
      alertmanager_config:
        external: true
      alertmanager_data:
        external: true
      alertmanager_bin:
        external: true


  • posted @ 2019-07-11 by 池偏一 阅读(100) 评论(0)
  • prometheus.yml 监控配置文件
  • global:
      scrape_interval: 15s
      evaluation_interval: 15s
      external_labels:
        monitor: 'namibox-monitor'
    alerting:
      alertmanagers:
        - static_configs:
          - targets: ['alertmanager:9093']
    rule_files:
      - '/etc/prometheus/first.rules'
    scrape_configs:
      - job_name: 'custom_script'
        static_configs:
          - targets: ['pushgateway:9091']
      
     # - job_name: 'W4-docker'
     #   static_configs:
     #     - targets: ['10.81.174.250:9092']
     # - job_name: 'ELK-docker'
     #   static_configs:
     #     - targets: ['10.81.81.184:9092']     
      - job_name: 'prometheus-server'
        static_configs:
          - targets: ['prometheus:9090']
            labels:
              instance: prometheus-server
      # 服务器主机监控
      - job_name: 'W-4'
        static_configs:
          - targets: ['10.81.174.250:9100']
            labels:
              instance: W-4
      - job_name: 'ELK'
        static_configs:
          - targets: ['10.81.81.184:9100']
            labels:
              instance: ELK
              
      - job_name: 'W-2'
        static_configs:
          - targets: ['10.81.182.151:9100']
            labels:
              instance: W-2


  • posted @ 2019-07-11 by 池偏一 阅读(159) 评论(0)
  • alert_rules.yml 报警规则配置文件
  • groups:
    - name: node-exporter-monitor
      rules:
      - alert: "节点宕机"
        expr: up == 0
        for: 10s
        labels:
          level: warning
        annotations:
          AlertType: "prometheus agent 节点丢失"
          Description: "{{$labels.instance}}: 有以下错误原因:node_exporter 进程未启动,服务器关机或网络通信异常"
    - name: host_monitor
      rules:
      - alert: "磁盘报警"
        expr: node_filesystem_free{fstype=~"ext4|xfs"} / node_filesystem_size{fstype=~"ext4|xfs"} < 0.2
        for: 10s
        labels:
          level: warning
        annotations:
          AlertType: "磁盘报警"
          Description: "{{$labels.instance}} 磁盘空闲率不足20%"
      - alert: "CPU 报警"
        expr: 100 - (avg(irate(node_cpu{instance=~"$node",mode="idle"}[5m])) * 100) > 80
        for: 5m
        labels:
          level: warning
        annotations:
          AlertType: "CPU报警"
          Description: "{{$labels.instance}} CPU使用率超过80%"
      - alert: "内存报警"
        expr: (1 - ((node_memory_MemTotal - node_memory_MemFree) / (node_memory_MemTotal)))* 100 > 80
        for: 5m
        labels:
          level: warining
        annotations:
          AlertType: "内存报警"
          Description: "{{$labels.instance}} 内存使用率超过80%" 
      - alert: "Ulimit 报警"
        expr: node_filefd_allocated{instance=~"$node"} / 65535 > 0.7
        for: 5m
        labels:
          level: warining
        annotations:
          AlertType: "Ulimit 报警"
          Description: "{{$labels.instance}} 打开文件描述符超过70%"
      - alert: "Inode 报警"
        expr: inode_free{host=~'$node'} / inode_total{host=~'$node'} < 0.3
        for: 5m
        labels:
          level: warining
        annotations:
          AlertType: "Inode 报警"
          Description: "{{$labels.host}} Inode 使用率超过70%"


  • posted @ 2019-07-11 by 池偏一 阅读(107) 评论(0)
  • 系统基础监控agent node-exporter
  • 下载node_exporter源码包解压到:/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64

    一、 centos6 : /etc/init.d/nodeexporter #已添加到系统自启动项

    #!/bin/bash
    # Start script from Prometheus node_expoter
    # chkconfig: 345 80 20
    # by jiangweihua @ 20190626
    
    HOME_PATH=/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64
    BIN_FILE=node_exporter
    
    function checkstart() {
    	ps -ef |grep node_exporter|grep -v "grep"|grep -v "service" >>/dev/null
        if [ $? -eq 0 ]
        then
            return 0
        else
            return 1
        fi
    }
    
    function start(){
    	echo "node_exporter is running now"
    	checkstart
    	if [ $? -ne 0 ];then
    		nohup ${HOME_PATH}/${BIN_FILE} >/dev/null 2>&1 &
    	fi
        sleep 1
    	checkstart
    	if [ $? -eq 0 ];then
    		echo "node_expoter is running......"
    	else
    		echo "node_expoter start faild!"
    	fi	
    }
    
    function stop(){
    	echo "node_exporter is stop now"
    	checkstart
    	if [ $? -eq 0 ];then
            for i in `ps -ef |grep node_exporter|grep -v "grep"|grep -v "service"|awk '{print $2}'`;do
    		    kill -9 $i >>/dev/null
    	    done
    	fi
    	sleep 1
    	checkstart
    	if [ $? -ne 0 ];then
    		echo "node_exporter is stop......"
    	else
    		echo "node_exporter stop fiald!"
    	fi
    }
    
    case "$1" in
    	start)
            start
    		;; 
    	stop)
    		stop
    		;;
    	restart) 
    		stop
    		start
    		;; 
    esac
    
    exit 0

    二、centos7: /etc/systemd/system/node_exporter.service

    [Unit]
    Description=node_exporter
    Documentation=https://github.com/prometheus/node_exporter/releases/download/v0.14.0/node_exporter-0.14.0.linux-amd64.tar.gz
    After=network.target
    
    [Service]
    Type=simple
    User=root
    ExecStart=/data/prometheus/exporter/node_exporter-0.14.0.linux-amd64/node_exporter
    Restart=on-failure
    
    [Install]
    WantedBy=multi-user.target


  • posted @ 2019-07-11 by 池偏一 阅读(173) 评论(0)
© 2017 池偏一 | 赣ICP备 17014207号