diff --git a/docker/alertmanager.yml b/docker/alertmanager.yml new file mode 100644 index 0000000000000000000000000000000000000000..e1f079dce03d61a394758c4acf6410432cd9e291 --- /dev/null +++ b/docker/alertmanager.yml @@ -0,0 +1,19 @@ +global: + resolve_timeout: 5m + +route: + group_by: ['alertname'] + group_wait: 10s + group_interval: 10s + repeat_interval: 1h + receiver: 'web.hook' +receivers: +- name: 'web.hook' + webhook_configs: + - url: 'http://127.0.0.1:5001/' +inhibit_rules: + - source_match: + severity: 'critical' + target_match: + severity: 'warning' + equal: ['alertname', 'dev', 'instance'] diff --git a/docker/docker-compose-monitor.yml b/docker/docker-compose-monitor.yml new file mode 100644 index 0000000000000000000000000000000000000000..9cb1fb1902fee595c990bd8195c966b97bebc011 --- /dev/null +++ b/docker/docker-compose-monitor.yml @@ -0,0 +1,56 @@ +version: '2.3' + +networks: + monitor: + driver: bridge + +services: + prometheus: + image: prom/prometheus:v2.11.1 + container_name: prometheus + hostname: prometheus + restart: always + volumes: + - ./prometheus.yml:/etc/prometheus/prometheus.yml + - ./server_down.yml:/etc/prometheus/node_down.yml + ports: + - "9090:9090" + networks: + - monitor + + alertmanager: + image: prom/alertmanager + container_name: alertmanager + hostname: alertmanager + restart: always + volumes: + - ./alertmanager.yml:/etc/alertmanager/alertmanager.yml + ports: + - "9093:9093" + networks: + - monitor + + grafana: + image: grafana/grafana + container_name: grafana + hostname: grafana + restart: always + ports: + - "3000:3000" + networks: + - monitor + + milvus: + runtime: nvidia + image: registry.zilliz.com/milvus/engine:branch-0.4.0-release + container_name: milvus + hostname: milvus + restart: always + volumes: + - ../cpp/conf/server_config.yaml:/opt/milvus/cpp/conf/server_config.yaml + - ../cpp/conf/log_config.conf:/opt/milvus/cpp/conf/log_config.conf + ports: + - "8080:8080" + - "19530:19530" + networks: + - monitor diff --git a/docker/prometheus.yml b/docker/prometheus.yml new file mode 100644 index 0000000000000000000000000000000000000000..7a2a07b390b03e07026f409552fe61558ff88d7c --- /dev/null +++ b/docker/prometheus.yml @@ -0,0 +1,46 @@ +# my global config +global: + scrape_interval: 15s # Set the scrape interval to every 1 seconds. Default is every 1 minute. + evaluation_interval: 15s # Evaluate rules every 15 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +# Alertmanager configuration +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] + +# Load rules once and periodically evaluate them according to the global 'evaluation_interval'. +rule_files: + - "serverdown.yml" # add alerting rules + +# A scrape configuration containing exactly one endpoint to scrape: +# Here it's Prometheus itself. +scrape_configs: + # The job name is added as a label `job=` to any timeseries scraped from this config. + - job_name: 'prometheus' + + # metrics_path defaults to '/metrics' + # scheme defaults to 'http'. + + static_configs: + - targets: ['localhost:9090'] + + # scrape metrics of server + - job_name: 'milvus_server' + scrape_interval: 1s + static_configs: + - targets: ['localhost:8080'] + + # scrape metrics of server + - job_name: 'milvus_server_1' + scrape_interval: 1s + static_configs: + - targets: ['localhost:8080'] + + + + # under development + - job_name: 'pushgateway' + static_configs: + - targets: ['localhost:9091'] diff --git a/docker/server_down.yml b/docker/server_down.yml new file mode 100644 index 0000000000000000000000000000000000000000..8a7077c7f8f6385adb6d2078816aa9b1fffc253c --- /dev/null +++ b/docker/server_down.yml @@ -0,0 +1,8 @@ +groups: +- name: milvus + rules: + - alert: MilvusServerDown + expr: up{job="milvus_server"} + for: 1s + labels: + serverity: page