ob_rules.yaml 5.2 KB
Newer Older
W
wangzelin.wzl 已提交
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40
groups:
  - name: ob-alert
    rules:
    - alert: ob_host_connection_percent_over_threshold
      expr: 100 * ob_active_session_num / 262144 > 80
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} connection used percent over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} connection_used_percent = {{ $value }}"

    - alert: ob_cluster_frozen_version_delta_over_threshold
      expr: max(ob_zone_stat{name="frozen_version"}) by (ob_cluster_name, ob_cluster_id) - min(ob_zone_stat{name="last_merged_version"}) by (ob_cluster_name, ob_cluster_id) > 1
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} too much frozen memstore not merged "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} no_merge_memstore_count = {{ $value }}"

    - alert: ob_cluster_exists_inactive_server
      expr: max(ob_server_num{status="inactive"}) by (ob_cluster_name, ob_cluster_id) > 0
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} exists inactive observer "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} inactive observer count is {{ $value }}"

    - alert: ob_cluster_exists_index_fail_table
      expr: sum(ob_index_error_num) by (ob_cluster_name, ob_cluster_id) > 0
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} exists error index table"
        description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} error index table count is {{ $value }}"

    - alert: ob_cluster_merge_timeout
C
chris-sun-star 已提交
41
      expr: max(ob_zone_stat{name="is_merge_timeout"}) by (ob_cluster_name, ob_cluster_id) == 1
W
wangzelin.wzl 已提交
42 43 44 45 46 47 48 49
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge time out"
        description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge time out"

    - alert: ob_cluster_merge_error
C
chris-sun-star 已提交
50
      expr: max(ob_zone_stat{name="is_merge_error"}) by (ob_cluster_name, ob_cluster_id) == 1
W
wangzelin.wzl 已提交
51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge error"
        description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge error"

    - alert: ob_host_partition_count_over_threshold
      expr: ob_partition_num > 30000
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} partition count over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} partition_count = {{ $value }}"

    - alert: ob_server_sstable_percent_over_threshold
      expr: 100 * (ob_disk_total_bytes - ob_disk_free_bytes) / ob_disk_total_bytes > 85
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} sstable used percent over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} sstable used percent = {{ $value }}"

    - alert: tenant_cpu_percent_over_threshold
      expr: 100  * sum(ob_sysstat{stat_id="140006"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="140005"}) by (svr_ip, tenant_name) > 90
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} cpu used percent over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} cpu used percent = {{ $value }}"

    - alert: tenant_active_memstore_percent_over_threshold
      expr: 100 * sum(ob_sysstat{stat_id="130000"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="130002"}) by (svr_ip, tenant_name) > 110
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant active memstore used percent over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant active memstore used percent = {{ $value }}"

    - alert: tenant_memstore_percent_over_threshold
      expr: 100 * sum(ob_sysstat{stat_id="130001"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="130004"}) by (svr_ip, tenant_name) > 85
      for: 1m
      labels:
        serverity: page
      annotations:
        summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant memstore used percent over threshold "
        description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant memstore used percent = {{ $value }}"