ob_rules.yaml 5.4 KB
Newer Older
W
wangzelin.wzl 已提交
1 2 3
groups:
  - name: ob-alert
    rules:
O
ob-robot 已提交
4 5 6 7 8 9 10 11
      - alert: ob_host_connection_percent_over_threshold
        expr: 100 * ob_active_session_num / 262144 > 80
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} connection used percent over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} connection_used_percent = {{ $value }}"
W
wangzelin.wzl 已提交
12

O
ob-robot 已提交
13 14 15 16 17 18 19 20
      - alert: ob_cluster_frozen_version_delta_over_threshold
        expr: max(ob_zone_stat{name="frozen_version"}) by (ob_cluster_name, ob_cluster_id) - min(ob_zone_stat{name="last_merged_version"}) by (ob_cluster_name, ob_cluster_id) > 1
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} too much frozen memstore not merged "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} no_merge_memstore_count = {{ $value }}"
W
wangzelin.wzl 已提交
21

O
ob-robot 已提交
22 23 24 25 26 27 28 29
      - alert: ob_cluster_exists_inactive_server
        expr: max(ob_server_num{status="inactive"}) by (ob_cluster_name, ob_cluster_id) > 0
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} exists inactive observer "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} inactive observer count is {{ $value }}"
W
wangzelin.wzl 已提交
30

O
ob-robot 已提交
31 32 33 34 35 36 37 38
      - alert: ob_cluster_exists_index_fail_table
        expr: sum(ob_index_error_num) by (ob_cluster_name, ob_cluster_id) > 0
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} exists error index table"
          description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} error index table count is {{ $value }}"
W
wangzelin.wzl 已提交
39

O
ob-robot 已提交
40 41 42 43 44 45 46 47
      - alert: ob_cluster_merge_timeout
        expr: max(ob_zone_stat{name="is_merge_timeout"}) by (ob_cluster_name, ob_cluster_id) == 1
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge time out"
          description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge time out"
W
wangzelin.wzl 已提交
48

O
ob-robot 已提交
49 50 51 52 53 54 55 56
      - alert: ob_cluster_merge_error
        expr: max(ob_zone_stat{name="is_merge_error"}) by (ob_cluster_name, ob_cluster_id) == 1
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge error"
          description: "{{ $labels.ob_cluster_name }} {{ $labels.ob_cluster_id }} merge error"
W
wangzelin.wzl 已提交
57

O
ob-robot 已提交
58 59 60 61 62 63 64 65
      - alert: ob_host_partition_count_over_threshold
        expr: ob_partition_num > 30000
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} partition count over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} partition_count = {{ $value }}"
W
wangzelin.wzl 已提交
66

O
ob-robot 已提交
67 68 69 70 71 72 73 74
      - alert: ob_server_sstable_percent_over_threshold
        expr: 100 * (ob_disk_total_bytes - ob_disk_free_bytes) / ob_disk_total_bytes > 85
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} sstable used percent over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} sstable used percent = {{ $value }}"
W
wangzelin.wzl 已提交
75

O
ob-robot 已提交
76 77 78 79 80 81 82 83
      - alert: tenant_cpu_percent_over_threshold
        expr: 100  * sum(ob_sysstat{stat_id="140006"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="140005"}) by (svr_ip, tenant_name) > 90
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} cpu used percent over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} cpu used percent = {{ $value }}"
W
wangzelin.wzl 已提交
84

O
ob-robot 已提交
85 86 87 88 89 90 91 92
      - alert: tenant_active_memstore_percent_over_threshold
        expr: 100 * sum(ob_sysstat{stat_id="130000"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="130002"}) by (svr_ip, tenant_name) > 110
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant active memstore used percent over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant active memstore used percent = {{ $value }}"
W
wangzelin.wzl 已提交
93

O
ob-robot 已提交
94 95 96 97 98 99 100 101
      - alert: tenant_memstore_percent_over_threshold
        expr: 100 * sum(ob_sysstat{stat_id="130001"}) by (svr_ip, tenant_name) / sum(ob_sysstat{stat_id="130004"}) by (svr_ip, tenant_name) > 85
        for: 1m
        labels:
          serverity: page
        annotations:
          summary: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant memstore used percent over threshold "
          description: "{{ $labels.ob_cluster_name }} {{ $labels.obzone }} {{ $labels.svr_ip }} {{ $labels.tenant_name }} tenant memstore used percent = {{ $value }}"