Merge branch 'jprovazn-puma-metrics' into 'master'

Add Puma sampler Closes #52769 See merge request gitlab-org/gitlab-ce!28324

Merge branch 'jprovazn-puma-metrics' into 'master'
Add Puma sampler Closes #52769 See merge request gitlab-org/gitlab-ce!28324
65d65fed · Kamil Trzciński · d017d2d9 · a5adc6a0 · 65d65fed · 65d65fed
7 changed file
--- a/config/gitlab.yml.example
+++ b/config/gitlab.yml.example
@@ -752,6 +752,8 @@ production: &base
  monitoring:
    # Time between sampling of unicorn socket metrics, in seconds
    # unicorn_sampler_interval: 10
+    # Time between sampling of Puma metrics, in seconds
+    # puma_sampler_interval: 5
    # IP whitelist to access monitoring endpoints
    ip_whitelist:
      - 127.0.0.0/8

--- a/config/initializers/1_settings.rb
+++ b/config/initializers/1_settings.rb
@@ -491,6 +491,7 @@ Settings.webpack.dev_server['port']    ||= 3808
 Settings['monitoring'] ||= Settingslogic.new({})
 Settings.monitoring['ip_whitelist'] ||= ['127.0.0.1/8']
 Settings.monitoring['unicorn_sampler_interval'] ||= 10
+Settings.monitoring['puma_sampler_interval'] ||= 5
 Settings.monitoring['ruby_sampler_interval'] ||= 60
 Settings.monitoring['sidekiq_exporter'] ||= Settingslogic.new({})
 Settings.monitoring.sidekiq_exporter['enabled'] ||= false

--- a/config/initializers/7_prometheus_metrics.rb
+++ b/config/initializers/7_prometheus_metrics.rb
@@ -29,12 +29,18 @@ if !Rails.env.test? && Gitlab::Metrics.prometheus_metrics_enabled?
  Gitlab::Cluster::LifecycleEvents.on_worker_start do
    defined?(::Prometheus::Client.reinitialize_on_pid_change) && Prometheus::Client.reinitialize_on_pid_change

-    unless Sidekiq.server?
+    if defined?(::Unicorn)
      Gitlab::Metrics::Samplers::UnicornSampler.initialize_instance(Settings.monitoring.unicorn_sampler_interval).start
    end

    Gitlab::Metrics::Samplers::RubySampler.initialize_instance(Settings.monitoring.ruby_sampler_interval).start
  end
+
+  if defined?(::Puma)
+    Gitlab::Cluster::LifecycleEvents.on_master_start do
+      Gitlab::Metrics::Samplers::PumaSampler.initialize_instance(Settings.monitoring.puma_sampler_interval).start
+    end
+  end
 end

 Gitlab::Cluster::LifecycleEvents.on_master_restart do

--- a/doc/administration/monitoring/prometheus/gitlab_metrics.md
+++ b/doc/administration/monitoring/prometheus/gitlab_metrics.md
@@ -103,6 +103,24 @@ Some basic Ruby runtime metrics are available:

 [GC.stat]: https://ruby-doc.org/core-2.3.0/GC.html#method-c-stat

+## Puma Metrics **[EXPERIMENTAL]**
+
+When Puma is used instead of Unicorn, following metrics are available:
+
+| Metric                                       | Type    | Since | Description |
+|:-------------------------------------------- |:------- |:----- |:----------- |
+| puma_workers                                 | Gauge   | 12.0 | Total number of workers |
+| puma_running_workers                         | Gauge   | 12.0 | Number of booted workers |
+| puma_stale_workers                           | Gauge   | 12.0 | Number of old workers |
+| puma_phase                                   | Gauge   | 12.0 | Phase number (increased during phased restarts) |
+| puma_running                                 | Gauge   | 12.0 | Number of running threads |
+| puma_queued_connections                      | Gauge   | 12.0 | Number of connections in that worker's "todo" set waiting for a worker thread |
+| puma_active_connections                      | Gauge   | 12.0 | Number of threads processing a request |
+| puma_pool_capacity                           | Gauge   | 12.0 | Number of requests the worker is capable of taking right now |
+| puma_max_threads                             | Gauge   | 12.0 | Maximum number of worker threads |
+| puma_idle_threads                            | Gauge   | 12.0 | Number of spawned threads which are not processing a request |
+
+
 ## Metrics shared directory

 GitLab's Prometheus client requires a directory to store metrics data shared between multi-process services.

--- a/lib/gitlab/cluster/lifecycle_events.rb
+++ b/lib/gitlab/cluster/lifecycle_events.rb
@@ -44,6 +44,14 @@ module Gitlab
          (@master_restart_hooks ||= []) << block
        end

+        def on_master_start(&block)
+          if in_clustered_environment?
+            on_before_fork(&block)
+          else
+            on_worker_start(&block)
+          end
+        end
+
        #
        # Lifecycle integration methods (called from unicorn.rb, puma.rb, etc.)
        #

--- a/lib/gitlab/metrics/samplers/puma_sampler.rb
+++ b/lib/gitlab/metrics/samplers/puma_sampler.rb
+# frozen_string_literal: true
+
+require 'puma/state_file'
+
+module Gitlab
+  module Metrics
+    module Samplers
+      class PumaSampler < BaseSampler
+        def metrics
+          @metrics ||= init_metrics
+        end
+
+        def init_metrics
+          {
+            puma_workers:            ::Gitlab::Metrics.gauge(:puma_workers, 'Total number of workers'),
+            puma_running_workers:    ::Gitlab::Metrics.gauge(:puma_running_workers, 'Number of active workers'),
+            puma_stale_workers:      ::Gitlab::Metrics.gauge(:puma_stale_workers, 'Number of stale workers'),
+            puma_phase:              ::Gitlab::Metrics.gauge(:puma_phase, 'Phase number (increased during phased restarts)'),
+            puma_running:            ::Gitlab::Metrics.gauge(:puma_running, 'Number of running threads'),
+            puma_queued_connections: ::Gitlab::Metrics.gauge(:puma_queued_connections, 'Number of connections in that worker\'s "todo" set waiting for a worker thread'),
+            puma_active_connections: ::Gitlab::Metrics.gauge(:puma_active_connections, 'Number of threads processing a request'),
+            puma_pool_capacity:      ::Gitlab::Metrics.gauge(:puma_pool_capacity, 'Number of requests the worker is capable of taking right now'),
+            puma_max_threads:        ::Gitlab::Metrics.gauge(:puma_max_threads, 'Maximum number of worker threads'),
+            puma_idle_threads:       ::Gitlab::Metrics.gauge(:puma_idle_threads, 'Number of spawned threads which are not processing a request')
+          }
+        end
+
+        def sample
+          json_stats = puma_stats
+          return unless json_stats
+
+          stats = JSON.parse(json_stats)
+
+          if cluster?(stats)
+            sample_cluster(stats)
+          else
+            sample_single_worker(stats)
+          end
+        end
+
+        private
+
+        def puma_stats
+          Puma.stats
+        rescue NoMethodError
+          Rails.logger.info "PumaSampler: stats are not available yet, waiting for Puma to boot"
+          nil
+        end
+
+        def sample_cluster(stats)
+          set_master_metrics(stats)
+
+          stats['worker_status'].each do |worker|
+            labels = { worker: "worker_#{worker['index']}" }
+
+            metrics[:puma_phase].set(labels, worker['phase'])
+            set_worker_metrics(worker['last_status'], labels)
+          end
+        end
+
+        def sample_single_worker(stats)
+          metrics[:puma_workers].set({}, 1)
+          metrics[:puma_running_workers].set({}, 1)
+
+          set_worker_metrics(stats)
+        end
+
+        def cluster?(stats)
+          stats['worker_status'].present?
+        end
+
+        def set_master_metrics(stats)
+          labels = { worker: "master" }
+
+          metrics[:puma_workers].set(labels, stats['workers'])
+          metrics[:puma_running_workers].set(labels, stats['booted_workers'])
+          metrics[:puma_stale_workers].set(labels, stats['old_workers'])
+          metrics[:puma_phase].set(labels, stats['phase'])
+        end
+
+        def set_worker_metrics(stats, labels = {})
+          metrics[:puma_running].set(labels, stats['running'])
+          metrics[:puma_queued_connections].set(labels, stats['backlog'])
+          metrics[:puma_active_connections].set(labels, stats['max_threads'] - stats['pool_capacity'])
+          metrics[:puma_pool_capacity].set(labels, stats['pool_capacity'])
+          metrics[:puma_max_threads].set(labels, stats['max_threads'])
+          metrics[:puma_idle_threads].set(labels, stats['running'] + stats['pool_capacity'] - stats['max_threads'])
+        end
+      end
+    end
+  end
+end
--- a/spec/lib/gitlab/metrics/samplers/puma_sampler_spec.rb
+++ b/spec/lib/gitlab/metrics/samplers/puma_sampler_spec.rb
+# frozen_string_literal: true
+
+require 'spec_helper'
+
+describe Gitlab::Metrics::Samplers::PumaSampler do
+  subject { described_class.new(5) }
+  let(:null_metric) { double('null_metric', set: nil, observe: nil) }
+
+  before do
+    allow(Gitlab::Metrics::NullMetric).to receive(:instance).and_return(null_metric)
+  end
+
+  describe '#sample' do
+    before do
+      expect(subject).to receive(:puma_stats).and_return(puma_stats)
+    end
+
+    context 'in cluster mode' do
+      let(:puma_stats) do
+        <<~EOS
+        {
+          "workers": 2,
+          "phase": 2,
+          "booted_workers": 2,
+          "old_workers": 0,
+          "worker_status": [{
+            "pid": 32534,
+            "index": 0,
+            "phase": 1,
+            "booted": true,
+            "last_checkin": "2019-05-15T07:57:55Z",
+            "last_status": {
+              "backlog":0,
+              "running":1,
+              "pool_capacity":4,
+              "max_threads": 4
+            }
+          }]
+        }
+        EOS
+      end
+
+      it 'samples master statistics' do
+        labels = { worker: 'master' }
+
+        expect(subject.metrics[:puma_workers]).to receive(:set).with(labels, 2)
+        expect(subject.metrics[:puma_running_workers]).to receive(:set).with(labels, 2)
+        expect(subject.metrics[:puma_stale_workers]).to receive(:set).with(labels, 0)
+        expect(subject.metrics[:puma_phase]).to receive(:set).once.with(labels, 2)
+        expect(subject.metrics[:puma_phase]).to receive(:set).once.with({ worker: 'worker_0' }, 1)
+
+        subject.sample
+      end
+
+      it 'samples worker statistics' do
+        labels = { worker: 'worker_0' }
+
+        expect_worker_stats(labels)
+
+        subject.sample
+      end
+    end
+
+    context 'in single mode' do
+      let(:puma_stats) do
+        <<~EOS
+        {
+          "backlog":0,
+          "running":1,
+          "pool_capacity":4,
+          "max_threads": 4
+        }
+        EOS
+      end
+
+      it 'samples worker statistics' do
+        labels = {}
+
+        expect(subject.metrics[:puma_workers]).to receive(:set).with(labels, 1)
+        expect(subject.metrics[:puma_running_workers]).to receive(:set).with(labels, 1)
+        expect_worker_stats(labels)
+
+        subject.sample
+      end
+    end
+  end
+
+  def expect_worker_stats(labels)
+    expect(subject.metrics[:puma_queued_connections]).to receive(:set).with(labels, 0)
+    expect(subject.metrics[:puma_active_connections]).to receive(:set).with(labels, 0)
+    expect(subject.metrics[:puma_running]).to receive(:set).with(labels, 1)
+    expect(subject.metrics[:puma_pool_capacity]).to receive(:set).with(labels, 4)
+    expect(subject.metrics[:puma_max_threads]).to receive(:set).with(labels, 4)
+    expect(subject.metrics[:puma_idle_threads]).to receive(:set).with(labels, 1)
+  end
+end