Add latest changes from gitlab-org/gitlab@master

927475d0 · GitLab Bot · c0d7d2e3 · 927475d0 · 927475d0 · 927475d0
5 changed file
--- a/qa/qa/resource/project.rb
+++ b/qa/qa/resource/project.rb
@@ -123,6 +123,10 @@ module QA
        "#{api_get_path}/runners"
      end

+      def api_commits_path
+        "#{api_get_path}/repository/commits"
+      end
+
      def api_repository_branches_path
        "#{api_get_path}/repository/branches"
      end
@@ -176,6 +180,10 @@ module QA
        raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish'
      end

+      def commits
+        parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url))
+      end
+
      def import_status
        response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url


--- a/qa/qa/service/praefect_manager.rb
+++ b/qa/qa/service/praefect_manager.rb
@@ -17,21 +17,31 @@ module QA
        @virtual_storage = 'default'
      end

-      def enable_writes
-        shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'"
+      # Executes the praefect `dataloss` command.
+      #
+      # @return [Boolean] whether dataloss has occurred
+      def dataloss?
+        wait_until_shell_command_matches(dataloss_command, /Outdated repositories/)
      end

      def replicated?(project_id)
-        shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line|
+        replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line|
+          QA::Runtime::Logger.debug(line.chomp)
          # The output of the rake task looks something like this:
          #
          # Project name                    | gitaly1 (primary)                        | gitaly2                                  | gitaly3
          # ----------------------------------------------------------------------------------------------------------------------------------------------------------------
          # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619
          #
-          # We want to confirm that the checksums are identical
-          break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster")
+          break line if line.start_with?("gitaly_cluster")
        end
+
+        # We want to know if the checksums are identical
+        replicas.split('|').map(&:strip)[1..3].uniq.one?
+      end
+
+      def start_primary_node
+        start_node(@primary_node)
      end

      def start_praefect
@@ -51,40 +61,54 @@ module QA
      end

      def trigger_failover_by_stopping_primary_node
+        QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover")
        stop_node(@primary_node)
      end

      def clear_replication_queue
-        QA::Runtime::Logger.debug("Clearing the replication queue")
-        shell <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\""
-        CMD
+        QA::Runtime::Logger.info("Clearing the replication queue")
+        shell sql_to_docker_exec_cmd(
+          <<~SQL
+            delete from replication_queue_job_lock;
+            delete from replication_queue_lock;
+            delete from replication_queue;
+          SQL
+        )
      end

      def create_stalled_replication_queue
-        QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks")
-        shell <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"update replication_queue set state = 'in_progress';
-                  insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
-                    select id, rq.lock_id, created_at from replication_queue rq
-                      left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
-                      where state = 'in_progress' and rqjl.job_id is null;
-                  update replication_queue_lock set acquired = 't';\\""
-        CMD
+        QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks")
+        shell sql_to_docker_exec_cmd(
+          <<~SQL
+            update replication_queue set state = 'in_progress';
+            insert into replication_queue_job_lock (job_id, lock_id, triggered_at)
+              select id, rq.lock_id, created_at from replication_queue rq
+                left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id
+                where state = 'in_progress' and rqjl.job_id is null;
+            update replication_queue_lock set acquired = 't';
+          SQL
+        )
+      end
+
+      # Reconciles the previous primary node with the current one
+      # I.e., it brings the previous primary node up-to-date
+      def reconcile_nodes
+        reconcile_node_with_node(@primary_node, current_primary_node)
+      end
+
+      def reconcile_node_with_node(target, reference)
+        QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}")
+        wait_until_shell_command_matches(
+          "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'",
+          /FINISHED: \d+ repos were checked for consistency/,
+          sleep_interval: 5,
+          retry_on_exception: true
+        )
      end

      def replication_queue_lock_count
        result = []
-        cmd = <<~CMD
-          docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \
-            bash -c "psql -U postgres -d praefect_production -h postgres.test \
-            -c \\"select count(*) from replication_queue_lock where acquired = 't';\\""
-        CMD
-        shell cmd do |line|
+        shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line|
          result << line
        end
        # The result looks like:
@@ -94,12 +118,36 @@ module QA
        result[2].to_i
      end

+      # Makes the original primary (gitaly1) the primary again by
+      # stopping the other nodes, waiting for gitaly1 to be made the
+      # primary again, and then it starts the other nodes and enables
+      # writes
+      def reset_primary_to_original
+        QA::Runtime::Logger.info("Checking primary node...")
+
+        return if @primary_node == current_primary_node
+
+        QA::Runtime::Logger.info("Reset primary node to #{@primary_node}")
+        start_node(@primary_node)
+        stop_node(@secondary_node)
+        stop_node(@tertiary_node)
+
+        wait_for_new_primary_node(@primary_node)
+
+        start_node(@secondary_node)
+        start_node(@tertiary_node)
+
+        wait_for_health_check_all_nodes
+        wait_for_reliable_connection
+      end
+
      def reset_cluster
+        QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes')
        start_node(@praefect)
        start_node(@primary_node)
        start_node(@secondary_node)
        start_node(@tertiary_node)
-        enable_writes
+        wait_for_health_check_all_nodes
      end

      def verify_storage_move(source_storage, destination_storage)
@@ -111,10 +159,33 @@ module QA
      end

      def wait_for_praefect
+        QA::Runtime::Logger.info('Wait until Praefect starts and is listening')
        wait_until_shell_command_matches(
          "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'",
          /listening at tcp address/
        )
+
+        # Praefect can fail to start if unable to dial one of the gitaly nodes
+        # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847
+        # We tail the logs to allow us to confirm if that is the problem if tests fail
+
+        shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+        end
+      end
+
+      def wait_for_new_primary_node(node)
+        QA::Runtime::Logger.info("Wait until #{node} is the primary node")
+        with_praefect_log do |log|
+          break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node
+        end
+      end
+
+      def wait_for_new_primary
+        QA::Runtime::Logger.info("Wait until a new primary node is selected")
+        with_praefect_log do |log|
+          break true if log['msg'] == 'primary node changed'
+        end
      end

      def wait_for_sql_ping
@@ -124,35 +195,86 @@ module QA
        )
      end

+      def wait_for_no_praefect_storage_error
+        # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem
+        # That is, there's no message shown in the Praefect logs when the healthcheck succeeds
+        # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear
+
+        QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes")
+
+        Support::Waiter.wait_until(max_duration: 120) do
+          wait_for_gitaly_check
+
+          sleep 5
+
+          shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line|
+            QA::Runtime::Logger.debug(line.chomp)
+            log = JSON.parse(line)
+
+            break true if log['msg'] != 'error when pinging healthcheck'
+          rescue JSON::ParserError
+            # Ignore lines that can't be parsed as JSON
+          end
+        end
+      end
+
      def wait_for_storage_nodes
-        nodes_confirmed = {
-          @primary_node => false,
-          @secondary_node => false,
-          @tertiary_node => false
-        }
+        wait_for_no_praefect_storage_error

-        wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
-          QA::Runtime::Logger.info(line.chomp)
+        Support::Waiter.repeat_until(max_attempts: 3) do
+          nodes_confirmed = {
+            @primary_node => false,
+            @secondary_node => false,
+            @tertiary_node => false
+          }

-          nodes_confirmed.each_key do |node|
-            nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+          wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line|
+            QA::Runtime::Logger.debug(line.chomp)
+
+            nodes_confirmed.each_key do |node|
+              nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/
+            end
+
+            nodes_confirmed.values.all?
          end
+        end
+      end

-          nodes_confirmed.values.all?
+      def wait_for_health_check_current_primary_node
+        wait_for_health_check(current_primary_node)
+      end
+
+      def wait_for_health_check_all_nodes
+        wait_for_health_check(@primary_node)
+        wait_for_health_check(@secondary_node)
+        wait_for_health_check(@tertiary_node)
+      end
+
+      def wait_for_health_check(node)
+        QA::Runtime::Logger.info("Waiting for health check on #{node}")
+        wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+          log = JSON.parse(line)
+
+          log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK'
+        rescue JSON::ParserError
+          # Ignore lines that can't be parsed as JSON
        end
      end

      def wait_for_gitaly_check
-        storage_ok = false
-        check_finished = false
+        Support::Waiter.repeat_until(max_attempts: 3) do
+          storage_ok = false
+          check_finished = false

-        wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
-          QA::Runtime::Logger.info(line.chomp)
+          wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line|
+            QA::Runtime::Logger.debug(line.chomp)

-          storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
-          check_finished = true if line =~ /Checking Gitaly ... Finished/
+            storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/
+            check_finished = true if line =~ /Checking Gitaly ... Finished/

-          storage_ok && check_finished
+            storage_ok && check_finished
+          end
        end
      end

@@ -164,15 +286,36 @@ module QA
      end

      def wait_for_reliable_connection
+        QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably')
        wait_for_praefect
        wait_for_sql_ping
        wait_for_storage_nodes
-        wait_for_gitaly_check
        wait_for_gitlab_shell_check
      end

+      def wait_for_replication(project_id)
+        Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) }
+      end
+
      private

+      def current_primary_node
+        shell dataloss_command do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+
+          match = line.match(/Primary: (.*)/)
+          break match[1] if match
+        end
+      end
+
+      def dataloss_command
+        "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'"
+      end
+
+      def sql_to_docker_exec_cmd(sql)
+        Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres)
+      end
+
      def verify_storage_move_from_gitaly(storage)
        wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line|
          log = JSON.parse(line)
@@ -203,21 +346,10 @@ module QA
        end
      end

-      def wait_until_shell_command(cmd, **kwargs)
-        sleep_interval = kwargs.delete(:sleep_interval) || 1
-
-        Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
-          shell cmd do |line|
-            break true if yield line
-          end
-        end
-      end
-
-      def wait_until_shell_command_matches(cmd, regex, **kwargs)
-        wait_until_shell_command(cmd, kwargs) do |line|
-          QA::Runtime::Logger.info(line.chomp)
-
-          line =~ regex
+      def with_praefect_log
+        wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+          yield JSON.parse(line)
        end
      end
    end

--- a/qa/qa/service/shellout.rb
+++ b/qa/qa/service/shellout.rb
@@ -33,6 +33,31 @@ module QA
          end
        end
      end
+
+      def sql_to_docker_exec_cmd(sql, username, password, database, host, container)
+        <<~CMD
+          docker exec --env PGPASSWORD=#{password} #{container} \
+            bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\""
+        CMD
+      end
+
+      def wait_until_shell_command(cmd, **kwargs)
+        sleep_interval = kwargs.delete(:sleep_interval) || 1
+
+        Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do
+          shell cmd do |line|
+            break true if yield line
+          end
+        end
+      end
+
+      def wait_until_shell_command_matches(cmd, regex, **kwargs)
+        wait_until_shell_command(cmd, kwargs) do |line|
+          QA::Runtime::Logger.debug(line.chomp)
+
+          line =~ regex
+        end
+      end
    end
  end
 end
--- a/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
+++ b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb
+# frozen_string_literal: true
+
+module QA
+  RSpec.describe 'Create' do
+    context 'Gitaly' do
+      describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do
+        let(:praefect_manager) { Service::PraefectManager.new }
+        let(:project) do
+          Resource::Project.fabricate! do |project|
+            project.name = "gitaly_cluster"
+            project.initialize_with_readme = true
+          end
+        end
+
+        before do
+          # Reset the cluster in case previous tests left it in a bad state
+          praefect_manager.reset_primary_to_original
+        end
+
+        after do
+          # Leave the cluster in a suitable state for subsequent tests
+          praefect_manager.reset_primary_to_original
+        end
+
+        it 'recovers from dataloss' do
+          # Create a new project with a commit and wait for it to replicate
+          praefect_manager.wait_for_replication(project.id)
+
+          # Stop the primary node to trigger failover, and then wait
+          # for Gitaly to be ready for writes again
+          praefect_manager.trigger_failover_by_stopping_primary_node
+          praefect_manager.wait_for_new_primary
+          praefect_manager.wait_for_health_check_current_primary_node
+          praefect_manager.wait_for_gitaly_check
+
+          # Confirm that we have access to the repo after failover
+          Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do
+            Resource::Repository::Commit.fabricate_via_api! do |commits|
+              commits.project = project
+              commits.sha = 'master'
+            end
+          end
+
+          # Push a commit to the new primary
+          Resource::Repository::ProjectPush.fabricate! do |push|
+            push.project = project
+            push.new_branch = false
+            push.commit_message = 'pushed after failover'
+            push.file_name = 'new_file'
+            push.file_content = 'new file'
+          end
+
+          # Start the old primary node again
+          praefect_manager.start_primary_node
+          praefect_manager.wait_for_health_check_current_primary_node
+
+          # Confirm dataloss (i.e., inconsistent nodes)
+          expect(praefect_manager.dataloss?).to be true
+          expect(praefect_manager.replicated?(project.id)).to be false
+
+          # Reconcile nodes to recover from dataloss
+          praefect_manager.reconcile_nodes
+          praefect_manager.wait_for_replication(project.id)
+
+          # Confirm that both commits are available after reconciliation
+          expect(project.commits.map { |commit| commit[:message].chomp })
+            .to include("Initial commit").and include("pushed after failover")
+        end
+      end
+    end
+  end
+end
--- a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
+++ b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb
@@ -41,8 +41,6 @@ module QA
            expect(show).to have_file(initial_file)
          end

-          praefect_manager.enable_writes
-
          Resource::Repository::Commit.fabricate_via_api! do |commit|
            commit.project = project
            commit.add_files([