diff --git a/qa/qa/resource/project.rb b/qa/qa/resource/project.rb index 358e87b0eb9e347ef3b9aaa4df05004334f9b1f5..f4fe43aa3ab5edb85015c488254b0d4f0543bdf3 100644 --- a/qa/qa/resource/project.rb +++ b/qa/qa/resource/project.rb @@ -123,6 +123,10 @@ module QA "#{api_get_path}/runners" end + def api_commits_path + "#{api_get_path}/repository/commits" + end + def api_repository_branches_path "#{api_get_path}/repository/branches" end @@ -176,6 +180,10 @@ module QA raise Runtime::API::RepositoryStorageMoves::RepositoryStorageMovesError, 'Timed out while waiting for the repository storage move to finish' end + def commits + parse_body(get(Runtime::API::Request.new(api_client, api_commits_path).url)) + end + def import_status response = get Runtime::API::Request.new(api_client, "/projects/#{id}/import").url diff --git a/qa/qa/service/praefect_manager.rb b/qa/qa/service/praefect_manager.rb index 00796ab150295371a60dd2cd8cc0dbd5fbd2f9e7..ce57cd9e07446acff30b8a45cdbd1b7d3e7f28da 100644 --- a/qa/qa/service/praefect_manager.rb +++ b/qa/qa/service/praefect_manager.rb @@ -17,21 +17,31 @@ module QA @virtual_storage = 'default' end - def enable_writes - shell "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml enable-writes -virtual-storage #{@virtual_storage}'" + # Executes the praefect `dataloss` command. + # + # @return [Boolean] whether dataloss has occurred + def dataloss? + wait_until_shell_command_matches(dataloss_command, /Outdated repositories/) end def replicated?(project_id) - shell %(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"') do |line| + replicas = wait_until_shell_command(%(docker exec gitlab-gitaly-ha bash -c 'gitlab-rake "gitlab:praefect:replicas[#{project_id}]"')) do |line| + QA::Runtime::Logger.debug(line.chomp) # The output of the rake task looks something like this: # # Project name | gitaly1 (primary) | gitaly2 | gitaly3 # ---------------------------------------------------------------------------------------------------------------------------------------------------------------- # gitaly_cluster-3aff1f2bd14e6c98 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 | 23c4422629234d62b62adacafd0a33a8364e8619 # - # We want to confirm that the checksums are identical - break line.split('|').map(&:strip)[1..3].uniq.one? if line.start_with?("gitaly_cluster") + break line if line.start_with?("gitaly_cluster") end + + # We want to know if the checksums are identical + replicas.split('|').map(&:strip)[1..3].uniq.one? + end + + def start_primary_node + start_node(@primary_node) end def start_praefect @@ -51,40 +61,54 @@ module QA end def trigger_failover_by_stopping_primary_node + QA::Runtime::Logger.info("Stopping node #{@primary_node} to trigger failover") stop_node(@primary_node) end def clear_replication_queue - QA::Runtime::Logger.debug("Clearing the replication queue") - shell <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"delete from replication_queue_job_lock; delete from replication_queue_lock; delete from replication_queue;\\"" - CMD + QA::Runtime::Logger.info("Clearing the replication queue") + shell sql_to_docker_exec_cmd( + <<~SQL + delete from replication_queue_job_lock; + delete from replication_queue_lock; + delete from replication_queue; + SQL + ) end def create_stalled_replication_queue - QA::Runtime::Logger.debug("Setting jobs in replication queue to `in_progress` and acquiring locks") - shell <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"update replication_queue set state = 'in_progress'; - insert into replication_queue_job_lock (job_id, lock_id, triggered_at) - select id, rq.lock_id, created_at from replication_queue rq - left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id - where state = 'in_progress' and rqjl.job_id is null; - update replication_queue_lock set acquired = 't';\\"" - CMD + QA::Runtime::Logger.info("Setting jobs in replication queue to `in_progress` and acquiring locks") + shell sql_to_docker_exec_cmd( + <<~SQL + update replication_queue set state = 'in_progress'; + insert into replication_queue_job_lock (job_id, lock_id, triggered_at) + select id, rq.lock_id, created_at from replication_queue rq + left join replication_queue_job_lock rqjl on rq.id = rqjl.job_id + where state = 'in_progress' and rqjl.job_id is null; + update replication_queue_lock set acquired = 't'; + SQL + ) + end + + # Reconciles the previous primary node with the current one + # I.e., it brings the previous primary node up-to-date + def reconcile_nodes + reconcile_node_with_node(@primary_node, current_primary_node) + end + + def reconcile_node_with_node(target, reference) + QA::Runtime::Logger.info("Reconcile #{target} with #{reference} on #{@virtual_storage}") + wait_until_shell_command_matches( + "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml reconcile -virtual #{@virtual_storage} -target #{target} -reference #{reference} -f'", + /FINISHED: \d+ repos were checked for consistency/, + sleep_interval: 5, + retry_on_exception: true + ) end def replication_queue_lock_count result = [] - cmd = <<~CMD - docker exec --env PGPASSWORD=SQL_PASSWORD #{@postgres} \ - bash -c "psql -U postgres -d praefect_production -h postgres.test \ - -c \\"select count(*) from replication_queue_lock where acquired = 't';\\"" - CMD - shell cmd do |line| + shell sql_to_docker_exec_cmd("select count(*) from replication_queue_lock where acquired = 't';") do |line| result << line end # The result looks like: @@ -94,12 +118,36 @@ module QA result[2].to_i end + # Makes the original primary (gitaly1) the primary again by + # stopping the other nodes, waiting for gitaly1 to be made the + # primary again, and then it starts the other nodes and enables + # writes + def reset_primary_to_original + QA::Runtime::Logger.info("Checking primary node...") + + return if @primary_node == current_primary_node + + QA::Runtime::Logger.info("Reset primary node to #{@primary_node}") + start_node(@primary_node) + stop_node(@secondary_node) + stop_node(@tertiary_node) + + wait_for_new_primary_node(@primary_node) + + start_node(@secondary_node) + start_node(@tertiary_node) + + wait_for_health_check_all_nodes + wait_for_reliable_connection + end + def reset_cluster + QA::Runtime::Logger.info('Reset Gitaly Cluster by starting all nodes and enabling writes') start_node(@praefect) start_node(@primary_node) start_node(@secondary_node) start_node(@tertiary_node) - enable_writes + wait_for_health_check_all_nodes end def verify_storage_move(source_storage, destination_storage) @@ -111,10 +159,33 @@ module QA end def wait_for_praefect + QA::Runtime::Logger.info('Wait until Praefect starts and is listening') wait_until_shell_command_matches( "docker exec #{@praefect} bash -c 'cat /var/log/gitlab/praefect/current'", /listening at tcp address/ ) + + # Praefect can fail to start if unable to dial one of the gitaly nodes + # See https://gitlab.com/gitlab-org/gitaly/-/issues/2847 + # We tail the logs to allow us to confirm if that is the problem if tests fail + + shell "docker exec #{@praefect} bash -c 'tail /var/log/gitlab/praefect/current'" do |line| + QA::Runtime::Logger.debug(line.chomp) + end + end + + def wait_for_new_primary_node(node) + QA::Runtime::Logger.info("Wait until #{node} is the primary node") + with_praefect_log do |log| + break true if log['msg'] == 'primary node changed' && log['newPrimary'] == node + end + end + + def wait_for_new_primary + QA::Runtime::Logger.info("Wait until a new primary node is selected") + with_praefect_log do |log| + break true if log['msg'] == 'primary node changed' + end end def wait_for_sql_ping @@ -124,35 +195,86 @@ module QA ) end + def wait_for_no_praefect_storage_error + # If a healthcheck error was the last message to be logged, we'll keep seeing that message even if it's no longer a problem + # That is, there's no message shown in the Praefect logs when the healthcheck succeeds + # To work around that we perform the gitaly check rake task, wait a few seconds, and then we confirm that no healthcheck errors appear + + QA::Runtime::Logger.info("Checking that Praefect does not report healthcheck errors with its gitaly nodes") + + Support::Waiter.wait_until(max_duration: 120) do + wait_for_gitaly_check + + sleep 5 + + shell "docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'" do |line| + QA::Runtime::Logger.debug(line.chomp) + log = JSON.parse(line) + + break true if log['msg'] != 'error when pinging healthcheck' + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON + end + end + end + def wait_for_storage_nodes - nodes_confirmed = { - @primary_node => false, - @secondary_node => false, - @tertiary_node => false - } + wait_for_no_praefect_storage_error - wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line| - QA::Runtime::Logger.info(line.chomp) + Support::Waiter.repeat_until(max_attempts: 3) do + nodes_confirmed = { + @primary_node => false, + @secondary_node => false, + @tertiary_node => false + } - nodes_confirmed.each_key do |node| - nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/ + wait_until_shell_command("docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dial-nodes'") do |line| + QA::Runtime::Logger.debug(line.chomp) + + nodes_confirmed.each_key do |node| + nodes_confirmed[node] = true if line =~ /SUCCESS: confirmed Gitaly storage "#{node}" in virtual storages \[#{@virtual_storage}\] is served/ + end + + nodes_confirmed.values.all? end + end + end - nodes_confirmed.values.all? + def wait_for_health_check_current_primary_node + wait_for_health_check(current_primary_node) + end + + def wait_for_health_check_all_nodes + wait_for_health_check(@primary_node) + wait_for_health_check(@secondary_node) + wait_for_health_check(@tertiary_node) + end + + def wait_for_health_check(node) + QA::Runtime::Logger.info("Waiting for health check on #{node}") + wait_until_shell_command("docker exec #{node} bash -c 'cat /var/log/gitlab/gitaly/current'") do |line| + QA::Runtime::Logger.debug(line.chomp) + log = JSON.parse(line) + + log['grpc.request.fullMethod'] == '/grpc.health.v1.Health/Check' && log['grpc.code'] == 'OK' + rescue JSON::ParserError + # Ignore lines that can't be parsed as JSON end end def wait_for_gitaly_check - storage_ok = false - check_finished = false + Support::Waiter.repeat_until(max_attempts: 3) do + storage_ok = false + check_finished = false - wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line| - QA::Runtime::Logger.info(line.chomp) + wait_until_shell_command("docker exec #{@gitlab} bash -c 'gitlab-rake gitlab:gitaly:check'") do |line| + QA::Runtime::Logger.debug(line.chomp) - storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/ - check_finished = true if line =~ /Checking Gitaly ... Finished/ + storage_ok = true if line =~ /Gitaly: ... #{@virtual_storage} ... OK/ + check_finished = true if line =~ /Checking Gitaly ... Finished/ - storage_ok && check_finished + storage_ok && check_finished + end end end @@ -164,15 +286,36 @@ module QA end def wait_for_reliable_connection + QA::Runtime::Logger.info('Wait until GitLab and Praefect can communicate reliably') wait_for_praefect wait_for_sql_ping wait_for_storage_nodes - wait_for_gitaly_check wait_for_gitlab_shell_check end + def wait_for_replication(project_id) + Support::Waiter.wait_until(sleep_interval: 1) { replicated?(project_id) } + end + private + def current_primary_node + shell dataloss_command do |line| + QA::Runtime::Logger.debug(line.chomp) + + match = line.match(/Primary: (.*)/) + break match[1] if match + end + end + + def dataloss_command + "docker exec #{@praefect} bash -c '/opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml dataloss'" + end + + def sql_to_docker_exec_cmd(sql) + Service::Shellout.sql_to_docker_exec_cmd(sql, 'postgres', 'SQL_PASSWORD', 'praefect_production', 'postgres.test', @postgres) + end + def verify_storage_move_from_gitaly(storage) wait_until_shell_command("docker exec #{@gitlab} bash -c 'tail -n 50 /var/log/gitlab/gitaly/current'") do |line| log = JSON.parse(line) @@ -203,21 +346,10 @@ module QA end end - def wait_until_shell_command(cmd, **kwargs) - sleep_interval = kwargs.delete(:sleep_interval) || 1 - - Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do - shell cmd do |line| - break true if yield line - end - end - end - - def wait_until_shell_command_matches(cmd, regex, **kwargs) - wait_until_shell_command(cmd, kwargs) do |line| - QA::Runtime::Logger.info(line.chomp) - - line =~ regex + def with_praefect_log + wait_until_shell_command("docker exec #{@praefect} bash -c 'tail -n 1 /var/log/gitlab/praefect/current'") do |line| + QA::Runtime::Logger.debug(line.chomp) + yield JSON.parse(line) end end end diff --git a/qa/qa/service/shellout.rb b/qa/qa/service/shellout.rb index 6efe50c4ae2baf9a69fa2f63566de1133379862b..81cfaa125a9e9dec4451ab848d81f21dab20155a 100644 --- a/qa/qa/service/shellout.rb +++ b/qa/qa/service/shellout.rb @@ -33,6 +33,31 @@ module QA end end end + + def sql_to_docker_exec_cmd(sql, username, password, database, host, container) + <<~CMD + docker exec --env PGPASSWORD=#{password} #{container} \ + bash -c "psql -U #{username} -d #{database} -h #{host} -c \\"#{sql}\\"" + CMD + end + + def wait_until_shell_command(cmd, **kwargs) + sleep_interval = kwargs.delete(:sleep_interval) || 1 + + Support::Waiter.wait_until(sleep_interval: sleep_interval, **kwargs) do + shell cmd do |line| + break true if yield line + end + end + end + + def wait_until_shell_command_matches(cmd, regex, **kwargs) + wait_until_shell_command(cmd, kwargs) do |line| + QA::Runtime::Logger.debug(line.chomp) + + line =~ regex + end + end end end end diff --git a/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..77d72bc1bb3d0dacaaba2e533fa48e9be1f5882f --- /dev/null +++ b/qa/qa/specs/features/api/3_create/repository/backend_node_recovery_spec.rb @@ -0,0 +1,72 @@ +# frozen_string_literal: true + +module QA + RSpec.describe 'Create' do + context 'Gitaly' do + describe 'Backend node recovery', :orchestrated, :gitaly_ha, :skip_live_env do + let(:praefect_manager) { Service::PraefectManager.new } + let(:project) do + Resource::Project.fabricate! do |project| + project.name = "gitaly_cluster" + project.initialize_with_readme = true + end + end + + before do + # Reset the cluster in case previous tests left it in a bad state + praefect_manager.reset_primary_to_original + end + + after do + # Leave the cluster in a suitable state for subsequent tests + praefect_manager.reset_primary_to_original + end + + it 'recovers from dataloss' do + # Create a new project with a commit and wait for it to replicate + praefect_manager.wait_for_replication(project.id) + + # Stop the primary node to trigger failover, and then wait + # for Gitaly to be ready for writes again + praefect_manager.trigger_failover_by_stopping_primary_node + praefect_manager.wait_for_new_primary + praefect_manager.wait_for_health_check_current_primary_node + praefect_manager.wait_for_gitaly_check + + # Confirm that we have access to the repo after failover + Support::Waiter.wait_until(retry_on_exception: true, sleep_interval: 5) do + Resource::Repository::Commit.fabricate_via_api! do |commits| + commits.project = project + commits.sha = 'master' + end + end + + # Push a commit to the new primary + Resource::Repository::ProjectPush.fabricate! do |push| + push.project = project + push.new_branch = false + push.commit_message = 'pushed after failover' + push.file_name = 'new_file' + push.file_content = 'new file' + end + + # Start the old primary node again + praefect_manager.start_primary_node + praefect_manager.wait_for_health_check_current_primary_node + + # Confirm dataloss (i.e., inconsistent nodes) + expect(praefect_manager.dataloss?).to be true + expect(praefect_manager.replicated?(project.id)).to be false + + # Reconcile nodes to recover from dataloss + praefect_manager.reconcile_nodes + praefect_manager.wait_for_replication(project.id) + + # Confirm that both commits are available after reconciliation + expect(project.commits.map { |commit| commit[:message].chomp }) + .to include("Initial commit").and include("pushed after failover") + end + end + end + end +end diff --git a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb index 3b2144270b05555ad3d472ba21a63c404a5fb918..16757a602dcd0fc13335b02287d1d2ecb4c77323 100644 --- a/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb +++ b/qa/qa/specs/features/browser_ui/3_create/gitaly/high_availability_spec.rb @@ -41,8 +41,6 @@ module QA expect(show).to have_file(initial_file) end - praefect_manager.enable_writes - Resource::Repository::Commit.fabricate_via_api! do |commit| commit.project = project commit.add_files([