diff --git a/changelogs/unreleased/kamil-improve-import-export.yml b/changelogs/unreleased/kamil-improve-import-export.yml new file mode 100644 index 0000000000000000000000000000000000000000..9d485e0df2df808aef97b8cfe2f7d54fd2563241 --- /dev/null +++ b/changelogs/unreleased/kamil-improve-import-export.yml @@ -0,0 +1,5 @@ +--- +title: Reduce N+1 when doing project export +merge_request: 32423 +author: +type: performance diff --git a/lib/gitlab/import_export/attributes_finder.rb b/lib/gitlab/import_export/attributes_finder.rb index 13883ca7f3dbc8c44c87d228b66b044c1bf109fb..28d48ce6dfe4d528253992f244fa6e72be0ecf30 100644 --- a/lib/gitlab/import_export/attributes_finder.rb +++ b/lib/gitlab/import_export/attributes_finder.rb @@ -8,6 +8,7 @@ module Gitlab @included_attributes = config[:included_attributes] || {} @excluded_attributes = config[:excluded_attributes] || {} @methods = config[:methods] || {} + @preloads = config[:preloads] || {} end def find_root(model_key) @@ -29,10 +30,26 @@ module Gitlab only: @included_attributes[model_key], except: @excluded_attributes[model_key], methods: @methods[model_key], - include: resolve_model_tree(model_tree) + include: resolve_model_tree(model_tree), + preload: resolve_preloads(model_key, model_tree) }.compact end + def resolve_preloads(model_key, model_tree) + model_tree + .map { |submodel_key, submodel_tree| resolve_preload(model_key, submodel_key, submodel_tree) } + .compact + .to_h + .deep_merge(@preloads[model_key].to_h) + .presence + end + + def resolve_preload(parent_model_key, model_key, model_tree) + return if @methods[parent_model_key]&.include?(model_key) + + [model_key, resolve_preloads(model_key, model_tree)] + end + def resolve_model_tree(model_tree) return unless model_tree diff --git a/lib/gitlab/import_export/fast_hash_serializer.rb b/lib/gitlab/import_export/fast_hash_serializer.rb new file mode 100644 index 0000000000000000000000000000000000000000..a6ab4f3a3d90f04e2322e35ea5214e5dc68e385f --- /dev/null +++ b/lib/gitlab/import_export/fast_hash_serializer.rb @@ -0,0 +1,108 @@ +# frozen_string_literal: true + +# ActiveModel::Serialization (https://github.com/rails/rails/blob/v5.0.7/activemodel/lib/active_model/serialization.rb#L184) +# is simple in that it recursively calls `as_json` on each object to +# serialize everything. However, for a model like a Project, this can +# generate a query for every single association, which can add up to tens +# of thousands of queries and lead to memory bloat. +# +# To improve this, we can do several things: + +# 1. Use the option tree in http://api.rubyonrails.org/classes/ActiveModel/Serializers/JSON.html +# to generate the necessary preload clauses. +# +# 2. We observe that a single project has many issues, merge requests, +# etc. Instead of serializing everything at once, which could lead to +# database timeouts and high memory usage, we take each top-level +# association and serialize the data in batches. +# +# For example, we serialize the first 100 issues and preload all of +# their associated events, notes, etc. before moving onto the next +# batch. When we're done, we serialize merge requests in the same way. +# We repeat this pattern for the remaining associations specified in +# import_export.yml. +module Gitlab + module ImportExport + class FastHashSerializer + attr_reader :subject, :tree + + BATCH_SIZE = 100 + + def initialize(subject, tree, batch_size: BATCH_SIZE) + @subject = subject + @batch_size = batch_size + @tree = tree + end + + # Serializes the subject into a Hash for the given option tree + # (e.g. Project#as_json) + def execute + simple_serialize.merge(serialize_includes) + end + + private + + def simple_serialize + subject.as_json( + tree.merge(include: nil, preloads: nil)) + end + + def serialize_includes + return {} unless includes + + includes + .map(&method(:serialize_include_definition)) + .compact + .to_h + end + + # definition: + # { labels: { includes: ... } } + def serialize_include_definition(definition) + raise ArgumentError, 'definition needs to be Hash' unless definition.is_a?(Hash) + raise ArgumentError, 'definition needs to have exactly one Hash element' unless definition.one? + + key = definition.first.first + options = definition.first.second + + record = subject.public_send(key) # rubocop: disable GitlabSecurity/PublicSend + return unless record + + serialized_record = serialize_record(key, record, options) + return unless serialized_record + + # `#as_json` always returns keys as `strings` + [key.to_s, serialized_record] + end + + def serialize_record(key, record, options) + unless record.respond_to?(:as_json) + raise "Invalid type of #{key} is #{record.class}" + end + + # no has-many relation + unless record.is_a?(ActiveRecord::Relation) + return record.as_json(options) + end + + # has-many relation + data = [] + + record.in_batches(of: @batch_size) do |batch| # rubocop:disable Cop/InBatches + batch = batch.preload(preloads[key]) if preloads&.key?(key) + data += batch.as_json(options) + end + + data + end + + def includes + tree[:include] + end + + def preloads + tree[:preload] + end + end + end +end diff --git a/lib/gitlab/import_export/import_export.yml b/lib/gitlab/import_export/import_export.yml index 06c94beead8f258965aa9af1de1aa37d995d61a9..511b702553e6cf24097c41485375c4e808ca35c3 100644 --- a/lib/gitlab/import_export/import_export.yml +++ b/lib/gitlab/import_export/import_export.yml @@ -231,6 +231,16 @@ methods: ci_pipelines: - :notes +preloads: + statuses: + # TODO: We cannot preload tags, as they are not part of `GenericCommitStatus` + # tags: # needed by tag_list + project: # deprecated: needed by coverage_regex of Ci::Build + merge_requests: + source_project: # needed by source_branch_sha and diff_head_sha + target_project: # needed by target_branch_sha + assignees: # needed by assigne_id that is implemented by DeprecatedAssignee + # EE specific relationships and settings to include. All of this will be merged # into the previous structures if EE is used. ee: diff --git a/lib/gitlab/import_export/project_tree_saver.rb b/lib/gitlab/import_export/project_tree_saver.rb index f1b3db6b2083dada11833181b445597ebbdbbe78..f75f69b2c756f9bc73bab89ec9f36b1091060fe0 100644 --- a/lib/gitlab/import_export/project_tree_saver.rb +++ b/lib/gitlab/import_export/project_tree_saver.rb @@ -41,7 +41,13 @@ module Gitlab end def serialize_project_tree - @project.as_json(reader.project_tree) + if Feature.enabled?(:export_fast_serialize, default_enabled: true) + Gitlab::ImportExport::FastHashSerializer + .new(@project, reader.project_tree) + .execute + else + @project.as_json(reader.project_tree) + end end def reader diff --git a/spec/lib/gitlab/import_export/attributes_finder_spec.rb b/spec/lib/gitlab/import_export/attributes_finder_spec.rb index 208b60844e3c1bd2e08570ff35fee5644755f17f..3cbc1375d6e450f6f0a850c1bc3fc81984531f78 100644 --- a/spec/lib/gitlab/import_export/attributes_finder_spec.rb +++ b/spec/lib/gitlab/import_export/attributes_finder_spec.rb @@ -20,20 +20,41 @@ describe Gitlab::ImportExport::AttributesFinder do except: [:iid], include: [ { merge_request_diff: { - include: [] + include: [], + preload: { source_project: nil } } }, { merge_request_test: { include: [] } } ], - only: [:id] + only: [:id], + preload: { + merge_request_diff: { source_project: nil }, + merge_request_test: nil + } } }, { commit_statuses: { - include: [{ commit: { include: [] } }] + include: [{ commit: { include: [] } }], + preload: { commit: nil } } }, { project_members: { include: [{ user: { include: [], - only: [:email] } }] + only: [:email] } }], + preload: { user: nil } } } - ] + ], + preload: { + commit_statuses: { + commit: nil + }, + issues: nil, + labels: nil, + merge_requests: { + merge_request_diff: { source_project: nil }, + merge_request_test: nil + }, + project_members: { + user: nil + } + } } end @@ -50,7 +71,8 @@ describe Gitlab::ImportExport::AttributesFinder do setup_yaml(tree: { project: [:issues] }) is_expected.to match( - include: [{ issues: { include: [] } }] + include: [{ issues: { include: [] } }], + preload: { issues: nil } ) end @@ -58,7 +80,8 @@ describe Gitlab::ImportExport::AttributesFinder do setup_yaml(tree: { project: [:project_feature] }) is_expected.to match( - include: [{ project_feature: { include: [] } }] + include: [{ project_feature: { include: [] } }], + preload: { project_feature: nil } ) end @@ -67,7 +90,8 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ issues: { include: [] } }, - { snippets: { include: [] } }] + { snippets: { include: [] } }], + preload: { issues: nil, snippets: nil } ) end @@ -75,7 +99,9 @@ describe Gitlab::ImportExport::AttributesFinder do setup_yaml(tree: { project: [issues: [:notes]] }) is_expected.to match( - include: [{ issues: { include: [{ notes: { include: [] } }] } }] + include: [{ issues: { include: [{ notes: { include: [] } }], + preload: { notes: nil } } }], + preload: { issues: { notes: nil } } ) end @@ -85,7 +111,9 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ merge_requests: { include: [{ notes: { include: [] } }, - { merge_request_diff: { include: [] } }] } }] + { merge_request_diff: { include: [] } }], + preload: { merge_request_diff: nil, notes: nil } } }], + preload: { merge_requests: { merge_request_diff: nil, notes: nil } } ) end @@ -94,8 +122,11 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ merge_requests: { - include: [{ notes: { include: [{ author: { include: [] } }] } }] - } }] + include: [{ notes: { include: [{ author: { include: [] } }], + preload: { author: nil } } }], + preload: { notes: { author: nil } } + } }], + preload: { merge_requests: { notes: { author: nil } } } ) end @@ -105,7 +136,8 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ issues: { include: [], - only: [:name, :description] } }] + only: [:name, :description] } }], + preload: { issues: nil } ) end @@ -115,7 +147,8 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ issues: { except: [:name], - include: [] } }] + include: [] } }], + preload: { issues: nil } ) end @@ -127,7 +160,8 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ issues: { except: [:name], include: [], - only: [:description] } }] + only: [:description] } }], + preload: { issues: nil } ) end @@ -137,7 +171,8 @@ describe Gitlab::ImportExport::AttributesFinder do is_expected.to match( include: [{ issues: { include: [], - methods: [:name] } }] + methods: [:name] } }], + preload: { issues: nil } ) end diff --git a/spec/lib/gitlab/import_export/config_spec.rb b/spec/lib/gitlab/import_export/config_spec.rb index e53db37def45132cff65d7c2ffbc7fc933819afa..f09a29b84db6cbd6421e68aad439f23a09257d1e 100644 --- a/spec/lib/gitlab/import_export/config_spec.rb +++ b/spec/lib/gitlab/import_export/config_spec.rb @@ -25,7 +25,7 @@ describe Gitlab::ImportExport::Config do expect { subject }.not_to raise_error expect(subject).to be_a(Hash) expect(subject.keys).to contain_exactly( - :tree, :excluded_attributes, :included_attributes, :methods) + :tree, :excluded_attributes, :included_attributes, :methods, :preloads) end end end @@ -55,6 +55,10 @@ describe Gitlab::ImportExport::Config do events: - :action + preloads: + statuses: + project: + ee: tree: project: @@ -71,6 +75,9 @@ describe Gitlab::ImportExport::Config do - :type_ee events_ee: - :action_ee + preloads: + statuses: + bridge_ee: EOF end @@ -111,6 +118,11 @@ describe Gitlab::ImportExport::Config do methods: { labels: [:type], events: [:action] + }, + preloads: { + statuses: { + project: nil + } } } ) @@ -150,6 +162,12 @@ describe Gitlab::ImportExport::Config do labels: [:type, :type_ee], events: [:action], events_ee: [:action_ee] + }, + preloads: { + statuses: { + project: nil, + bridge_ee: nil + } } } ) diff --git a/spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb b/spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb new file mode 100644 index 0000000000000000000000000000000000000000..d23b27c9d8e262fd4b44d140f50ded31597f7708 --- /dev/null +++ b/spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb @@ -0,0 +1,272 @@ +require 'spec_helper' + +describe Gitlab::ImportExport::FastHashSerializer do + subject { described_class.new(project, tree).execute } + + let!(:project) { setup_project } + let(:user) { create(:user) } + let(:shared) { project.import_export_shared } + let(:reader) { Gitlab::ImportExport::Reader.new(shared: shared) } + let(:tree) { reader.project_tree } + + before do + project.add_maintainer(user) + allow_any_instance_of(MergeRequest).to receive(:source_branch_sha).and_return('ABCD') + allow_any_instance_of(MergeRequest).to receive(:target_branch_sha).and_return('DCBA') + end + + it 'saves the correct hash' do + is_expected.to include({ 'description' => 'description', 'visibility_level' => 20 }) + end + + it 'has approvals_before_merge set' do + expect(subject['approvals_before_merge']).to eq(1) + end + + it 'has milestones' do + expect(subject['milestones']).not_to be_empty + end + + it 'has merge requests' do + expect(subject['merge_requests']).not_to be_empty + end + + it 'has merge request\'s milestones' do + expect(subject['merge_requests'].first['milestone']).not_to be_empty + end + + it 'has merge request\'s source branch SHA' do + expect(subject['merge_requests'].first['source_branch_sha']).to eq('ABCD') + end + + it 'has merge request\'s target branch SHA' do + expect(subject['merge_requests'].first['target_branch_sha']).to eq('DCBA') + end + + it 'has events' do + expect(subject['merge_requests'].first['milestone']['events']).not_to be_empty + end + + it 'has snippets' do + expect(subject['snippets']).not_to be_empty + end + + it 'has snippet notes' do + expect(subject['snippets'].first['notes']).not_to be_empty + end + + it 'has releases' do + expect(subject['releases']).not_to be_empty + end + + it 'has no author on releases' do + expect(subject['releases'].first['author']).to be_nil + end + + it 'has the author ID on releases' do + expect(subject['releases'].first['author_id']).not_to be_nil + end + + it 'has issues' do + expect(subject['issues']).not_to be_empty + end + + it 'has issue comments' do + notes = subject['issues'].first['notes'] + + expect(notes).not_to be_empty + expect(notes.first['type']).to eq('DiscussionNote') + end + + it 'has issue assignees' do + expect(subject['issues'].first['issue_assignees']).not_to be_empty + end + + it 'has author on issue comments' do + expect(subject['issues'].first['notes'].first['author']).not_to be_empty + end + + it 'has project members' do + expect(subject['project_members']).not_to be_empty + end + + it 'has merge requests diffs' do + expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty + end + + it 'has merge request diff files' do + expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_files']).not_to be_empty + end + + it 'has merge request diff commits' do + expect(subject['merge_requests'].first['merge_request_diff']['merge_request_diff_commits']).not_to be_empty + end + + it 'has merge requests comments' do + expect(subject['merge_requests'].first['notes']).not_to be_empty + end + + it 'has author on merge requests comments' do + expect(subject['merge_requests'].first['notes'].first['author']).not_to be_empty + end + + it 'has pipeline stages' do + expect(subject.dig('ci_pipelines', 0, 'stages')).not_to be_empty + end + + it 'has pipeline statuses' do + expect(subject.dig('ci_pipelines', 0, 'stages', 0, 'statuses')).not_to be_empty + end + + it 'has pipeline builds' do + builds_count = subject + .dig('ci_pipelines', 0, 'stages', 0, 'statuses') + .count { |hash| hash['type'] == 'Ci::Build' } + + expect(builds_count).to eq(1) + end + + it 'has no when YML attributes but only the DB column' do + allow_any_instance_of(Ci::Pipeline) + .to receive(:ci_yaml_file) + .and_return(File.read(Rails.root.join('spec/support/gitlab_stubs/gitlab_ci.yml'))) + + expect_any_instance_of(Gitlab::Ci::YamlProcessor).not_to receive(:build_attributes) + + subject + end + + it 'has pipeline commits' do + expect(subject['ci_pipelines']).not_to be_empty + end + + it 'has ci pipeline notes' do + expect(subject['ci_pipelines'].first['notes']).not_to be_empty + end + + it 'has labels with no associations' do + expect(subject['labels']).not_to be_empty + end + + it 'has labels associated to records' do + expect(subject['issues'].first['label_links'].first['label']).not_to be_empty + end + + it 'has project and group labels' do + label_types = subject['issues'].first['label_links'].map { |link| link['label']['type'] } + + expect(label_types).to match_array(%w(ProjectLabel GroupLabel)) + end + + it 'has priorities associated to labels' do + priorities = subject['issues'].first['label_links'].flat_map { |link| link['label']['priorities'] } + + expect(priorities).not_to be_empty + end + + it 'has issue resource label events' do + expect(subject['issues'].first['resource_label_events']).not_to be_empty + end + + it 'has merge request resource label events' do + expect(subject['merge_requests'].first['resource_label_events']).not_to be_empty + end + + it 'saves the correct service type' do + expect(subject['services'].first['type']).to eq('CustomIssueTrackerService') + end + + it 'saves the properties for a service' do + expect(subject['services'].first['properties']).to eq('one' => 'value') + end + + it 'has project feature' do + project_feature = subject['project_feature'] + expect(project_feature).not_to be_empty + expect(project_feature["issues_access_level"]).to eq(ProjectFeature::DISABLED) + expect(project_feature["wiki_access_level"]).to eq(ProjectFeature::ENABLED) + expect(project_feature["builds_access_level"]).to eq(ProjectFeature::PRIVATE) + end + + it 'has custom attributes' do + expect(subject['custom_attributes'].count).to eq(2) + end + + it 'has badges' do + expect(subject['project_badges'].count).to eq(2) + end + + it 'does not complain about non UTF-8 characters in MR diff files' do + ActiveRecord::Base.connection.execute("UPDATE merge_request_diff_files SET diff = '---\n- :diff: !binary |-\n LS0tIC9kZXYvbnVsbAorKysgYi9pbWFnZXMvbnVjb3IucGRmCkBAIC0wLDAg\n KzEsMTY3OSBAQAorJVBERi0xLjUNJeLjz9MNCisxIDAgb2JqDTw8L01ldGFk\n YXR'") + + expect(subject['merge_requests'].first['merge_request_diff']).not_to be_empty + end + + context 'project attributes' do + it 'does not contain the runners token' do + expect(subject).not_to include("runners_token" => 'token') + end + end + + it 'has a board and a list' do + expect(subject['boards'].first['lists']).not_to be_empty + end + + def setup_project + issue = create(:issue, assignees: [user]) + snippet = create(:project_snippet) + release = create(:release) + group = create(:group) + + project = create(:project, + :public, + :repository, + :issues_disabled, + :wiki_enabled, + :builds_private, + description: 'description', + issues: [issue], + snippets: [snippet], + releases: [release], + group: group, + approvals_before_merge: 1 + ) + project_label = create(:label, project: project) + group_label = create(:group_label, group: group) + create(:label_link, label: project_label, target: issue) + create(:label_link, label: group_label, target: issue) + create(:label_priority, label: group_label, priority: 1) + milestone = create(:milestone, project: project) + merge_request = create(:merge_request, source_project: project, milestone: milestone) + + ci_build = create(:ci_build, project: project, when: nil) + ci_build.pipeline.update(project: project) + create(:commit_status, project: project, pipeline: ci_build.pipeline) + + create(:milestone, project: project) + create(:discussion_note, noteable: issue, project: project) + create(:note, noteable: merge_request, project: project) + create(:note, noteable: snippet, project: project) + create(:note_on_commit, + author: user, + project: project, + commit_id: ci_build.pipeline.sha) + + create(:resource_label_event, label: project_label, issue: issue) + create(:resource_label_event, label: group_label, merge_request: merge_request) + + create(:event, :created, target: milestone, project: project, author: user) + create(:service, project: project, type: 'CustomIssueTrackerService', category: 'issue_tracker', properties: { one: 'value' }) + + create(:project_custom_attribute, project: project) + create(:project_custom_attribute, project: project) + + create(:project_badge, project: project) + create(:project_badge, project: project) + + board = create(:board, project: project, name: 'TestBoard') + create(:list, board: board, position: 0, label: project_label) + + project + end +end diff --git a/spec/lib/gitlab/import_export/project_tree_saver_spec.rb b/spec/lib/gitlab/import_export/project_tree_saver_spec.rb index fefbed9331687849c5cd7bdfd19d4015ccc59f85..ff46e062a5dde665a9f7f566804cb20890f5fe0b 100644 --- a/spec/lib/gitlab/import_export/project_tree_saver_spec.rb +++ b/spec/lib/gitlab/import_export/project_tree_saver_spec.rb @@ -23,12 +23,65 @@ describe Gitlab::ImportExport::ProjectTreeSaver do expect(project_tree_saver.save).to be true end + context ':export_fast_serialize feature flag checks' do + before do + expect(Gitlab::ImportExport::Reader).to receive(:new).with(shared: shared).and_return(reader) + expect(reader).to receive(:project_tree).and_return(project_tree) + end + + let(:serializer) { instance_double('Gitlab::ImportExport::FastHashSerializer') } + let(:reader) { instance_double('Gitlab::ImportExport::Reader') } + let(:project_tree) do + { + include: [{ issues: { include: [] } }], + preload: { issues: nil } + } + end + + context 'when :export_fast_serialize feature is enabled' do + before do + stub_feature_flags(export_fast_serialize: true) + end + + it 'uses FastHashSerializer' do + expect(Gitlab::ImportExport::FastHashSerializer) + .to receive(:new) + .with(project, project_tree) + .and_return(serializer) + + expect(serializer).to receive(:execute) + + project_tree_saver.save + end + end + + context 'when :export_fast_serialize feature is disabled' do + before do + stub_feature_flags(export_fast_serialize: false) + end + + it 'is serialized via built-in `as_json`' do + expect(project).to receive(:as_json).with(project_tree) + + project_tree_saver.save + end + end + end + + # It is mostly duplicated in + # `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb` + # except: + # context 'with description override' do + # context 'group members' do + # ^ These are specific for the ProjectTreeSaver context 'JSON' do let(:saved_project_json) do project_tree_saver.save project_json(project_tree_saver.full_path) end + # It is not duplicated in + # `spec/lib/gitlab/import_export/fast_hash_serializer_spec.rb` context 'with description override' do let(:params) { { description: 'Foo Bar' } } let(:project_tree_saver) { described_class.new(project: project, current_user: user, shared: shared, params: params) } diff --git a/spec/support/import_export/import_export.yml b/spec/support/import_export/import_export.yml index ed2a3243f0d3e588385b4e89fb1915125054b485..116bc8d0b9ca5cc38c52a6b93b3d01fcdffec03a 100644 --- a/spec/support/import_export/import_export.yml +++ b/spec/support/import_export/import_export.yml @@ -13,6 +13,10 @@ tree: group_members: - :user +preloads: + merge_request_diff: + source_project: + included_attributes: merge_requests: - :id