diff --git a/lib/gitlab/encoding_helper.rb b/lib/gitlab/encoding_helper.rb index 8ac756a0b6ac157cea5cb6a7041b7fb7c2718076..c50417f4d4f5839d282497ab7291461ad6bf9e72 100644 --- a/lib/gitlab/encoding_helper.rb +++ b/lib/gitlab/encoding_helper.rb @@ -24,7 +24,7 @@ module Gitlab # return message if message type is binary detect = CharlockHolmes::EncodingDetector.detect(message) - return message.force_encoding("BINARY") if all_binary?(message, detect) + return message.force_encoding("BINARY") if detect_binary?(message, detect) if detect && detect[:encoding] && detect[:confidence] > ENCODING_CONFIDENCE_THRESHOLD # force detected encoding if we have sufficient confidence. @@ -38,17 +38,17 @@ module Gitlab "--broken encoding: #{encoding}" end - def all_binary?(data, detect = nil) + def detect_binary?(data, detect = nil) detect ||= CharlockHolmes::EncodingDetector.detect(data) - detect && detect[:type] == :binary + detect && detect[:type] == :binary && detect[:confidence] == 100 end - def libgit2_binary?(data) + def detect_libgit2_binary?(data) # EncodingDetector checks the first 1024 * 1024 bytes for NUL byte, libgit2 checks # only the first 8000 (https://github.com/libgit2/libgit2/blob/2ed855a9e8f9af211e7274021c2264e600c0f86b/src/filter.h#L15), # which is what we use below to keep a consistent behavior. detect = CharlockHolmes::EncodingDetector.new(8000).detect(data) - all_binary?(data, detect) + detect && detect[:type] == :binary end def encode_utf8(message) diff --git a/lib/gitlab/git/blob.rb b/lib/gitlab/git/blob.rb index e5391b8bf8a0d0725e54b8c609a7d31b591f17e6..8d96826f6ee616da2fc3654d4e64fc1dab5536c2 100644 --- a/lib/gitlab/git/blob.rb +++ b/lib/gitlab/git/blob.rb @@ -58,7 +58,7 @@ module Gitlab end def binary?(data) - EncodingHelper.libgit2_binary?(data) + EncodingHelper.detect_libgit2_binary?(data) end private diff --git a/lib/gitlab/git/diff.rb b/lib/gitlab/git/diff.rb index b3237d5496abb07d90fa606cf982d7dcfac57f9f..a23c8cf0dd1188fbca507d02723f31f04371ec2f 100644 --- a/lib/gitlab/git/diff.rb +++ b/lib/gitlab/git/diff.rb @@ -120,6 +120,8 @@ module Gitlab # Return a binary diff message like: # # "Binary files a/file/path and b/file/path differ\n" + # This is used when we detect that a diff is binary + # using CharlockHolmes when Rugged treats it as text. def binary_message(old_path, new_path) "Binary files #{old_path} and #{new_path} differ\n" end @@ -198,7 +200,7 @@ module Gitlab end def json_safe_diff - return @diff unless all_binary?(@diff) + return @diff unless detect_binary?(@diff) # the diff is binary, let's make a message for it Diff.binary_message(@old_path, @new_path) diff --git a/spec/requests/api/commits_spec.rb b/spec/requests/api/commits_spec.rb index edbfaf510c5ee7d3dd9cc27e45792fc48b0e0d5e..f663719d28c051e74a1d9f90c104a8a32b7220b1 100644 --- a/spec/requests/api/commits_spec.rb +++ b/spec/requests/api/commits_spec.rb @@ -673,6 +673,12 @@ describe API::Commits do it_behaves_like 'ref diff' end end + + context 'when binary diff are treated as text' do + let(:commit_id) { TestEnv::BRANCH_SHA['add-pdf-text-binary'] } + + it_behaves_like 'ref diff' + end end end