diff --git a/Makefile b/Makefile index af7e809ced694635d1281dbcf3f0ab234ce3c137..f0b2299172cf631470e1935869021bebb0e5ea96 100644 --- a/Makefile +++ b/Makefile @@ -259,6 +259,10 @@ all:: # Define OLD_ICONV if your library has an old iconv(), where the second # (input buffer pointer) parameter is declared with type (const char **). # +# Define ICONV_OMITS_BOM if your iconv implementation does not write a +# byte-order mark (BOM) when writing UTF-16 or UTF-32 and always writes in +# big-endian format. +# # Define NO_DEFLATE_BOUND if your zlib does not have deflateBound. # # Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib" @@ -1417,6 +1421,9 @@ ifndef NO_ICONV EXTLIBS += $(ICONV_LINK) -liconv endif endif +ifdef ICONV_OMITS_BOM + BASIC_CFLAGS += -DICONV_OMITS_BOM +endif ifdef NEEDS_LIBGEN EXTLIBS += -lgen endif diff --git a/t/t0028-working-tree-encoding.sh b/t/t0028-working-tree-encoding.sh index e58ecbfc44037fa1d38707b30545e803e1401b4f..500229a9bd6245c8a93ddc10b008bc6afc2d9f71 100755 --- a/t/t0028-working-tree-encoding.sh +++ b/t/t0028-working-tree-encoding.sh @@ -6,6 +6,30 @@ test_description='working-tree-encoding conversion via gitattributes' GIT_TRACE_WORKING_TREE_ENCODING=1 && export GIT_TRACE_WORKING_TREE_ENCODING +test_lazy_prereq NO_UTF16_BOM ' + test $(printf abc | iconv -f UTF-8 -t UTF-16 | wc -c) = 6 +' + +test_lazy_prereq NO_UTF32_BOM ' + test $(printf abc | iconv -f UTF-8 -t UTF-32 | wc -c) = 12 +' + +write_utf16 () { + if test_have_prereq NO_UTF16_BOM + then + printf '\xfe\xff' + fi && + iconv -f UTF-8 -t UTF-16 +} + +write_utf32 () { + if test_have_prereq NO_UTF32_BOM + then + printf '\x00\x00\xfe\xff' + fi && + iconv -f UTF-8 -t UTF-32 +} + test_expect_success 'setup test files' ' git config core.eol lf && @@ -13,8 +37,8 @@ test_expect_success 'setup test files' ' echo "*.utf16 text working-tree-encoding=utf-16" >.gitattributes && echo "*.utf16lebom text working-tree-encoding=UTF-16LE-BOM" >>.gitattributes && printf "$text" >test.utf8.raw && - printf "$text" | iconv -f UTF-8 -t UTF-16 >test.utf16.raw && - printf "$text" | iconv -f UTF-8 -t UTF-32 >test.utf32.raw && + printf "$text" | write_utf16 >test.utf16.raw && + printf "$text" | write_utf32 >test.utf32.raw && printf "\377\376" >test.utf16lebom.raw && printf "$text" | iconv -f UTF-8 -t UTF-32LE >>test.utf16lebom.raw && @@ -124,8 +148,8 @@ do test_when_finished "rm -f crlf.utf${i}.raw lf.utf${i}.raw" && test_when_finished "git reset --hard HEAD^" && - cat lf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >lf.utf${i}.raw && - cat crlf.utf8.raw | iconv -f UTF-8 -t UTF-${i} >crlf.utf${i}.raw && + cat lf.utf8.raw | write_utf${i} >lf.utf${i}.raw && + cat crlf.utf8.raw | write_utf${i} >crlf.utf${i}.raw && cp crlf.utf${i}.raw eol.utf${i} && cat >expectIndexLF <<-EOF && @@ -223,7 +247,7 @@ test_expect_success ICONV_SHIFT_JIS 'check roundtrip encoding' ' text="hallo there!\nroundtrip test here!" && printf "$text" | iconv -f UTF-8 -t SHIFT-JIS >roundtrip.shift && - printf "$text" | iconv -f UTF-8 -t UTF-16 >roundtrip.utf16 && + printf "$text" | write_utf16 >roundtrip.utf16 && echo "*.shift text working-tree-encoding=SHIFT-JIS" >>.gitattributes && # SHIFT-JIS encoded files are round-trip checked by default... diff --git a/utf8.c b/utf8.c index 83824dc2f4ab151a19418c61c46e0c1ffbb0e42c..3b42fadffd7ccb89a5658fdf8d314014f299a769 100644 --- a/utf8.c +++ b/utf8.c @@ -559,6 +559,10 @@ char *reencode_string_len(const char *in, size_t insz, /* * For writing, UTF-16 iconv typically creates "UTF-16BE-BOM" * Some users under Windows want the little endian version + * + * We handle UTF-16 and UTF-32 ourselves only if the platform does not + * provide a BOM (which we require), since we want to match the behavior + * of the system tools and libc as much as possible. */ if (same_utf_encoding("UTF-16LE-BOM", out_encoding)) { bom_str = utf16_le_bom; @@ -568,6 +572,16 @@ char *reencode_string_len(const char *in, size_t insz, bom_str = utf16_be_bom; bom_len = sizeof(utf16_be_bom); out_encoding = "UTF-16BE"; +#ifdef ICONV_OMITS_BOM + } else if (same_utf_encoding("UTF-16", out_encoding)) { + bom_str = utf16_be_bom; + bom_len = sizeof(utf16_be_bom); + out_encoding = "UTF-16BE"; + } else if (same_utf_encoding("UTF-32", out_encoding)) { + bom_str = utf32_be_bom; + bom_len = sizeof(utf32_be_bom); + out_encoding = "UTF-32BE"; +#endif } conv = iconv_open(out_encoding, in_encoding);