From c7e3879c782f5434525639fe585b98e4aa0d9298 Mon Sep 17 00:00:00 2001 From: Matt Pharr Date: Wed, 2 Feb 2022 07:10:03 -0800 Subject: [PATCH] Unicode normalization (#225) Normalize unicode strings for user-supplied names (objects, materials, media, etc.) Note that there is no need to normalize strings for things like the name of the selected sampler, light source types, or the parameters provided to pbrt objects, as all of the valid ones are plain old ASCII text. We also intentionally do not normalize pathnames, as doing so can cause all sorts of trouble. --- .gitmodules | 3 +++ CMakeLists.txt | 3 +++ src/ext/CMakeLists.txt | 7 +++++++ src/ext/utf8proc | 1 + src/pbrt/scene.cpp | 33 +++++++++++++++++++++++---------- src/pbrt/util/string.cpp | 18 ++++++++++++++++++ src/pbrt/util/string.h | 2 ++ src/pbrt/util/string_test.cpp | 26 ++++++++++++++++++++++++++ 8 files changed, 83 insertions(+), 10 deletions(-) create mode 160000 src/ext/utf8proc create mode 100644 src/pbrt/util/string_test.cpp diff --git a/.gitmodules b/.gitmodules index b299bd0..a908a29 100644 --- a/.gitmodules +++ b/.gitmodules @@ -27,3 +27,6 @@ [submodule "src/ext/lodepng"] path = src/ext/lodepng url = https://github.com/lvandeve/lodepng.git +[submodule "src/ext/utf8proc"] + path = src/ext/utf8proc + url = https://github.com/JuliaStrings/utf8proc.git diff --git a/CMakeLists.txt b/CMakeLists.txt index 4465ed1..1619858 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -65,6 +65,7 @@ check_ext ("filesystem" "filesystem/filesystem" c5f9de30142453eb3c6fe991e82dfc25 check_ext ("libdeflate" "libdeflate/common" 1fd0bea6ca2073c68493632dafc4b1ddda1bcbc3) check_ext ("lodepng" "lodepng/examples" 8c6a9e30576f07bf470ad6f09458a2dcd7a6a84a) check_ext ("stb" "stb/tools" af1a5bc352164740c1cc1354942b1c6b72eacb8a) +check_ext ("utf8proc" "utf8proc/bench" 2484e2ed5e1d9c19edcccf392a7d9920ad90dfaf) check_ext ("zlib" "zlib/doc" 54d591eabf9fe0e84c725638f8d5d8d202a093fa) add_compile_definitions ("$<$:PBRT_DEBUG_BUILD>") @@ -861,6 +862,7 @@ set (ALL_PBRT_LIBS ${LIBDEFLATE_LIBRARIES} double-conversion ${PBRT_CUDA_LIB} + utf8proc ) if (PBRT_CUDA_ENABLED) @@ -1024,6 +1026,7 @@ set (PBRT_TEST_SOURCE src/pbrt/util/sampling_test.cpp src/pbrt/util/spectrum_test.cpp src/pbrt/util/splines_test.cpp + src/pbrt/util/string_test.cpp src/pbrt/util/taggedptr_test.cpp src/pbrt/util/transform_test.cpp src/pbrt/util/vecmath_test.cpp diff --git a/src/ext/CMakeLists.txt b/src/ext/CMakeLists.txt index d7f47f6..3968d18 100644 --- a/src/ext/CMakeLists.txt +++ b/src/ext/CMakeLists.txt @@ -139,3 +139,10 @@ set (FLIP_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/flip PARENT_SCOPE) add_library (flip_lib STATIC ${CMAKE_CURRENT_SOURCE_DIR}/flip/flip.cpp) set_property (TARGET flip_lib PROPERTY FOLDER "ext") + +########################################################################### +# utf8proc + +set (UTF8PROC_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/utf8proc PARENT_SCOPE) + +add_subdirectory (utf8proc) diff --git a/src/ext/utf8proc b/src/ext/utf8proc new file mode 160000 index 0000000..2484e2e --- /dev/null +++ b/src/ext/utf8proc @@ -0,0 +1 @@ +Subproject commit 2484e2ed5e1d9c19edcccf392a7d9920ad90dfaf diff --git a/src/pbrt/scene.cpp b/src/pbrt/scene.cpp index 8f99579..91ec026 100644 --- a/src/pbrt/scene.cpp +++ b/src/pbrt/scene.cpp @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -122,11 +123,13 @@ void BasicSceneBuilder::Translate(Float dx, Float dy, Float dz, FileLoc loc) { [=](auto t) { return t * pbrt::Translate(Vector3f(dx, dy, dz)); }); } -void BasicSceneBuilder::CoordinateSystem(const std::string &name, FileLoc loc) { +void BasicSceneBuilder::CoordinateSystem(const std::string &origName, FileLoc loc) { + std::string name = NormalizeUTF8(origName); namedCoordinateSystems[name] = graphicsState.ctm; } -void BasicSceneBuilder::CoordSysTransform(const std::string &name, FileLoc loc) { +void BasicSceneBuilder::CoordSysTransform(const std::string &origName, FileLoc loc) { + std::string name = NormalizeUTF8(origName); if (namedCoordinateSystems.find(name) != namedCoordinateSystems.end()) graphicsState.ctm = namedCoordinateSystems[name]; else @@ -230,8 +233,9 @@ void BasicSceneBuilder::WorldBegin(FileLoc loc) { scene->SetOptions(filter, film, camera, sampler, integrator, accelerator); } -void BasicSceneBuilder::MakeNamedMedium(const std::string &name, +void BasicSceneBuilder::MakeNamedMedium(const std::string &origName, ParsedParameterVector params, FileLoc loc) { + std::string name = NormalizeUTF8(origName); // Issue error if medium _name_ is multiply defined if (mediumNames.find(name) != mediumNames.end()) { ErrorExitDeferred(&loc, "Named medium \"%s\" redefined.", name); @@ -302,7 +306,9 @@ void BasicSceneBuilder::Shape(const std::string &name, ParsedParameterVector par } } -void BasicSceneBuilder::ObjectBegin(const std::string &name, FileLoc loc) { +void BasicSceneBuilder::ObjectBegin(const std::string &origName, FileLoc loc) { + std::string name = NormalizeUTF8(origName); + VERIFY_WORLD("ObjectBegin"); pushedGraphicsStates.push_back(graphicsState); @@ -356,7 +362,8 @@ void BasicSceneBuilder::ObjectEnd(FileLoc loc) { activeInstanceDefinition = nullptr; } -void BasicSceneBuilder::ObjectInstance(const std::string &name, FileLoc loc) { +void BasicSceneBuilder::ObjectInstance(const std::string &origName, FileLoc loc) { + std::string name = NormalizeUTF8(origName); VERIFY_WORLD("ObjectInstance"); if (activeInstanceDefinition) { @@ -643,15 +650,19 @@ void BasicSceneBuilder::Integrator(const std::string &name, ParsedParameterVecto integrator = SceneEntity(name, std::move(dict), loc); } -void BasicSceneBuilder::MediumInterface(const std::string &insideName, - const std::string &outsideName, FileLoc loc) { +void BasicSceneBuilder::MediumInterface(const std::string &origInsideName, + const std::string &origOutsideName, FileLoc loc) { + std::string insideName = NormalizeUTF8(origInsideName); + std::string outsideName = NormalizeUTF8(origOutsideName); + graphicsState.currentInsideMedium = insideName; graphicsState.currentOutsideMedium = outsideName; } -void BasicSceneBuilder::Texture(const std::string &name, const std::string &type, +void BasicSceneBuilder::Texture(const std::string &origName, const std::string &type, const std::string &texname, ParsedParameterVector params, FileLoc loc) { + std::string name = NormalizeUTF8(origName); VERIFY_WORLD("Texture"); ParameterDictionary dict(std::move(params), graphicsState.textureAttributes, @@ -691,8 +702,9 @@ void BasicSceneBuilder::Material(const std::string &name, ParsedParameterVector graphicsState.currentMaterialName.clear(); } -void BasicSceneBuilder::MakeNamedMaterial(const std::string &name, +void BasicSceneBuilder::MakeNamedMaterial(const std::string &origName, ParsedParameterVector params, FileLoc loc) { + std::string name = NormalizeUTF8(origName); VERIFY_WORLD("MakeNamedMaterial"); ParameterDictionary dict(std::move(params), graphicsState.materialAttributes, @@ -707,7 +719,8 @@ void BasicSceneBuilder::MakeNamedMaterial(const std::string &name, scene->AddNamedMaterial(name, SceneEntity("", std::move(dict), loc)); } -void BasicSceneBuilder::NamedMaterial(const std::string &name, FileLoc loc) { +void BasicSceneBuilder::NamedMaterial(const std::string &origName, FileLoc loc) { + std::string name = NormalizeUTF8(origName); VERIFY_WORLD("NamedMaterial"); graphicsState.currentMaterialName = name; graphicsState.currentMaterialIndex = -1; diff --git a/src/pbrt/util/string.cpp b/src/pbrt/util/string.cpp index 4d10875..6a24862 100644 --- a/src/pbrt/util/string.cpp +++ b/src/pbrt/util/string.cpp @@ -9,6 +9,10 @@ #include #include +#include + +#define UTF8PROC_STATIC +#include #include #include @@ -185,4 +189,18 @@ std::u16string UTF16FromUTF8(std::string str) { return utf16; } +std::string NormalizeUTF8(std::string str) { + utf8proc_option_t options = UTF8PROC_COMPOSE; + + utf8proc_uint8_t *result; + utf8proc_ssize_t length = utf8proc_map((const unsigned char *)str.data(), str.size(), + &result, options); + if (length < 0) + ErrorExit("Unicode normalization error: %s: \"%s\"", utf8proc_errmsg(length), str); + + str = std::string(result, result + length); + free(result); + return str; +} + } // namespace pbrt diff --git a/src/pbrt/util/string.h b/src/pbrt/util/string.h index 8db24f2..319e80e 100644 --- a/src/pbrt/util/string.h +++ b/src/pbrt/util/string.h @@ -36,6 +36,8 @@ std::wstring WStringFromUTF8(std::string str); std::string UTF8FromWString(std::wstring str); #endif // PBRT_IS_WINDOWS +std::string NormalizeUTF8(std::string str); + // InternedString Definition class InternedString { public: diff --git a/src/pbrt/util/string_test.cpp b/src/pbrt/util/string_test.cpp new file mode 100644 index 0000000..11fc0cc --- /dev/null +++ b/src/pbrt/util/string_test.cpp @@ -0,0 +1,26 @@ +// pbrt is Copyright(c) 1998-2020 Matt Pharr, Wenzel Jakob, and Greg Humphreys. +// The pbrt source code is licensed under the Apache License, Version 2.0. +// SPDX: Apache-2.0 + +#include + +#include +#include + +#include + +using namespace pbrt; + +TEST(Unicode, BasicNormalization) { + // "Amélie" two ways, via https://en.wikipedia.org/wiki/Unicode_equivalence + std::u16string nfc16(u"\u0041\u006d\u00e9\u006c\u0069\u0065"); + std::u16string nfd16(u"\u0041\u006d\u0065\u0301\u006c\u0069\u0065"); + EXPECT_NE(nfc16, nfd16); + + std::string nfc8 = UTF8FromUTF16(nfc16); + std::string nfd8 = UTF8FromUTF16(nfd16); + EXPECT_NE(nfc8, nfd8); + + EXPECT_EQ(nfc8, NormalizeUTF8(nfc8)); // nfc is already normalized + EXPECT_EQ(nfc8, NormalizeUTF8(nfd8)); // normalizing nfd should make it equal nfc +} -- GitLab