From 421c66b76c3d51e764bd3d8ea25acae89a5b222d Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Wed, 6 May 2009 16:15:21 +0000 Subject: [PATCH] Modify CREATE DATABASE to enforce that the source database's encoding setting must be used for the new database, except when copying from template0. This is the same rule that we now enforce for locale settings, and it has the same motivation: databases other than template0 might contain data that would be invalid according to a different setting. This represents another step in a continuing process of locking down ways in which encoding violations could occur inside the backend. Per discussion of a few days ago. In passing, fix pre-existing breakage of mbregress.sh, and fix up a couple of ereport() calls in dbcommands.c that failed to specify sqlstate codes. --- doc/src/sgml/charset.sgml | 36 +++++++++++++++---------- doc/src/sgml/manage-ag.sgml | 30 +++++++++++++-------- doc/src/sgml/ref/create_database.sgml | 39 ++++++++++++++++----------- src/backend/commands/dbcommands.c | 38 +++++++++++++++++++------- src/test/mb/README | 4 +-- src/test/mb/mbregress.sh | 6 ++--- 6 files changed, 97 insertions(+), 56 deletions(-) diff --git a/doc/src/sgml/charset.sgml b/doc/src/sgml/charset.sgml index 715824f21d..d6ab34f9e3 100644 --- a/doc/src/sgml/charset.sgml +++ b/doc/src/sgml/charset.sgml @@ -1,4 +1,4 @@ - + Localization</> @@ -20,11 +20,9 @@ <listitem> <para> - Providing a number of different character sets defined in the - <productname>PostgreSQL</productname> server, including - multiple-byte character sets, to support storing text in all - kinds of languages, and providing character set translation between - client and server. + Providing a number of different character sets to support storing text + in all kinds of languages, and providing character set translation + between client and server. </para> </listitem> </itemizedlist> @@ -75,8 +73,8 @@ initdb --locale=sv_SE names on your system depends on what was provided by the operating system vendor and what was installed. On most Unix systems, the command <literal>locale -a</> will provide a list of available locales. - Windows uses more verbose names, such as <literal>German_Germany</> - or <literal>Swedish_Sweden.1252</>. + Windows uses more verbose locale names, such as <literal>German_Germany</> + or <literal>Swedish_Sweden.1252</>, but the principles are the same. </para> <para> @@ -133,7 +131,7 @@ initdb --locale=sv_SE fixed when the database is created. You can use different settings for different databases, but once a database is created, you cannot change them for that database anymore. <literal>LC_COLLATE</literal> - and <literal>LC_CTYPE</literal> are those categories. They affect + and <literal>LC_CTYPE</literal> are these categories. They affect the sort order of indexes, so they must be kept fixed, or indexes on text columns will become corrupt. The default values for these categories are determined when <command>initdb</command> is run, and @@ -169,7 +167,7 @@ initdb --locale=sv_SE For a given locale category, say the collation, the following environment variables are consulted in this order until one is found to be set: <envar>LC_ALL</envar>, <envar>LC_COLLATE</envar> - (the variable corresponding to the respective category), + (or the variable corresponding to the respective category), <envar>LANG</envar>. If none of these environment variables are set then the locale defaults to <literal>C</literal>. </para> @@ -186,8 +184,9 @@ initdb --locale=sv_SE <para> To enable messages to be translated to the user's preferred language, - <acronym>NLS</acronym> must have been enabled at build time. This - choice is independent of the other locale support. + <acronym>NLS</acronym> must have been selected at build time + (<literal>configure --enable-nls</>). All other locale support is + built in automatically. </para> </sect2> @@ -325,6 +324,7 @@ initdb --locale=sv_SE <envar>LC_COLLATE</> locale settings. For <literal>C</> or <literal>POSIX</> locale, any character set is allowed, but for other locales there is only one character set that will work correctly. + (On Windows, however, UTF-8 encoding can be used with any locale.) </para> <sect2 id="multibyte-charset-supported"> @@ -752,6 +752,14 @@ createdb -E EUC_KR -T template0 --lc-collate=ko_KR.euckr --lc-ctype=ko_KR.euckr CREATE DATABASE korean WITH ENCODING 'EUC_KR' LC_COLLATE='ko_KR.euckr' LC_CTYPE='ko_KR.euckr' TEMPLATE=template0; </programlisting> + Notice that the above commands specify copying the <literal>template0</> + database. When copying any other database, the encoding and locale + settings cannot be changed from those of the source database, because + that might result in corrupt data. For more information see + <xref linkend="manage-ag-templatedbs">. + </para> + + <para> The encoding for a database is stored in the system catalog <literal>pg_database</literal>. You can see it by using the <option>-l</option> option or the <command>\l</command> command @@ -777,7 +785,7 @@ $ <userinput>psql -l</userinput> <para> On most modern operating systems, <productname>PostgreSQL</productname> can determine which character set is implied by an <envar>LC_CTYPE</> - setting, and it will enforce that only the correct database encoding is + setting, and it will enforce that only the matching database encoding is used. On older systems it is your responsibility to ensure that you use the encoding expected by the locale you have selected. A mistake in this area is likely to lead to strange misbehavior of locale-dependent @@ -1225,7 +1233,7 @@ RESET client_encoding; <listitem> <para> - The web site of the Unicode Consortium + The web site of the Unicode Consortium. </para> </listitem> </varlistentry> diff --git a/doc/src/sgml/manage-ag.sgml b/doc/src/sgml/manage-ag.sgml index cee2f51441..a2c196f1dd 100644 --- a/doc/src/sgml/manage-ag.sgml +++ b/doc/src/sgml/manage-ag.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/manage-ag.sgml,v 2.57 2007/11/08 15:21:03 momjian Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/manage-ag.sgml,v 2.58 2009/05/06 16:15:20 tgl Exp $ --> <chapter id="managing-databases"> <title>Managing Databases @@ -203,8 +203,17 @@ createdb -O rolename dbname template1. This is particularly handy when restoring a pg_dump dump: the dump script should be restored in a virgin database to ensure that one recreates the correct contents - of the dumped database, without any conflicts with additions that - can now be present in template1. + of the dumped database, without any conflicts with objects that + might have been added to template1 later on. + + + + Another common reason for copying template0 instead + of template1 is that new encoding and locale settings + can be specified when copying template0, whereas a copy + of template1 must use the same settings it does. + This is because template1 might contain encoding-specific + or locale-specific data, while template0 is known not to. @@ -238,9 +247,8 @@ createdb -T template0 dbname datallowconn. datistemplate can be set to indicate that a database is intended as a template for CREATE DATABASE. If this flag is set, the database can be - cloned by - any user with CREATEDB privileges; if it is not set, only superusers - and the owner of the database can clone it. + cloned by any user with CREATEDB privileges; if it is not set, + only superusers and the owner of the database can clone it. If datallowconn is false, then no new connections to that database will be allowed (but existing sessions are not killed simply by setting the flag false). The template0 @@ -305,14 +313,14 @@ ALTER DATABASE mydb SET geqo TO off; Destroying a Database - Databases are destroyed with the command + Databases are destroyed with the command :DROP DATABASE DROP DATABASE name; Only the owner of the database, or a superuser, can drop a database. Dropping a database removes all objects - that were + that were contained within the database. The destruction of a database cannot be undone. @@ -403,8 +411,8 @@ CREATE TABLESPACE fastspace LOCATION '/mnt/sda1/postgresql/data'; Tables, indexes, and entire databases can be assigned to particular tablespaces. To do so, a user with the CREATE - privilege on a given tablespace must pass the tablespace name as a - parameter to the relevant command. For example, the following creates + privilege on a given tablespace must pass the tablespace name as a + parameter to the relevant command. For example, the following creates a table in the tablespace space1: CREATE TABLE foo(i int) TABLESPACE space1; @@ -493,7 +501,7 @@ SELECT spcname FROM pg_tablespace; update the pg_tablespace catalog to show the new locations. (If you do not, pg_dump will continue to show the old tablespace locations.) - + diff --git a/doc/src/sgml/ref/create_database.sgml b/doc/src/sgml/ref/create_database.sgml index 5866ca7ef5..786a63b702 100644 --- a/doc/src/sgml/ref/create_database.sgml +++ b/doc/src/sgml/ref/create_database.sgml @@ -1,5 +1,5 @@ @@ -116,11 +116,11 @@ CREATE DATABASE name - collate + lc_collate Collation order (LC_COLLATE) to use in the new database. - This affects the sort order applied to strings, e.g in queries with + This affects the sort order applied to strings, e.g. in queries with ORDER BY, as well as the order used in indexes on text columns. The default is to use the collation order of the template database. See below for additional restrictions. @@ -128,7 +128,7 @@ CREATE DATABASE name - ctype + lc_ctype Character classification (LC_CTYPE) to use in the new @@ -207,25 +207,27 @@ CREATE DATABASE name The character set encoding specified for the new database must be - compatible with the chosen LC_COLLATE and LC_CTYPE settings. - If LC_CTYPE is C (or equivalently + compatible with the chosen locale settings (LC_COLLATE and + LC_CTYPE). If the locale is C (or equivalently POSIX), then all encodings are allowed, but for other locale settings there is only one encoding that will work properly. + (On Windows, however, UTF-8 encoding can be used with any locale.) CREATE DATABASE will allow superusers to specify - SQL_ASCII encoding regardless of the locale setting, + SQL_ASCII encoding regardless of the locale settings, but this choice is deprecated and may result in misbehavior of character-string functions if data that is not encoding-compatible with the locale is stored in the database. - The LC_COLLATE and LC_CTYPE settings must match - those of the template database, except when template0 is used as - template. This is because LC_COLLATE and LC_CTYPE - affects the ordering in indexes, so that any indexes copied from the - template database would be invalid in the new database with different - settings. template0, however, is known to not - contain any indexes that would be affected. + The encoding and locale settings must match those of the template database, + except when template0 is used as template. This is because + other databases might contain data that does not match the specified + encoding, or might contain indexes whose sort ordering is affected by + LC_COLLATE and LC_CTYPE. Copying such data would + result in a database that is corrupt according to the new settings. + template0, however, is known to not contain any data or + indexes that would be affected. @@ -257,12 +259,17 @@ CREATE DATABASE sales OWNER salesapp TABLESPACE salesspace; - To create a database music which supports the ISO-8859-1 + To create a database music which supports the ISO-8859-1 character set: -CREATE DATABASE music ENCODING 'LATIN1'; +CREATE DATABASE music ENCODING 'LATIN1' TEMPLATE template0; + + In this example, the TEMPLATE template0 clause would only + be required if template1's encoding is not ISO-8859-1. + Note that changing encoding might require selecting new + LC_COLLATE and LC_CTYPE settings as well. diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c index cf6e740098..8d5972dfa5 100644 --- a/src/backend/commands/dbcommands.c +++ b/src/backend/commands/dbcommands.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.223 2009/05/05 23:39:55 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/dbcommands.c,v 1.224 2009/05/06 16:15:21 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -361,7 +361,8 @@ createdb(const CreatedbStmt *stmt) #endif (encoding == PG_SQL_ASCII && superuser()))) ereport(ERROR, - (errmsg("encoding %s does not match locale %s", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("encoding %s does not match locale %s", pg_encoding_to_char(encoding), dbctype), errdetail("The chosen LC_CTYPE setting requires encoding %s.", @@ -374,29 +375,45 @@ createdb(const CreatedbStmt *stmt) #endif (encoding == PG_SQL_ASCII && superuser()))) ereport(ERROR, - (errmsg("encoding %s does not match locale %s", + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("encoding %s does not match locale %s", pg_encoding_to_char(encoding), dbcollate), errdetail("The chosen LC_COLLATE setting requires encoding %s.", pg_encoding_to_char(collate_encoding)))); /* - * Check that the new locale is compatible with the source database. + * Check that the new encoding and locale settings match the source + * database. We insist on this because we simply copy the source data --- + * any non-ASCII data would be wrongly encoded, and any indexes sorted + * according to the source locale would be wrong. * - * We know that template0 doesn't contain any indexes that depend on - * collation or ctype, so template0 can be used as template for - * any locale. + * However, we assume that template0 doesn't contain any non-ASCII data + * nor any indexes that depend on collation or ctype, so template0 can be + * used as template for creating a database with any encoding or locale. */ if (strcmp(dbtemplate, "template0") != 0) { + if (encoding != src_encoding) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new encoding (%s) is incompatible with the encoding of the template database (%s)", + pg_encoding_to_char(encoding), + pg_encoding_to_char(src_encoding)), + errhint("Use the same encoding as in the template database, or use template0 as template."))); + if (strcmp(dbcollate, src_collate) != 0) ereport(ERROR, - (errmsg("new collation is incompatible with the collation of the template database (%s)", src_collate), + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new collation (%s) is incompatible with the collation of the template database (%s)", + dbcollate, src_collate), errhint("Use the same collation as in the template database, or use template0 as template."))); if (strcmp(dbctype, src_ctype) != 0) ereport(ERROR, - (errmsg("new LC_CTYPE is incompatible with LC_CTYPE of the template database (%s)", src_ctype), + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("new LC_CTYPE (%s) is incompatible with the LC_CTYPE of the template database (%s)", + dbctype, src_ctype), errhint("Use the same LC_CTYPE as in the template database, or use template0 as template."))); } @@ -1099,7 +1116,8 @@ movedb(const char *dbname, const char *tblspcname) continue; ereport(ERROR, - (errmsg("some relations of database \"%s\" are already in tablespace \"%s\"", + (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), + errmsg("some relations of database \"%s\" are already in tablespace \"%s\"", dbname, tblspcname), errhint("You must move them back to the database's default tablespace before using this command."))); } diff --git a/src/test/mb/README b/src/test/mb/README index c11452f8a9..e7bd757dbd 100644 --- a/src/test/mb/README +++ b/src/test/mb/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/test/mb/README,v 1.3 2008/03/21 13:23:29 momjian Exp $ +$PostgreSQL: pgsql/src/test/mb/README,v 1.4 2009/05/06 16:15:21 tgl Exp $ README for multibyte regression test 1998/7/22 @@ -7,4 +7,4 @@ README for multibyte regression test This directory contains a set of tests for multibyte supporting extentions for PostgreSQL. To run the test, simply type: -% mbregress.sh +% sh mbregress.sh diff --git a/src/test/mb/mbregress.sh b/src/test/mb/mbregress.sh index eb9821284b..8a2aca6267 100644 --- a/src/test/mb/mbregress.sh +++ b/src/test/mb/mbregress.sh @@ -1,5 +1,5 @@ #! /bin/sh -# $PostgreSQL: pgsql/src/test/mb/mbregress.sh,v 1.9 2005/06/24 15:11:59 ishii Exp $ +# $PostgreSQL: pgsql/src/test/mb/mbregress.sh,v 1.10 2009/05/06 16:15:21 tgl Exp $ if echo '\c' | grep -s c >/dev/null 2>&1 then @@ -15,7 +15,7 @@ if [ ! -d results ];then fi dropdb utf8 -createdb -E UTF8 utf8 +createdb -T template0 -l C -E UTF8 utf8 PSQL="psql -n -e -q" tests="euc_jp sjis euc_kr euc_cn euc_tw big5 utf8 mule_internal" @@ -36,7 +36,7 @@ do unset PGCLIENTENCODING else dropdb $i >/dev/null 2>&1 - createdb -E `echo $i | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` $i >/dev/null + createdb -T template0 -l C -E `echo $i | tr 'abcdefghijklmnopqrstuvwxyz' 'ABCDEFGHIJKLMNOPQRSTUVWXYZ'` $i >/dev/null $PSQL $i < sql/${i}.sql > results/${i}.out 2>&1 fi -- GitLab