From 799ac992014374c23a1fc437f4fd9aa413be4920 Mon Sep 17 00:00:00 2001 From: Tom Lane Date: Sun, 27 Sep 2009 03:27:24 +0000 Subject: [PATCH] Sync psql's scanner with recent changes in backend scanner's flex rules. Marko Kreen, Tom Lane --- src/backend/parser/scan.l | 10 ++++----- src/bin/psql/psqlscan.l | 44 ++++++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 7 deletions(-) diff --git a/src/backend/parser/scan.l b/src/backend/parser/scan.l index fdc9513550..150202e77c 100644 --- a/src/backend/parser/scan.l +++ b/src/backend/parser/scan.l @@ -24,7 +24,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.161 2009/09/25 21:13:06 petere Exp $ + * $PostgreSQL: pgsql/src/backend/parser/scan.l,v 1.162 2009/09/27 03:27:23 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -571,18 +571,16 @@ other . BEGIN(xe); } -. | -\n | +. { yyerror("invalid Unicode surrogate pair"); } +\n { yyerror("invalid Unicode surrogate pair"); } <> { yyerror("invalid Unicode surrogate pair"); } - {xeunicodefail} { ereport(ERROR, (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), errmsg("invalid Unicode escape"), errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."), lexer_errposition())); - } - + } {xeescape} { if (yytext[1] == '\'') { diff --git a/src/bin/psql/psqlscan.l b/src/bin/psql/psqlscan.l index 235fe9599c..894800aaf5 100644 --- a/src/bin/psql/psqlscan.l +++ b/src/bin/psql/psqlscan.l @@ -33,7 +33,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.28 2009/01/01 17:23:55 momjian Exp $ + * $PostgreSQL: pgsql/src/bin/psql/psqlscan.l,v 1.29 2009/09/27 03:27:24 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -117,6 +117,7 @@ static void push_new_buffer(const char *newstr); static YY_BUFFER_STATE prepare_buffer(const char *txt, int len, char **txtcopy); static void emit(const char *txt, int len); +static bool is_utf16_surrogate_first(uint32 c); #define ECHO emit(yytext, yyleng) @@ -158,6 +159,7 @@ static void emit(const char *txt, int len); * $foo$ quoted strings * quoted identifier with Unicode escapes * quoted string with Unicode escapes + * Unicode surrogate pair in extended quoted string */ %x xb @@ -169,6 +171,7 @@ static void emit(const char *txt, int len); %x xdolq %x xui %x xus +%x xeu /* Additional exclusive states for psql only: lex backslash commands */ %x xslashcmd %x xslasharg @@ -192,6 +195,9 @@ static void emit(const char *txt, int len); * did not end with a newline. * * XXX perhaps \f (formfeed) should be treated as a newline as well? + * + * XXX if you change the set of whitespace characters, fix scanner_isspace() + * to agree, and see also the plpgsql lexer. */ space [ \t\n\r\f] @@ -253,6 +259,8 @@ xeinside [^\\']+ xeescape [\\][^0-7] xeoctesc [\\][0-7]{1,3} xehexesc [\\]x[0-9A-Fa-f]{1,2} +xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8}) +xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7}) /* Extended quote * xqdouble implements embedded quote, '''' @@ -334,6 +342,10 @@ identifier {ident_start}{ident_cont}* typecast "::" +/* these two token types are used by PL/pgsql, though not in core SQL */ +dot_dot \.\. +colon_equals ":=" + /* * "self" is the set of chars that should be returned as single-character * tokens. "op_chars" is the set of chars that can make up "Op" tokens, @@ -511,6 +523,22 @@ other . {xeinside} { ECHO; } +{xeunicode} { + uint32 c = strtoul(yytext+2, NULL, 16); + + if (is_utf16_surrogate_first(c)) + BEGIN(xeu); + ECHO; + } +{xeunicode} { + BEGIN(xe); + ECHO; + } +. { ECHO; } +\n { ECHO; } +{xeunicodefail} { + ECHO; + } {xeescape} { ECHO; } @@ -605,6 +633,14 @@ other . ECHO; } +{dot_dot} { + ECHO; + } + +{colon_equals} { + ECHO; + } + /* * These rules are specific to psql --- they implement parenthesis * counting and detection of command-ending semicolon. These must @@ -1690,3 +1726,9 @@ emit(const char *txt, int len) } } } + +static bool +is_utf16_surrogate_first(uint32 c) +{ + return (c >= 0xD800 && c <= 0xDBFF); +} -- GitLab