diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c index 801d8c5c2969820de2668bc193f1a90875db7fe7..f23da4fa1aa249d4213d5fa20426bc851d79fc9c 100644 --- a/src/backend/utils/adt/varlena.c +++ b/src/backend/utils/adt/varlena.c @@ -43,18 +43,33 @@ int bytea_output = BYTEA_OUTPUT_HEX; typedef struct varlena unknown; typedef struct varlena VarString; +/* + * State for text_position_* functions. + */ typedef struct { - bool use_wchar; /* T if multibyte encoding */ - char *str1; /* use these if not use_wchar */ - char *str2; /* note: these point to original texts */ - pg_wchar *wstr1; /* use these if use_wchar */ - pg_wchar *wstr2; /* note: these are palloc'd */ - int len1; /* string lengths in logical characters */ + bool is_multibyte; /* T if multibyte encoding */ + bool is_multibyte_char_in_char; + + char *str1; /* haystack string */ + char *str2; /* needle string */ + int len1; /* string lengths in bytes */ int len2; + /* Skip table for Boyer-Moore-Horspool search algorithm: */ int skiptablemask; /* mask for ANDing with skiptable subscripts */ int skiptable[256]; /* skip distance for given mismatched char */ + + char *last_match; /* pointer to last match in 'str1' */ + + /* + * Sometimes we need to convert the byte position of a match to a + * character position. These store the last position that was converted, + * so that on the next call, we can continue from that point, rather than + * count characters from the very beginning. + */ + char *refpoint; /* pointer within original haystack string */ + int refpos; /* 0-based character offset of the same point */ } TextPositionState; typedef struct @@ -109,7 +124,10 @@ static text *text_substring(Datum str, static text *text_overlay(text *t1, text *t2, int sp, int sl); static int text_position(text *t1, text *t2); static void text_position_setup(text *t1, text *t2, TextPositionState *state); -static int text_position_next(int start_pos, TextPositionState *state); +static bool text_position_next(TextPositionState *state); +static char *text_position_next_internal(char *start_ptr, TextPositionState *state); +static char *text_position_get_match_ptr(TextPositionState *state); +static int text_position_get_match_pos(TextPositionState *state); static void text_position_cleanup(TextPositionState *state); static int text_cmp(text *arg1, text *arg2, Oid collid); static bytea *bytea_catenate(bytea *t1, bytea *t2); @@ -1099,8 +1117,14 @@ text_position(text *t1, text *t2) TextPositionState state; int result; + if (VARSIZE_ANY_EXHDR(t1) < 1 || VARSIZE_ANY_EXHDR(t2) < 1) + return 0; + text_position_setup(t1, t2, &state); - result = text_position_next(1, &state); + if (!text_position_next(&state)) + result = 0; + else + result = text_position_get_match_pos(&state); text_position_cleanup(&state); return result; } @@ -1112,9 +1136,14 @@ text_position(text *t1, text *t2) * * These are broken out so that a string can be efficiently searched for * multiple occurrences of the same pattern. text_position_next may be - * called multiple times with increasing values of start_pos, which is - * the 1-based character position to start the search from. The "state" - * variable is normally just a local variable in the caller. + * called multiple times, and it advances to the next match on each call. + * text_position_get_match_ptr() and text_position_get_match_pos() return + * a pointer or 1-based character position of the last match, respectively. + * + * The "state" variable is normally just a local variable in the caller. + * + * NOTE: text_position_next skips over the matched portion. For example, + * searching for "xx" in "xxx" returns only one match, not two. */ static void @@ -1123,33 +1152,42 @@ text_position_setup(text *t1, text *t2, TextPositionState *state) int len1 = VARSIZE_ANY_EXHDR(t1); int len2 = VARSIZE_ANY_EXHDR(t2); + Assert(len1 > 0); + Assert(len2 > 0); + + /* + * Even with a multi-byte encoding, we perform the search using the raw + * byte sequence, ignoring multibyte issues. For UTF-8, that works fine, + * because in UTF-8 the byte sequence of one character cannot contain + * another character. For other multi-byte encodings, we do the search + * initially as a simple byte search, ignoring multibyte issues, but + * verify afterwards that the match we found is at a character boundary, + * and continue the search if it was a false match. + */ if (pg_database_encoding_max_length() == 1) { - /* simple case - single byte encoding */ - state->use_wchar = false; - state->str1 = VARDATA_ANY(t1); - state->str2 = VARDATA_ANY(t2); - state->len1 = len1; - state->len2 = len2; + state->is_multibyte = false; + state->is_multibyte_char_in_char = false; + } + else if (GetDatabaseEncoding() == PG_UTF8) + { + state->is_multibyte = true; + state->is_multibyte_char_in_char = false; } else { - /* not as simple - multibyte encoding */ - pg_wchar *p1, - *p2; - - p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar)); - len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1); - p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar)); - len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2); - - state->use_wchar = true; - state->wstr1 = p1; - state->wstr2 = p2; - state->len1 = len1; - state->len2 = len2; + state->is_multibyte = true; + state->is_multibyte_char_in_char = true; } + state->str1 = VARDATA_ANY(t1); + state->str2 = VARDATA_ANY(t2); + state->len1 = len1; + state->len2 = len2; + state->last_match = NULL; + state->refpoint = state->str1; + state->refpos = 0; + /* * Prepare the skip table for Boyer-Moore-Horspool searching. In these * notes we use the terminology that the "haystack" is the string to be @@ -1166,6 +1204,7 @@ text_position_setup(text *t1, text *t2, TextPositionState *state) int skiptablemask; int last; int i; + const char *str2 = state->str2; /* * First we must determine how much of the skip table to use. The @@ -1212,165 +1251,182 @@ text_position_setup(text *t1, text *t2, TextPositionState *state) */ last = len2 - 1; - if (!state->use_wchar) - { - const char *str2 = state->str2; - - for (i = 0; i < last; i++) - state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i; - } - else - { - const pg_wchar *wstr2 = state->wstr2; - - for (i = 0; i < last; i++) - state->skiptable[wstr2[i] & skiptablemask] = last - i; - } + for (i = 0; i < last; i++) + state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i; } } -static int -text_position_next(int start_pos, TextPositionState *state) +/* + * Advance to the next match, starting from the end of the previous match + * (or the beginning of the string, on first call). Returns true if a match + * is found. + */ +static bool +text_position_next(TextPositionState *state) { - int haystack_len = state->len1; int needle_len = state->len2; - int skiptablemask = state->skiptablemask; - - Assert(start_pos > 0); /* else caller error */ + char *start_ptr; + char *matchptr; if (needle_len <= 0) - return start_pos; /* result for empty pattern */ + return false; /* result for empty pattern */ - start_pos--; /* adjust for zero based arrays */ + /* Start from the point right after the previous match. */ + if (state->last_match) + start_ptr = state->last_match + needle_len; + else + start_ptr = state->str1; - /* Done if the needle can't possibly fit */ - if (haystack_len < start_pos + needle_len) - return 0; +retry: + matchptr = text_position_next_internal(start_ptr, state); - if (!state->use_wchar) + if (!matchptr) + return false; + + /* + * Found a match for the byte sequence. If this is a multibyte encoding, + * where one character's byte sequence can appear inside a longer + * multi-byte character, we need to verify that the match was at a + * character boundary, not in the middle of a multi-byte character. + */ + if (state->is_multibyte_char_in_char) { - /* simple case - single byte encoding */ - const char *haystack = state->str1; - const char *needle = state->str2; - const char *haystack_end = &haystack[haystack_len]; - const char *hptr; + /* Walk one character at a time, until we reach the match. */ - if (needle_len == 1) + /* the search should never move backwards. */ + Assert(state->refpoint <= matchptr); + + while (state->refpoint < matchptr) { - /* No point in using B-M-H for a one-character needle */ - char nchar = *needle; + /* step to next character. */ + state->refpoint += pg_mblen(state->refpoint); + state->refpos++; - hptr = &haystack[start_pos]; - while (hptr < haystack_end) + /* + * If we stepped over the match's start position, then it was a + * false positive, where the byte sequence appeared in the middle + * of a multi-byte character. Skip it, and continue the search at + * the next character boundary. + */ + if (state->refpoint > matchptr) { - if (*hptr == nchar) - return hptr - haystack + 1; - hptr++; + start_ptr = state->refpoint; + goto retry; } } - else - { - const char *needle_last = &needle[needle_len - 1]; + } - /* Start at startpos plus the length of the needle */ - hptr = &haystack[start_pos + needle_len - 1]; - while (hptr < haystack_end) - { - /* Match the needle scanning *backward* */ - const char *nptr; - const char *p; + state->last_match = matchptr; + return true; +} - nptr = needle_last; - p = hptr; - while (*nptr == *p) - { - /* Matched it all? If so, return 1-based position */ - if (nptr == needle) - return p - haystack + 1; - nptr--, p--; - } +/* + * Subroutine of text_position_next(). This searches for the raw byte + * sequence, ignoring any multi-byte encoding issues. Returns the first + * match starting at 'start_ptr', or NULL if no match is found. + */ +static char * +text_position_next_internal(char *start_ptr, TextPositionState *state) +{ + int haystack_len = state->len1; + int needle_len = state->len2; + int skiptablemask = state->skiptablemask; + const char *haystack = state->str1; + const char *needle = state->str2; + const char *haystack_end = &haystack[haystack_len]; + const char *hptr; - /* - * No match, so use the haystack char at hptr to decide how - * far to advance. If the needle had any occurrence of that - * character (or more precisely, one sharing the same - * skiptable entry) before its last character, then we advance - * far enough to align the last such needle character with - * that haystack position. Otherwise we can advance by the - * whole needle length. - */ - hptr += state->skiptable[(unsigned char) *hptr & skiptablemask]; - } + Assert(start_ptr >= haystack && start_ptr <= haystack_end); + + if (needle_len == 1) + { + /* No point in using B-M-H for a one-character needle */ + char nchar = *needle; + + hptr = start_ptr; + while (hptr < haystack_end) + { + if (*hptr == nchar) + return (char *) hptr; + hptr++; } } else { - /* The multibyte char version. This works exactly the same way. */ - const pg_wchar *haystack = state->wstr1; - const pg_wchar *needle = state->wstr2; - const pg_wchar *haystack_end = &haystack[haystack_len]; - const pg_wchar *hptr; + const char *needle_last = &needle[needle_len - 1]; - if (needle_len == 1) + /* Start at startpos plus the length of the needle */ + hptr = start_ptr + needle_len - 1; + while (hptr < haystack_end) { - /* No point in using B-M-H for a one-character needle */ - pg_wchar nchar = *needle; + /* Match the needle scanning *backward* */ + const char *nptr; + const char *p; - hptr = &haystack[start_pos]; - while (hptr < haystack_end) + nptr = needle_last; + p = hptr; + while (*nptr == *p) { - if (*hptr == nchar) - return hptr - haystack + 1; - hptr++; + /* Matched it all? If so, return 1-based position */ + if (nptr == needle) + return (char *) p; + nptr--, p--; } + + /* + * No match, so use the haystack char at hptr to decide how far to + * advance. If the needle had any occurrence of that character + * (or more precisely, one sharing the same skiptable entry) + * before its last character, then we advance far enough to align + * the last such needle character with that haystack position. + * Otherwise we can advance by the whole needle length. + */ + hptr += state->skiptable[(unsigned char) *hptr & skiptablemask]; } - else - { - const pg_wchar *needle_last = &needle[needle_len - 1]; + } - /* Start at startpos plus the length of the needle */ - hptr = &haystack[start_pos + needle_len - 1]; - while (hptr < haystack_end) - { - /* Match the needle scanning *backward* */ - const pg_wchar *nptr; - const pg_wchar *p; + return 0; /* not found */ +} - nptr = needle_last; - p = hptr; - while (*nptr == *p) - { - /* Matched it all? If so, return 1-based position */ - if (nptr == needle) - return p - haystack + 1; - nptr--, p--; - } +/* + * Return a pointer to the current match. + * + * The returned pointer points into correct position in the original + * the haystack string. + */ +static char * +text_position_get_match_ptr(TextPositionState *state) +{ + return state->last_match; +} - /* - * No match, so use the haystack char at hptr to decide how - * far to advance. If the needle had any occurrence of that - * character (or more precisely, one sharing the same - * skiptable entry) before its last character, then we advance - * far enough to align the last such needle character with - * that haystack position. Otherwise we can advance by the - * whole needle length. - */ - hptr += state->skiptable[*hptr & skiptablemask]; - } +/* + * Return the offset of the current match. + * + * The offset is in characters, 1-based. + */ +static int +text_position_get_match_pos(TextPositionState *state) +{ + if (!state->is_multibyte) + return state->last_match - state->str1 + 1; + else + { + /* Convert the byte position to char position. */ + while (state->refpoint < state->last_match) + { + state->refpoint += pg_mblen(state->refpoint); + state->refpos++; } + Assert(state->refpoint == state->last_match); + return state->refpos + 1; } - - return 0; /* not found */ } static void text_position_cleanup(TextPositionState *state) { - if (state->use_wchar) - { - pfree(state->wstr1); - pfree(state->wstr2); - } + /* no cleanup needed */ } /* varstr_cmp() @@ -4050,39 +4106,32 @@ replace_text(PG_FUNCTION_ARGS) int from_sub_text_len; TextPositionState state; text *ret_text; - int start_posn; - int curr_posn; int chunk_len; + char *curr_ptr; char *start_ptr; StringInfoData str; + bool found; - text_position_setup(src_text, from_sub_text, &state); - - /* - * Note: we check the converted string length, not the original, because - * they could be different if the input contained invalid encoding. - */ - src_text_len = state.len1; - from_sub_text_len = state.len2; + src_text_len = VARSIZE_ANY_EXHDR(src_text); + from_sub_text_len = VARSIZE_ANY_EXHDR(from_sub_text); /* Return unmodified source string if empty source or pattern */ if (src_text_len < 1 || from_sub_text_len < 1) { - text_position_cleanup(&state); PG_RETURN_TEXT_P(src_text); } - start_posn = 1; - curr_posn = text_position_next(1, &state); + text_position_setup(src_text, from_sub_text, &state); + + found = text_position_next(&state); /* When the from_sub_text is not found, there is nothing to do. */ - if (curr_posn == 0) + if (!found) { text_position_cleanup(&state); PG_RETURN_TEXT_P(src_text); } - - /* start_ptr points to the start_posn'th character of src_text */ + curr_ptr = text_position_get_match_ptr(&state); start_ptr = VARDATA_ANY(src_text); initStringInfo(&str); @@ -4092,19 +4141,18 @@ replace_text(PG_FUNCTION_ARGS) CHECK_FOR_INTERRUPTS(); /* copy the data skipped over by last text_position_next() */ - chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn); + chunk_len = curr_ptr - start_ptr; appendBinaryStringInfo(&str, start_ptr, chunk_len); appendStringInfoText(&str, to_sub_text); - start_posn = curr_posn; - start_ptr += chunk_len; - start_posn += from_sub_text_len; - start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len); + start_ptr = curr_ptr + from_sub_text_len; - curr_posn = text_position_next(start_posn, &state); + found = text_position_next(&state); + if (found) + curr_ptr = text_position_get_match_ptr(&state); } - while (curr_posn > 0); + while (found); /* copy trailing data */ chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr; @@ -4405,9 +4453,10 @@ split_text(PG_FUNCTION_ARGS) int inputstring_len; int fldsep_len; TextPositionState state; - int start_posn; - int end_posn; + char *start_ptr; + char *end_ptr; text *result_text; + bool found; /* field number is 1 based */ if (fldnum < 1) @@ -4415,21 +4464,12 @@ split_text(PG_FUNCTION_ARGS) (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("field position must be greater than zero"))); - text_position_setup(inputstring, fldsep, &state); - - /* - * Note: we check the converted string length, not the original, because - * they could be different if the input contained invalid encoding. - */ - inputstring_len = state.len1; - fldsep_len = state.len2; + inputstring_len = VARSIZE_ANY_EXHDR(inputstring); + fldsep_len = VARSIZE_ANY_EXHDR(fldsep); /* return empty string for empty input string */ if (inputstring_len < 1) - { - text_position_cleanup(&state); PG_RETURN_TEXT_P(cstring_to_text("")); - } /* empty field separator */ if (fldsep_len < 1) @@ -4442,12 +4482,14 @@ split_text(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(cstring_to_text("")); } + text_position_setup(inputstring, fldsep, &state); + /* identify bounds of first field */ - start_posn = 1; - end_posn = text_position_next(1, &state); + start_ptr = VARDATA_ANY(inputstring); + found = text_position_next(&state); /* special case if fldsep not found at all */ - if (end_posn == 0) + if (!found) { text_position_cleanup(&state); /* if field 1 requested, return input string, else empty string */ @@ -4456,12 +4498,15 @@ split_text(PG_FUNCTION_ARGS) else PG_RETURN_TEXT_P(cstring_to_text("")); } + end_ptr = text_position_get_match_ptr(&state); - while (end_posn > 0 && --fldnum > 0) + while (found && --fldnum > 0) { /* identify bounds of next field */ - start_posn = end_posn + fldsep_len; - end_posn = text_position_next(start_posn, &state); + start_ptr = end_ptr + fldsep_len; + found = text_position_next(&state); + if (found) + end_ptr = text_position_get_match_ptr(&state); } text_position_cleanup(&state); @@ -4471,20 +4516,19 @@ split_text(PG_FUNCTION_ARGS) /* N'th field separator not found */ /* if last field requested, return it, else empty string */ if (fldnum == 1) - result_text = text_substring(PointerGetDatum(inputstring), - start_posn, - -1, - true); + { + int last_len = start_ptr - VARDATA_ANY(inputstring); + + result_text = cstring_to_text_with_len(start_ptr, + inputstring_len - last_len); + } else result_text = cstring_to_text(""); } else { /* non-last field requested */ - result_text = text_substring(PointerGetDatum(inputstring), - start_posn, - end_posn - start_posn, - false); + result_text = cstring_to_text_with_len(start_ptr, end_ptr - start_ptr); } PG_RETURN_TEXT_P(result_text); @@ -4570,26 +4614,14 @@ text_to_array_internal(PG_FUNCTION_ARGS) */ TextPositionState state; int fldnum; - int start_posn; - int end_posn; int chunk_len; - text_position_setup(inputstring, fldsep, &state); - - /* - * Note: we check the converted string length, not the original, - * because they could be different if the input contained invalid - * encoding. - */ - inputstring_len = state.len1; - fldsep_len = state.len2; + inputstring_len = VARSIZE_ANY_EXHDR(inputstring); + fldsep_len = VARSIZE_ANY_EXHDR(fldsep); /* return empty array for empty input string */ if (inputstring_len < 1) - { - text_position_cleanup(&state); PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID)); - } /* * empty field separator: return the input string as a one-element @@ -4602,7 +4634,6 @@ text_to_array_internal(PG_FUNCTION_ARGS) int dims[1]; int lbs[1]; - text_position_cleanup(&state); /* single element can be a NULL too */ is_null = null_string ? text_isequal(inputstring, null_string) : false; @@ -4616,17 +4647,19 @@ text_to_array_internal(PG_FUNCTION_ARGS) TEXTOID, -1, false, 'i')); } - start_posn = 1; - /* start_ptr points to the start_posn'th character of inputstring */ + text_position_setup(inputstring, fldsep, &state); + start_ptr = VARDATA_ANY(inputstring); for (fldnum = 1;; fldnum++) /* field number is 1 based */ { - CHECK_FOR_INTERRUPTS(); + bool found; + char *end_ptr; - end_posn = text_position_next(start_posn, &state); + CHECK_FOR_INTERRUPTS(); - if (end_posn == 0) + found = text_position_next(&state); + if (!found) { /* fetch last field */ chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr; @@ -4634,7 +4667,8 @@ text_to_array_internal(PG_FUNCTION_ARGS) else { /* fetch non-last field */ - chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn); + end_ptr = text_position_get_match_ptr(&state); + chunk_len = end_ptr - start_ptr; } /* must build a temp text datum to pass to accumArrayResult */ @@ -4650,13 +4684,10 @@ text_to_array_internal(PG_FUNCTION_ARGS) pfree(result_text); - if (end_posn == 0) + if (!found) break; - start_posn = end_posn; - start_ptr += chunk_len; - start_posn += fldsep_len; - start_ptr += charlen_to_bytelen(start_ptr, fldsep_len); + start_ptr = end_ptr + fldsep_len; } text_position_cleanup(&state);