regexp.c 32.5 KB
Newer Older
1 2
/*-------------------------------------------------------------------------
 *
3
 * regexp.c
4
 *	  Postgres' interface to the regular expression package.
5
 *
6
 * Portions Copyright (c) 1996-2012, PostgreSQL Global Development Group
B
Add:  
Bruce Momjian 已提交
7
 * Portions Copyright (c) 1994, Regents of the University of California
8 9 10
 *
 *
 * IDENTIFICATION
11
 *	  src/backend/utils/adt/regexp.c
12
 *
13 14 15 16 17 18
 *		Alistair Crooks added the code for the regex caching
 *		agc - cached the regular expressions used - there's a good chance
 *		that we'll get a hit, so this saves a compile step for every
 *		attempted match. I haven't actually measured the speed improvement,
 *		but it `looks' a lot quicker visually when watching regression
 *		test output.
19
 *
20 21
 *		agc - incorporated Keith Bostic's Berkeley regex code into
 *		the tree for all ports. To distinguish this regex code from any that
22
 *		is existent on a platform, I've prepended the string "pg_" to
23 24 25 26
 *		the functions regcomp, regerror, regexec and regfree.
 *		Fixed a bug that was originally a typo by me, where `i' was used
 *		instead of `oldest' when compiling regular expressions - benign
 *		results mostly, although occasionally it bit you...
27 28 29
 *
 *-------------------------------------------------------------------------
 */
B
Bruce Momjian 已提交
30
#include "postgres.h"
31

32
#include "catalog/pg_type.h"
33
#include "funcapi.h"
34
#include "regex/regex.h"
T
Tom Lane 已提交
35
#include "utils/array.h"
B
Bruce Momjian 已提交
36
#include "utils/builtins.h"
37

38 39
#define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
	(PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
40 41


42 43 44 45 46
/* all the options of interest for regex functions */
typedef struct pg_re_flags
{
	int			cflags;			/* compile flags for Spencer's regex code */
	bool		glob;			/* do it globally (for each occurrence) */
47
} pg_re_flags;
48 49 50 51 52 53 54 55 56 57 58 59 60 61

/* cross-call state for regexp_matches(), also regexp_split() */
typedef struct regexp_matches_ctx
{
	text	   *orig_str;		/* data string in original TEXT form */
	int			nmatches;		/* number of places where pattern matched */
	int			npatterns;		/* number of capturing subpatterns */
	/* We store start char index and end+1 char index for each match */
	/* so the number of entries in match_locs is nmatches * npatterns * 2 */
	int		   *match_locs;		/* 0-based character indexes */
	int			next_match;		/* 0-based index of next match to process */
	/* workspace for build_regexp_matches_result() */
	Datum	   *elems;			/* has npatterns elements */
	bool	   *nulls;			/* has npatterns elements */
62
} regexp_matches_ctx;
63

64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88
/*
 * We cache precompiled regular expressions using a "self organizing list"
 * structure, in which recently-used items tend to be near the front.
 * Whenever we use an entry, it's moved up to the front of the list.
 * Over time, an item's average position corresponds to its frequency of use.
 *
 * When we first create an entry, it's inserted at the front of
 * the array, dropping the entry at the end of the array if necessary to
 * make room.  (This might seem to be weighting the new entry too heavily,
 * but if we insert new entries further back, we'll be unable to adjust to
 * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
 * never-before-seen items used circularly.  We ought to be able to handle
 * that case, so we have to insert at the front.)
 *
 * Knuth mentions a variant strategy in which a used item is moved up just
 * one place in the list.  Although he says this uses fewer comparisons on
 * average, it seems not to adapt very well to the situation where you have
 * both some reusable patterns and a steady stream of non-reusable patterns.
 * A reusable pattern that isn't used at least as often as non-reusable
 * patterns are seen will "fail to keep up" and will drop off the end of the
 * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
 * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
 */

/* this is the maximum number of cached regular expressions */
89 90 91 92
#ifndef MAX_CACHED_RES
#define MAX_CACHED_RES	32
#endif

93 94
/* this structure describes one cached regular expression */
typedef struct cached_re_str
95
{
96 97
	char	   *cre_pat;		/* original RE (not null terminated!) */
	int			cre_pat_len;	/* length of original RE, in bytes */
98
	int			cre_flags;		/* compile flags: extended,icase etc */
99
	Oid			cre_collation;	/* collation to use */
100
	regex_t		cre_re;			/* the compiled regular expression */
101
} cached_re_str;
102 103

static int	num_res = 0;		/* # of cached re's */
B
Bruce Momjian 已提交
104
static cached_re_str re_array[MAX_CACHED_RES];	/* cached re's */
105

106

107 108
/* Local functions */
static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
B
Bruce Momjian 已提交
109
					 text *flags,
110
					 Oid collation,
B
Bruce Momjian 已提交
111 112 113
					 bool force_glob,
					 bool use_subpatterns,
					 bool ignore_degenerate);
114 115 116
static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
static ArrayType *build_regexp_matches_result(regexp_matches_ctx *matchctx);
static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
117

118

119
/*
120
 * RE_compile_and_cache - compile a RE, caching if possible
121
 *
122
 * Returns regex_t *
123
 *
124
 *	text_re --- the pattern, expressed as a TEXT object
B
Bruce Momjian 已提交
125
 *	cflags --- compile options for the pattern
126
 *	collation --- collation to use for LC_CTYPE-dependent behavior
127
 *
128
 * Pattern is given in the database encoding.  We internally convert to
129
 * an array of pg_wchar, which is what Spencer's regex package wants.
130
 */
131
static regex_t *
132
RE_compile_and_cache(text *text_re, int cflags, Oid collation)
133
{
134 135
	int			text_re_len = VARSIZE_ANY_EXHDR(text_re);
	char	   *text_re_val = VARDATA_ANY(text_re);
136
	pg_wchar   *pattern;
137
	int			pattern_len;
138 139
	int			i;
	int			regcomp_result;
B
Bruce Momjian 已提交
140
	cached_re_str re_temp;
141
	char		errMsg[100];
142

B
Bruce Momjian 已提交
143
	/*
B
Bruce Momjian 已提交
144
	 * Look for a match among previously compiled REs.	Since the data
B
Bruce Momjian 已提交
145 146
	 * structure is self-organizing with most-used entries at the front, our
	 * search strategy can just be to scan from the front.
147
	 */
148
	for (i = 0; i < num_res; i++)
149
	{
150 151
		if (re_array[i].cre_pat_len == text_re_len &&
			re_array[i].cre_flags == cflags &&
152
			re_array[i].cre_collation == collation &&
153
			memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
154
		{
155 156 157 158
			/*
			 * Found a match; move it to front if not there already.
			 */
			if (i > 0)
159
			{
160 161 162
				re_temp = re_array[i];
				memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
				re_array[0] = re_temp;
163 164
			}

165
			return &re_array[0].cre_re;
166
		}
167 168
	}

169 170 171 172 173 174
	/*
	 * Couldn't find it, so try to compile the new RE.  To avoid leaking
	 * resources on failure, we build into the re_temp local.
	 */

	/* Convert pattern string to wide characters */
175 176
	pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
	pattern_len = pg_mb2wchar_with_len(text_re_val,
177
									   pattern,
178
									   text_re_len);
179 180 181 182

	regcomp_result = pg_regcomp(&re_temp.cre_re,
								pattern,
								pattern_len,
183 184
								cflags,
								collation);
185

186 187
	pfree(pattern);

188
	if (regcomp_result != REG_OKAY)
189
	{
190
		/* re didn't compile */
191 192
		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
		/* XXX should we pg_regfree here? */
193 194 195
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("invalid regular expression: %s", errMsg)));
196 197
	}

198
	/*
199
	 * We use malloc/free for the cre_pat field because the storage has to
B
Bruce Momjian 已提交
200 201 202
	 * persist across transactions, and because we want to get control back on
	 * out-of-memory.  The Max() is because some malloc implementations return
	 * NULL for malloc(0).
203
	 */
204
	re_temp.cre_pat = malloc(Max(text_re_len, 1));
205 206 207
	if (re_temp.cre_pat == NULL)
	{
		pg_regfree(&re_temp.cre_re);
208 209 210
		ereport(ERROR,
				(errcode(ERRCODE_OUT_OF_MEMORY),
				 errmsg("out of memory")));
211
	}
212 213
	memcpy(re_temp.cre_pat, text_re_val, text_re_len);
	re_temp.cre_pat_len = text_re_len;
214
	re_temp.cre_flags = cflags;
215
	re_temp.cre_collation = collation;
216

217
	/*
B
Bruce Momjian 已提交
218 219
	 * Okay, we have a valid new item in re_temp; insert it into the storage
	 * array.  Discard last entry if needed.
220 221 222 223 224 225 226 227
	 */
	if (num_res >= MAX_CACHED_RES)
	{
		--num_res;
		Assert(num_res < MAX_CACHED_RES);
		pg_regfree(&re_array[num_res].cre_re);
		free(re_array[num_res].cre_pat);
	}
228

229 230
	if (num_res > 0)
		memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
231

232 233
	re_array[0] = re_temp;
	num_res++;
234

235
	return &re_array[0].cre_re;
236 237 238
}

/*
239
 * RE_wchar_execute - execute a RE on pg_wchar data
240 241 242
 *
 * Returns TRUE on match, FALSE on no match
 *
243 244 245 246
 *	re --- the compiled pattern as returned by RE_compile_and_cache
 *	data --- the data to match against (need not be null-terminated)
 *	data_len --- the length of the data string
 *	start_search -- the offset in the data to start searching
247 248
 *	nmatch, pmatch	--- optional return area for match details
 *
249 250
 * Data is given as array of pg_wchar which is what Spencer's regex package
 * wants.
251 252
 */
static bool
253
RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
254
				 int start_search, int nmatch, regmatch_t *pmatch)
255 256
{
	int			regexec_result;
B
Bruce Momjian 已提交
257
	char		errMsg[100];
258

259
	/* Perform RE match and return result */
260
	regexec_result = pg_regexec(re,
261 262
								data,
								data_len,
263
								start_search,
B
Bruce Momjian 已提交
264
								NULL,	/* no details */
265 266 267
								nmatch,
								pmatch,
								0);
268

269 270 271
	if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
	{
		/* re failed??? */
272
		pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
273 274 275 276 277 278
		ereport(ERROR,
				(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
				 errmsg("regular expression failed: %s", errMsg)));
	}

	return (regexec_result == REG_OKAY);
279 280
}

281 282 283 284 285 286 287 288 289 290
/*
 * RE_execute - execute a RE
 *
 * Returns TRUE on match, FALSE on no match
 *
 *	re --- the compiled pattern as returned by RE_compile_and_cache
 *	dat --- the data to match against (need not be null-terminated)
 *	dat_len --- the length of the data string
 *	nmatch, pmatch	--- optional return area for match details
 *
B
Bruce Momjian 已提交
291
 * Data is given in the database encoding.	We internally
292 293 294 295 296 297 298
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 */
static bool
RE_execute(regex_t *re, char *dat, int dat_len,
		   int nmatch, regmatch_t *pmatch)
{
	pg_wchar   *data;
299
	int			data_len;
300 301 302 303 304 305 306 307
	bool		match;

	/* Convert data string to wide characters */
	data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
	data_len = pg_mb2wchar_with_len(dat, data, dat_len);

	/* Perform RE match and return result */
	match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
308

309 310 311 312 313 314 315 316 317
	pfree(data);
	return match;
}

/*
 * RE_compile_and_execute - compile and execute a RE
 *
 * Returns TRUE on match, FALSE on no match
 *
318
 *	text_re --- the pattern, expressed as a TEXT object
319 320 321
 *	dat --- the data to match against (need not be null-terminated)
 *	dat_len --- the length of the data string
 *	cflags --- compile options for the pattern
322
 *	collation --- collation to use for LC_CTYPE-dependent behavior
323 324 325 326 327 328 329
 *	nmatch, pmatch	--- optional return area for match details
 *
 * Both pattern and data are given in the database encoding.  We internally
 * convert to array of pg_wchar which is what Spencer's regex package wants.
 */
static bool
RE_compile_and_execute(text *text_re, char *dat, int dat_len,
330 331
					   int cflags, Oid collation,
					   int nmatch, regmatch_t *pmatch)
332 333 334 335
{
	regex_t    *re;

	/* Compile RE */
336
	re = RE_compile_and_cache(text_re, cflags, collation);
337 338 339 340

	return RE_execute(re, dat, dat_len, nmatch, pmatch);
}

341 342 343 344 345

/*
 * parse_re_flags - parse the options argument of regexp_matches and friends
 *
 *	flags --- output argument, filled with desired options
346
 *	opts --- TEXT object, or NULL for defaults
347 348 349 350
 *
 * This accepts all the options allowed by any of the callers; callers that
 * don't want some have to reject them after the fact.
 */
351
static void
352
parse_re_flags(pg_re_flags *flags, text *opts)
353
{
354 355
	/* regex flavor is always folded into the compile flags */
	flags->cflags = REG_ADVANCED;
356
	flags->glob = false;
357 358 359

	if (opts)
	{
B
Bruce Momjian 已提交
360 361 362
		char	   *opt_p = VARDATA_ANY(opts);
		int			opt_len = VARSIZE_ANY_EXHDR(opts);
		int			i;
363 364 365 366 367 368 369 370

		for (i = 0; i < opt_len; i++)
		{
			switch (opt_p[i])
			{
				case 'g':
					flags->glob = true;
					break;
B
Bruce Momjian 已提交
371
				case 'b':		/* BREs (but why???) */
372 373
					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
					break;
B
Bruce Momjian 已提交
374
				case 'c':		/* case sensitive */
375 376
					flags->cflags &= ~REG_ICASE;
					break;
B
Bruce Momjian 已提交
377
				case 'e':		/* plain EREs */
378 379 380
					flags->cflags |= REG_EXTENDED;
					flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
					break;
B
Bruce Momjian 已提交
381
				case 'i':		/* case insensitive */
382 383
					flags->cflags |= REG_ICASE;
					break;
B
Bruce Momjian 已提交
384 385
				case 'm':		/* Perloid synonym for n */
				case 'n':		/* \n affects ^ $ . [^ */
386 387
					flags->cflags |= REG_NEWLINE;
					break;
B
Bruce Momjian 已提交
388
				case 'p':		/* ~Perl, \n affects . [^ */
389 390 391
					flags->cflags |= REG_NLSTOP;
					flags->cflags &= ~REG_NLANCH;
					break;
B
Bruce Momjian 已提交
392
				case 'q':		/* literal string */
393 394 395
					flags->cflags |= REG_QUOTE;
					flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
					break;
B
Bruce Momjian 已提交
396
				case 's':		/* single line, \n ordinary */
397 398
					flags->cflags &= ~REG_NEWLINE;
					break;
B
Bruce Momjian 已提交
399
				case 't':		/* tight syntax */
400 401
					flags->cflags &= ~REG_EXPANDED;
					break;
B
Bruce Momjian 已提交
402
				case 'w':		/* weird, \n affects ^ $ only */
403 404 405
					flags->cflags &= ~REG_NLSTOP;
					flags->cflags |= REG_NLANCH;
					break;
B
Bruce Momjian 已提交
406
				case 'x':		/* expanded syntax */
407 408 409 410 411
					flags->cflags |= REG_EXPANDED;
					break;
				default:
					ereport(ERROR,
							(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
412 413
							 errmsg("invalid regexp option: \"%c\"",
									opt_p[i])));
414 415 416 417 418 419
					break;
			}
		}
	}
}

420 421

/*
422
 *	interface routines called by the function manager
423
 */
424 425 426

Datum
nameregexeq(PG_FUNCTION_ARGS)
427
{
428
	Name		n = PG_GETARG_NAME(0);
429
	text	   *p = PG_GETARG_TEXT_PP(1);
430

431
	PG_RETURN_BOOL(RE_compile_and_execute(p,
432
										  NameStr(*n),
433
										  strlen(NameStr(*n)),
434
										  REG_ADVANCED,
435
										  PG_GET_COLLATION(),
436
										  0, NULL));
437
}
438

439 440
Datum
nameregexne(PG_FUNCTION_ARGS)
441
{
442
	Name		n = PG_GETARG_NAME(0);
443
	text	   *p = PG_GETARG_TEXT_PP(1);
444

445
	PG_RETURN_BOOL(!RE_compile_and_execute(p,
446
										   NameStr(*n),
447
										   strlen(NameStr(*n)),
448
										   REG_ADVANCED,
449
										   PG_GET_COLLATION(),
450
										   0, NULL));
451 452
}

453 454
Datum
textregexeq(PG_FUNCTION_ARGS)
455
{
456 457
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
458

459
	PG_RETURN_BOOL(RE_compile_and_execute(p,
460 461
										  VARDATA_ANY(s),
										  VARSIZE_ANY_EXHDR(s),
462
										  REG_ADVANCED,
463
										  PG_GET_COLLATION(),
464
										  0, NULL));
465 466
}

467 468
Datum
textregexne(PG_FUNCTION_ARGS)
469
{
470 471
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
472

473
	PG_RETURN_BOOL(!RE_compile_and_execute(p,
474 475
										   VARDATA_ANY(s),
										   VARSIZE_ANY_EXHDR(s),
476
										   REG_ADVANCED,
477
										   PG_GET_COLLATION(),
478
										   0, NULL));
479 480 481 482
}


/*
B
Bruce Momjian 已提交
483
 *	routines that use the regexp stuff, but ignore the case.
484
 *	for this, we use the REG_ICASE flag to pg_regcomp
485
 */
486 487 488


Datum
489
nameicregexeq(PG_FUNCTION_ARGS)
490
{
491
	Name		n = PG_GETARG_NAME(0);
492
	text	   *p = PG_GETARG_TEXT_PP(1);
493

494
	PG_RETURN_BOOL(RE_compile_and_execute(p,
495
										  NameStr(*n),
496
										  strlen(NameStr(*n)),
497
										  REG_ADVANCED | REG_ICASE,
498
										  PG_GET_COLLATION(),
499
										  0, NULL));
500 501
}

502
Datum
503
nameicregexne(PG_FUNCTION_ARGS)
504
{
505
	Name		n = PG_GETARG_NAME(0);
506
	text	   *p = PG_GETARG_TEXT_PP(1);
507

508
	PG_RETURN_BOOL(!RE_compile_and_execute(p,
509
										   NameStr(*n),
510
										   strlen(NameStr(*n)),
511
										   REG_ADVANCED | REG_ICASE,
512
										   PG_GET_COLLATION(),
513
										   0, NULL));
514 515
}

516
Datum
517
texticregexeq(PG_FUNCTION_ARGS)
518
{
519 520
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
521

522
	PG_RETURN_BOOL(RE_compile_and_execute(p,
523 524
										  VARDATA_ANY(s),
										  VARSIZE_ANY_EXHDR(s),
525
										  REG_ADVANCED | REG_ICASE,
526
										  PG_GET_COLLATION(),
527
										  0, NULL));
528
}
529

530
Datum
531
texticregexne(PG_FUNCTION_ARGS)
532
{
533 534
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
535

536
	PG_RETURN_BOOL(!RE_compile_and_execute(p,
537 538
										   VARDATA_ANY(s),
										   VARSIZE_ANY_EXHDR(s),
539
										   REG_ADVANCED | REG_ICASE,
540
										   PG_GET_COLLATION(),
541
										   0, NULL));
542
}
543 544


545 546 547
/*
 * textregexsubstr()
 *		Return a substring matched by a regular expression.
548 549 550 551
 */
Datum
textregexsubstr(PG_FUNCTION_ARGS)
{
552 553
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
554
	regex_t    *re;
555
	regmatch_t	pmatch[2];
556 557 558 559
	int			so,
				eo;

	/* Compile RE */
560
	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
561

B
Bruce Momjian 已提交
562
	/*
B
Bruce Momjian 已提交
563 564 565 566
	 * We pass two regmatch_t structs to get info about the overall match and
	 * the match for the first parenthesized subexpression (if any). If there
	 * is a parenthesized subexpression, we return what it matched; else
	 * return what the whole regexp matched.
567
	 */
568 569 570 571
	if (!RE_execute(re,
					VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
					2, pmatch))
		PG_RETURN_NULL();		/* definitely no match */
572

573 574 575
	if (re->re_nsub > 0)
	{
		/* has parenthesized subexpressions, use the first one */
576 577
		so = pmatch[1].rm_so;
		eo = pmatch[1].rm_eo;
578
	}
579 580 581 582 583 584 585 586
	else
	{
		/* no parenthesized subexpression, use whole match */
		so = pmatch[0].rm_so;
		eo = pmatch[0].rm_eo;
	}

	/*
587 588 589 590
	 * It is possible to have a match to the whole pattern but no match for a
	 * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
	 * there is no subexpression match.  So this extra test for match failure
	 * is not redundant.
591 592 593
	 */
	if (so < 0 || eo < 0)
		PG_RETURN_NULL();
594

595 596 597 598
	return DirectFunctionCall3(text_substr,
							   PointerGetDatum(s),
							   Int32GetDatum(so + 1),
							   Int32GetDatum(eo - so));
599
}
600

601 602
/*
 * textregexreplace_noopt()
603 604 605 606
 *		Return a string matched by a regular expression, with replacement.
 *
 * This version doesn't have an option argument: we default to case
 * sensitive match, replace the first instance only.
607 608 609 610
 */
Datum
textregexreplace_noopt(PG_FUNCTION_ARGS)
{
611 612 613
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
	text	   *r = PG_GETARG_TEXT_PP(2);
614
	regex_t    *re;
615

616
	re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
617

618
	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
619 620 621 622
}

/*
 * textregexreplace()
623
 *		Return a string matched by a regular expression, with replacement.
624 625 626 627
 */
Datum
textregexreplace(PG_FUNCTION_ARGS)
{
628 629 630 631
	text	   *s = PG_GETARG_TEXT_PP(0);
	text	   *p = PG_GETARG_TEXT_PP(1);
	text	   *r = PG_GETARG_TEXT_PP(2);
	text	   *opt = PG_GETARG_TEXT_PP(3);
632
	regex_t    *re;
633
	pg_re_flags flags;
634

635
	parse_re_flags(&flags, opt);
636

637
	re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
638

639
	PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
640 641
}

642 643
/*
 * similar_escape()
644
 * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
645 646 647 648 649 650 651 652
 * our regexp engine.
 */
Datum
similar_escape(PG_FUNCTION_ARGS)
{
	text	   *pat_text;
	text	   *esc_text;
	text	   *result;
653
	char	   *p,
654 655 656 657 658
			   *e,
			   *r;
	int			plen,
				elen;
	bool		afterescape = false;
659
	bool		incharclass = false;
660 661 662 663 664
	int			nquotes = 0;

	/* This function is not strict, so must test explicitly */
	if (PG_ARGISNULL(0))
		PG_RETURN_NULL();
665 666 667
	pat_text = PG_GETARG_TEXT_PP(0);
	p = VARDATA_ANY(pat_text);
	plen = VARSIZE_ANY_EXHDR(pat_text);
668 669 670 671 672 673 674 675
	if (PG_ARGISNULL(1))
	{
		/* No ESCAPE clause provided; default to backslash as escape */
		e = "\\";
		elen = 1;
	}
	else
	{
676 677 678
		esc_text = PG_GETARG_TEXT_PP(1);
		e = VARDATA_ANY(esc_text);
		elen = VARSIZE_ANY_EXHDR(esc_text);
679 680 681
		if (elen == 0)
			e = NULL;			/* no escape character */
		else if (elen != 1)
682 683 684
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
					 errmsg("invalid escape string"),
B
Bruce Momjian 已提交
685
				  errhint("Escape string must be empty or one character.")));
686 687
	}

688 689
	/*----------
	 * We surround the transformed input string with
690 691 692 693
	 *			^(?: ... )$
	 * which requires some explanation.  We need "^" and "$" to force
	 * the pattern to match the entire input string as per SQL99 spec.
	 * The "(?:" and ")" are a non-capturing set of parens; we have to have
694 695 696 697 698 699 700 701
	 * parens in case the string contains "|", else the "^" and "$" will
	 * be bound into the first and last alternatives which is not what we
	 * want, and the parens must be non capturing because we don't want them
	 * to count when selecting output for SUBSTRING.
	 *----------
	 */

	/*
702 703
	 * We need room for the prefix/postfix plus as many as 3 output bytes per
	 * input byte; since the input is at most 1GB this can't overflow
704
	 */
705
	result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
706 707 708
	r = VARDATA(result);

	*r++ = '^';
709 710 711
	*r++ = '(';
	*r++ = '?';
	*r++ = ':';
712 713 714

	while (plen > 0)
	{
B
Bruce Momjian 已提交
715
		char		pchar = *p;
716 717 718

		if (afterescape)
		{
719
			if (pchar == '"' && !incharclass)	/* for SUBSTRING patterns */
720 721 722 723 724 725 726 727 728 729 730 731 732
				*r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
			else
			{
				*r++ = '\\';
				*r++ = pchar;
			}
			afterescape = false;
		}
		else if (e && pchar == *e)
		{
			/* SQL99 escape character; do not send to output */
			afterescape = true;
		}
733 734 735 736 737 738 739 740 741 742 743 744 745
		else if (incharclass)
		{
			if (pchar == '\\')
				*r++ = '\\';
			*r++ = pchar;
			if (pchar == ']')
				incharclass = false;
		}
		else if (pchar == '[')
		{
			*r++ = pchar;
			incharclass = true;
		}
746 747 748 749 750 751 752
		else if (pchar == '%')
		{
			*r++ = '.';
			*r++ = '*';
		}
		else if (pchar == '_')
			*r++ = '.';
753 754 755 756 757 758 759
		else if (pchar == '(')
		{
			/* convert to non-capturing parenthesis */
			*r++ = '(';
			*r++ = '?';
			*r++ = ':';
		}
760 761
		else if (pchar == '\\' || pchar == '.' ||
				 pchar == '^' || pchar == '$')
762 763 764 765 766 767 768 769 770
		{
			*r++ = '\\';
			*r++ = pchar;
		}
		else
			*r++ = pchar;
		p++, plen--;
	}

771
	*r++ = ')';
772
	*r++ = '$';
B
Bruce Momjian 已提交
773

774
	SET_VARSIZE(result, r - ((char *) result));
775 776 777

	PG_RETURN_TEXT_P(result);
}
778

779 780 781 782
/*
 * regexp_matches()
 *		Return a table of matches of a pattern within a string.
 */
783 784 785
Datum
regexp_matches(PG_FUNCTION_ARGS)
{
B
Bruce Momjian 已提交
786 787
	FuncCallContext *funcctx;
	regexp_matches_ctx *matchctx;
788 789 790

	if (SRF_IS_FIRSTCALL())
	{
B
Bruce Momjian 已提交
791 792 793
		text	   *pattern = PG_GETARG_TEXT_PP(1);
		text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
		MemoryContext oldcontext;
794 795 796 797 798

		funcctx = SRF_FIRSTCALL_INIT();
		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

		/* be sure to copy the input string into the multi-call ctx */
799
		matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
800 801 802
										flags,
										PG_GET_COLLATION(),
										false, true, false);
803 804 805 806

		/* Pre-create workspace that build_regexp_matches_result needs */
		matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
		matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
807 808 809 810 811 812 813 814

		MemoryContextSwitchTo(oldcontext);
		funcctx->user_fctx = (void *) matchctx;
	}

	funcctx = SRF_PERCALL_SETUP();
	matchctx = (regexp_matches_ctx *) funcctx->user_fctx;

815
	if (matchctx->next_match < matchctx->nmatches)
816
	{
B
Bruce Momjian 已提交
817
		ArrayType  *result_ary;
818

819 820 821
		result_ary = build_regexp_matches_result(matchctx);
		matchctx->next_match++;
		SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
822 823
	}

824 825 826
	/* release space in multi-call ctx to avoid intraquery memory leak */
	cleanup_regexp_matches(matchctx);

827 828 829
	SRF_RETURN_DONE(funcctx);
}

830
/* This is separate to keep the opr_sanity regression test from complaining */
831 832 833 834 835 836
Datum
regexp_matches_no_flags(PG_FUNCTION_ARGS)
{
	return regexp_matches(fcinfo);
}

837 838 839 840 841 842 843 844 845 846 847 848
/*
 * setup_regexp_matches --- do the initial matching for regexp_matches()
 *		or regexp_split()
 *
 * To avoid having to re-find the compiled pattern on each call, we do
 * all the matching in one swoop.  The returned regexp_matches_ctx contains
 * the locations of all the substrings matching the pattern.
 *
 * The three bool parameters have only two patterns (one for each caller)
 * but it seems clearer to distinguish the functionality this way than to
 * key it all off one "is_split" flag.
 */
849
static regexp_matches_ctx *
850
setup_regexp_matches(text *orig_str, text *pattern, text *flags,
851
					 Oid collation,
852 853
					 bool force_glob, bool use_subpatterns,
					 bool ignore_degenerate)
854
{
855 856 857 858
	regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
	int			orig_len;
	pg_wchar   *wide_str;
	int			wide_len;
B
Bruce Momjian 已提交
859 860
	pg_re_flags re_flags;
	regex_t    *cpattern;
861 862 863 864 865 866 867 868
	regmatch_t *pmatch;
	int			pmatch_len;
	int			array_len;
	int			array_idx;
	int			prev_match_end;
	int			start_search;

	/* save original string --- we'll extract result substrings from it */
869 870
	matchctx->orig_str = orig_str;

871
	/* convert string to pg_wchar form for matching */
872
	orig_len = VARSIZE_ANY_EXHDR(orig_str);
873
	wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
874
	wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
875

876 877 878 879 880 881 882 883
	/* determine options */
	parse_re_flags(&re_flags, flags);
	if (force_glob)
	{
		/* user mustn't specify 'g' for regexp_split */
		if (re_flags.glob)
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
B
Bruce Momjian 已提交
884
				 errmsg("regexp_split does not support the global option")));
885 886 887
		/* but we find all the matches anyway */
		re_flags.glob = true;
	}
888

889
	/* set up the compiled pattern */
890
	cpattern = RE_compile_and_cache(pattern, re_flags.cflags, collation);
891

892 893
	/* do we want to remember subpatterns? */
	if (use_subpatterns && cpattern->re_nsub > 0)
894
	{
895 896 897 898 899 900 901 902 903
		matchctx->npatterns = cpattern->re_nsub;
		pmatch_len = cpattern->re_nsub + 1;
	}
	else
	{
		use_subpatterns = false;
		matchctx->npatterns = 1;
		pmatch_len = 1;
	}
904

905 906
	/* temporary output space for RE package */
	pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
907

908 909 910 911 912 913 914 915 916 917 918 919 920
	/* the real output space (grown dynamically if needed) */
	array_len = re_flags.glob ? 256 : 32;
	matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
	array_idx = 0;

	/* search for the pattern, perhaps repeatedly */
	prev_match_end = 0;
	start_search = 0;
	while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
							pmatch_len, pmatch))
	{
		/*
		 * If requested, ignore degenerate matches, which are zero-length
B
Bruce Momjian 已提交
921 922
		 * matches occurring at the start or end of a string or just after a
		 * previous match.
923 924 925 926
		 */
		if (!ignore_degenerate ||
			(pmatch[0].rm_so < wide_len &&
			 pmatch[0].rm_eo > prev_match_end))
927
		{
928 929 930 931 932
			/* enlarge output space if needed */
			while (array_idx + matchctx->npatterns * 2 > array_len)
			{
				array_len *= 2;
				matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
B
Bruce Momjian 已提交
933
													sizeof(int) * array_len);
934
			}
935

936 937
			/* save this match's locations */
			if (use_subpatterns)
938
			{
B
Bruce Momjian 已提交
939
				int			i;
940 941 942 943 944 945

				for (i = 1; i <= matchctx->npatterns; i++)
				{
					matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
					matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
				}
946 947 948
			}
			else
			{
949 950
				matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
				matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
951
			}
952
			matchctx->nmatches++;
953
		}
954 955 956 957 958 959 960
		prev_match_end = pmatch[0].rm_eo;

		/* if not glob, stop after one match */
		if (!re_flags.glob)
			break;

		/*
B
Bruce Momjian 已提交
961 962 963 964
		 * Advance search position.  Normally we start just after the end of
		 * the previous match, but always advance at least one character (the
		 * special case can occur if the pattern matches zero characters just
		 * after the prior match or at the end of the string).
965 966 967 968 969 970 971
		 */
		if (start_search < pmatch[0].rm_eo)
			start_search = pmatch[0].rm_eo;
		else
			start_search++;
		if (start_search > wide_len)
			break;
972 973
	}

974 975 976
	/* Clean up temp storage */
	pfree(wide_str);
	pfree(pmatch);
977

978 979 980
	return matchctx;
}

981 982 983 984
/*
 * cleanup_regexp_matches - release memory of a regexp_matches_ctx
 */
static void
985
cleanup_regexp_matches(regexp_matches_ctx *matchctx)
986 987 988 989 990 991 992 993 994 995
{
	pfree(matchctx->orig_str);
	pfree(matchctx->match_locs);
	if (matchctx->elems)
		pfree(matchctx->elems);
	if (matchctx->nulls)
		pfree(matchctx->nulls);
	pfree(matchctx);
}

996 997 998 999
/*
 * build_regexp_matches_result - build output array for current match
 */
static ArrayType *
1000
build_regexp_matches_result(regexp_matches_ctx *matchctx)
1001 1002 1003
{
	Datum	   *elems = matchctx->elems;
	bool	   *nulls = matchctx->nulls;
B
Bruce Momjian 已提交
1004 1005
	int			dims[1];
	int			lbs[1];
1006 1007
	int			loc;
	int			i;
1008

1009 1010 1011 1012
	/* Extract matching substrings from the original string */
	loc = matchctx->next_match * matchctx->npatterns * 2;
	for (i = 0; i < matchctx->npatterns; i++)
	{
B
Bruce Momjian 已提交
1013 1014
		int			so = matchctx->match_locs[loc++];
		int			eo = matchctx->match_locs[loc++];
1015 1016 1017 1018 1019 1020 1021 1022 1023

		if (so < 0 || eo < 0)
		{
			elems[i] = (Datum) 0;
			nulls[i] = true;
		}
		else
		{
			elems[i] = DirectFunctionCall3(text_substr,
B
Bruce Momjian 已提交
1024
										 PointerGetDatum(matchctx->orig_str),
1025 1026 1027 1028
										   Int32GetDatum(so + 1),
										   Int32GetDatum(eo - so));
			nulls[i] = false;
		}
1029 1030
	}

1031 1032 1033
	/* And form an array */
	dims[0] = matchctx->npatterns;
	lbs[0] = 1;
1034
	/* XXX: this hardcodes assumptions about the text type */
1035
	return construct_md_array(elems, nulls, 1, dims, lbs,
1036
							  TEXTOID, -1, false, 'i');
1037 1038
}

1039 1040 1041 1042 1043
/*
 * regexp_split_to_table()
 *		Split the string at matches of the pattern, returning the
 *		split-out substrings as a table.
 */
1044 1045 1046
Datum
regexp_split_to_table(PG_FUNCTION_ARGS)
{
B
Bruce Momjian 已提交
1047
	FuncCallContext *funcctx;
1048
	regexp_matches_ctx *splitctx;
1049 1050 1051

	if (SRF_IS_FIRSTCALL())
	{
B
Bruce Momjian 已提交
1052 1053 1054
		text	   *pattern = PG_GETARG_TEXT_PP(1);
		text	   *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
		MemoryContext oldcontext;
1055 1056 1057 1058

		funcctx = SRF_FIRSTCALL_INIT();
		oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);

1059 1060
		/* be sure to copy the input string into the multi-call ctx */
		splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1061 1062 1063
										flags,
										PG_GET_COLLATION(),
										true, false, true);
1064 1065 1066 1067 1068 1069

		MemoryContextSwitchTo(oldcontext);
		funcctx->user_fctx = (void *) splitctx;
	}

	funcctx = SRF_PERCALL_SETUP();
1070
	splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1071

1072
	if (splitctx->next_match <= splitctx->nmatches)
1073
	{
B
Bruce Momjian 已提交
1074
		Datum		result = build_regexp_split_result(splitctx);
1075 1076 1077

		splitctx->next_match++;
		SRF_RETURN_NEXT(funcctx, result);
1078 1079
	}

1080 1081 1082
	/* release space in multi-call ctx to avoid intraquery memory leak */
	cleanup_regexp_matches(splitctx);

1083
	SRF_RETURN_DONE(funcctx);
1084 1085
}

1086
/* This is separate to keep the opr_sanity regression test from complaining */
B
Bruce Momjian 已提交
1087 1088
Datum
regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1089 1090 1091 1092
{
	return regexp_split_to_table(fcinfo);
}

1093 1094 1095 1096 1097
/*
 * regexp_split_to_array()
 *		Split the string at matches of the pattern, returning the
 *		split-out substrings as an array.
 */
B
Bruce Momjian 已提交
1098 1099
Datum
regexp_split_to_array(PG_FUNCTION_ARGS)
1100
{
B
Bruce Momjian 已提交
1101 1102
	ArrayBuildState *astate = NULL;
	regexp_matches_ctx *splitctx;
1103

1104 1105 1106
	splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
									PG_GETARG_TEXT_PP(1),
									PG_GETARG_TEXT_PP_IF_EXISTS(2),
1107
									PG_GET_COLLATION(),
1108
									true, false, true);
1109

1110
	while (splitctx->next_match <= splitctx->nmatches)
1111 1112
	{
		astate = accumArrayResult(astate,
1113
								  build_regexp_split_result(splitctx),
1114
								  false,
1115
								  TEXTOID,
1116
								  CurrentMemoryContext);
1117
		splitctx->next_match++;
1118 1119
	}

1120
	/*
B
Bruce Momjian 已提交
1121 1122 1123
	 * We don't call cleanup_regexp_matches here; it would try to pfree the
	 * input string, which we didn't copy.  The space is not in a long-lived
	 * memory context anyway.
1124 1125
	 */

1126 1127 1128
	PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
}

1129
/* This is separate to keep the opr_sanity regression test from complaining */
B
Bruce Momjian 已提交
1130 1131
Datum
regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1132 1133 1134 1135
{
	return regexp_split_to_array(fcinfo);
}

1136 1137 1138 1139 1140 1141
/*
 * build_regexp_split_result - build output string for current match
 *
 * We return the string between the current match and the previous one,
 * or the string after the last match when next_match == nmatches.
 */
1142
static Datum
1143
build_regexp_split_result(regexp_matches_ctx *splitctx)
1144
{
B
Bruce Momjian 已提交
1145 1146
	int			startpos;
	int			endpos;
1147

1148 1149 1150 1151 1152 1153
	if (splitctx->next_match > 0)
		startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
	else
		startpos = 0;
	if (startpos < 0)
		elog(ERROR, "invalid match ending position");
1154

1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166
	if (splitctx->next_match < splitctx->nmatches)
	{
		endpos = splitctx->match_locs[splitctx->next_match * 2];
		if (endpos < startpos)
			elog(ERROR, "invalid match starting position");
		return DirectFunctionCall3(text_substr,
								   PointerGetDatum(splitctx->orig_str),
								   Int32GetDatum(startpos + 1),
								   Int32GetDatum(endpos - startpos));
	}
	else
	{
1167
		/* no more matches, return rest of string */
1168 1169 1170
		return DirectFunctionCall2(text_substr_no_len,
								   PointerGetDatum(splitctx->orig_str),
								   Int32GetDatum(startpos + 1));
1171 1172
	}
}
1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237

/*
 * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
 *
 * The result is NULL if there is no fixed prefix, else a palloc'd string.
 * If it is an exact match, not just a prefix, *exact is returned as TRUE.
 */
char *
regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
					bool *exact)
{
	char	   *result;
	regex_t    *re;
	int			cflags;
	int			re_result;
	pg_wchar   *str;
	size_t		slen;
	size_t		maxlen;
	char		errMsg[100];

	*exact = false;				/* default result */

	/* Compile RE */
	cflags = REG_ADVANCED;
	if (case_insensitive)
		cflags |= REG_ICASE;

	re = RE_compile_and_cache(text_re, cflags, collation);

	/* Examine it to see if there's a fixed prefix */
	re_result = pg_regprefix(re, &str, &slen);

	switch (re_result)
	{
		case REG_NOMATCH:
			return NULL;

		case REG_PREFIX:
			/* continue with wchar conversion */
			break;

		case REG_EXACT:
			*exact = true;
			/* continue with wchar conversion */
			break;

		default:
			/* re failed??? */
			pg_regerror(re_result, re, errMsg, sizeof(errMsg));
			ereport(ERROR,
					(errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
					 errmsg("regular expression failed: %s", errMsg)));
			break;
	}

	/* Convert pg_wchar result back to database encoding */
	maxlen = pg_database_encoding_max_length() * slen + 1;
	result = (char *) palloc(maxlen);
	slen = pg_wchar2mb_with_len(str, result, slen);
	Assert(slen < maxlen);

	free(str);

	return result;
}