like_support.c source code [PostgreSQL/src/backend/utils/adt/like_support.c]

1	/-------------------------------------------------------------------------*
2	*
3	* like_support.c
4	* Planner support functions for LIKE, regex, and related operators.
5	*
6	* These routines handle special optimization of operators that can be
7	* used with index scans even though they are not known to the executor's
8	* indexscan machinery. The key idea is that these operators allow us
9	* to derive approximate indexscan qual clauses, such that any tuples
10	* that pass the operator clause itself must also satisfy the simpler
11	* indexscan condition(s). Then we can use the indexscan machinery
12	* to avoid scanning as much of the table as we'd otherwise have to,
13	* while applying the original operator as a qpqual condition to ensure
14	* we deliver only the tuples we want. (In essence, we're using a regular
15	* index as if it were a lossy index.)
16	*
17	* An example of what we're doing is
18	* textfield LIKE 'abc%def'
19	* from which we can generate the indexscanable conditions
20	* textfield >= 'abc' AND textfield < 'abd'
21	* which allow efficient scanning of an index on textfield.
22	* (In reality, character set and collation issues make the transformation
23	* from LIKE to indexscan limits rather harder than one might think ...
24	* but that's the basic idea.)
25	*
26	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
27	* Portions Copyright (c) 1994, Regents of the University of California
28	*
29	*
30	* IDENTIFICATION
31	* src/backend/utils/adt/like_support.c
32	*
33	*-------------------------------------------------------------------------
34	*/
35	#include "postgres.h"
36
37	#include <math.h>
38
39	#include "access/htup_details.h"
40	#include "access/stratnum.h"
41	#include "catalog/pg_collation.h"
42	#include "catalog/pg_opfamily.h"
43	#include "catalog/pg_statistic.h"
44	#include "catalog/pg_type.h"
45	#include "mb/pg_wchar.h"
46	#include "nodes/makefuncs.h"
47	#include "nodes/nodeFuncs.h"
48	#include "nodes/supportnodes.h"
49	#include "utils/builtins.h"
50	#include "utils/datum.h"
51	#include "utils/lsyscache.h"
52	#include "utils/pg_locale.h"
53	#include "utils/selfuncs.h"
54	#include "utils/varlena.h"
55
56
57	typedef enum
58	{
59	Pattern_Type_Like,
60	Pattern_Type_Like_IC,
61	Pattern_Type_Regex,
62	Pattern_Type_Regex_IC,
63	Pattern_Type_Prefix
64	} Pattern_Type;
65
66	typedef enum
67	{
68	Pattern_Prefix_None, Pattern_Prefix_Partial, Pattern_Prefix_Exact
69	} Pattern_Prefix_Status;
70
71	static Node like_regex_support(Node rawreq, Pattern_Type ptype);
72	static List match_pattern_prefix(Node leftop,
73	Node *rightop,
74	Pattern_Type ptype,
75	Oid expr_coll,
76	Oid opfamily,
77	Oid indexcollation);
78	static double patternsel_common(PlannerInfo *root,
79	Oid oprid,
80	Oid opfuncid,
81	List *args,
82	int varRelid,
83	Oid collation,
84	Pattern_Type ptype,
85	bool negate);
86	static Pattern_Prefix_Status pattern_fixed_prefix(Const *patt,
87	Pattern_Type ptype,
88	Oid collation,
89	Const **prefix,
90	Selectivity *rest_selec);
91	static Selectivity prefix_selectivity(PlannerInfo *root,
92	VariableStatData *vardata,
93	Oid vartype, Oid opfamily, Const *prefixcon);
94	static Selectivity like_selectivity(const char patt, int* pattlen,
95	bool case_insensitive);
96	static Selectivity regex_selectivity(const char patt, int* pattlen,
97	bool case_insensitive,
98	int fixed_prefix_len);
99	static int pattern_char_isalpha(char c, bool is_multibyte,
100	pg_locale_t locale, bool locale_is_c);
101	static Const make_greater_string(const* Const str_const, FmgrInfo ltproc,
102	Oid collation);
103	static Datum string_to_datum(const char *str, Oid datatype);
104	static Const string_to_const(const* char *str, Oid datatype);
105	static Const string_to_bytea_const(const* char *str, size_t str_len);
106
107
108	/*
109	* Planner support functions for LIKE, regex, and related operators
110	*/
111	Datum
112	textlike_support(PG_FUNCTION_ARGS)
113	{
114	Node rawreq = (Node ) PG_GETARG_POINTER(`0`);
115
116	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like));
117	}
118
119	Datum
120	texticlike_support(PG_FUNCTION_ARGS)
121	{
122	Node rawreq = (Node ) PG_GETARG_POINTER(`0`);
123
124	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Like_IC));
125	}
126
127	Datum
128	textregexeq_support(PG_FUNCTION_ARGS)
129	{
130	Node rawreq = (Node ) PG_GETARG_POINTER(`0`);
131
132	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex));
133	}
134
135	Datum
136	texticregexeq_support(PG_FUNCTION_ARGS)
137	{
138	Node rawreq = (Node ) PG_GETARG_POINTER(`0`);
139
140	PG_RETURN_POINTER(like_regex_support(rawreq, Pattern_Type_Regex_IC));
141	}
142
143	/ Common code for the above /
144	static Node *
145	like_regex_support(Node *rawreq, Pattern_Type ptype)
146	{
147	Node *ret = NULL;
148
149	if (IsA(rawreq, SupportRequestSelectivity))
150	{
151	/*
152	* Make a selectivity estimate for a function call, just as we'd do if
153	* the call was via the corresponding operator.
154	*/
155	SupportRequestSelectivity req = (SupportRequestSelectivity ) rawreq;
156	Selectivity s1;
157
158	if (req->is_join)
159	{
160	/*
161	* For the moment we just punt. If patternjoinsel is ever
162	* improved to do better, this should be made to call it.
163	*/
164	s1 = DEFAULT_MATCH_SEL;
165	}
166	else
167	{
168	/ Share code with operator restriction selectivity functions /
169	s1 = patternsel_common(req->root,
170	InvalidOid,
171	req->funcid,
172	req->args,
173	req->varRelid,
174	req->inputcollid,
175	ptype,
176	false);
177	}
178	req->selectivity = s1;
179	ret = (Node *) req;
180	}
181	else if (IsA(rawreq, SupportRequestIndexCondition))
182	{
183	/ Try to convert operator/function call to index conditions /
184	SupportRequestIndexCondition req = (SupportRequestIndexCondition ) rawreq;
185
186	/*
187	* Currently we have no "reverse" match operators with the pattern on
188	* the left, so we only need consider cases with the indexkey on the
189	* left.
190	*/
191	if (req->indexarg != `0`)
192	return NULL;
193
194	if (is_opclause(req->node))
195	{
196	OpExpr clause = (OpExpr ) req->node;
197
198	Assert(list_length(clause->args) == `2`);
199	ret = (Node *)
200	match_pattern_prefix((Node *) linitial(clause->args),
201	(Node *) lsecond(clause->args),
202	ptype,
203	clause->inputcollid,
204	req->opfamily,
205	req->indexcollation);
206	}
207	else if (is_funcclause(req->node)) / be paranoid /
208	{
209	FuncExpr clause = (FuncExpr ) req->node;
210
211	Assert(list_length(clause->args) == `2`);
212	ret = (Node *)
213	match_pattern_prefix((Node *) linitial(clause->args),
214	(Node *) lsecond(clause->args),
215	ptype,
216	clause->inputcollid,
217	req->opfamily,
218	req->indexcollation);
219	}
220	}
221
222	return ret;
223	}
224
225	/*
226	* match_pattern_prefix
227	* Try to generate an indexqual for a LIKE or regex operator.
228	*/
229	static List *
230	match_pattern_prefix(Node *leftop,
231	Node *rightop,
232	Pattern_Type ptype,
233	Oid expr_coll,
234	Oid opfamily,
235	Oid indexcollation)
236	{
237	List *result;
238	Const *patt;
239	Const *prefix;
240	Pattern_Prefix_Status pstatus;
241	Oid ldatatype;
242	Oid rdatatype;
243	Oid oproid;
244	Expr *expr;
245	FmgrInfo ltproc;
246	Const *greaterstr;
247
248	/*
249	* Can't do anything with a non-constant or NULL pattern argument.
250	*
251	* Note that since we restrict ourselves to cases with a hard constant on
252	* the RHS, it's a-fortiori a pseudoconstant, and we don't need to worry
253	* about verifying that.
254	*/
255	if (!IsA(rightop, Const) \|\|
256	((Const *) rightop)->constisnull)
257	return NIL;
258	patt = (Const *) rightop;
259
260	/*
261	* Not supported if the expression collation is nondeterministic. The
262	* optimized equality or prefix tests use bytewise comparisons, which is
263	* not consistent with nondeterministic collations. The actual
264	* pattern-matching implementation functions will later error out that
265	* pattern-matching is not supported with nondeterministic collations. (We
266	* could also error out here, but by doing it later we get more precise
267	* error messages.) (It should be possible to support at least
268	* Pattern_Prefix_Exact, but no point as along as the actual
269	* pattern-matching implementations don't support it.)
270	*
271	* expr_coll is not set for a non-collation-aware data type such as bytea.
272	*/
273	if (expr_coll && !get_collation_isdeterministic(expr_coll))
274	return NIL;
275
276	/*
277	* Try to extract a fixed prefix from the pattern.
278	*/
279	pstatus = pattern_fixed_prefix(patt, ptype, expr_coll,
280	&prefix, NULL);
281
282	/ fail if no fixed prefix /
283	if (pstatus == Pattern_Prefix_None)
284	return NIL;
285
286	/*
287	* Must also check that index's opfamily supports the operators we will
288	* want to apply. (A hash index, for example, will not support ">=".)
289	* Currently, only btree and spgist support the operators we need.
290	*
291	* Note: actually, in the Pattern_Prefix_Exact case, we only need "=" so a
292	* hash index would work. Currently it doesn't seem worth checking for
293	* that, however.
294	*
295	* We insist on the opfamily being one of the specific ones we expect,
296	* else we'd do the wrong thing if someone were to make a reverse-sort
297	* opfamily with the same operators.
298	*
299	* The non-pattern opclasses will not sort the way we need in most non-C
300	* locales. We can use such an index anyway for an exact match (simple
301	* equality), but not for prefix-match cases. Note that here we are
302	* looking at the index's collation, not the expression's collation --
303	* this test is not dependent on the LIKE/regex operator's collation.
304	*
305	* While we're at it, identify the type the comparison constant(s) should
306	* have, based on the opfamily.
307	*/
308	switch (opfamily)
309	{
310	case TEXT_BTREE_FAM_OID:
311	if (!(pstatus == Pattern_Prefix_Exact \|\|
312	lc_collate_is_c(indexcollation)))
313	return NIL;
314	rdatatype = TEXTOID;
315	break;
316
317	case TEXT_PATTERN_BTREE_FAM_OID:
318	case TEXT_SPGIST_FAM_OID:
319	rdatatype = TEXTOID;
320	break;
321
322	case BPCHAR_BTREE_FAM_OID:
323	if (!(pstatus == Pattern_Prefix_Exact \|\|
324	lc_collate_is_c(indexcollation)))
325	return NIL;
326	rdatatype = BPCHAROID;
327	break;
328
329	case BPCHAR_PATTERN_BTREE_FAM_OID:
330	rdatatype = BPCHAROID;
331	break;
332
333	case BYTEA_BTREE_FAM_OID:
334	rdatatype = BYTEAOID;
335	break;
336
337	default:
338	return NIL;
339	}
340
341	/ OK, prepare to create the indexqual(s) /
342	ldatatype = exprType(leftop);
343
344	/*
345	* If necessary, coerce the prefix constant to the right type. The given
346	* prefix constant is either text or bytea type, therefore the only case
347	* where we need to do anything is when converting text to bpchar. Those
348	* two types are binary-compatible, so relabeling the Const node is
349	* sufficient.
350	*/
351	if (prefix->consttype != rdatatype)
352	{
353	Assert(prefix->consttype == TEXTOID &&
354	rdatatype == BPCHAROID);
355	prefix->consttype = rdatatype;
356	}
357
358	/*
359	* If we found an exact-match pattern, generate an "=" indexqual.
360	*/
361	if (pstatus == Pattern_Prefix_Exact)
362	{
363	oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
364	BTEqualStrategyNumber);
365	if (oproid == InvalidOid)
366	elog(ERROR, "no = operator for opfamily %u", opfamily);
367	expr = make_opclause(oproid, BOOLOID, false,
368	(Expr ) leftop, (Expr ) prefix,
369	InvalidOid, indexcollation);
370	result = list_make1(expr);
371	return result;
372	}
373
374	/*
375	* Otherwise, we have a nonempty required prefix of the values.
376	*
377	* We can always say "x >= prefix".
378	*/
379	oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
380	BTGreaterEqualStrategyNumber);
381	if (oproid == InvalidOid)
382	elog(ERROR, "no >= operator for opfamily %u", opfamily);
383	expr = make_opclause(oproid, BOOLOID, false,
384	(Expr ) leftop, (Expr ) prefix,
385	InvalidOid, indexcollation);
386	result = list_make1(expr);
387
388	/-------*
389	* If we can create a string larger than the prefix, we can say
390	* "x < greaterstr". NB: we rely on make_greater_string() to generate
391	* a guaranteed-greater string, not just a probably-greater string.
392	* In general this is only guaranteed in C locale, so we'd better be
393	* using a C-locale index collation.
394	*-------
395	*/
396	oproid = get_opfamily_member(opfamily, ldatatype, rdatatype,
397	BTLessStrategyNumber);
398	if (oproid == InvalidOid)
399	elog(ERROR, "no < operator for opfamily %u", opfamily);
400	fmgr_info(get_opcode(oproid), &ltproc);
401	greaterstr = make_greater_string(prefix, &ltproc, indexcollation);
402	if (greaterstr)
403	{
404	expr = make_opclause(oproid, BOOLOID, false,
405	(Expr ) leftop, (Expr ) greaterstr,
406	InvalidOid, indexcollation);
407	result = lappend(result, expr);
408	}
409
410	return result;
411	}
412
413
414	/*
415	* patternsel_common - generic code for pattern-match restriction selectivity.
416	*
417	* To support using this from either the operator or function paths, caller
418	* may pass either operator OID or underlying function OID; we look up the
419	* latter from the former if needed. (We could just have patternsel() call
420	* get_opcode(), but the work would be wasted if we don't have a need to
421	* compare a fixed prefix to the pg_statistic data.)
422	*
423	* Note that oprid and/or opfuncid should be for the positive-match operator
424	* even when negate is true.
425	*/
426	static double
427	patternsel_common(PlannerInfo *root,
428	Oid oprid,
429	Oid opfuncid,
430	List *args,
431	int varRelid,
432	Oid collation,
433	Pattern_Type ptype,
434	bool negate)
435	{
436	VariableStatData vardata;
437	Node *other;
438	bool varonleft;
439	Datum constval;
440	Oid consttype;
441	Oid vartype;
442	Oid opfamily;
443	Pattern_Prefix_Status pstatus;
444	Const *patt;
445	Const *prefix = NULL;
446	Selectivity rest_selec = `0`;
447	double nullfrac = `0.0`;
448	double result;
449
450	/*
451	* Initialize result to the appropriate default estimate depending on
452	* whether it's a match or not-match operator.
453	*/
454	if (negate)
455	result = `1.0` - DEFAULT_MATCH_SEL;
456	else
457	result = DEFAULT_MATCH_SEL;
458
459	/*
460	* If expression is not variable op constant, then punt and return the
461	* default estimate.
462	*/
463	if (!get_restriction_variable(root, args, varRelid,
464	&vardata, &other, &varonleft))
465	return result;
466	if (!varonleft \|\| !IsA(other, Const))
467	{
468	ReleaseVariableStats(vardata);
469	return result;
470	}
471
472	/*
473	* If the constant is NULL, assume operator is strict and return zero, ie,
474	* operator will never return TRUE. (It's zero even for a negator op.)
475	*/
476	if (((Const *) other)->constisnull)
477	{
478	ReleaseVariableStats(vardata);
479	return `0.0`;
480	}
481	constval = ((Const *) other)->constvalue;
482	consttype = ((Const *) other)->consttype;
483
484	/*
485	* The right-hand const is type text or bytea for all supported operators.
486	* We do not expect to see binary-compatible types here, since
487	* const-folding should have relabeled the const to exactly match the
488	* operator's declared type.
489	*/
490	if (consttype != TEXTOID && consttype != BYTEAOID)
491	{
492	ReleaseVariableStats(vardata);
493	return result;
494	}
495
496	/*
497	* Similarly, the exposed type of the left-hand side should be one of
498	* those we know. (Do not look at vardata.atttype, which might be
499	* something binary-compatible but different.) We can use it to choose
500	* the index opfamily from which we must draw the comparison operators.
501	*
502	* NOTE: It would be more correct to use the PATTERN opfamilies than the
503	* simple ones, but at the moment ANALYZE will not generate statistics for
504	* the PATTERN operators. But our results are so approximate anyway that
505	* it probably hardly matters.
506	*/
507	vartype = vardata.vartype;
508
509	switch (vartype)
510	{
511	case TEXTOID:
512	case NAMEOID:
513	opfamily = TEXT_BTREE_FAM_OID;
514	break;
515	case BPCHAROID:
516	opfamily = BPCHAR_BTREE_FAM_OID;
517	break;
518	case BYTEAOID:
519	opfamily = BYTEA_BTREE_FAM_OID;
520	break;
521	default:
522	ReleaseVariableStats(vardata);
523	return result;
524	}
525
526	/*
527	* Grab the nullfrac for use below.
528	*/
529	if (HeapTupleIsValid(vardata.statsTuple))
530	{
531	Form_pg_statistic stats;
532
533	stats = (Form_pg_statistic) GETSTRUCT(vardata.statsTuple);
534	nullfrac = stats->stanullfrac;
535	}
536
537	/*
538	* Pull out any fixed prefix implied by the pattern, and estimate the
539	* fractional selectivity of the remainder of the pattern. Unlike many
540	* other selectivity estimators, we use the pattern operator's actual
541	* collation for this step. This is not because we expect the collation
542	* to make a big difference in the selectivity estimate (it seldom would),
543	* but because we want to be sure we cache compiled regexps under the
544	* right cache key, so that they can be re-used at runtime.
545	*/
546	patt = (Const *) other;
547	pstatus = pattern_fixed_prefix(patt, ptype, collation,
548	&prefix, &rest_selec);
549
550	/*
551	* If necessary, coerce the prefix constant to the right type.
552	*/
553	if (prefix && prefix->consttype != vartype)
554	{
555	char *prefixstr;
556
557	switch (prefix->consttype)
558	{
559	case TEXTOID:
560	prefixstr = TextDatumGetCString(prefix->constvalue);
561	break;
562	case BYTEAOID:
563	prefixstr = DatumGetCString(DirectFunctionCall1(byteaout,
564	prefix->constvalue));
565	break;
566	default:
567	elog(ERROR, "unrecognized consttype: %u",
568	prefix->consttype);
569	ReleaseVariableStats(vardata);
570	return result;
571	}
572	prefix = string_to_const(prefixstr, vartype);
573	pfree(prefixstr);
574	}
575
576	if (pstatus == Pattern_Prefix_Exact)
577	{
578	/*
579	* Pattern specifies an exact match, so pretend operator is '='
580	*/
581	Oid eqopr = get_opfamily_member(opfamily, vartype, vartype,
582	BTEqualStrategyNumber);
583
584	if (eqopr == InvalidOid)
585	elog(ERROR, "no = operator for opfamily %u", opfamily);
586	result = var_eq_const(&vardata, eqopr, prefix->constvalue,
587	false, true, false);
588	}
589	else
590	{
591	/*
592	* Not exact-match pattern. If we have a sufficiently large
593	* histogram, estimate selectivity for the histogram part of the
594	* population by counting matches in the histogram. If not, estimate
595	* selectivity of the fixed prefix and remainder of pattern
596	* separately, then combine the two to get an estimate of the
597	* selectivity for the part of the column population represented by
598	* the histogram. (For small histograms, we combine these
599	* approaches.)
600	*
601	* We then add up data for any most-common-values values; these are
602	* not in the histogram population, and we can get exact answers for
603	* them by applying the pattern operator, so there's no reason to
604	* approximate. (If the MCVs cover a significant part of the total
605	* population, this gives us a big leg up in accuracy.)
606	*/
607	Selectivity selec;
608	int hist_size;
609	FmgrInfo opproc;
610	double mcv_selec,
611	sumcommon;
612
613	/ Try to use the histogram entries to get selectivity /
614	if (!OidIsValid(opfuncid))
615	opfuncid = get_opcode(oprid);
616	fmgr_info(opfuncid, &opproc);
617
618	selec = histogram_selectivity(&vardata, &opproc, constval, true,
619	`10`, `1`, &hist_size);
620
621	/ If not at least 100 entries, use the heuristic method /
622	if (hist_size < `100`)
623	{
624	Selectivity heursel;
625	Selectivity prefixsel;
626
627	if (pstatus == Pattern_Prefix_Partial)
628	prefixsel = prefix_selectivity(root, &vardata, vartype,
629	opfamily, prefix);
630	else
631	prefixsel = `1.0`;
632	heursel = prefixsel * rest_selec;
633
634	if (selec < `0`) / fewer than 10 histogram entries? /
635	selec = heursel;
636	else
637	{
638	/*
639	* For histogram sizes from 10 to 100, we combine the
640	* histogram and heuristic selectivities, putting increasingly
641	* more trust in the histogram for larger sizes.
642	*/
643	double hist_weight = hist_size / `100.0`;
644
645	selec = selec * hist_weight + heursel * (`1.0` - hist_weight);
646	}
647	}
648
649	/ In any case, don't believe extremely small or large estimates. /
650	if (selec < `0.0001`)
651	selec = `0.0001`;
652	else if (selec > `0.9999`)
653	selec = `0.9999`;
654
655	/*
656	* If we have most-common-values info, add up the fractions of the MCV
657	* entries that satisfy MCV OP PATTERN. These fractions contribute
658	* directly to the result selectivity. Also add up the total fraction
659	* represented by MCV entries.
660	*/
661	mcv_selec = mcv_selectivity(&vardata, &opproc, constval, true,
662	&sumcommon);
663
664	/*
665	* Now merge the results from the MCV and histogram calculations,
666	* realizing that the histogram covers only the non-null values that
667	* are not listed in MCV.
668	*/
669	selec *= `1.0` - nullfrac - sumcommon;
670	selec += mcv_selec;
671	result = selec;
672	}
673
674	/ now adjust if we wanted not-match rather than match /
675	if (negate)
676	result = `1.0` - result - nullfrac;
677
678	/ result should be in range, but make sure... /
679	CLAMP_PROBABILITY(result);
680
681	if (prefix)
682	{
683	pfree(DatumGetPointer(prefix->constvalue));
684	pfree(prefix);
685	}
686
687	ReleaseVariableStats(vardata);
688
689	return result;
690	}
691
692	/*
693	* Fix impedance mismatch between SQL-callable functions and patternsel_common
694	*/
695	static double
696	patternsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
697	{
698	PlannerInfo root = (PlannerInfo ) PG_GETARG_POINTER(`0`);
699	Oid operator = PG_GETARG_OID(`1`);
700	List args = (List ) PG_GETARG_POINTER(`2`);
701	int varRelid = PG_GETARG_INT32(`3`);
702	Oid collation = PG_GET_COLLATION();
703
704	/*
705	* If this is for a NOT LIKE or similar operator, get the corresponding
706	* positive-match operator and work with that.
707	*/
708	if (negate)
709	{
710	operator = get_negator(operator);
711	if (!OidIsValid(operator))
712	elog(ERROR, "patternsel called for operator without a negator");
713	}
714
715	return patternsel_common(root,
716	operator,
717	InvalidOid,
718	args,
719	varRelid,
720	collation,
721	ptype,
722	negate);
723	}
724
725	/*
726	* regexeqsel - Selectivity of regular-expression pattern match.
727	*/
728	Datum
729	regexeqsel(PG_FUNCTION_ARGS)
730	{
731	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, false));
732	}
733
734	/*
735	* icregexeqsel - Selectivity of case-insensitive regex match.
736	*/
737	Datum
738	icregexeqsel(PG_FUNCTION_ARGS)
739	{
740	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, false));
741	}
742
743	/*
744	* likesel - Selectivity of LIKE pattern match.
745	*/
746	Datum
747	likesel(PG_FUNCTION_ARGS)
748	{
749	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, false));
750	}
751
752	/*
753	* prefixsel - selectivity of prefix operator
754	*/
755	Datum
756	prefixsel(PG_FUNCTION_ARGS)
757	{
758	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Prefix, false));
759	}
760
761	/*
762	*
763	* iclikesel - Selectivity of ILIKE pattern match.
764	*/
765	Datum
766	iclikesel(PG_FUNCTION_ARGS)
767	{
768	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, false));
769	}
770
771	/*
772	* regexnesel - Selectivity of regular-expression pattern non-match.
773	*/
774	Datum
775	regexnesel(PG_FUNCTION_ARGS)
776	{
777	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex, true));
778	}
779
780	/*
781	* icregexnesel - Selectivity of case-insensitive regex non-match.
782	*/
783	Datum
784	icregexnesel(PG_FUNCTION_ARGS)
785	{
786	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Regex_IC, true));
787	}
788
789	/*
790	* nlikesel - Selectivity of LIKE pattern non-match.
791	*/
792	Datum
793	nlikesel(PG_FUNCTION_ARGS)
794	{
795	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like, true));
796	}
797
798	/*
799	* icnlikesel - Selectivity of ILIKE pattern non-match.
800	*/
801	Datum
802	icnlikesel(PG_FUNCTION_ARGS)
803	{
804	PG_RETURN_FLOAT8(patternsel(fcinfo, Pattern_Type_Like_IC, true));
805	}
806
807	/*
808	* patternjoinsel - Generic code for pattern-match join selectivity.
809	*/
810	static double
811	patternjoinsel(PG_FUNCTION_ARGS, Pattern_Type ptype, bool negate)
812	{
813	/ For the moment we just punt. /
814	return negate ? (`1.0` - DEFAULT_MATCH_SEL) : DEFAULT_MATCH_SEL;
815	}
816
817	/*
818	* regexeqjoinsel - Join selectivity of regular-expression pattern match.
819	*/
820	Datum
821	regexeqjoinsel(PG_FUNCTION_ARGS)
822	{
823	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, false));
824	}
825
826	/*
827	* icregexeqjoinsel - Join selectivity of case-insensitive regex match.
828	*/
829	Datum
830	icregexeqjoinsel(PG_FUNCTION_ARGS)
831	{
832	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, false));
833	}
834
835	/*
836	* likejoinsel - Join selectivity of LIKE pattern match.
837	*/
838	Datum
839	likejoinsel(PG_FUNCTION_ARGS)
840	{
841	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, false));
842	}
843
844	/*
845	* prefixjoinsel - Join selectivity of prefix operator
846	*/
847	Datum
848	prefixjoinsel(PG_FUNCTION_ARGS)
849	{
850	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Prefix, false));
851	}
852
853	/*
854	* iclikejoinsel - Join selectivity of ILIKE pattern match.
855	*/
856	Datum
857	iclikejoinsel(PG_FUNCTION_ARGS)
858	{
859	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, false));
860	}
861
862	/*
863	* regexnejoinsel - Join selectivity of regex non-match.
864	*/
865	Datum
866	regexnejoinsel(PG_FUNCTION_ARGS)
867	{
868	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex, true));
869	}
870
871	/*
872	* icregexnejoinsel - Join selectivity of case-insensitive regex non-match.
873	*/
874	Datum
875	icregexnejoinsel(PG_FUNCTION_ARGS)
876	{
877	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Regex_IC, true));
878	}
879
880	/*
881	* nlikejoinsel - Join selectivity of LIKE pattern non-match.
882	*/
883	Datum
884	nlikejoinsel(PG_FUNCTION_ARGS)
885	{
886	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like, true));
887	}
888
889	/*
890	* icnlikejoinsel - Join selectivity of ILIKE pattern non-match.
891	*/
892	Datum
893	icnlikejoinsel(PG_FUNCTION_ARGS)
894	{
895	PG_RETURN_FLOAT8(patternjoinsel(fcinfo, Pattern_Type_Like_IC, true));
896	}
897
898
899	/-------------------------------------------------------------------------*
900	*
901	* Pattern analysis functions
902	*
903	* These routines support analysis of LIKE and regular-expression patterns
904	* by the planner/optimizer. It's important that they agree with the
905	* regular-expression code in backend/regex/ and the LIKE code in
906	* backend/utils/adt/like.c. Also, the computation of the fixed prefix
907	* must be conservative: if we report a string longer than the true fixed
908	* prefix, the query may produce actually wrong answers, rather than just
909	* getting a bad selectivity estimate!
910	*
911	*-------------------------------------------------------------------------
912	*/
913
914	/*
915	* Extract the fixed prefix, if any, for a pattern.
916	*
917	* *prefix is set to a palloc'd prefix string (in the form of a Const node),
918	* or to NULL if no fixed prefix exists for the pattern.
919	* If rest_selec is not NULL, *rest_selec is set to an estimate of the
920	* selectivity of the remainder of the pattern (without any fixed prefix).
921	* The prefix Const has the same type (TEXT or BYTEA) as the input pattern.
922	*
923	* The return value distinguishes no fixed prefix, a partial prefix,
924	* or an exact-match-only pattern.
925	*/
926
927	static Pattern_Prefix_Status
928	like_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
929	Const *prefix_const, Selectivity rest_selec)
930	{
931	char *match;
932	char *patt;
933	int pattlen;
934	Oid typeid = patt_const->consttype;
935	int pos,
936	match_pos;
937	bool is_multibyte = (pg_database_encoding_max_length() > `1`);
938	pg_locale_t locale = `0`;
939	bool locale_is_c = false;
940
941	/ the right-hand const is type text or bytea /
942	Assert(typeid == BYTEAOID \|\| typeid == TEXTOID);
943
944	if (case_insensitive)
945	{
946	if (typeid == BYTEAOID)
947	ereport(ERROR,
948	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
949	errmsg("case insensitive matching not supported on type bytea")));
950
951	/ If case-insensitive, we need locale info /
952	if (lc_ctype_is_c(collation))
953	locale_is_c = true;
954	else if (collation != DEFAULT_COLLATION_OID)
955	{
956	if (!OidIsValid(collation))
957	{
958	/*
959	* This typically means that the parser could not resolve a
960	* conflict of implicit collations, so report it that way.
961	*/
962	ereport(ERROR,
963	(errcode(ERRCODE_INDETERMINATE_COLLATION),
964	errmsg("could not determine which collation to use for ILIKE"),
965	errhint("Use the COLLATE clause to set the collation explicitly.")));
966	}
967	locale = pg_newlocale_from_collation(collation);
968	}
969	}
970
971	if (typeid != BYTEAOID)
972	{
973	patt = TextDatumGetCString(patt_const->constvalue);
974	pattlen = strlen(patt);
975	}
976	else
977	{
978	bytea *bstr = DatumGetByteaPP(patt_const->constvalue);
979
980	pattlen = VARSIZE_ANY_EXHDR(bstr);
981	patt = (char *) palloc(pattlen);
982	memcpy(patt, VARDATA_ANY(bstr), pattlen);
983	Assert((Pointer) bstr == DatumGetPointer(patt_const->constvalue));
984	}
985
986	match = palloc(pattlen + `1`);
987	match_pos = `0`;
988	for (pos = `0`; pos < pattlen; pos++)
989	{
990	/ % and _ are wildcard characters in LIKE /
991	if (patt[pos] == `'%'` \|\|
992	patt[pos] == `'_'`)
993	break;
994
995	/ Backslash escapes the next character /
996	if (patt[pos] == `'\\'`)
997	{
998	pos++;
999	if (pos >= pattlen)
1000	break;
1001	}
1002
1003	/ Stop if case-varying character (it's sort of a wildcard) /
1004	if (case_insensitive &&
1005	pattern_char_isalpha(patt[pos], is_multibyte, locale, locale_is_c))
1006	break;
1007
1008	match[match_pos++] = patt[pos];
1009	}
1010
1011	match[match_pos] = `'\0'`;
1012
1013	if (typeid != BYTEAOID)
1014	*prefix_const = string_to_const(match, typeid);
1015	else
1016	*prefix_const = string_to_bytea_const(match, match_pos);
1017
1018	if (rest_selec != NULL)
1019	*rest_selec = like_selectivity(&patt[pos], pattlen - pos,
1020	case_insensitive);
1021
1022	pfree(patt);
1023	pfree(match);
1024
1025	/ in LIKE, an empty pattern is an exact match! /
1026	if (pos == pattlen)
1027	return Pattern_Prefix_Exact; / reached end of pattern, so exact /
1028
1029	if (match_pos > `0`)
1030	return Pattern_Prefix_Partial;
1031
1032	return Pattern_Prefix_None;
1033	}
1034
1035	static Pattern_Prefix_Status
1036	regex_fixed_prefix(Const *patt_const, bool case_insensitive, Oid collation,
1037	Const *prefix_const, Selectivity rest_selec)
1038	{
1039	Oid typeid = patt_const->consttype;
1040	char *prefix;
1041	bool exact;
1042
1043	/*
1044	* Should be unnecessary, there are no bytea regex operators defined. As
1045	* such, it should be noted that the rest of this function has not been
1046	* made safe for binary (possibly NULL containing) strings.
1047	*/
1048	if (typeid == BYTEAOID)
1049	ereport(ERROR,
1050	(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1051	errmsg("regular-expression matching not supported on type bytea")));
1052
1053	/ Use the regexp machinery to extract the prefix, if any /
1054	prefix = regexp_fixed_prefix(DatumGetTextPP(patt_const->constvalue),
1055	case_insensitive, collation,
1056	&exact);
1057
1058	if (prefix == NULL)
1059	{
1060	*prefix_const = NULL;
1061
1062	if (rest_selec != NULL)
1063	{
1064	char *patt = TextDatumGetCString(patt_const->constvalue);
1065
1066	*rest_selec = regex_selectivity(patt, strlen(patt),
1067	case_insensitive,
1068	`0`);
1069	pfree(patt);
1070	}
1071
1072	return Pattern_Prefix_None;
1073	}
1074
1075	*prefix_const = string_to_const(prefix, typeid);
1076
1077	if (rest_selec != NULL)
1078	{
1079	if (exact)
1080	{
1081	/ Exact match, so there's no additional selectivity /
1082	*rest_selec = `1.0`;
1083	}
1084	else
1085	{
1086	char *patt = TextDatumGetCString(patt_const->constvalue);
1087
1088	*rest_selec = regex_selectivity(patt, strlen(patt),
1089	case_insensitive,
1090	strlen(prefix));
1091	pfree(patt);
1092	}
1093	}
1094
1095	pfree(prefix);
1096
1097	if (exact)
1098	return Pattern_Prefix_Exact; / pattern specifies exact match /
1099	else
1100	return Pattern_Prefix_Partial;
1101	}
1102
1103	static Pattern_Prefix_Status
1104	pattern_fixed_prefix(Const *patt, Pattern_Type ptype, Oid collation,
1105	Const *prefix, Selectivity rest_selec)
1106	{
1107	Pattern_Prefix_Status result;
1108
1109	switch (ptype)
1110	{
1111	case Pattern_Type_Like:
1112	result = like_fixed_prefix(patt, false, collation,
1113	prefix, rest_selec);
1114	break;
1115	case Pattern_Type_Like_IC:
1116	result = like_fixed_prefix(patt, true, collation,
1117	prefix, rest_selec);
1118	break;
1119	case Pattern_Type_Regex:
1120	result = regex_fixed_prefix(patt, false, collation,
1121	prefix, rest_selec);
1122	break;
1123	case Pattern_Type_Regex_IC:
1124	result = regex_fixed_prefix(patt, true, collation,
1125	prefix, rest_selec);
1126	break;
1127	case Pattern_Type_Prefix:
1128	/ Prefix type work is trivial. /
1129	result = Pattern_Prefix_Partial;
1130	rest_selec = `1.0`; /* all /
1131	*prefix = makeConst(patt->consttype,
1132	patt->consttypmod,
1133	patt->constcollid,
1134	patt->constlen,
1135	datumCopy(patt->constvalue,
1136	patt->constbyval,
1137	patt->constlen),
1138	patt->constisnull,
1139	patt->constbyval);
1140	break;
1141	default:
1142	elog(ERROR, "unrecognized ptype: %d", (int) ptype);
1143	result = Pattern_Prefix_None; / keep compiler quiet /
1144	break;
1145	}
1146	return result;
1147	}
1148
1149	/*
1150	* Estimate the selectivity of a fixed prefix for a pattern match.
1151	*
1152	* A fixed prefix "foo" is estimated as the selectivity of the expression
1153	* "variable >= 'foo' AND variable < 'fop'" (see also indxpath.c).
1154	*
1155	* The selectivity estimate is with respect to the portion of the column
1156	* population represented by the histogram --- the caller must fold this
1157	* together with info about MCVs and NULLs.
1158	*
1159	* We use the >= and < operators from the specified btree opfamily to do the
1160	* estimation. The given variable and Const must be of the associated
1161	* datatype.
1162	*
1163	* XXX Note: we make use of the upper bound to estimate operator selectivity
1164	* even if the locale is such that we cannot rely on the upper-bound string.
1165	* The selectivity only needs to be approximately right anyway, so it seems
1166	* more useful to use the upper-bound code than not.
1167	*/
1168	static Selectivity
1169	prefix_selectivity(PlannerInfo root, VariableStatData vardata,
1170	Oid vartype, Oid opfamily, Const *prefixcon)
1171	{
1172	Selectivity prefixsel;
1173	Oid cmpopr;
1174	FmgrInfo opproc;
1175	AttStatsSlot sslot;
1176	Const *greaterstrcon;
1177	Selectivity eq_sel;
1178
1179	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1180	BTGreaterEqualStrategyNumber);
1181	if (cmpopr == InvalidOid)
1182	elog(ERROR, "no >= operator for opfamily %u", opfamily);
1183	fmgr_info(get_opcode(cmpopr), &opproc);
1184
1185	prefixsel = ineq_histogram_selectivity(root, vardata,
1186	&opproc, true, true,
1187	prefixcon->constvalue,
1188	prefixcon->consttype);
1189
1190	if (prefixsel < `0.0`)
1191	{
1192	/ No histogram is present ... return a suitable default estimate /
1193	return DEFAULT_MATCH_SEL;
1194	}
1195
1196	/-------*
1197	* If we can create a string larger than the prefix, say
1198	* "x < greaterstr". We try to generate the string referencing the
1199	* collation of the var's statistics, but if that's not available,
1200	* use DEFAULT_COLLATION_OID.
1201	*-------
1202	*/
1203	if (HeapTupleIsValid(vardata->statsTuple) &&
1204	get_attstatsslot(&sslot, vardata->statsTuple,
1205	STATISTIC_KIND_HISTOGRAM, InvalidOid, `0`))
1206	/ sslot.stacoll is set up / ;
1207	else
1208	sslot.stacoll = DEFAULT_COLLATION_OID;
1209	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1210	BTLessStrategyNumber);
1211	if (cmpopr == InvalidOid)
1212	elog(ERROR, "no < operator for opfamily %u", opfamily);
1213	fmgr_info(get_opcode(cmpopr), &opproc);
1214	greaterstrcon = make_greater_string(prefixcon, &opproc, sslot.stacoll);
1215	if (greaterstrcon)
1216	{
1217	Selectivity topsel;
1218
1219	topsel = ineq_histogram_selectivity(root, vardata,
1220	&opproc, false, false,
1221	greaterstrcon->constvalue,
1222	greaterstrcon->consttype);
1223
1224	/ ineq_histogram_selectivity worked before, it shouldn't fail now /
1225	Assert(topsel >= `0.0`);
1226
1227	/*
1228	* Merge the two selectivities in the same way as for a range query
1229	* (see clauselist_selectivity()). Note that we don't need to worry
1230	* about double-exclusion of nulls, since ineq_histogram_selectivity
1231	* doesn't count those anyway.
1232	*/
1233	prefixsel = topsel + prefixsel - `1.0`;
1234	}
1235
1236	/*
1237	* If the prefix is long then the two bounding values might be too close
1238	* together for the histogram to distinguish them usefully, resulting in a
1239	* zero estimate (plus or minus roundoff error). To avoid returning a
1240	* ridiculously small estimate, compute the estimated selectivity for
1241	* "variable = 'foo'", and clamp to that. (Obviously, the resultant
1242	* estimate should be at least that.)
1243	*
1244	* We apply this even if we couldn't make a greater string. That case
1245	* suggests that the prefix is near the maximum possible, and thus
1246	* probably off the end of the histogram, and thus we probably got a very
1247	* small estimate from the >= condition; so we still need to clamp.
1248	*/
1249	cmpopr = get_opfamily_member(opfamily, vartype, vartype,
1250	BTEqualStrategyNumber);
1251	if (cmpopr == InvalidOid)
1252	elog(ERROR, "no = operator for opfamily %u", opfamily);
1253	eq_sel = var_eq_const(vardata, cmpopr, prefixcon->constvalue,
1254	false, true, false);
1255
1256	prefixsel = Max(prefixsel, eq_sel);
1257
1258	return prefixsel;
1259	}
1260
1261
1262	/*
1263	* Estimate the selectivity of a pattern of the specified type.
1264	* Note that any fixed prefix of the pattern will have been removed already,
1265	* so actually we may be looking at just a fragment of the pattern.
1266	*
1267	* For now, we use a very simplistic approach: fixed characters reduce the
1268	* selectivity a good deal, character ranges reduce it a little,
1269	* wildcards (such as % for LIKE or .* for regex) increase it.
1270	*/
1271
1272	#define FIXED_CHAR_SEL 0.20 /* about 1/5 */
1273	#define CHAR_RANGE_SEL 0.25
1274	#define ANY_CHAR_SEL 0.9 /* not 1, since it won't match end-of-string */
1275	#define FULL_WILDCARD_SEL 5.0
1276	#define PARTIAL_WILDCARD_SEL 2.0
1277
1278	static Selectivity
1279	like_selectivity(const char patt, int* pattlen, bool case_insensitive)
1280	{
1281	Selectivity sel = `1.0`;
1282	int pos;
1283
1284	/ Skip any leading wildcard; it's already factored into initial sel /
1285	for (pos = `0`; pos < pattlen; pos++)
1286	{
1287	if (patt[pos] != `'%'` && patt[pos] != `'_'`)
1288	break;
1289	}
1290
1291	for (; pos < pattlen; pos++)
1292	{
1293	/ % and _ are wildcard characters in LIKE /
1294	if (patt[pos] == `'%'`)
1295	sel *= FULL_WILDCARD_SEL;
1296	else if (patt[pos] == `'_'`)
1297	sel *= ANY_CHAR_SEL;
1298	else if (patt[pos] == `'\\'`)
1299	{
1300	/ Backslash quotes the next character /
1301	pos++;
1302	if (pos >= pattlen)
1303	break;
1304	sel *= FIXED_CHAR_SEL;
1305	}
1306	else
1307	sel *= FIXED_CHAR_SEL;
1308	}
1309	/ Could get sel > 1 if multiple wildcards /
1310	if (sel > `1.0`)
1311	sel = `1.0`;
1312	return sel;
1313	}
1314
1315	static Selectivity
1316	regex_selectivity_sub(const char patt, int* pattlen, bool case_insensitive)
1317	{
1318	Selectivity sel = `1.0`;
1319	int paren_depth = `0`;
1320	int paren_pos = `0`; / dummy init to keep compiler quiet /
1321	int pos;
1322
1323	for (pos = `0`; pos < pattlen; pos++)
1324	{
1325	if (patt[pos] == `'('`)
1326	{
1327	if (paren_depth == `0`)
1328	paren_pos = pos; / remember start of parenthesized item /
1329	paren_depth++;
1330	}
1331	else if (patt[pos] == `')'` && paren_depth > `0`)
1332	{
1333	paren_depth--;
1334	if (paren_depth == `0`)
1335	sel *= regex_selectivity_sub(patt + (paren_pos + `1`),
1336	pos - (paren_pos + `1`),
1337	case_insensitive);
1338	}
1339	else if (patt[pos] == `'\|'` && paren_depth == `0`)
1340	{
1341	/*
1342	* If unquoted \| is present at paren level 0 in pattern, we have
1343	* multiple alternatives; sum their probabilities.
1344	*/
1345	sel += regex_selectivity_sub(patt + (pos + `1`),
1346	pattlen - (pos + `1`),
1347	case_insensitive);
1348	break; / rest of pattern is now processed /
1349	}
1350	else if (patt[pos] == `'['`)
1351	{
1352	bool negclass = false;
1353
1354	if (patt[++pos] == `'^'`)
1355	{
1356	negclass = true;
1357	pos++;
1358	}
1359	if (patt[pos] == `']'`) / ']' at start of class is not special /
1360	pos++;
1361	while (pos < pattlen && patt[pos] != `']'`)
1362	pos++;
1363	if (paren_depth == `0`)
1364	sel *= (negclass ? (`1.0` - CHAR_RANGE_SEL) : CHAR_RANGE_SEL);
1365	}
1366	else if (patt[pos] == `'.'`)
1367	{
1368	if (paren_depth == `0`)
1369	sel *= ANY_CHAR_SEL;
1370	}
1371	else if (patt[pos] == `'*'` \|\|
1372	patt[pos] == `'?'` \|\|
1373	patt[pos] == `'+'`)
1374	{
1375	/ Ought to be smarter about quantifiers... /
1376	if (paren_depth == `0`)
1377	sel *= PARTIAL_WILDCARD_SEL;
1378	}
1379	else if (patt[pos] == `'{'`)
1380	{
1381	while (pos < pattlen && patt[pos] != `'}'`)
1382	pos++;
1383	if (paren_depth == `0`)
1384	sel *= PARTIAL_WILDCARD_SEL;
1385	}
1386	else if (patt[pos] == `'\\'`)
1387	{
1388	/ backslash quotes the next character /
1389	pos++;
1390	if (pos >= pattlen)
1391	break;
1392	if (paren_depth == `0`)
1393	sel *= FIXED_CHAR_SEL;
1394	}
1395	else
1396	{
1397	if (paren_depth == `0`)
1398	sel *= FIXED_CHAR_SEL;
1399	}
1400	}
1401	/ Could get sel > 1 if multiple wildcards /
1402	if (sel > `1.0`)
1403	sel = `1.0`;
1404	return sel;
1405	}
1406
1407	static Selectivity
1408	regex_selectivity(const char patt, int* pattlen, bool case_insensitive,
1409	int fixed_prefix_len)
1410	{
1411	Selectivity sel;
1412
1413	/ If patt doesn't end with $, consider it to have a trailing wildcard /
1414	if (pattlen > `0` && patt[pattlen - `1`] == `'$'` &&
1415	(pattlen == `1` \|\| patt[pattlen - `2`] != `'\\'`))
1416	{
1417	/ has trailing $ /
1418	sel = regex_selectivity_sub(patt, pattlen - `1`, case_insensitive);
1419	}
1420	else
1421	{
1422	/ no trailing $ /
1423	sel = regex_selectivity_sub(patt, pattlen, case_insensitive);
1424	sel *= FULL_WILDCARD_SEL;
1425	}
1426
1427	/ If there's a fixed prefix, discount its selectivity /
1428	if (fixed_prefix_len > `0`)
1429	sel /= pow(FIXED_CHAR_SEL, fixed_prefix_len);
1430
1431	/ Make sure result stays in range /
1432	CLAMP_PROBABILITY(sel);
1433	return sel;
1434	}
1435
1436	/*
1437	* Check whether char is a letter (and, hence, subject to case-folding)
1438	*
1439	* In multibyte character sets or with ICU, we can't use isalpha, and it does
1440	* not seem worth trying to convert to wchar_t to use iswalpha or u_isalpha.
1441	* Instead, just assume any non-ASCII char is potentially case-varying, and
1442	* hard-wire knowledge of which ASCII chars are letters.
1443	*/
1444	static int
1445	pattern_char_isalpha(char c, bool is_multibyte,
1446	pg_locale_t locale, bool locale_is_c)
1447	{
1448	if (locale_is_c)
1449	return (c >= `'A'` && c <= `'Z'`) \|\| (c >= `'a'` && c <= `'z'`);
1450	else if (is_multibyte && IS_HIGHBIT_SET(c))
1451	return true;
1452	else if (locale && locale->provider == COLLPROVIDER_ICU)
1453	return IS_HIGHBIT_SET(c) \|\|
1454	(c >= `'A'` && c <= `'Z'`) \|\| (c >= `'a'` && c <= `'z'`);
1455	#ifdef HAVE_LOCALE_T
1456	else if (locale && locale->provider == COLLPROVIDER_LIBC)
1457	return isalpha_l((unsigned char) c, locale->info.lt);
1458	#endif
1459	else
1460	return isalpha((unsigned char) c);
1461	}
1462
1463
1464	/*
1465	* For bytea, the increment function need only increment the current byte
1466	* (there are no multibyte characters to worry about).
1467	*/
1468	static bool
1469	byte_increment(unsigned char ptr, int* len)
1470	{
1471	if (*ptr >= `255`)
1472	return false;
1473	(*ptr)++;
1474	return true;
1475	}
1476
1477	/*
1478	* Try to generate a string greater than the given string or any
1479	* string it is a prefix of. If successful, return a palloc'd string
1480	* in the form of a Const node; else return NULL.
1481	*
1482	* The caller must provide the appropriate "less than" comparison function
1483	* for testing the strings, along with the collation to use.
1484	*
1485	* The key requirement here is that given a prefix string, say "foo",
1486	* we must be able to generate another string "fop" that is greater than
1487	* all strings "foobar" starting with "foo". We can test that we have
1488	* generated a string greater than the prefix string, but in non-C collations
1489	* that is not a bulletproof guarantee that an extension of the string might
1490	* not sort after it; an example is that "foo " is less than "foo!", but it
1491	* is not clear that a "dictionary" sort ordering will consider "foo!" less
1492	* than "foo bar". CAUTION: Therefore, this function should be used only for
1493	* estimation purposes when working in a non-C collation.
1494	*
1495	* To try to catch most cases where an extended string might otherwise sort
1496	* before the result value, we determine which of the strings "Z", "z", "y",
1497	* and "9" is seen as largest by the collation, and append that to the given
1498	* prefix before trying to find a string that compares as larger.
1499	*
1500	* To search for a greater string, we repeatedly "increment" the rightmost
1501	* character, using an encoding-specific character incrementer function.
1502	* When it's no longer possible to increment the last character, we truncate
1503	* off that character and start incrementing the next-to-rightmost.
1504	* For example, if "z" were the last character in the sort order, then we
1505	* could produce "foo" as a string greater than "fonz".
1506	*
1507	* This could be rather slow in the worst case, but in most cases we
1508	* won't have to try more than one or two strings before succeeding.
1509	*
1510	* Note that it's important for the character incrementer not to be too anal
1511	* about producing every possible character code, since in some cases the only
1512	* way to get a larger string is to increment a previous character position.
1513	* So we don't want to spend too much time trying every possible character
1514	* code at the last position. A good rule of thumb is to be sure that we
1515	* don't try more than 256*K values for a K-byte character (and definitely
1516	* not 256^K, which is what an exhaustive search would approach).
1517	*/
1518	static Const *
1519	make_greater_string(const Const str_const, FmgrInfo ltproc, Oid collation)
1520	{
1521	Oid datatype = str_const->consttype;
1522	char *workstr;
1523	int len;
1524	Datum cmpstr;
1525	char *cmptxt = NULL;
1526	mbcharacter_incrementer charinc;
1527
1528	/*
1529	* Get a modifiable copy of the prefix string in C-string format, and set
1530	* up the string we will compare to as a Datum. In C locale this can just
1531	* be the given prefix string, otherwise we need to add a suffix. Type
1532	* BYTEA sorts bytewise so it never needs a suffix either.
1533	*/
1534	if (datatype == BYTEAOID)
1535	{
1536	bytea *bstr = DatumGetByteaPP(str_const->constvalue);
1537
1538	len = VARSIZE_ANY_EXHDR(bstr);
1539	workstr = (char *) palloc(len);
1540	memcpy(workstr, VARDATA_ANY(bstr), len);
1541	Assert((Pointer) bstr == DatumGetPointer(str_const->constvalue));
1542	cmpstr = str_const->constvalue;
1543	}
1544	else
1545	{
1546	if (datatype == NAMEOID)
1547	workstr = DatumGetCString(DirectFunctionCall1(nameout,
1548	str_const->constvalue));
1549	else
1550	workstr = TextDatumGetCString(str_const->constvalue);
1551	len = strlen(workstr);
1552	if (lc_collate_is_c(collation) \|\| len == `0`)
1553	cmpstr = str_const->constvalue;
1554	else
1555	{
1556	/ If first time through, determine the suffix to use /
1557	static char suffixchar = `0`;
1558	static Oid suffixcollation = `0`;
1559
1560	if (!suffixchar \|\| suffixcollation != collation)
1561	{
1562	char *best;
1563
1564	best = "Z";
1565	if (varstr_cmp(best, `1`, "z", `1`, collation) < `0`)
1566	best = "z";
1567	if (varstr_cmp(best, `1`, "y", `1`, collation) < `0`)
1568	best = "y";
1569	if (varstr_cmp(best, `1`, "9", `1`, collation) < `0`)
1570	best = "9";
1571	suffixchar = *best;
1572	suffixcollation = collation;
1573	}
1574
1575	/ And build the string to compare to /
1576	if (datatype == NAMEOID)
1577	{
1578	cmptxt = palloc(len + `2`);
1579	memcpy(cmptxt, workstr, len);
1580	cmptxt[len] = suffixchar;
1581	cmptxt[len + `1`] = `'\0'`;
1582	cmpstr = PointerGetDatum(cmptxt);
1583	}
1584	else
1585	{
1586	cmptxt = palloc(VARHDRSZ + len + `1`);
1587	SET_VARSIZE(cmptxt, VARHDRSZ + len + `1`);
1588	memcpy(VARDATA(cmptxt), workstr, len);
1589	*(VARDATA(cmptxt) + len) = suffixchar;
1590	cmpstr = PointerGetDatum(cmptxt);
1591	}
1592	}
1593	}
1594
1595	/ Select appropriate character-incrementer function /
1596	if (datatype == BYTEAOID)
1597	charinc = byte_increment;
1598	else
1599	charinc = pg_database_encoding_character_incrementer();
1600
1601	/ And search ... /
1602	while (len > `0`)
1603	{
1604	int charlen;
1605	unsigned char *lastchar;
1606
1607	/ Identify the last character --- for bytea, just the last byte /
1608	if (datatype == BYTEAOID)
1609	charlen = `1`;
1610	else
1611	charlen = len - pg_mbcliplen(workstr, len, len - `1`);
1612	lastchar = (unsigned char *) (workstr + len - charlen);
1613
1614	/*
1615	* Try to generate a larger string by incrementing the last character
1616	* (for BYTEA, we treat each byte as a character).
1617	*
1618	* Note: the incrementer function is expected to return true if it's
1619	* generated a valid-per-the-encoding new character, otherwise false.
1620	* The contents of the character on false return are unspecified.
1621	*/
1622	while (charinc(lastchar, charlen))
1623	{
1624	Const *workstr_const;
1625
1626	if (datatype == BYTEAOID)
1627	workstr_const = string_to_bytea_const(workstr, len);
1628	else
1629	workstr_const = string_to_const(workstr, datatype);
1630
1631	if (DatumGetBool(FunctionCall2Coll(ltproc,
1632	collation,
1633	cmpstr,
1634	workstr_const->constvalue)))
1635	{
1636	/ Successfully made a string larger than cmpstr /
1637	if (cmptxt)
1638	pfree(cmptxt);
1639	pfree(workstr);
1640	return workstr_const;
1641	}
1642
1643	/ No good, release unusable value and try again /
1644	pfree(DatumGetPointer(workstr_const->constvalue));
1645	pfree(workstr_const);
1646	}
1647
1648	/*
1649	* No luck here, so truncate off the last character and try to
1650	* increment the next one.
1651	*/
1652	len -= charlen;
1653	workstr[len] = `'\0'`;
1654	}
1655
1656	/ Failed... /
1657	if (cmptxt)
1658	pfree(cmptxt);
1659	pfree(workstr);
1660
1661	return NULL;
1662	}
1663
1664	/*
1665	* Generate a Datum of the appropriate type from a C string.
1666	* Note that all of the supported types are pass-by-ref, so the
1667	* returned value should be pfree'd if no longer needed.
1668	*/
1669	static Datum
1670	string_to_datum(const char *str, Oid datatype)
1671	{
1672	Assert(str != NULL);
1673
1674	/*
1675	* We cheat a little by assuming that CStringGetTextDatum() will do for
1676	* bpchar and varchar constants too...
1677	*/
1678	if (datatype == NAMEOID)
1679	return DirectFunctionCall1(namein, CStringGetDatum(str));
1680	else if (datatype == BYTEAOID)
1681	return DirectFunctionCall1(byteain, CStringGetDatum(str));
1682	else
1683	return CStringGetTextDatum(str);
1684	}
1685
1686	/*
1687	* Generate a Const node of the appropriate type from a C string.
1688	*/
1689	static Const *
1690	string_to_const(const char *str, Oid datatype)
1691	{
1692	Datum conval = string_to_datum(str, datatype);
1693	Oid collation;
1694	int constlen;
1695
1696	/*
1697	* We only need to support a few datatypes here, so hard-wire properties
1698	* instead of incurring the expense of catalog lookups.
1699	*/
1700	switch (datatype)
1701	{
1702	case TEXTOID:
1703	case VARCHAROID:
1704	case BPCHAROID:
1705	collation = DEFAULT_COLLATION_OID;
1706	constlen = -`1`;
1707	break;
1708
1709	case NAMEOID:
1710	collation = C_COLLATION_OID;
1711	constlen = NAMEDATALEN;
1712	break;
1713
1714	case BYTEAOID:
1715	collation = InvalidOid;
1716	constlen = -`1`;
1717	break;
1718
1719	default:
1720	elog(ERROR, "unexpected datatype in string_to_const: %u",
1721	datatype);
1722	return NULL;
1723	}
1724
1725	return makeConst(datatype, -`1`, collation, constlen,
1726	conval, false, false);
1727	}
1728
1729	/*
1730	* Generate a Const node of bytea type from a binary C string and a length.
1731	*/
1732	static Const *
1733	string_to_bytea_const(const char *str, size_t str_len)
1734	{
1735	bytea *bstr = palloc(VARHDRSZ + str_len);
1736	Datum conval;
1737
1738	memcpy(VARDATA(bstr), str, str_len);
1739	SET_VARSIZE(bstr, VARHDRSZ + str_len);
1740	conval = PointerGetDatum(bstr);
1741
1742	return makeConst(BYTEAOID, -`1`, InvalidOid, -`1`, conval, false, false);
1743	}
1744

Browse the source code of PostgreSQL/src/backend/utils/adt/like_support.c