8000 Fix handling of empty ranges and NULLs in BRIN · postgres/postgres@fc7dc72 · GitHub
[go: up one dir, main page]

Skip to content

Commit fc7dc72

Browse files
committed
Fix handling of empty ranges and NULLs in BRIN
BRIN indexes did not properly distinguish between summaries for empty (no rows) and all-NULL ranges, treating them as essentially the same thing. Summaries were initialized with allnulls=true, and opclasses simply reset allnulls to false when processing the first non-NULL value. This however produces incorrect results if the range starts with a NULL value (or a sequence of NULL values), in which case we forget the range contains NULL values when adding the first non-NULL value. This happens because the allnulls flag is used for two separate purposes - to mark empty ranges (not representing any rows yet) and ranges containing only NULL values. Opclasses don't know which of these cases it is, and so don't know whether to set hasnulls=true. Setting the flag in both cases would make it correct, but it would also make BRIN indexes useless for queries with IS NULL clauses. All ranges start empty (and thus allnulls=true), so all ranges would end up with either allnulls=true or hasnulls=true. The severity of the issue is somewhat reduced by the fact that it only happens when adding values to an existing summary with allnulls=true. This can happen e.g. for small tables (because a summary for the first range exists for all BRIN indexes), or for tables with large fraction of NULL values in the indexed columns. Bulk summarization (e.g. during CREATE INDEX or automatic summarization) that processes all values at once is not affected by this issue. In this case the flags were updated in a slightly different way, not forgetting the NULL values. To identify empty ranges we use a new flag, stored in an unused bit in the BRIN tuple header so the on-disk format remains the same. A matching flag is added to BrinMemTuple, into a 3B gap after bt_placeholder. That means there's no risk of ABI breakage, although we don't actually pass the BrinMemTuple to any public API. We could also skip storing index tuples for empty summaries, but then we'd have to always process such ranges - even if there are no rows in large parts of the table (e.g. after a bulk DELETE), it would still require reading the pages etc. So we store them, but ignore them when building the bitmap. Backpatch to 11. The issue exists since BRIN indexes were introduced in 9.5, but older releases are already EOL. Backpatch-through: 11 Reviewed-by: Justin Pryzby, Matthias van de Meent, Alvaro Herrera Discussion: https://postgr.es/m/402430e4-7d9d-6cf1-09ef-464d80afff3b@enterprisedb.com
1 parent b511d73 commit fc7dc72

File tree

5 files changed

+167
-8
lines changed

5 files changed

+167
-8
lines changed

src/backend/access/brin/brin.c

Lines changed: 145 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
#include "storage/bufmgr.h"
3131
#include "storage/freespace.h"
3232
#include "utils/builtins.h"
33+
#include "utils/datum.h"
3334
#include "utils/guc.h"
3435
#include "utils/index_selfuncs.h"
3536
#include "utils/memutils.h"
@@ -164,7 +165,7 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
164165

165166
for (;;)
166167
{
167-
bool need_insert = false;
168+
bool need_insert;
168169
OffsetNumber off;
169170
BrinTuple *brtup;
170171
BrinMemTuple *dtup;
@@ -232,6 +233,9 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
232233

233234
dtup = brin_deform_tuple(bdesc, brtup, NULL);
234235

236+
/* If the range starts empty, we're certainly going to modify it. */
237+
need_insert = dtup->bt_empty_range;
238+
235239
/*
236240
* Compare the key values of the new tuple to the stored index values;
237241
* our deformed tuple will get updated if the new tuple doesn't fit
@@ -244,8 +248,20 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
244248
Datum result;
245249
BrinValues *bval;
246250
FmgrInfo *addValue;
251+
bool has_nulls;
247252

248253
bval = &dtup->bt_columns[keyno];
254+
255+
/*
256+
* Does the range have actual NULL values? Either of the flags can
257+
* be set, but we ignore the state before adding first row.
258+
*
259+
* We have to remember this, because we'll modify the flags and we
260+
* need to know if the range started as empty.
261+
*/
262+
has_nulls = ((!dtup->bt_empty_range) &&
263+
(bval->bv_hasnulls || bval->bv_allnulls));
264+
249265
addValue = index_getprocinfo(idxRel, keyno + 1,
250266
BRIN_PROCNUM_ADDVALUE);
251267
result = FunctionCall4Coll(addValue,
@@ -256,8 +272,33 @@ brininsert(Relation idxRel, Datum *values, bool *nulls,
256272
nulls[keyno]);
257273
/* if that returned true, we need to insert the updated tuple */
258274
need_insert |= DatumGetBool(result);
275+
276+
/*
277+
* If the range was had actual NULL values (i.e. did not start empty),
278+
* make sure we don't forget about the NULL values. Either the allnulls
279+
* flag is still set to true, or (if the opclass cleared it) we need to
280+
* set hasnulls=true.
281+
*
282+
* XXX This can only happen when the opclass modified the tuple, so the
283+
* modified flag should be set.
284+
*/
285+
if (has_nulls && !(bval->bv_hasnulls || bval->bv_allnulls))
286+
{
287+
Assert(need_insert);
288+
bval->bv_hasnulls = true;
289+
}
259290
}
260291

292+
/*
293+
* After updating summaries for all the keys, mark it as not empty.
294+
*
295+
* If we're actually changing the flag value (i.e. tuple start 1E80 ed as
296+
* empty), we should have modified the tuple. So we should not see
297+
* empty range that was not modified.
298+
*/
299+
Assert(!dtup->bt_empty_range || need_insert);
300+
dtup->bt_empty_range = false;
301+
261302
if (!need_insert)
262303
{
263304
/*
@@ -499,6 +540,17 @@ bringetbitmap(IndexScanDesc scan, TIDBitmap *tbm)
499540
CurrentMemoryContext);
500541
}
501542

543+
/*
544+
* If the BRIN tuple indicates that this range is empty,
545+
* we can skip it: there's nothing to match. We don't
546+
* need to examine the next columns.
547+
*/
548+
if (dtup->bt_empty_range)
549+
{
550+
addrange = false;
551+
break;
552+
}
553+
502554
/*
503555
* Check whether the scan key is consistent with the page
504556
* range values; if so, have the pages in the range added
@@ -636,8 +688,24 @@ brinbuildCallback(Relation index,
636688
FmgrInfo *addValue;
637689
BrinValues *col;
638690
Form_pg_attribute attr = TupleDescAttr(state->bs_bdesc->bd_tupdesc, i);
691+
bool has_nulls;
639692

640693
col = &state->bs_dtuple->bt_columns[i];
694+
695+
/*
696+
* Does the range have actual NULL values? Either of the flags can
697+
* be set, but we ignore the state before adding first row.
698+
*
699+
* We have to remember this, because we'll modify the flags and we
700+
* need to know if the range started as empty.
701+
*/
702+
has_nulls = ((!state->bs_dtuple->bt_empty_range) &&
703+
(col->bv_hasnulls || col->bv_allnulls));
704+
705+
/*
706+
* Call the BRIN_PROCNUM_ADDVALUE procedure. We do this even for NULL
707+
* values, because who knows what the opclass is doing.
708+
*/
641709
addValue = index_getprocinfo(index, i + 1,
642710
BRIN_PROCNUM_ADDVALUE);
643711

@@ -649,7 +717,25 @@ brinbuildCallback(Relation index,
649717
PointerGetDatum(state->bs_bdesc),
650718
PointerGetDatum(col),
651719
values[i], isnull[i]);
720+
721+
/*
722+
* If the range was had actual NULL values (i.e. did not start empty),
723+
* make sure we don't forget about the NULL values. Either the allnulls
724+
* flag is still set to true, or (if the opclass cleared it) we need to
725+
* set hasnulls=true.
726+
*/
727+
if (has_nulls && !(col->bv_hasnulls || col->bv_allnulls))
728+
col->bv_hasnulls = true;
652729
}
730+
731+
/*
732+
* After updating summaries for all the keys, mark it as not empty.
733+
*
734+
* If we're actually changing the flag value (i.e. tuple started as
735+
* empty), we should have modified the tuple. So we should not see
736+
* empty range that was not modified.
737+
*/
738+
state->bs_dtuple->bt_empty_range = false;
653739
}
654740

655741
/*
@@ -1469,6 +1555,64 @@ union_tuples(BrinDesc *bdesc, BrinMemTuple *a, BrinTuple *b)
14691555
db = brin_deform_tuple(bdesc, b, NULL);
14701556
MemoryContextSwitchTo(oldcxt);
14711557

1558+
/*
1559+
* Check if the ranges are empty.
1560+
*
1561+
* If at least one of them is empty, we don't need to call per-key union
1562+
* functions at all. If "b" is empty, we just use "a" as the result (it
1563+
* might be empty fine, but that's fine). If "a" is empty but "b" is not,
1564+
* we use "b" as the result (but we have to copy the data into "a" first).
1565+
*
1566+
* Only when both ranges are non-empty, we actually do the per-key merge.
1567+
*/
1568< 10000 code class="diff-text syntax-highlighted-line addition">+
1569+
/* If "b" is empty - ignore it and just use "a" (even if it's empty etc.). */
1570+
if (db->bt_empty_range)
1571+
{
1572+
/* skip the per-key merge */
1573+
MemoryContextDelete(cxt);
1574+
return;
1575+
}
1576+
1577+
/*
1578+
* Now we know "b" is not empty. If "a" is empty, then "b" is the result.
1579+
* But we need to copy the data from "b" to "a" first, because that's how
1580+
* we pass result out.
1581+
*
1582+
* We have to copy all the global/per-key flags etc. too.
1583+
*/
1584+
if (a->bt_empty_range)
1585+
{
1586+
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
1587+
{
1588+
int i;
1589+
BrinValues *col_a = &a->bt_columns[keyno];
1590+
BrinValues *col_b = &db->bt_columns[keyno];
1591+
BrinOpcInfo *opcinfo = bdesc->bd_info[keyno];
1592+
1593+
col_a->bv_allnulls = col_b->bv_allnulls;
1594+
col_a->bv_hasnulls = col_b->bv_hasnulls;
1595+
1596+
/* If "b" has no data, we're done. */
1597+
if (col_b->bv_allnulls)
1598+
continue;
1599+
1600+
for (i = 0; i < opcinfo->oi_nstored; i++)
1601+
col_a->bv_values[i] =
1602+
datumCopy(col_b->bv_values[i],
1603+
opcinfo->oi_typcache[i]->typbyval,
1604+
opcinfo->oi_typcache[i]->typlen);
1605+
}
1606+
1607+
/* "a" started empty, but "b" was not empty, so remember that */
1608+
a->bt_empty_range = false;
1609+
1610+
/* skip the per-key merge */
1611+
MemoryContextDelete(cxt);
1612+
return;
1613+
}
1614+
1615+
/* Neither range is empty, so call the union proc. */
14721616
for (keyno = 0; keyno < bdesc->bd_tupdesc->natts; keyno++)
14731617
{
14741618
FmgrInfo *unionFn;

src/backend/access/brin/brin_tuple.c

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -339,6 +339,9 @@ brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno, BrinMemTuple *tuple,
339339
if (tuple->bt_placeholder)
340340
rettuple->bt_info |= BRIN_PLACEHOLDER_MASK;
341341

342+
if (tuple->bt_empty_range)
343+
rettuple->bt_info |= BRIN_EMPTY_RANGE_MASK;
344+
342345
*size = len;
343346
return rettuple;
344347
}
@@ -366,7 +369,7 @@ brin_form_placeholder_tuple(BrinDesc *brdesc, BlockNumber blkno, Size *size)
366369
rettuple = palloc0(len);
367370
rettuple->bt_blkno = blkno;
368371
rettuple->bt_info = hoff;
369-
rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK;
372+
rettuple->bt_info |= BRIN_NULLS_MASK | BRIN_PLACEHOLDER_MASK | BRIN_EMPTY_RANGE_MASK;
370373

371374
bitP = ((bits8 *) ((char *) rettuple + SizeOfBrinTuple)) - 1;
372375
bitmask = HIGHBIT;
@@ -456,6 +459,8 @@ brin_new_memtuple(BrinDesc *brdesc)
456459
dtup->bt_allnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
457460
dtup->bt_hasnulls = palloc(sizeof(bool) * brdesc->bd_tupdesc->natts);
458461

462+
dtup->bt_empty_range = true;
463+
459464
dtup->bt_context = AllocSetContextCreate(CurrentMemoryContext,
460465
"brin dtuple",
461466
ALLOCSET_DEFAULT_SIZES);
@@ -489,6 +494,8 @@ brin_memtuple_initialize(BrinMemTuple *dtuple, BrinDesc *brdesc)
489494
currdatum += sizeof(Datum) * brdesc->bd_info[i]->oi_nstored;
490495
}
491496

497+
dtuple->bt_empty_range = true;
498+
492499
return dtuple;
493500
}
494501

@@ -522,6 +529,11 @@ brin_deform_tuple(BrinDesc *brdesc, BrinTuple *tuple, BrinMemTuple *dMemtuple)
522529

523530
if (BrinTupleIsPlaceholder(tuple))
524531
dtup->bt_placeholder = true;
532+
533+
/* ranges start as empty, depends on the BrinTuple */
534+
if (!BrinTupleIsEmptyRange(tuple))
535+
dtup->bt_empty_range = false;
536+
525537
dtup->bt_blkno = tuple->bt_blkno;
526538

527539
values = dtup->bt_values;

src/include/access/brin_tuple.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ typedef struct BrinValues
3636
typedef struct BrinMemTuple
3737
{
3838
bool bt_placeholder; /* this is a placeholder tuple */
39+
bool bt_empty_range; /* range represents no tuples */
3940
BlockNumber bt_blkno; /* heap blkno that the tuple is for */
4041
MemoryContext bt_context; /* memcxt holding the bt_columns values */
4142
/* output arrays for brin_deform_tuple: */
@@ -61,7 +62,7 @@ typedef struct BrinTuple
6162
*
6263
* 7th (high) bit: has nulls
6364
* 6th bit: is placeholder tuple
64-
* 5th bit: unused
65+
* 5th bit: range is empty
6566
* 4-0 bit: offset of data
6667
* ---------------
6768
*/
@@ -74,13 +75,14 @@ typedef struct BrinTuple
7475
* bt_info manipulation macros
7576
*/
7677
#define BRIN_OFFSET_MASK 0x1F
77-
/* bit 0x20 is not used at present */
78+
#define BRIN_EMPTY_RANGE_MASK 0x20
7879
#define BRIN_PLACEHOLDER_MASK 0x40
7980
#define BRIN_NULLS_MASK 0x80
8081

8182
#define BrinTupleDataOffset(tup) ((Size) (((BrinTuple *) (tup))->bt_info & BRIN_OFFSET_MASK))
8283
#define BrinTupleHasNulls(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_NULLS_MASK)) != 0)
8384
#define BrinTupleIsPlaceholder(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_PLACEHOLDER_MASK)) != 0)
85+
#define BrinTupleIsEmptyRange(tup) (((((BrinTuple *) (tup))->bt_info & BRIN_EMPTY_RANGE_MASK)) != 0)
8486

8587

8688
extern BrinTuple *brin_form_tuple(BrinDesc *brdesc, BlockNumber blkno,

src/test/modules/brin/expected/summarization-and-inprogress-insertion.out

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@ starting permutation: s2check s1b s2b s1i s2summ s1c s2c s2check
44
step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
55
itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value
66
----------+------+------+--------+--------+-----------+--------
7-
1| 0| 1|f |f |f |{1 .. 1}
7+
1| 0| 1|f |t |f |{1 .. 1}
88
(1 row)
99

1010
step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -26,7 +26,7 @@ step s2c: COMMIT;
2626
step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
2727
itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value
2828
----------+------+------+--------+--------+-----------+-----------
29-
1| 0| 1|f |f |f |{1 .. 1}
29+
1| 0| 1|f |t |f |{1 .. 1}
3030
2| 1| 1|f |f |f |{1 .. 1000}
3131
(2 rows)
3232

@@ -35,7 +35,7 @@ starting permutation: s2check s1b s1i s2vacuum s1c s2check
3535
step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
3636
itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value
3737
----------+------+------+--------+--------+-----------+--------
38-
1| 0| 1|f |f |f |{1 .. 1}
38+
1| 0| 1|f |t |f |{1 .. 1}
3939
(1 row)
4040

4141
step s1b: BEGIN ISOLATION LEVEL REPEATABLE READ;
@@ -45,7 +45,7 @@ step s1c: COMMIT;
4545
step s2check: SELECT * FROM brin_page_items(get_raw_page('brinidx', 2), 'brinidx'::regclass);
4646
itemoffset|blknum|attnum|allnulls|hasnulls|placeholder|value
4747
----------+------+------+--------+--------+-----------+-----------
48-
1| 0| 1|f |f |f |{1 .. 1}
48+
1| 0| 1|f |t |f |{1 .. 1}
4949
2| 1| 1|f |f |f |{1 .. 1000}
5050
(2 rows)
5151

src/test/modules/brin/specs/summarization-and-inprogress-insertion.spec

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ setup
99
) WITH (fillfactor=10);
1010
CREATE INDEX brinidx ON brin_iso USING brin (value) WITH (pages_per_range=1);
1111
-- this fills the first page
12+
INSERT INTO brin_iso VALUES (NULL);
1213
DO $$
1314
DECLARE curtid tid;
1415
BEGIN

0 commit comments

Comments
 (0)
0