1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
|
/*-------------------------------------------------------------------------
*
* visibilitymap.c
* bitmap for tracking visibility of heap tuples
*
* Portions Copyright (c) 1996-2011, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
* src/backend/access/heap/visibilitymap.c
*
* INTERFACE ROUTINES
* visibilitymap_clear - clear a bit in the visibility map
* visibilitymap_pin - pin a map page for setting a bit
* visibilitymap_pin_ok - check whether correct map page is already pinned
* visibilitymap_set - set a bit in a previously pinned page
* visibilitymap_test - test if a bit is set
*
* NOTES
*
* The visibility map is a bitmap with one bit per heap page. A set bit means
* that all tuples on the page are known visible to all transactions, and
* therefore the page doesn't need to be vacuumed. The map is conservative in
* the sense that we make sure that whenever a bit is set, we know the
* condition is true, but if a bit is not set, it might or might not be true.
*
* There's no explicit WAL logging in the functions in this file. The callers
* must make sure that whenever a bit is cleared, the bit is cleared on WAL
* replay of the updating operation as well. Setting bits during recovery
* isn't necessary for correctness.
*
* Currently, the visibility map is only used as a hint, to speed up VACUUM.
* A corrupted visibility map won't cause data corruption, although it can
* make VACUUM skip pages that need vacuuming, until the next anti-wraparound
* vacuum. The visibility map is not used for anti-wraparound vacuums, because
* an anti-wraparound vacuum needs to freeze tuples and observe the latest xid
* present in the table, even on pages that don't have any dead tuples.
*
* Although the visibility map is just a hint at the moment, the PD_ALL_VISIBLE
* flag on heap pages *must* be correct, because it is used to skip visibility
* checking.
*
* LOCKING
*
* In heapam.c, whenever a page is modified so that not all tuples on the
* page are visible to everyone anymore, the corresponding bit in the
* visibility map is cleared. The bit in the visibility map is cleared
* after releasing the lock on the heap page, to avoid holding the lock
* over possible I/O to read in the visibility map page.
*
* To set a bit, you need to hold a lock on the heap page. That prevents
* the race condition where VACUUM sees that all tuples on the page are
* visible to everyone, but another backend modifies the page before VACUUM
* sets the bit in the visibility map.
*
* When a bit is set, the LSN of the visibility map page is updated to make
* sure that the visibility map update doesn't get written to disk before the
* WAL record of the changes that made it possible to set the bit is flushed.
* But when a bit is cleared, we don't have to do that because it's always
* safe to clear a bit in the map from correctness point of view.
*
* TODO
*
* It would be nice to use the visibility map to skip visibility checks in
* index scans.
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include "access/heapam.h"
#include "access/visibilitymap.h"
#include "miscadmin.h"
#include "storage/bufmgr.h"
#include "storage/bufpage.h"
#include "storage/lmgr.h"
#include "storage/smgr.h"
/*#define TRACE_VISIBILITYMAP */
/*
* Size of the bitmap on each visibility map page, in bytes. There's no
* extra headers, so the whole page minus the standard page header is
* used for the bitmap.
*/
#define MAPSIZE (BLCKSZ - MAXALIGN(SizeOfPageHeaderData))
/* Number of bits allocated for each heap block. */
#define BITS_PER_HEAPBLOCK 1
/* Number of heap blocks we can represent in one byte. */
#define HEAPBLOCKS_PER_BYTE 8
/* Number of heap blocks we can represent in one visibility map page. */
#define HEAPBLOCKS_PER_PAGE (MAPSIZE * HEAPBLOCKS_PER_BYTE)
/* Mapping from heap block number to the right bit in the visibility map */
#define HEAPBLK_TO_MAPBLOCK(x) ((x) / HEAPBLOCKS_PER_PAGE)
#define HEAPBLK_TO_MAPBYTE(x) (((x) % HEAPBLOCKS_PER_PAGE) / HEAPBLOCKS_PER_BYTE)
#define HEAPBLK_TO_MAPBIT(x) ((x) % HEAPBLOCKS_PER_BYTE)
/* prototypes for internal routines */
static Buffer vm_readbuf(Relation rel, BlockNumber blkno, bool extend);
static void vm_extend(Relation rel, BlockNumber nvmblocks);
/*
* visibilitymap_clear - clear a bit in visibility map
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
visibilitymap_clear(Relation rel, BlockNumber heapBlk, Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
int mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
int mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
uint8 mask = 1 << mapBit;
char *map;
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_clear %s %d", RelationGetRelationName(rel), heapBlk);
#endif
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
elog(ERROR, "wrong buffer passed to visibilitymap_clear");
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
map = PageGetContents(BufferGetPage(buf));
if (map[mapByte] & mask)
{
map[mapByte] &= ~mask;
MarkBufferDirty(buf);
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
* visibilitymap_pin - pin a map page for setting a bit
*
* Setting a bit in the visibility map is a two-phase operation. First, call
* visibilitymap_pin, to pin the visibility map page containing the bit for
* the heap page. Because that can require I/O to read the map page, you
* shouldn't hold a lock on the heap page while doing that. Then, call
* visibilitymap_set to actually set the bit.
*
* On entry, *buf should be InvalidBuffer or a valid buffer returned by
* an earlier call to visibilitymap_pin or visibilitymap_test on the same
* relation. On return, *buf is a valid buffer with the map page containing
* the bit for heapBlk.
*
* If the page doesn't exist in the map file yet, it is extended.
*/
void
visibilitymap_pin(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
/* Reuse the old pinned buffer if possible */
if (BufferIsValid(*buf))
{
if (BufferGetBlockNumber(*buf) == mapBlock)
return;
ReleaseBuffer(*buf);
}
*buf = vm_readbuf(rel, mapBlock, true);
}
/*
* visibilitymap_pin_ok - do we already have the correct page pinned?
*
* On entry, buf should be InvalidBuffer or a valid buffer returned by
* an earlier call to visibilitymap_pin or visibilitymap_test on the same
* relation. The return value indicates whether the buffer covers the
* given heapBlk.
*/
bool
visibilitymap_pin_ok(BlockNumber heapBlk, Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
return BufferIsValid(buf) && BufferGetBlockNumber(buf) == mapBlock;
}
/*
* visibilitymap_set - set a bit on a previously pinned page
*
* recptr is the LSN of the XLOG record we're replaying, if we're in recovery,
* or InvalidXLogRecPtr in normal running. The page LSN is advanced to the
* one provided; in normal running, we generate a new XLOG record and set the
* page LSN to that value.
*
* You must pass a buffer containing the correct map page to this function.
* Call visibilitymap_pin first to pin the right one. This function doesn't do
* any I/O.
*/
void
visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
Buffer buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
Page page;
char *map;
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_set %s %d", RelationGetRelationName(rel), heapBlk);
#endif
Assert(InRecovery || XLogRecPtrIsInvalid(recptr));
/* Check that we have the right page pinned */
if (!BufferIsValid(buf) || BufferGetBlockNumber(buf) != mapBlock)
elog(ERROR, "wrong buffer passed to visibilitymap_set");
page = BufferGetPage(buf);
map = PageGetContents(page);
LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
if (!(map[mapByte] & (1 << mapBit)))
{
START_CRIT_SECTION();
map[mapByte] |= (1 << mapBit);
MarkBufferDirty(buf);
if (RelationNeedsWAL(rel))
{
if (XLogRecPtrIsInvalid(recptr))
recptr = log_heap_visible(rel->rd_node, heapBlk, buf);
PageSetLSN(page, recptr);
PageSetTLI(page, ThisTimeLineID);
}
END_CRIT_SECTION();
}
LockBuffer(buf, BUFFER_LOCK_UNLOCK);
}
/*
* visibilitymap_test - test if a bit is set
*
* Are all tuples on heapBlk visible to all, according to the visibility map?
*
* On entry, *buf should be InvalidBuffer or a valid buffer returned by an
* earlier call to visibilitymap_pin or visibilitymap_test on the same
* relation. On return, *buf is a valid buffer with the map page containing
* the bit for heapBlk, or InvalidBuffer. The caller is responsible for
* releasing *buf after it's done testing and setting bits.
*/
bool
visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
{
BlockNumber mapBlock = HEAPBLK_TO_MAPBLOCK(heapBlk);
uint32 mapByte = HEAPBLK_TO_MAPBYTE(heapBlk);
uint8 mapBit = HEAPBLK_TO_MAPBIT(heapBlk);
bool result;
char *map;
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_test %s %d", RelationGetRelationName(rel), heapBlk);
#endif
/* Reuse the old pinned buffer if possible */
if (BufferIsValid(*buf))
{
if (BufferGetBlockNumber(*buf) != mapBlock)
{
ReleaseBuffer(*buf);
*buf = InvalidBuffer;
}
}
if (!BufferIsValid(*buf))
{
*buf = vm_readbuf(rel, mapBlock, false);
if (!BufferIsValid(*buf))
return false;
}
map = PageGetContents(BufferGetPage(*buf));
/*
* We don't need to lock the page, as we're only looking at a single bit.
*/
result = (map[mapByte] & (1 << mapBit)) ? true : false;
return result;
}
/*
* visibilitymap_truncate - truncate the visibility map
*
* The caller must hold AccessExclusiveLock on the relation, to ensure that
* other backends receive the smgr invalidation event that this function sends
* before they access the VM again.
*
* nheapblocks is the new size of the heap.
*/
void
visibilitymap_truncate(Relation rel, BlockNumber nheapblocks)
{
BlockNumber newnblocks;
/* last remaining block, byte, and bit */
BlockNumber truncBlock = HEAPBLK_TO_MAPBLOCK(nheapblocks);
uint32 truncByte = HEAPBLK_TO_MAPBYTE(nheapblocks);
uint8 truncBit = HEAPBLK_TO_MAPBIT(nheapblocks);
#ifdef TRACE_VISIBILITYMAP
elog(DEBUG1, "vm_truncate %s %d", RelationGetRelationName(rel), nheapblocks);
#endif
RelationOpenSmgr(rel);
/*
* If no visibility map has been created yet for this relation, there's
* nothing to truncate.
*/
if (!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
return;
/*
* Unless the new size is exactly at a visibility map page boundary, the
* tail bits in the last remaining map page, representing truncated heap
* blocks, need to be cleared. This is not only tidy, but also necessary
* because we don't get a chance to clear the bits if the heap is extended
* again.
*/
if (truncByte != 0 || truncBit != 0)
{
Buffer mapBuffer;
Page page;
char *map;
newnblocks = truncBlock + 1;
mapBuffer = vm_readbuf(rel, truncBlock, false);
if (!BufferIsValid(mapBuffer))
{
/* nothing to do, the file was already smaller */
return;
}
page = BufferGetPage(mapBuffer);
map = PageGetContents(page);
LockBuffer(mapBuffer, BUFFER_LOCK_EXCLUSIVE);
/* Clear out the unwanted bytes. */
MemSet(&map[truncByte + 1], 0, MAPSIZE - (truncByte + 1));
/*
* Mask out the unwanted bits of the last remaining byte.
*
* ((1 << 0) - 1) = 00000000 ((1 << 1) - 1) = 00000001 ... ((1 << 6) -
* 1) = 00111111 ((1 << 7) - 1) = 01111111
*/
map[truncByte] &= (1 << truncBit) - 1;
MarkBufferDirty(mapBuffer);
UnlockReleaseBuffer(mapBuffer);
}
else
newnblocks = truncBlock;
if (smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM) <= newnblocks)
{
/* nothing to do, the file was already smaller than requested size */
return;
}
/* Truncate the unused VM pages, and send smgr inval message */
smgrtruncate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, newnblocks);
/*
* We might as well update the local smgr_vm_nblocks setting. smgrtruncate
* sent an smgr cache inval message, which will cause other backends to
* invalidate their copy of smgr_vm_nblocks, and this one too at the next
* command boundary. But this ensures it isn't outright wrong until then.
*/
if (rel->rd_smgr)
rel->rd_smgr->smgr_vm_nblocks = newnblocks;
}
/*
* Read a visibility map page.
*
* If the page doesn't exist, InvalidBuffer is returned, or if 'extend' is
* true, the visibility map file is extended.
*/
static Buffer
vm_readbuf(Relation rel, BlockNumber blkno, bool extend)
{
Buffer buf;
RelationOpenSmgr(rel);
/*
* If we haven't cached the size of the visibility map fork yet, check it
* first. Also recheck if the requested block seems to be past end, since
* our cached value might be stale. (We send smgr inval messages on
* truncation, but not on extension.)
*/
if (rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber ||
blkno >= rel->rd_smgr->smgr_vm_nblocks)
{
if (smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
rel->rd_smgr->smgr_vm_nblocks = smgrnblocks(rel->rd_smgr,
VISIBILITYMAP_FORKNUM);
else
rel->rd_smgr->smgr_vm_nblocks = 0;
}
/* Handle requests beyond EOF */
if (blkno >= rel->rd_smgr->smgr_vm_nblocks)
{
if (extend)
vm_extend(rel, blkno + 1);
else
return InvalidBuffer;
}
/*
* Use ZERO_ON_ERROR mode, and initialize the page if necessary. It's
* always safe to clear bits, so it's better to clear corrupt pages than
* error out.
*/
buf = ReadBufferExtended(rel, VISIBILITYMAP_FORKNUM, blkno,
RBM_ZERO_ON_ERROR, NULL);
if (PageIsNew(BufferGetPage(buf)))
PageInit(BufferGetPage(buf), BLCKSZ, 0);
return buf;
}
/*
* Ensure that the visibility map fork is at least vm_nblocks long, extending
* it if necessary with zeroed pages.
*/
static void
vm_extend(Relation rel, BlockNumber vm_nblocks)
{
BlockNumber vm_nblocks_now;
Page pg;
pg = (Page) palloc(BLCKSZ);
PageInit(pg, BLCKSZ, 0);
/*
* We use the relation extension lock to lock out other backends trying to
* extend the visibility map at the same time. It also locks out extension
* of the main fork, unnecessarily, but extending the visibility map
* happens seldom enough that it doesn't seem worthwhile to have a
* separate lock tag type for it.
*
* Note that another backend might have extended or created the relation
* by the time we get the lock.
*/
LockRelationForExtension(rel, ExclusiveLock);
/* Might have to re-open if a cache flush happened */
RelationOpenSmgr(rel);
/*
* Create the file first if it doesn't exist. If smgr_vm_nblocks is
* positive then it must exist, no need for an smgrexists call.
*/
if ((rel->rd_smgr->smgr_vm_nblocks == 0 ||
rel->rd_smgr->smgr_vm_nblocks == InvalidBlockNumber) &&
!smgrexists(rel->rd_smgr, VISIBILITYMAP_FORKNUM))
smgrcreate(rel->rd_smgr, VISIBILITYMAP_FORKNUM, false);
vm_nblocks_now = smgrnblocks(rel->rd_smgr, VISIBILITYMAP_FORKNUM);
while (vm_nblocks_now < vm_nblocks)
{
smgrextend(rel->rd_smgr, VISIBILITYMAP_FORKNUM, vm_nblocks_now,
(char *) pg, false);
vm_nblocks_now++;
}
/* Update local cache with the up-to-date size */
rel->rd_smgr->smgr_vm_nblocks = vm_nblocks_now;
UnlockRelationForExtension(rel, ExclusiveLock);
pfree(pg);
}
|