| 1 | /*------------------------------------------------------------------------- |
| 2 | * |
| 3 | * freelist.c |
| 4 | * routines for managing the buffer pool's replacement strategy. |
| 5 | * |
| 6 | * |
| 7 | * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group |
| 8 | * Portions Copyright (c) 1994, Regents of the University of California |
| 9 | * |
| 10 | * |
| 11 | * IDENTIFICATION |
| 12 | * src/backend/storage/buffer/freelist.c |
| 13 | * |
| 14 | *------------------------------------------------------------------------- |
| 15 | */ |
| 16 | #include "postgres.h" |
| 17 | |
| 18 | #include "port/atomics.h" |
| 19 | #include "storage/buf_internals.h" |
| 20 | #include "storage/bufmgr.h" |
| 21 | #include "storage/proc.h" |
| 22 | |
| 23 | #define INT_ACCESS_ONCE(var) ((int)(*((volatile int *)&(var)))) |
| 24 | |
| 25 | |
| 26 | /* |
| 27 | * The shared freelist control information. |
| 28 | */ |
| 29 | typedef struct |
| 30 | { |
| 31 | /* Spinlock: protects the values below */ |
| 32 | slock_t buffer_strategy_lock; |
| 33 | |
| 34 | /* |
| 35 | * Clock sweep hand: index of next buffer to consider grabbing. Note that |
| 36 | * this isn't a concrete buffer - we only ever increase the value. So, to |
| 37 | * get an actual buffer, it needs to be used modulo NBuffers. |
| 38 | */ |
| 39 | pg_atomic_uint32 nextVictimBuffer; |
| 40 | |
| 41 | int firstFreeBuffer; /* Head of list of unused buffers */ |
| 42 | int lastFreeBuffer; /* Tail of list of unused buffers */ |
| 43 | |
| 44 | /* |
| 45 | * NOTE: lastFreeBuffer is undefined when firstFreeBuffer is -1 (that is, |
| 46 | * when the list is empty) |
| 47 | */ |
| 48 | |
| 49 | /* |
| 50 | * Statistics. These counters should be wide enough that they can't |
| 51 | * overflow during a single bgwriter cycle. |
| 52 | */ |
| 53 | uint32 completePasses; /* Complete cycles of the clock sweep */ |
| 54 | pg_atomic_uint32 numBufferAllocs; /* Buffers allocated since last reset */ |
| 55 | |
| 56 | /* |
| 57 | * Bgworker process to be notified upon activity or -1 if none. See |
| 58 | * StrategyNotifyBgWriter. |
| 59 | */ |
| 60 | int bgwprocno; |
| 61 | } BufferStrategyControl; |
| 62 | |
| 63 | /* Pointers to shared state */ |
| 64 | static BufferStrategyControl *StrategyControl = NULL; |
| 65 | |
| 66 | /* |
| 67 | * Private (non-shared) state for managing a ring of shared buffers to re-use. |
| 68 | * This is currently the only kind of BufferAccessStrategy object, but someday |
| 69 | * we might have more kinds. |
| 70 | */ |
| 71 | typedef struct BufferAccessStrategyData |
| 72 | { |
| 73 | /* Overall strategy type */ |
| 74 | BufferAccessStrategyType btype; |
| 75 | /* Number of elements in buffers[] array */ |
| 76 | int ring_size; |
| 77 | |
| 78 | /* |
| 79 | * Index of the "current" slot in the ring, ie, the one most recently |
| 80 | * returned by GetBufferFromRing. |
| 81 | */ |
| 82 | int current; |
| 83 | |
| 84 | /* |
| 85 | * True if the buffer just returned by StrategyGetBuffer had been in the |
| 86 | * ring already. |
| 87 | */ |
| 88 | bool current_was_in_ring; |
| 89 | |
| 90 | /* |
| 91 | * Array of buffer numbers. InvalidBuffer (that is, zero) indicates we |
| 92 | * have not yet selected a buffer for this ring slot. For allocation |
| 93 | * simplicity this is palloc'd together with the fixed fields of the |
| 94 | * struct. |
| 95 | */ |
| 96 | Buffer buffers[FLEXIBLE_ARRAY_MEMBER]; |
| 97 | } BufferAccessStrategyData; |
| 98 | |
| 99 | |
| 100 | /* Prototypes for internal functions */ |
| 101 | static BufferDesc *GetBufferFromRing(BufferAccessStrategy strategy, |
| 102 | uint32 *buf_state); |
| 103 | static void AddBufferToRing(BufferAccessStrategy strategy, |
| 104 | BufferDesc *buf); |
| 105 | |
| 106 | /* |
| 107 | * ClockSweepTick - Helper routine for StrategyGetBuffer() |
| 108 | * |
| 109 | * Move the clock hand one buffer ahead of its current position and return the |
| 110 | * id of the buffer now under the hand. |
| 111 | */ |
| 112 | static inline uint32 |
| 113 | ClockSweepTick(void) |
| 114 | { |
| 115 | uint32 victim; |
| 116 | |
| 117 | /* |
| 118 | * Atomically move hand ahead one buffer - if there's several processes |
| 119 | * doing this, this can lead to buffers being returned slightly out of |
| 120 | * apparent order. |
| 121 | */ |
| 122 | victim = |
| 123 | pg_atomic_fetch_add_u32(&StrategyControl->nextVictimBuffer, 1); |
| 124 | |
| 125 | if (victim >= NBuffers) |
| 126 | { |
| 127 | uint32 originalVictim = victim; |
| 128 | |
| 129 | /* always wrap what we look up in BufferDescriptors */ |
| 130 | victim = victim % NBuffers; |
| 131 | |
| 132 | /* |
| 133 | * If we're the one that just caused a wraparound, force |
| 134 | * completePasses to be incremented while holding the spinlock. We |
| 135 | * need the spinlock so StrategySyncStart() can return a consistent |
| 136 | * value consisting of nextVictimBuffer and completePasses. |
| 137 | */ |
| 138 | if (victim == 0) |
| 139 | { |
| 140 | uint32 expected; |
| 141 | uint32 wrapped; |
| 142 | bool success = false; |
| 143 | |
| 144 | expected = originalVictim + 1; |
| 145 | |
| 146 | while (!success) |
| 147 | { |
| 148 | /* |
| 149 | * Acquire the spinlock while increasing completePasses. That |
| 150 | * allows other readers to read nextVictimBuffer and |
| 151 | * completePasses in a consistent manner which is required for |
| 152 | * StrategySyncStart(). In theory delaying the increment |
| 153 | * could lead to an overflow of nextVictimBuffers, but that's |
| 154 | * highly unlikely and wouldn't be particularly harmful. |
| 155 | */ |
| 156 | SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| 157 | |
| 158 | wrapped = expected % NBuffers; |
| 159 | |
| 160 | success = pg_atomic_compare_exchange_u32(&StrategyControl->nextVictimBuffer, |
| 161 | &expected, wrapped); |
| 162 | if (success) |
| 163 | StrategyControl->completePasses++; |
| 164 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 165 | } |
| 166 | } |
| 167 | } |
| 168 | return victim; |
| 169 | } |
| 170 | |
| 171 | /* |
| 172 | * have_free_buffer -- a lockless check to see if there is a free buffer in |
| 173 | * buffer pool. |
| 174 | * |
| 175 | * If the result is true that will become stale once free buffers are moved out |
| 176 | * by other operations, so the caller who strictly want to use a free buffer |
| 177 | * should not call this. |
| 178 | */ |
| 179 | bool |
| 180 | have_free_buffer() |
| 181 | { |
| 182 | if (StrategyControl->firstFreeBuffer >= 0) |
| 183 | return true; |
| 184 | else |
| 185 | return false; |
| 186 | } |
| 187 | |
| 188 | /* |
| 189 | * StrategyGetBuffer |
| 190 | * |
| 191 | * Called by the bufmgr to get the next candidate buffer to use in |
| 192 | * BufferAlloc(). The only hard requirement BufferAlloc() has is that |
| 193 | * the selected buffer must not currently be pinned by anyone. |
| 194 | * |
| 195 | * strategy is a BufferAccessStrategy object, or NULL for default strategy. |
| 196 | * |
| 197 | * To ensure that no one else can pin the buffer before we do, we must |
| 198 | * return the buffer with the buffer header spinlock still held. |
| 199 | */ |
| 200 | BufferDesc * |
| 201 | StrategyGetBuffer(BufferAccessStrategy strategy, uint32 *buf_state) |
| 202 | { |
| 203 | BufferDesc *buf; |
| 204 | int bgwprocno; |
| 205 | int trycounter; |
| 206 | uint32 local_buf_state; /* to avoid repeated (de-)referencing */ |
| 207 | |
| 208 | /* |
| 209 | * If given a strategy object, see whether it can select a buffer. We |
| 210 | * assume strategy objects don't need buffer_strategy_lock. |
| 211 | */ |
| 212 | if (strategy != NULL) |
| 213 | { |
| 214 | buf = GetBufferFromRing(strategy, buf_state); |
| 215 | if (buf != NULL) |
| 216 | return buf; |
| 217 | } |
| 218 | |
| 219 | /* |
| 220 | * If asked, we need to waken the bgwriter. Since we don't want to rely on |
| 221 | * a spinlock for this we force a read from shared memory once, and then |
| 222 | * set the latch based on that value. We need to go through that length |
| 223 | * because otherwise bgwprocno might be reset while/after we check because |
| 224 | * the compiler might just reread from memory. |
| 225 | * |
| 226 | * This can possibly set the latch of the wrong process if the bgwriter |
| 227 | * dies in the wrong moment. But since PGPROC->procLatch is never |
| 228 | * deallocated the worst consequence of that is that we set the latch of |
| 229 | * some arbitrary process. |
| 230 | */ |
| 231 | bgwprocno = INT_ACCESS_ONCE(StrategyControl->bgwprocno); |
| 232 | if (bgwprocno != -1) |
| 233 | { |
| 234 | /* reset bgwprocno first, before setting the latch */ |
| 235 | StrategyControl->bgwprocno = -1; |
| 236 | |
| 237 | /* |
| 238 | * Not acquiring ProcArrayLock here which is slightly icky. It's |
| 239 | * actually fine because procLatch isn't ever freed, so we just can |
| 240 | * potentially set the wrong process' (or no process') latch. |
| 241 | */ |
| 242 | SetLatch(&ProcGlobal->allProcs[bgwprocno].procLatch); |
| 243 | } |
| 244 | |
| 245 | /* |
| 246 | * We count buffer allocation requests so that the bgwriter can estimate |
| 247 | * the rate of buffer consumption. Note that buffers recycled by a |
| 248 | * strategy object are intentionally not counted here. |
| 249 | */ |
| 250 | pg_atomic_fetch_add_u32(&StrategyControl->numBufferAllocs, 1); |
| 251 | |
| 252 | /* |
| 253 | * First check, without acquiring the lock, whether there's buffers in the |
| 254 | * freelist. Since we otherwise don't require the spinlock in every |
| 255 | * StrategyGetBuffer() invocation, it'd be sad to acquire it here - |
| 256 | * uselessly in most cases. That obviously leaves a race where a buffer is |
| 257 | * put on the freelist but we don't see the store yet - but that's pretty |
| 258 | * harmless, it'll just get used during the next buffer acquisition. |
| 259 | * |
| 260 | * If there's buffers on the freelist, acquire the spinlock to pop one |
| 261 | * buffer of the freelist. Then check whether that buffer is usable and |
| 262 | * repeat if not. |
| 263 | * |
| 264 | * Note that the freeNext fields are considered to be protected by the |
| 265 | * buffer_strategy_lock not the individual buffer spinlocks, so it's OK to |
| 266 | * manipulate them without holding the spinlock. |
| 267 | */ |
| 268 | if (StrategyControl->firstFreeBuffer >= 0) |
| 269 | { |
| 270 | while (true) |
| 271 | { |
| 272 | /* Acquire the spinlock to remove element from the freelist */ |
| 273 | SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| 274 | |
| 275 | if (StrategyControl->firstFreeBuffer < 0) |
| 276 | { |
| 277 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 278 | break; |
| 279 | } |
| 280 | |
| 281 | buf = GetBufferDescriptor(StrategyControl->firstFreeBuffer); |
| 282 | Assert(buf->freeNext != FREENEXT_NOT_IN_LIST); |
| 283 | |
| 284 | /* Unconditionally remove buffer from freelist */ |
| 285 | StrategyControl->firstFreeBuffer = buf->freeNext; |
| 286 | buf->freeNext = FREENEXT_NOT_IN_LIST; |
| 287 | |
| 288 | /* |
| 289 | * Release the lock so someone else can access the freelist while |
| 290 | * we check out this buffer. |
| 291 | */ |
| 292 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 293 | |
| 294 | /* |
| 295 | * If the buffer is pinned or has a nonzero usage_count, we cannot |
| 296 | * use it; discard it and retry. (This can only happen if VACUUM |
| 297 | * put a valid buffer in the freelist and then someone else used |
| 298 | * it before we got to it. It's probably impossible altogether as |
| 299 | * of 8.3, but we'd better check anyway.) |
| 300 | */ |
| 301 | local_buf_state = LockBufHdr(buf); |
| 302 | if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 |
| 303 | && BUF_STATE_GET_USAGECOUNT(local_buf_state) == 0) |
| 304 | { |
| 305 | if (strategy != NULL) |
| 306 | AddBufferToRing(strategy, buf); |
| 307 | *buf_state = local_buf_state; |
| 308 | return buf; |
| 309 | } |
| 310 | UnlockBufHdr(buf, local_buf_state); |
| 311 | |
| 312 | } |
| 313 | } |
| 314 | |
| 315 | /* Nothing on the freelist, so run the "clock sweep" algorithm */ |
| 316 | trycounter = NBuffers; |
| 317 | for (;;) |
| 318 | { |
| 319 | buf = GetBufferDescriptor(ClockSweepTick()); |
| 320 | |
| 321 | /* |
| 322 | * If the buffer is pinned or has a nonzero usage_count, we cannot use |
| 323 | * it; decrement the usage_count (unless pinned) and keep scanning. |
| 324 | */ |
| 325 | local_buf_state = LockBufHdr(buf); |
| 326 | |
| 327 | if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0) |
| 328 | { |
| 329 | if (BUF_STATE_GET_USAGECOUNT(local_buf_state) != 0) |
| 330 | { |
| 331 | local_buf_state -= BUF_USAGECOUNT_ONE; |
| 332 | |
| 333 | trycounter = NBuffers; |
| 334 | } |
| 335 | else |
| 336 | { |
| 337 | /* Found a usable buffer */ |
| 338 | if (strategy != NULL) |
| 339 | AddBufferToRing(strategy, buf); |
| 340 | *buf_state = local_buf_state; |
| 341 | return buf; |
| 342 | } |
| 343 | } |
| 344 | else if (--trycounter == 0) |
| 345 | { |
| 346 | /* |
| 347 | * We've scanned all the buffers without making any state changes, |
| 348 | * so all the buffers are pinned (or were when we looked at them). |
| 349 | * We could hope that someone will free one eventually, but it's |
| 350 | * probably better to fail than to risk getting stuck in an |
| 351 | * infinite loop. |
| 352 | */ |
| 353 | UnlockBufHdr(buf, local_buf_state); |
| 354 | elog(ERROR, "no unpinned buffers available" ); |
| 355 | } |
| 356 | UnlockBufHdr(buf, local_buf_state); |
| 357 | } |
| 358 | } |
| 359 | |
| 360 | /* |
| 361 | * StrategyFreeBuffer: put a buffer on the freelist |
| 362 | */ |
| 363 | void |
| 364 | StrategyFreeBuffer(BufferDesc *buf) |
| 365 | { |
| 366 | SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| 367 | |
| 368 | /* |
| 369 | * It is possible that we are told to put something in the freelist that |
| 370 | * is already in it; don't screw up the list if so. |
| 371 | */ |
| 372 | if (buf->freeNext == FREENEXT_NOT_IN_LIST) |
| 373 | { |
| 374 | buf->freeNext = StrategyControl->firstFreeBuffer; |
| 375 | if (buf->freeNext < 0) |
| 376 | StrategyControl->lastFreeBuffer = buf->buf_id; |
| 377 | StrategyControl->firstFreeBuffer = buf->buf_id; |
| 378 | } |
| 379 | |
| 380 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 381 | } |
| 382 | |
| 383 | /* |
| 384 | * StrategySyncStart -- tell BufferSync where to start syncing |
| 385 | * |
| 386 | * The result is the buffer index of the best buffer to sync first. |
| 387 | * BufferSync() will proceed circularly around the buffer array from there. |
| 388 | * |
| 389 | * In addition, we return the completed-pass count (which is effectively |
| 390 | * the higher-order bits of nextVictimBuffer) and the count of recent buffer |
| 391 | * allocs if non-NULL pointers are passed. The alloc count is reset after |
| 392 | * being read. |
| 393 | */ |
| 394 | int |
| 395 | StrategySyncStart(uint32 *complete_passes, uint32 *num_buf_alloc) |
| 396 | { |
| 397 | uint32 nextVictimBuffer; |
| 398 | int result; |
| 399 | |
| 400 | SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| 401 | nextVictimBuffer = pg_atomic_read_u32(&StrategyControl->nextVictimBuffer); |
| 402 | result = nextVictimBuffer % NBuffers; |
| 403 | |
| 404 | if (complete_passes) |
| 405 | { |
| 406 | *complete_passes = StrategyControl->completePasses; |
| 407 | |
| 408 | /* |
| 409 | * Additionally add the number of wraparounds that happened before |
| 410 | * completePasses could be incremented. C.f. ClockSweepTick(). |
| 411 | */ |
| 412 | *complete_passes += nextVictimBuffer / NBuffers; |
| 413 | } |
| 414 | |
| 415 | if (num_buf_alloc) |
| 416 | { |
| 417 | *num_buf_alloc = pg_atomic_exchange_u32(&StrategyControl->numBufferAllocs, 0); |
| 418 | } |
| 419 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 420 | return result; |
| 421 | } |
| 422 | |
| 423 | /* |
| 424 | * StrategyNotifyBgWriter -- set or clear allocation notification latch |
| 425 | * |
| 426 | * If bgwprocno isn't -1, the next invocation of StrategyGetBuffer will |
| 427 | * set that latch. Pass -1 to clear the pending notification before it |
| 428 | * happens. This feature is used by the bgwriter process to wake itself up |
| 429 | * from hibernation, and is not meant for anybody else to use. |
| 430 | */ |
| 431 | void |
| 432 | StrategyNotifyBgWriter(int bgwprocno) |
| 433 | { |
| 434 | /* |
| 435 | * We acquire buffer_strategy_lock just to ensure that the store appears |
| 436 | * atomic to StrategyGetBuffer. The bgwriter should call this rather |
| 437 | * infrequently, so there's no performance penalty from being safe. |
| 438 | */ |
| 439 | SpinLockAcquire(&StrategyControl->buffer_strategy_lock); |
| 440 | StrategyControl->bgwprocno = bgwprocno; |
| 441 | SpinLockRelease(&StrategyControl->buffer_strategy_lock); |
| 442 | } |
| 443 | |
| 444 | |
| 445 | /* |
| 446 | * StrategyShmemSize |
| 447 | * |
| 448 | * estimate the size of shared memory used by the freelist-related structures. |
| 449 | * |
| 450 | * Note: for somewhat historical reasons, the buffer lookup hashtable size |
| 451 | * is also determined here. |
| 452 | */ |
| 453 | Size |
| 454 | StrategyShmemSize(void) |
| 455 | { |
| 456 | Size size = 0; |
| 457 | |
| 458 | /* size of lookup hash table ... see comment in StrategyInitialize */ |
| 459 | size = add_size(size, BufTableShmemSize(NBuffers + NUM_BUFFER_PARTITIONS)); |
| 460 | |
| 461 | /* size of the shared replacement strategy control block */ |
| 462 | size = add_size(size, MAXALIGN(sizeof(BufferStrategyControl))); |
| 463 | |
| 464 | return size; |
| 465 | } |
| 466 | |
| 467 | /* |
| 468 | * StrategyInitialize -- initialize the buffer cache replacement |
| 469 | * strategy. |
| 470 | * |
| 471 | * Assumes: All of the buffers are already built into a linked list. |
| 472 | * Only called by postmaster and only during initialization. |
| 473 | */ |
| 474 | void |
| 475 | StrategyInitialize(bool init) |
| 476 | { |
| 477 | bool found; |
| 478 | |
| 479 | /* |
| 480 | * Initialize the shared buffer lookup hashtable. |
| 481 | * |
| 482 | * Since we can't tolerate running out of lookup table entries, we must be |
| 483 | * sure to specify an adequate table size here. The maximum steady-state |
| 484 | * usage is of course NBuffers entries, but BufferAlloc() tries to insert |
| 485 | * a new entry before deleting the old. In principle this could be |
| 486 | * happening in each partition concurrently, so we could need as many as |
| 487 | * NBuffers + NUM_BUFFER_PARTITIONS entries. |
| 488 | */ |
| 489 | InitBufTable(NBuffers + NUM_BUFFER_PARTITIONS); |
| 490 | |
| 491 | /* |
| 492 | * Get or create the shared strategy control block |
| 493 | */ |
| 494 | StrategyControl = (BufferStrategyControl *) |
| 495 | ShmemInitStruct("Buffer Strategy Status" , |
| 496 | sizeof(BufferStrategyControl), |
| 497 | &found); |
| 498 | |
| 499 | if (!found) |
| 500 | { |
| 501 | /* |
| 502 | * Only done once, usually in postmaster |
| 503 | */ |
| 504 | Assert(init); |
| 505 | |
| 506 | SpinLockInit(&StrategyControl->buffer_strategy_lock); |
| 507 | |
| 508 | /* |
| 509 | * Grab the whole linked list of free buffers for our strategy. We |
| 510 | * assume it was previously set up by InitBufferPool(). |
| 511 | */ |
| 512 | StrategyControl->firstFreeBuffer = 0; |
| 513 | StrategyControl->lastFreeBuffer = NBuffers - 1; |
| 514 | |
| 515 | /* Initialize the clock sweep pointer */ |
| 516 | pg_atomic_init_u32(&StrategyControl->nextVictimBuffer, 0); |
| 517 | |
| 518 | /* Clear statistics */ |
| 519 | StrategyControl->completePasses = 0; |
| 520 | pg_atomic_init_u32(&StrategyControl->numBufferAllocs, 0); |
| 521 | |
| 522 | /* No pending notification */ |
| 523 | StrategyControl->bgwprocno = -1; |
| 524 | } |
| 525 | else |
| 526 | Assert(!init); |
| 527 | } |
| 528 | |
| 529 | |
| 530 | /* ---------------------------------------------------------------- |
| 531 | * Backend-private buffer ring management |
| 532 | * ---------------------------------------------------------------- |
| 533 | */ |
| 534 | |
| 535 | |
| 536 | /* |
| 537 | * GetAccessStrategy -- create a BufferAccessStrategy object |
| 538 | * |
| 539 | * The object is allocated in the current memory context. |
| 540 | */ |
| 541 | BufferAccessStrategy |
| 542 | GetAccessStrategy(BufferAccessStrategyType btype) |
| 543 | { |
| 544 | BufferAccessStrategy strategy; |
| 545 | int ring_size; |
| 546 | |
| 547 | /* |
| 548 | * Select ring size to use. See buffer/README for rationales. |
| 549 | * |
| 550 | * Note: if you change the ring size for BAS_BULKREAD, see also |
| 551 | * SYNC_SCAN_REPORT_INTERVAL in access/heap/syncscan.c. |
| 552 | */ |
| 553 | switch (btype) |
| 554 | { |
| 555 | case BAS_NORMAL: |
| 556 | /* if someone asks for NORMAL, just give 'em a "default" object */ |
| 557 | return NULL; |
| 558 | |
| 559 | case BAS_BULKREAD: |
| 560 | ring_size = 256 * 1024 / BLCKSZ; |
| 561 | break; |
| 562 | case BAS_BULKWRITE: |
| 563 | ring_size = 16 * 1024 * 1024 / BLCKSZ; |
| 564 | break; |
| 565 | case BAS_VACUUM: |
| 566 | ring_size = 256 * 1024 / BLCKSZ; |
| 567 | break; |
| 568 | |
| 569 | default: |
| 570 | elog(ERROR, "unrecognized buffer access strategy: %d" , |
| 571 | (int) btype); |
| 572 | return NULL; /* keep compiler quiet */ |
| 573 | } |
| 574 | |
| 575 | /* Make sure ring isn't an undue fraction of shared buffers */ |
| 576 | ring_size = Min(NBuffers / 8, ring_size); |
| 577 | |
| 578 | /* Allocate the object and initialize all elements to zeroes */ |
| 579 | strategy = (BufferAccessStrategy) |
| 580 | palloc0(offsetof(BufferAccessStrategyData, buffers) + |
| 581 | ring_size * sizeof(Buffer)); |
| 582 | |
| 583 | /* Set fields that don't start out zero */ |
| 584 | strategy->btype = btype; |
| 585 | strategy->ring_size = ring_size; |
| 586 | |
| 587 | return strategy; |
| 588 | } |
| 589 | |
| 590 | /* |
| 591 | * FreeAccessStrategy -- release a BufferAccessStrategy object |
| 592 | * |
| 593 | * A simple pfree would do at the moment, but we would prefer that callers |
| 594 | * don't assume that much about the representation of BufferAccessStrategy. |
| 595 | */ |
| 596 | void |
| 597 | FreeAccessStrategy(BufferAccessStrategy strategy) |
| 598 | { |
| 599 | /* don't crash if called on a "default" strategy */ |
| 600 | if (strategy != NULL) |
| 601 | pfree(strategy); |
| 602 | } |
| 603 | |
| 604 | /* |
| 605 | * GetBufferFromRing -- returns a buffer from the ring, or NULL if the |
| 606 | * ring is empty. |
| 607 | * |
| 608 | * The bufhdr spin lock is held on the returned buffer. |
| 609 | */ |
| 610 | static BufferDesc * |
| 611 | GetBufferFromRing(BufferAccessStrategy strategy, uint32 *buf_state) |
| 612 | { |
| 613 | BufferDesc *buf; |
| 614 | Buffer bufnum; |
| 615 | uint32 local_buf_state; /* to avoid repeated (de-)referencing */ |
| 616 | |
| 617 | |
| 618 | /* Advance to next ring slot */ |
| 619 | if (++strategy->current >= strategy->ring_size) |
| 620 | strategy->current = 0; |
| 621 | |
| 622 | /* |
| 623 | * If the slot hasn't been filled yet, tell the caller to allocate a new |
| 624 | * buffer with the normal allocation strategy. He will then fill this |
| 625 | * slot by calling AddBufferToRing with the new buffer. |
| 626 | */ |
| 627 | bufnum = strategy->buffers[strategy->current]; |
| 628 | if (bufnum == InvalidBuffer) |
| 629 | { |
| 630 | strategy->current_was_in_ring = false; |
| 631 | return NULL; |
| 632 | } |
| 633 | |
| 634 | /* |
| 635 | * If the buffer is pinned we cannot use it under any circumstances. |
| 636 | * |
| 637 | * If usage_count is 0 or 1 then the buffer is fair game (we expect 1, |
| 638 | * since our own previous usage of the ring element would have left it |
| 639 | * there, but it might've been decremented by clock sweep since then). A |
| 640 | * higher usage_count indicates someone else has touched the buffer, so we |
| 641 | * shouldn't re-use it. |
| 642 | */ |
| 643 | buf = GetBufferDescriptor(bufnum - 1); |
| 644 | local_buf_state = LockBufHdr(buf); |
| 645 | if (BUF_STATE_GET_REFCOUNT(local_buf_state) == 0 |
| 646 | && BUF_STATE_GET_USAGECOUNT(local_buf_state) <= 1) |
| 647 | { |
| 648 | strategy->current_was_in_ring = true; |
| 649 | *buf_state = local_buf_state; |
| 650 | return buf; |
| 651 | } |
| 652 | UnlockBufHdr(buf, local_buf_state); |
| 653 | |
| 654 | /* |
| 655 | * Tell caller to allocate a new buffer with the normal allocation |
| 656 | * strategy. He'll then replace this ring element via AddBufferToRing. |
| 657 | */ |
| 658 | strategy->current_was_in_ring = false; |
| 659 | return NULL; |
| 660 | } |
| 661 | |
| 662 | /* |
| 663 | * AddBufferToRing -- add a buffer to the buffer ring |
| 664 | * |
| 665 | * Caller must hold the buffer header spinlock on the buffer. Since this |
| 666 | * is called with the spinlock held, it had better be quite cheap. |
| 667 | */ |
| 668 | static void |
| 669 | AddBufferToRing(BufferAccessStrategy strategy, BufferDesc *buf) |
| 670 | { |
| 671 | strategy->buffers[strategy->current] = BufferDescriptorGetBuffer(buf); |
| 672 | } |
| 673 | |
| 674 | /* |
| 675 | * StrategyRejectBuffer -- consider rejecting a dirty buffer |
| 676 | * |
| 677 | * When a nondefault strategy is used, the buffer manager calls this function |
| 678 | * when it turns out that the buffer selected by StrategyGetBuffer needs to |
| 679 | * be written out and doing so would require flushing WAL too. This gives us |
| 680 | * a chance to choose a different victim. |
| 681 | * |
| 682 | * Returns true if buffer manager should ask for a new victim, and false |
| 683 | * if this buffer should be written and re-used. |
| 684 | */ |
| 685 | bool |
| 686 | StrategyRejectBuffer(BufferAccessStrategy strategy, BufferDesc *buf) |
| 687 | { |
| 688 | /* We only do this in bulkread mode */ |
| 689 | if (strategy->btype != BAS_BULKREAD) |
| 690 | return false; |
| 691 | |
| 692 | /* Don't muck with behavior of normal buffer-replacement strategy */ |
| 693 | if (!strategy->current_was_in_ring || |
| 694 | strategy->buffers[strategy->current] != BufferDescriptorGetBuffer(buf)) |
| 695 | return false; |
| 696 | |
| 697 | /* |
| 698 | * Remove the dirty buffer from the ring; necessary to prevent infinite |
| 699 | * loop if all ring members are dirty. |
| 700 | */ |
| 701 | strategy->buffers[strategy->current] = InvalidBuffer; |
| 702 | |
| 703 | return true; |
| 704 | } |
| 705 | |