buf_internals.h source code [PostgreSQL/src/include/storage/buf_internals.h]

1	/-------------------------------------------------------------------------*
2	*
3	* buf_internals.h
4	* Internal definitions for buffer manager and the buffer replacement
5	* strategy.
6	*
7	*
8	* Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
9	* Portions Copyright (c) 1994, Regents of the University of California
10	*
11	* src/include/storage/buf_internals.h
12	*
13	*-------------------------------------------------------------------------
14	*/
15	#ifndef BUFMGR_INTERNALS_H
16	#define BUFMGR_INTERNALS_H
17
18	#include "storage/buf.h"
19	#include "storage/bufmgr.h"
20	#include "storage/latch.h"
21	#include "storage/lwlock.h"
22	#include "storage/shmem.h"
23	#include "storage/smgr.h"
24	#include "port/atomics.h"
25	#include "storage/spin.h"
26	#include "utils/relcache.h"
27
28
29	/*
30	* Buffer state is a single 32-bit variable where following data is combined.
31	*
32	* - 18 bits refcount
33	* - 4 bits usage count
34	* - 10 bits of flags
35	*
36	* Combining these values allows to perform some operations without locking
37	* the buffer header, by modifying them together with a CAS loop.
38	*
39	* The definition of buffer state components is below.
40	*/
41	#define BUF_REFCOUNT_ONE 1
42	#define BUF_REFCOUNT_MASK ((1U << 18) - 1)
43	#define BUF_USAGECOUNT_MASK 0x003C0000U
44	#define BUF_USAGECOUNT_ONE (1U << 18)
45	#define BUF_USAGECOUNT_SHIFT 18
46	#define BUF_FLAG_MASK 0xFFC00000U
47
48	/ Get refcount and usagecount from buffer state /
49	#define BUF_STATE_GET_REFCOUNT(state) ((state) & BUF_REFCOUNT_MASK)
50	#define BUF_STATE_GET_USAGECOUNT(state) (((state) & BUF_USAGECOUNT_MASK) >> BUF_USAGECOUNT_SHIFT)
51
52	/*
53	* Flags for buffer descriptors
54	*
55	* Note: TAG_VALID essentially means that there is a buffer hashtable
56	* entry associated with the buffer's tag.
57	*/
58	#define BM_LOCKED (1U << 22) /* buffer header is locked */
59	#define BM_DIRTY (1U << 23) /* data needs writing */
60	#define BM_VALID (1U << 24) /* data is valid */
61	#define BM_TAG_VALID (1U << 25) /* tag is assigned */
62	#define BM_IO_IN_PROGRESS (1U << 26) /* read or write in progress */
63	#define BM_IO_ERROR (1U << 27) /* previous I/O failed */
64	#define BM_JUST_DIRTIED (1U << 28) /* dirtied since write started */
65	#define BM_PIN_COUNT_WAITER (1U << 29) /* have waiter for sole pin */
66	#define BM_CHECKPOINT_NEEDED (1U << 30) /* must write for checkpoint */
67	#define BM_PERMANENT (1U << 31) /* permanent buffer (not unlogged,
68	* or init fork) */
69	/*
70	* The maximum allowed value of usage_count represents a tradeoff between
71	* accuracy and speed of the clock-sweep buffer management algorithm. A
72	* large value (comparable to NBuffers) would approximate LRU semantics.
73	* But it can take as many as BM_MAX_USAGE_COUNT+1 complete cycles of
74	* clock sweeps to find a free buffer, so in practice we don't want the
75	* value to be very large.
76	*/
77	#define BM_MAX_USAGE_COUNT 5
78
79	/*
80	* Buffer tag identifies which disk block the buffer contains.
81	*
82	* Note: the BufferTag data must be sufficient to determine where to write the
83	* block, without reference to pg_class or pg_tablespace entries. It's
84	* possible that the backend flushing the buffer doesn't even believe the
85	* relation is visible yet (its xact may have started before the xact that
86	* created the rel). The storage manager must be able to cope anyway.
87	*
88	* Note: if there's any pad bytes in the struct, INIT_BUFFERTAG will have
89	* to be fixed to zero them, since this struct is used as a hash key.
90	*/
91	typedef struct buftag
92	{
93	RelFileNode rnode; / physical relation identifier /
94	ForkNumber forkNum;
95	BlockNumber blockNum; / blknum relative to begin of reln /
96	} BufferTag;
97
98	#define CLEAR_BUFFERTAG(a) \
99	( \
100	(a).rnode.spcNode = InvalidOid, \
101	(a).rnode.dbNode = InvalidOid, \
102	(a).rnode.relNode = InvalidOid, \
103	(a).forkNum = InvalidForkNumber, \
104	(a).blockNum = InvalidBlockNumber \
105	)
106
107	#define INIT_BUFFERTAG(a,xx_rnode,xx_forkNum,xx_blockNum) \
108	( \
109	(a).rnode = (xx_rnode), \
110	(a).forkNum = (xx_forkNum), \
111	(a).blockNum = (xx_blockNum) \
112	)
113
114	#define BUFFERTAGS_EQUAL(a,b) \
115	( \
116	RelFileNodeEquals((a).rnode, (b).rnode) && \
117	(a).blockNum == (b).blockNum && \
118	(a).forkNum == (b).forkNum \
119	)
120
121	/*
122	* The shared buffer mapping table is partitioned to reduce contention.
123	* To determine which partition lock a given tag requires, compute the tag's
124	* hash code with BufTableHashCode(), then apply BufMappingPartitionLock().
125	* NB: NUM_BUFFER_PARTITIONS must be a power of 2!
126	*/
127	#define BufTableHashPartition(hashcode) \
128	((hashcode) % NUM_BUFFER_PARTITIONS)
129	#define BufMappingPartitionLock(hashcode) \
130	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + \
131	BufTableHashPartition(hashcode)].lock)
132	#define BufMappingPartitionLockByIndex(i) \
133	(&MainLWLockArray[BUFFER_MAPPING_LWLOCK_OFFSET + (i)].lock)
134
135	/*
136	* BufferDesc -- shared descriptor/state data for a single shared buffer.
137	*
138	* Note: Buffer header lock (BM_LOCKED flag) must be held to examine or change
139	* the tag, state or wait_backend_pid fields. In general, buffer header lock
140	* is a spinlock which is combined with flags, refcount and usagecount into
141	* single atomic variable. This layout allow us to do some operations in a
142	* single atomic operation, without actually acquiring and releasing spinlock;
143	* for instance, increase or decrease refcount. buf_id field never changes
144	* after initialization, so does not need locking. freeNext is protected by
145	* the buffer_strategy_lock not buffer header lock. The LWLock can take care
146	* of itself. The buffer header lock is not used to control access to the
147	* data in the buffer!
148	*
149	* It's assumed that nobody changes the state field while buffer header lock
150	* is held. Thus buffer header lock holder can do complex updates of the
151	* state variable in single write, simultaneously with lock release (cleaning
152	* BM_LOCKED flag). On the other hand, updating of state without holding
153	* buffer header lock is restricted to CAS, which insure that BM_LOCKED flag
154	* is not set. Atomic increment/decrement, OR/AND etc. are not allowed.
155	*
156	* An exception is that if we have the buffer pinned, its tag can't change
157	* underneath us, so we can examine the tag without locking the buffer header.
158	* Also, in places we do one-time reads of the flags without bothering to
159	* lock the buffer header; this is generally for situations where we don't
160	* expect the flag bit being tested to be changing.
161	*
162	* We can't physically remove items from a disk page if another backend has
163	* the buffer pinned. Hence, a backend may need to wait for all other pins
164	* to go away. This is signaled by storing its own PID into
165	* wait_backend_pid and setting flag bit BM_PIN_COUNT_WAITER. At present,
166	* there can be only one such waiter per buffer.
167	*
168	* We use this same struct for local buffer headers, but the locks are not
169	* used and not all of the flag bits are useful either. To avoid unnecessary
170	* overhead, manipulations of the state field should be done without actual
171	* atomic operations (i.e. only pg_atomic_read_u32() and
172	* pg_atomic_unlocked_write_u32()).
173	*
174	* Be careful to avoid increasing the size of the struct when adding or
175	* reordering members. Keeping it below 64 bytes (the most common CPU
176	* cache line size) is fairly important for performance.
177	*/
178	typedef struct BufferDesc
179	{
180	BufferTag tag; / ID of page contained in buffer /
181	int buf_id; / buffer's index number (from 0) /
182
183	/ state of the tag, containing flags, refcount and usagecount /
184	pg_atomic_uint32 state;
185
186	int wait_backend_pid; / backend PID of pin-count waiter /
187	int freeNext; / link in freelist chain /
188
189	LWLock content_lock; / to lock access to buffer contents /
190	} BufferDesc;
191
192	/*
193	* Concurrent access to buffer headers has proven to be more efficient if
194	* they're cache line aligned. So we force the start of the BufferDescriptors
195	* array to be on a cache line boundary and force the elements to be cache
196	* line sized.
197	*
198	* XXX: As this is primarily matters in highly concurrent workloads which
199	* probably all are 64bit these days, and the space wastage would be a bit
200	* more noticeable on 32bit systems, we don't force the stride to be cache
201	* line sized on those. If somebody does actual performance testing, we can
202	* reevaluate.
203	*
204	* Note that local buffer descriptors aren't forced to be aligned - as there's
205	* no concurrent access to those it's unlikely to be beneficial.
206	*
207	* We use 64bit as the cache line size here, because that's the most common
208	* size. Making it bigger would be a waste of memory. Even if running on a
209	* platform with either 32 or 128 byte line sizes, it's good to align to
210	* boundaries and avoid false sharing.
211	*/
212	#define BUFFERDESC_PAD_TO_SIZE (SIZEOF_VOID_P == 8 ? 64 : 1)
213
214	typedef union BufferDescPadded
215	{
216	BufferDesc bufferdesc;
217	char pad[BUFFERDESC_PAD_TO_SIZE];
218	} BufferDescPadded;
219
220	#define GetBufferDescriptor(id) (&BufferDescriptors[(id)].bufferdesc)
221	#define GetLocalBufferDescriptor(id) (&LocalBufferDescriptors[(id)])
222
223	#define BufferDescriptorGetBuffer(bdesc) ((bdesc)->buf_id + 1)
224
225	#define BufferDescriptorGetIOLock(bdesc) \
226	(&(BufferIOLWLockArray[(bdesc)->buf_id]).lock)
227	#define BufferDescriptorGetContentLock(bdesc) \
228	((LWLock*) (&(bdesc)->content_lock))
229
230	extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray;
231
232	/*
233	* The freeNext field is either the index of the next freelist entry,
234	* or one of these special values:
235	*/
236	#define FREENEXT_END_OF_LIST (-1)
237	#define FREENEXT_NOT_IN_LIST (-2)
238
239	/*
240	* Functions for acquiring/releasing a shared buffer header's spinlock. Do
241	* not apply these to local buffers!
242	*/
243	extern uint32 LockBufHdr(BufferDesc *desc);
244	#define UnlockBufHdr(desc, s) \
245	do { \
246	pg_write_barrier(); \
247	pg_atomic_write_u32(&(desc)->state, (s) & (~BM_LOCKED)); \
248	} while (0)
249
250
251	/*
252	* The PendingWriteback & WritebackContext structure are used to keep
253	* information about pending flush requests to be issued to the OS.
254	*/
255	typedef struct PendingWriteback
256	{
257	/ could store different types of pending flushes here /
258	BufferTag tag;
259	} PendingWriteback;
260
261	/ struct forward declared in bufmgr.h /
262	typedef struct WritebackContext
263	{
264	/ pointer to the max number of writeback requests to coalesce /
265	int *max_pending;
266
267	/ current number of pending writeback requests /
268	int nr_pending;
269
270	/ pending requests /
271	PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES];
272	} WritebackContext;
273
274	/ in buf_init.c /
275	extern PGDLLIMPORT BufferDescPadded *BufferDescriptors;
276	extern PGDLLIMPORT WritebackContext BackendWritebackContext;
277
278	/ in localbuf.c /
279	extern BufferDesc *LocalBufferDescriptors;
280
281	/ in bufmgr.c /
282
283	/*
284	* Structure to sort buffers per file on checkpoints.
285	*
286	* This structure is allocated per buffer in shared memory, so it should be
287	* kept as small as possible.
288	*/
289	typedef struct CkptSortItem
290	{
291	Oid tsId;
292	Oid relNode;
293	ForkNumber forkNum;
294	BlockNumber blockNum;
295	int buf_id;
296	} CkptSortItem;
297
298	extern CkptSortItem *CkptBufferIds;
299
300	/*
301	* Internal buffer management routines
302	*/
303	/ bufmgr.c /
304	extern void WritebackContextInit(WritebackContext context, int* *max_pending);
305	extern void IssuePendingWritebacks(WritebackContext *context);
306	extern void ScheduleBufferTagForWriteback(WritebackContext context, BufferTag tag);
307
308	/ freelist.c /
309	extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy,
310	uint32 *buf_state);
311	extern void StrategyFreeBuffer(BufferDesc *buf);
312	extern bool StrategyRejectBuffer(BufferAccessStrategy strategy,
313	BufferDesc *buf);
314
315	extern int StrategySyncStart(uint32 complete_passes, uint32 num_buf_alloc);
316	extern void StrategyNotifyBgWriter(int bgwprocno);
317
318	extern Size StrategyShmemSize(void);
319	extern void StrategyInitialize(bool init);
320	extern bool have_free_buffer(void);
321
322	/ buf_table.c /
323	extern Size BufTableShmemSize(int size);
324	extern void InitBufTable(int size);
325	extern uint32 BufTableHashCode(BufferTag *tagPtr);
326	extern int BufTableLookup(BufferTag *tagPtr, uint32 hashcode);
327	extern int BufTableInsert(BufferTag tagPtr, uint32 hashcode, int* buf_id);
328	extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode);
329
330	/ localbuf.c /
331	extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum,
332	BlockNumber blockNum);
333	extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum,
334	BlockNumber blockNum, bool *foundPtr);
335	extern void MarkLocalBufferDirty(Buffer buffer);
336	extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum,
337	BlockNumber firstDelBlock);
338	extern void DropRelFileNodeAllLocalBuffers(RelFileNode rnode);
339	extern void AtEOXact_LocalBuffers(bool isCommit);
340
341	#endif /* BUFMGR_INTERNALS_H */
342

Browse the source code of PostgreSQL/src/include/storage/buf_internals.h