read0read.cc source code [MariaDB/storage/innobase/read/read0read.cc]

1	/*****************************************************************************
2
3	Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved.
4	Copyright (c) 2018, MariaDB Corporation.
5
6	This program is free software; you can redistribute it and/or modify it under
7	the terms of the GNU General Public License as published by the Free Software
8	Foundation; version 2 of the License.
9
10	This program is distributed in the hope that it will be useful, but WITHOUT
11	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
12	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
13
14	You should have received a copy of the GNU General Public License along with
15	this program; if not, write to the Free Software Foundation, Inc.,
16	51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA
17
18	*****************************************************************************/
19
20	/************************************************//**
21	@file read/read0read.cc
22	Cursor read
23
24	Created 2/16/1997 Heikki Tuuri
25	*******************************************************/
26
27	#include "read0types.h"
28
29	#include "srv0srv.h"
30	#include "trx0sys.h"
31	#include "trx0purge.h"
32
33	/*
34	-------------------------------------------------------------------------------
35	FACT A: Cursor read view on a secondary index sees only committed versions
36	-------
37	of the records in the secondary index or those versions of rows created
38	by transaction which created a cursor before cursor was created even
39	if transaction which created the cursor has changed that clustered index page.
40
41	PROOF: We must show that read goes always to the clustered index record
42	to see that record is visible in the cursor read view. Consider e.g.
43	following table and SQL-clauses:
44
45	create table t1(a int not null, b int, primary key(a), index(b));
46	insert into t1 values (1,1),(2,2);
47	commit;
48
49	Now consider that we have a cursor for a query
50
51	select b from t1 where b >= 1;
52
53	This query will use secondary key on the table t1. Now after the first fetch
54	on this cursor if we do a update:
55
56	update t1 set b = 5 where b = 2;
57
58	Now second fetch of the cursor should not see record (2,5) instead it should
59	see record (2,2).
60
61	We also should show that if we have delete t1 where b = 5; we still
62	can see record (2,2).
63
64	When we access a secondary key record maximum transaction id is fetched
65	from this record and this trx_id is compared to up_limit_id in the view.
66	If trx_id in the record is greater or equal than up_limit_id in the view
67	cluster record is accessed. Because trx_id of the creating
68	transaction is stored when this view was created to the list of
69	trx_ids not seen by this read view previous version of the
70	record is requested to be built. This is build using clustered record.
71	If the secondary key record is delete-marked, its corresponding
72	clustered record can be already be purged only if records
73	trx_id < low_limit_no. Purge can't remove any record deleted by a
74	transaction which was active when cursor was created. But, we still
75	may have a deleted secondary key record but no clustered record. But,
76	this is not a problem because this case is handled in
77	row_sel_get_clust_rec() function which is called
78	whenever we note that this read view does not see trx_id in the
79	record. Thus, we see correct version. Q. E. D.
80
81	-------------------------------------------------------------------------------
82	FACT B: Cursor read view on a clustered index sees only committed versions
83	-------
84	of the records in the clustered index or those versions of rows created
85	by transaction which created a cursor before cursor was created even
86	if transaction which created the cursor has changed that clustered index page.
87
88	PROOF: Consider e.g.following table and SQL-clauses:
89
90	create table t1(a int not null, b int, primary key(a));
91	insert into t1 values (1),(2);
92	commit;
93
94	Now consider that we have a cursor for a query
95
96	select a from t1 where a >= 1;
97
98	This query will use clustered key on the table t1. Now after the first fetch
99	on this cursor if we do a update:
100
101	update t1 set a = 5 where a = 2;
102
103	Now second fetch of the cursor should not see record (5) instead it should
104	see record (2).
105
106	We also should show that if we have execute delete t1 where a = 5; after
107	the cursor is opened we still can see record (2).
108
109	When accessing clustered record we always check if this read view sees
110	trx_id stored to clustered record. By default we don't see any changes
111	if record trx_id >= low_limit_id i.e. change was made transaction
112	which started after transaction which created the cursor. If row
113	was changed by the future transaction a previous version of the
114	clustered record is created. Thus we see only committed version in
115	this case. We see all changes made by committed transactions i.e.
116	record trx_id < up_limit_id. In this case we don't need to do anything,
117	we already see correct version of the record. We don't see any changes
118	made by active transaction except creating transaction. We have stored
119	trx_id of creating transaction to list of trx_ids when this view was
120	created. Thus we can easily see if this record was changed by the
121	creating transaction. Because we already have clustered record we can
122	access roll_ptr. Using this roll_ptr we can fetch undo record.
123	We can now check that undo_no of the undo record is less than undo_no of the
124	trancaction which created a view when cursor was created. We see this
125	clustered record only in case when record undo_no is less than undo_no
126	in the view. If this is not true we build based on undo_rec previous
127	version of the record. This record is found because purge can't remove
128	records accessed by active transaction. Thus we see correct version. Q. E. D.
129	-------------------------------------------------------------------------------
130	FACT C: Purge does not remove any delete-marked row that is visible
131	-------
132	in any cursor read view.
133
134	PROOF: We know that:
135	1: Currently active read views in trx_sys_t::view_list are ordered by
136	ReadView::low_limit_no in descending order, that is,
137	newest read view first.
138
139	2: Purge clones the oldest read view and uses that to determine whether there
140	are any active transactions that can see the to be purged records.
141
142	Therefore any joining or active transaction will not have a view older
143	than the purge view, according to 1.
144
145	When purge needs to remove a delete-marked row from a secondary index,
146	it will first check that the DB_TRX_ID value of the corresponding
147	record in the clustered index is older than the purge view. It will
148	also check if there is a newer version of the row (clustered index
149	record) that is not delete-marked in the secondary index. If such a
150	row exists and is collation-equal to the delete-marked secondary index
151	record then purge will not remove the secondary index record.
152
153	Delete-marked clustered index records will be removed by
154	row_purge_remove_clust_if_poss(), unless the clustered index record
155	(and its DB_ROLL_PTR) has been updated. Every new version of the
156	clustered index record will update DB_ROLL_PTR, pointing to a new UNDO
157	log entry that allows the old version to be reconstructed. The
158	DB_ROLL_PTR in the oldest remaining version in the old-version chain
159	may be pointing to garbage (an undo log record discarded by purge),
160	but it will never be dereferenced, because the purge view is older
161	than any active transaction.
162
163	For details see: row_vers_old_has_index_entry() and row_purge_poss_sec()
164
165	Some additional issues:
166
167	What if trx_sys.view_list == NULL and some transaction T1 and Purge both
168	try to open read_view at same time. Only one can acquire trx_sys.mutex.
169	In which order will the views be opened? Should it matter? If no, why?
170
171	The order does not matter. No new transactions can be created and no running
172	RW transaction can commit or rollback (or free views). AC-NL-RO transactions
173	will mark their views as closed but not actually free their views.
174	*/
175
176
177	/**
178	Creates a snapshot where exactly the transactions serialized before this
179	point in time are seen in the view.
180
181	@param[in,out] trx transaction
182	*/
183	inline void ReadView::snapshot(trx_t *trx)
184	{
185	trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no);
186	std::sort(m_ids.begin(), m_ids.end());
187	m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front();
188	ut_ad(m_up_limit_id <= m_low_limit_id);
189	}
190
191
192	/**
193	Opens a read view where exactly the transactions serialized before this
194	point in time are seen in the view.
195
196	View becomes visible to purge thread.
197
198	@param[in,out] trx transaction
199	*/
200	void ReadView::open(trx_t *trx)
201	{
202	ut_ad(this == &trx->read_view);
203	switch (m_state)
204	{
205	case READ_VIEW_STATE_OPEN:
206	ut_ad(!srv_read_only_mode);
207	return;
208	case READ_VIEW_STATE_CLOSED:
209	if (srv_read_only_mode)
210	return;
211	/*
212	Reuse closed view if there were no read-write transactions since (and at)
213	its creation time.
214
215	Original comment states: there is an inherent race here between purge
216	and this thread.
217
218	To avoid this race we should've checked trx_sys.get_max_trx_id() and
219	set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex
220	protection. But we're cutting edges to achieve great scalability.
221
222	There're at least two types of concurrent threads interested in this
223	value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and
224	InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()).
225
226	What bad things can happen because we allow this race?
227
228	Speculative execution may reorder state change before get_max_trx_id().
229	In this case purge thread has short gap to clone outdated view. Which is
230	probably not that bad: it just won't be able to purge things that it was
231	actually allowed to purge for a short while.
232
233	This thread may as well get suspended after trx_sys.get_max_trx_id() and
234	before state is set to READ_VIEW_STATE_OPEN. New read-write transaction
235	may get started, committed and purged meanwhile. It is acceptable as
236	well, since this view doesn't see it.
237	*/
238	if (trx_is_autocommit_non_locking(trx) && m_ids.empty() &&
239	m_low_limit_id == trx_sys.get_max_trx_id())
240	goto reopen;
241
242	/*
243	Can't reuse view, take new snapshot.
244
245	Alas this empty critical section is simplest way to make sure concurrent
246	purge thread completed snapshot copy. Of course purge thread may come
247	again and try to copy once again after we release this mutex, but in
248	this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus
249	it'll skip this view.
250
251	This critical section can be replaced with new state, which purge thread
252	would set to inform us to wait until it completes snapshot. However it'd
253	complicate m_state even further.
254	*/
255	mutex_enter(&trx_sys.mutex);
256	mutex_exit(&trx_sys.mutex);
257	my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_SNAPSHOT,
258	MY_MEMORY_ORDER_RELAXED);
259	break;
260	default:
261	ut_ad(`0`);
262	}
263
264	snapshot(trx);
265	reopen:
266	m_creator_trx_id= trx->id;
267	my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_OPEN,
268	MY_MEMORY_ORDER_RELEASE);
269	}
270
271
272	/**
273	Clones the oldest view and stores it in view.
274
275	No need to call ReadView::close(). The caller owns the view that is passed
276	in. This function is called by purge thread to determine whether it should
277	purge the delete marked record or not.
278	*/
279	void trx_sys_t::clone_oldest_view()
280	{
281	purge_sys.view.snapshot(`0`);
282	mutex_enter(&mutex);
283	/ Find oldest view. /
284	for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx;
285	trx= UT_LIST_GET_NEXT(trx_list, trx))
286	{
287	int32_t state;
288
289	while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT)
290	ut_delay(`1`);
291
292	if (state == READ_VIEW_STATE_OPEN)
293	purge_sys.view.copy(trx->read_view);
294	}
295	mutex_exit(&mutex);
296	}
297

Browse the source code of MariaDB/storage/innobase/read/read0read.cc