| 1 | /***************************************************************************** |
| 2 | |
| 3 | Copyright (c) 1996, 2015, Oracle and/or its affiliates. All Rights Reserved. |
| 4 | Copyright (c) 2018, MariaDB Corporation. |
| 5 | |
| 6 | This program is free software; you can redistribute it and/or modify it under |
| 7 | the terms of the GNU General Public License as published by the Free Software |
| 8 | Foundation; version 2 of the License. |
| 9 | |
| 10 | This program is distributed in the hope that it will be useful, but WITHOUT |
| 11 | ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS |
| 12 | FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. |
| 13 | |
| 14 | You should have received a copy of the GNU General Public License along with |
| 15 | this program; if not, write to the Free Software Foundation, Inc., |
| 16 | 51 Franklin Street, Suite 500, Boston, MA 02110-1335 USA |
| 17 | |
| 18 | *****************************************************************************/ |
| 19 | |
| 20 | /**************************************************//** |
| 21 | @file read/read0read.cc |
| 22 | Cursor read |
| 23 | |
| 24 | Created 2/16/1997 Heikki Tuuri |
| 25 | *******************************************************/ |
| 26 | |
| 27 | #include "read0types.h" |
| 28 | |
| 29 | #include "srv0srv.h" |
| 30 | #include "trx0sys.h" |
| 31 | #include "trx0purge.h" |
| 32 | |
| 33 | /* |
| 34 | ------------------------------------------------------------------------------- |
| 35 | FACT A: Cursor read view on a secondary index sees only committed versions |
| 36 | ------- |
| 37 | of the records in the secondary index or those versions of rows created |
| 38 | by transaction which created a cursor before cursor was created even |
| 39 | if transaction which created the cursor has changed that clustered index page. |
| 40 | |
| 41 | PROOF: We must show that read goes always to the clustered index record |
| 42 | to see that record is visible in the cursor read view. Consider e.g. |
| 43 | following table and SQL-clauses: |
| 44 | |
| 45 | create table t1(a int not null, b int, primary key(a), index(b)); |
| 46 | insert into t1 values (1,1),(2,2); |
| 47 | commit; |
| 48 | |
| 49 | Now consider that we have a cursor for a query |
| 50 | |
| 51 | select b from t1 where b >= 1; |
| 52 | |
| 53 | This query will use secondary key on the table t1. Now after the first fetch |
| 54 | on this cursor if we do a update: |
| 55 | |
| 56 | update t1 set b = 5 where b = 2; |
| 57 | |
| 58 | Now second fetch of the cursor should not see record (2,5) instead it should |
| 59 | see record (2,2). |
| 60 | |
| 61 | We also should show that if we have delete t1 where b = 5; we still |
| 62 | can see record (2,2). |
| 63 | |
| 64 | When we access a secondary key record maximum transaction id is fetched |
| 65 | from this record and this trx_id is compared to up_limit_id in the view. |
| 66 | If trx_id in the record is greater or equal than up_limit_id in the view |
| 67 | cluster record is accessed. Because trx_id of the creating |
| 68 | transaction is stored when this view was created to the list of |
| 69 | trx_ids not seen by this read view previous version of the |
| 70 | record is requested to be built. This is build using clustered record. |
| 71 | If the secondary key record is delete-marked, its corresponding |
| 72 | clustered record can be already be purged only if records |
| 73 | trx_id < low_limit_no. Purge can't remove any record deleted by a |
| 74 | transaction which was active when cursor was created. But, we still |
| 75 | may have a deleted secondary key record but no clustered record. But, |
| 76 | this is not a problem because this case is handled in |
| 77 | row_sel_get_clust_rec() function which is called |
| 78 | whenever we note that this read view does not see trx_id in the |
| 79 | record. Thus, we see correct version. Q. E. D. |
| 80 | |
| 81 | ------------------------------------------------------------------------------- |
| 82 | FACT B: Cursor read view on a clustered index sees only committed versions |
| 83 | ------- |
| 84 | of the records in the clustered index or those versions of rows created |
| 85 | by transaction which created a cursor before cursor was created even |
| 86 | if transaction which created the cursor has changed that clustered index page. |
| 87 | |
| 88 | PROOF: Consider e.g.following table and SQL-clauses: |
| 89 | |
| 90 | create table t1(a int not null, b int, primary key(a)); |
| 91 | insert into t1 values (1),(2); |
| 92 | commit; |
| 93 | |
| 94 | Now consider that we have a cursor for a query |
| 95 | |
| 96 | select a from t1 where a >= 1; |
| 97 | |
| 98 | This query will use clustered key on the table t1. Now after the first fetch |
| 99 | on this cursor if we do a update: |
| 100 | |
| 101 | update t1 set a = 5 where a = 2; |
| 102 | |
| 103 | Now second fetch of the cursor should not see record (5) instead it should |
| 104 | see record (2). |
| 105 | |
| 106 | We also should show that if we have execute delete t1 where a = 5; after |
| 107 | the cursor is opened we still can see record (2). |
| 108 | |
| 109 | When accessing clustered record we always check if this read view sees |
| 110 | trx_id stored to clustered record. By default we don't see any changes |
| 111 | if record trx_id >= low_limit_id i.e. change was made transaction |
| 112 | which started after transaction which created the cursor. If row |
| 113 | was changed by the future transaction a previous version of the |
| 114 | clustered record is created. Thus we see only committed version in |
| 115 | this case. We see all changes made by committed transactions i.e. |
| 116 | record trx_id < up_limit_id. In this case we don't need to do anything, |
| 117 | we already see correct version of the record. We don't see any changes |
| 118 | made by active transaction except creating transaction. We have stored |
| 119 | trx_id of creating transaction to list of trx_ids when this view was |
| 120 | created. Thus we can easily see if this record was changed by the |
| 121 | creating transaction. Because we already have clustered record we can |
| 122 | access roll_ptr. Using this roll_ptr we can fetch undo record. |
| 123 | We can now check that undo_no of the undo record is less than undo_no of the |
| 124 | trancaction which created a view when cursor was created. We see this |
| 125 | clustered record only in case when record undo_no is less than undo_no |
| 126 | in the view. If this is not true we build based on undo_rec previous |
| 127 | version of the record. This record is found because purge can't remove |
| 128 | records accessed by active transaction. Thus we see correct version. Q. E. D. |
| 129 | ------------------------------------------------------------------------------- |
| 130 | FACT C: Purge does not remove any delete-marked row that is visible |
| 131 | ------- |
| 132 | in any cursor read view. |
| 133 | |
| 134 | PROOF: We know that: |
| 135 | 1: Currently active read views in trx_sys_t::view_list are ordered by |
| 136 | ReadView::low_limit_no in descending order, that is, |
| 137 | newest read view first. |
| 138 | |
| 139 | 2: Purge clones the oldest read view and uses that to determine whether there |
| 140 | are any active transactions that can see the to be purged records. |
| 141 | |
| 142 | Therefore any joining or active transaction will not have a view older |
| 143 | than the purge view, according to 1. |
| 144 | |
| 145 | When purge needs to remove a delete-marked row from a secondary index, |
| 146 | it will first check that the DB_TRX_ID value of the corresponding |
| 147 | record in the clustered index is older than the purge view. It will |
| 148 | also check if there is a newer version of the row (clustered index |
| 149 | record) that is not delete-marked in the secondary index. If such a |
| 150 | row exists and is collation-equal to the delete-marked secondary index |
| 151 | record then purge will not remove the secondary index record. |
| 152 | |
| 153 | Delete-marked clustered index records will be removed by |
| 154 | row_purge_remove_clust_if_poss(), unless the clustered index record |
| 155 | (and its DB_ROLL_PTR) has been updated. Every new version of the |
| 156 | clustered index record will update DB_ROLL_PTR, pointing to a new UNDO |
| 157 | log entry that allows the old version to be reconstructed. The |
| 158 | DB_ROLL_PTR in the oldest remaining version in the old-version chain |
| 159 | may be pointing to garbage (an undo log record discarded by purge), |
| 160 | but it will never be dereferenced, because the purge view is older |
| 161 | than any active transaction. |
| 162 | |
| 163 | For details see: row_vers_old_has_index_entry() and row_purge_poss_sec() |
| 164 | |
| 165 | Some additional issues: |
| 166 | |
| 167 | What if trx_sys.view_list == NULL and some transaction T1 and Purge both |
| 168 | try to open read_view at same time. Only one can acquire trx_sys.mutex. |
| 169 | In which order will the views be opened? Should it matter? If no, why? |
| 170 | |
| 171 | The order does not matter. No new transactions can be created and no running |
| 172 | RW transaction can commit or rollback (or free views). AC-NL-RO transactions |
| 173 | will mark their views as closed but not actually free their views. |
| 174 | */ |
| 175 | |
| 176 | |
| 177 | /** |
| 178 | Creates a snapshot where exactly the transactions serialized before this |
| 179 | point in time are seen in the view. |
| 180 | |
| 181 | @param[in,out] trx transaction |
| 182 | */ |
| 183 | inline void ReadView::snapshot(trx_t *trx) |
| 184 | { |
| 185 | trx_sys.snapshot_ids(trx, &m_ids, &m_low_limit_id, &m_low_limit_no); |
| 186 | std::sort(m_ids.begin(), m_ids.end()); |
| 187 | m_up_limit_id= m_ids.empty() ? m_low_limit_id : m_ids.front(); |
| 188 | ut_ad(m_up_limit_id <= m_low_limit_id); |
| 189 | } |
| 190 | |
| 191 | |
| 192 | /** |
| 193 | Opens a read view where exactly the transactions serialized before this |
| 194 | point in time are seen in the view. |
| 195 | |
| 196 | View becomes visible to purge thread. |
| 197 | |
| 198 | @param[in,out] trx transaction |
| 199 | */ |
| 200 | void ReadView::open(trx_t *trx) |
| 201 | { |
| 202 | ut_ad(this == &trx->read_view); |
| 203 | switch (m_state) |
| 204 | { |
| 205 | case READ_VIEW_STATE_OPEN: |
| 206 | ut_ad(!srv_read_only_mode); |
| 207 | return; |
| 208 | case READ_VIEW_STATE_CLOSED: |
| 209 | if (srv_read_only_mode) |
| 210 | return; |
| 211 | /* |
| 212 | Reuse closed view if there were no read-write transactions since (and at) |
| 213 | its creation time. |
| 214 | |
| 215 | Original comment states: there is an inherent race here between purge |
| 216 | and this thread. |
| 217 | |
| 218 | To avoid this race we should've checked trx_sys.get_max_trx_id() and |
| 219 | set state to READ_VIEW_STATE_OPEN atomically under trx_sys.mutex |
| 220 | protection. But we're cutting edges to achieve great scalability. |
| 221 | |
| 222 | There're at least two types of concurrent threads interested in this |
| 223 | value: purge coordinator thread (see trx_sys_t::clone_oldest_view()) and |
| 224 | InnoDB monitor thread (see lock_trx_print_wait_and_mvcc_state()). |
| 225 | |
| 226 | What bad things can happen because we allow this race? |
| 227 | |
| 228 | Speculative execution may reorder state change before get_max_trx_id(). |
| 229 | In this case purge thread has short gap to clone outdated view. Which is |
| 230 | probably not that bad: it just won't be able to purge things that it was |
| 231 | actually allowed to purge for a short while. |
| 232 | |
| 233 | This thread may as well get suspended after trx_sys.get_max_trx_id() and |
| 234 | before state is set to READ_VIEW_STATE_OPEN. New read-write transaction |
| 235 | may get started, committed and purged meanwhile. It is acceptable as |
| 236 | well, since this view doesn't see it. |
| 237 | */ |
| 238 | if (trx_is_autocommit_non_locking(trx) && m_ids.empty() && |
| 239 | m_low_limit_id == trx_sys.get_max_trx_id()) |
| 240 | goto reopen; |
| 241 | |
| 242 | /* |
| 243 | Can't reuse view, take new snapshot. |
| 244 | |
| 245 | Alas this empty critical section is simplest way to make sure concurrent |
| 246 | purge thread completed snapshot copy. Of course purge thread may come |
| 247 | again and try to copy once again after we release this mutex, but in |
| 248 | this case it is guaranteed to see READ_VIEW_STATE_REGISTERED and thus |
| 249 | it'll skip this view. |
| 250 | |
| 251 | This critical section can be replaced with new state, which purge thread |
| 252 | would set to inform us to wait until it completes snapshot. However it'd |
| 253 | complicate m_state even further. |
| 254 | */ |
| 255 | mutex_enter(&trx_sys.mutex); |
| 256 | mutex_exit(&trx_sys.mutex); |
| 257 | my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_SNAPSHOT, |
| 258 | MY_MEMORY_ORDER_RELAXED); |
| 259 | break; |
| 260 | default: |
| 261 | ut_ad(0); |
| 262 | } |
| 263 | |
| 264 | snapshot(trx); |
| 265 | reopen: |
| 266 | m_creator_trx_id= trx->id; |
| 267 | my_atomic_store32_explicit(&m_state, READ_VIEW_STATE_OPEN, |
| 268 | MY_MEMORY_ORDER_RELEASE); |
| 269 | } |
| 270 | |
| 271 | |
| 272 | /** |
| 273 | Clones the oldest view and stores it in view. |
| 274 | |
| 275 | No need to call ReadView::close(). The caller owns the view that is passed |
| 276 | in. This function is called by purge thread to determine whether it should |
| 277 | purge the delete marked record or not. |
| 278 | */ |
| 279 | void trx_sys_t::clone_oldest_view() |
| 280 | { |
| 281 | purge_sys.view.snapshot(0); |
| 282 | mutex_enter(&mutex); |
| 283 | /* Find oldest view. */ |
| 284 | for (const trx_t *trx= UT_LIST_GET_FIRST(trx_list); trx; |
| 285 | trx= UT_LIST_GET_NEXT(trx_list, trx)) |
| 286 | { |
| 287 | int32_t state; |
| 288 | |
| 289 | while ((state= trx->read_view.get_state()) == READ_VIEW_STATE_SNAPSHOT) |
| 290 | ut_delay(1); |
| 291 | |
| 292 | if (state == READ_VIEW_STATE_OPEN) |
| 293 | purge_sys.view.copy(trx->read_view); |
| 294 | } |
| 295 | mutex_exit(&mutex); |
| 296 | } |
| 297 | |