diff --git a/mysql-test/suite/innodb/r/index_lock_upgrade.result b/mysql-test/suite/innodb/r/index_lock_upgrade.result new file mode 100644 index 0000000000000..4b582dbac0a51 --- /dev/null +++ b/mysql-test/suite/innodb/r/index_lock_upgrade.result @@ -0,0 +1,49 @@ +# +# MDEV-38814: btr_cur_need_opposite_intention() should not +# trigger index lock upgrade on the root page +# +# Test with simple table structure: +# Single INT primary key +# DATETIME column dt that starts NULL and gets updated to a +# distinct timestamp per row +# +# The NULL-to-DATETIME growth makes btr_cur_optimistic_update() +# return DB_UNDERFLOW (TODO: Why? DB_OVERFLOW was expected), +# falling through to the pessimistic path (BTR_MODIFY_TREE). +CREATE TABLE t1 ( +id INT NOT NULL, +dt DATETIME NULL, +PRIMARY KEY (id) +) ENGINE=InnoDB; +INSERT INTO t1 (id, dt) SELECT seq, NULL FROM seq_1_to_1000; +# Rows inserted with NULL DATETIME +SELECT COUNT(*) AS rows_inserted FROM t1; +rows_inserted +1000 +SELECT COUNT(DISTINCT dt) AS distinct_timestamps FROM t1; +distinct_timestamps +0 +# Index lock upgrades during inserts (no reduction w.r.t. root page checks) +SELECT @u01 AS index_lock_upgrades_insert; +index_lock_upgrades_insert +5 +SELECT @r01 AS need_opposite_intention_root_insert; +need_opposite_intention_root_insert +5 +# Now update all rows (batch update) +# This changes DATETIME from NULL to a distinct timestamp per row, +# triggering the pessimistic path due to record size change. +UPDATE t1 JOIN seq_1_to_1000 s ON t1.id = s.seq +SET t1.dt = '2025-01-01' + INTERVAL s.seq SECOND; +# Rows updated +SELECT COUNT(DISTINCT dt) AS distinct_timestamps FROM t1; +distinct_timestamps +1000 +# Index lock upgrades during updates (with reduction w.r.t. root page checks) +SELECT @u12 AS index_lock_upgrades_update; +index_lock_upgrades_update +120 +SELECT @r12 AS need_opposite_intention_root_update; +need_opposite_intention_root_update +414 +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/r/innodb_status_variables.result b/mysql-test/suite/innodb/r/innodb_status_variables.result index c6f4d4f27c45a..c9051dd4efb58 100644 --- a/mysql-test/suite/innodb/r/innodb_status_variables.result +++ b/mysql-test/suite/innodb/r/innodb_status_variables.result @@ -3,7 +3,8 @@ WHERE variable_name LIKE 'INNODB_%' AND variable_name NOT IN ('INNODB_ADAPTIVE_HASH_HASH_SEARCHES','INNODB_ADAPTIVE_HASH_NON_HASH_SEARCHES', 'INNODB_MEM_ADAPTIVE_HASH', -'INNODB_BUFFERED_AIO_SUBMITTED','INNODB_BUFFER_POOL_PAGES_LATCHED'); +'INNODB_BUFFERED_AIO_SUBMITTED','INNODB_BUFFER_POOL_PAGES_LATCHED', +'INNODB_BTR_INDEX_LOCK_UPGRADES','INNODB_BTR_NEED_OPPOSITE_INTENTION_ROOT'); variable_name INNODB_ASYNC_READS_PENDING INNODB_ASYNC_READS_TASKS_RUNNING diff --git a/mysql-test/suite/innodb/t/index_lock_upgrade.test b/mysql-test/suite/innodb/t/index_lock_upgrade.test new file mode 100644 index 0000000000000..c24f739f0b706 --- /dev/null +++ b/mysql-test/suite/innodb/t/index_lock_upgrade.test @@ -0,0 +1,74 @@ +--source include/have_innodb.inc +--source include/have_innodb_4k.inc +--source include/have_sequence.inc +--source include/have_debug.inc + +--echo # +--echo # MDEV-38814: btr_cur_need_opposite_intention() should not +--echo # trigger index lock upgrade on the root page +--echo # + +--echo # Test with simple table structure: +--echo # Single INT primary key +--echo # DATETIME column dt that starts NULL and gets updated to a +--echo # distinct timestamp per row +--echo # +--echo # The NULL-to-DATETIME growth makes btr_cur_optimistic_update() +--echo # return DB_UNDERFLOW (TODO: Why? DB_OVERFLOW was expected), +--echo # falling through to the pessimistic path (BTR_MODIFY_TREE). +CREATE TABLE t1 ( + id INT NOT NULL, + dt DATETIME NULL, + PRIMARY KEY (id) +) ENGINE=InnoDB; + +let $u0 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_index_lock_upgrades'`; +let $r0 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_need_opposite_intention_root'`; + +INSERT INTO t1 (id, dt) SELECT seq, NULL FROM seq_1_to_1000; + +--echo # Rows inserted with NULL DATETIME +SELECT COUNT(*) AS rows_inserted FROM t1; +SELECT COUNT(DISTINCT dt) AS distinct_timestamps FROM t1; + +let $u1 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_index_lock_upgrades'`; +let $r1 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_need_opposite_intention_root'`; + +--disable_query_log +eval SET @u01 = $u1 - $u0; +eval SET @r01 = $r1 - $r0; +--enable_query_log + +--echo # Index lock upgrades during inserts (no reduction w.r.t. root page checks) +SELECT @u01 AS index_lock_upgrades_insert; +SELECT @r01 AS need_opposite_intention_root_insert; + +--echo # Now update all rows (batch update) +--echo # This changes DATETIME from NULL to a distinct timestamp per row, +--echo # triggering the pessimistic path due to record size change. + +UPDATE t1 JOIN seq_1_to_1000 s ON t1.id = s.seq +SET t1.dt = '2025-01-01' + INTERVAL s.seq SECOND; + +--echo # Rows updated +SELECT COUNT(DISTINCT dt) AS distinct_timestamps FROM t1; + +let $u2 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_index_lock_upgrades'`; +let $r2 = `SELECT VARIABLE_VALUE FROM INFORMATION_SCHEMA.GLOBAL_STATUS + WHERE VARIABLE_NAME = 'Innodb_btr_need_opposite_intention_root'`; + +--disable_query_log +eval SET @u12 = $u2 - $u1; +eval SET @r12 = $r2 - $r1; +--enable_query_log + +--echo # Index lock upgrades during updates (with reduction w.r.t. root page checks) +SELECT @u12 AS index_lock_upgrades_update; +SELECT @r12 AS need_opposite_intention_root_update; + +DROP TABLE t1; diff --git a/mysql-test/suite/innodb/t/innodb_status_variables.test b/mysql-test/suite/innodb/t/innodb_status_variables.test index 6746a94530fcc..c4dfbb84ae073 100644 --- a/mysql-test/suite/innodb/t/innodb_status_variables.test +++ b/mysql-test/suite/innodb/t/innodb_status_variables.test @@ -4,4 +4,5 @@ WHERE variable_name LIKE 'INNODB_%' AND variable_name NOT IN ('INNODB_ADAPTIVE_HASH_HASH_SEARCHES','INNODB_ADAPTIVE_HASH_NON_HASH_SEARCHES', 'INNODB_MEM_ADAPTIVE_HASH', - 'INNODB_BUFFERED_AIO_SUBMITTED','INNODB_BUFFER_POOL_PAGES_LATCHED'); + 'INNODB_BUFFERED_AIO_SUBMITTED','INNODB_BUFFER_POOL_PAGES_LATCHED', + 'INNODB_BTR_INDEX_LOCK_UPGRADES','INNODB_BTR_NEED_OPPOSITE_INTENTION_ROOT'); diff --git a/storage/innobase/btr/btr0cur.cc b/storage/innobase/btr/btr0cur.cc index 40a7305062ad5..788a49796e7dc 100644 --- a/storage/innobase/btr/btr0cur.cc +++ b/storage/innobase/btr/btr0cur.cc @@ -105,6 +105,10 @@ ulint btr_cur_n_sea_old; #ifdef UNIV_DEBUG /* Flag to limit optimistic insert records */ uint btr_cur_limit_optimistic_insert_debug; +/** Number of times index lock was upgraded from SX to X */ +Atomic_counter btr_cur_n_index_lock_upgrades; +/** Number of times btr_cur_need_opposite_intention() was called on the root */ +Atomic_counter btr_cur_n_need_opposite_intention_root; #endif /* UNIV_DEBUG */ /** In the optimistic insert, if the insert does not fit, but this much space @@ -777,22 +781,43 @@ btr_cur_will_modify_tree( /** Detects whether the modifying record might need a opposite modification to the intention. @param bpage buffer pool page -@param is_clust whether this is a clustered index +@param index index tree @param lock_intention lock intention for the tree operation @param node_ptr_max_size the maximum size of a node pointer @param compress_limit BTR_CUR_PAGE_COMPRESS_LIMIT(index) @param rec record (current node_ptr) @return true if tree modification is needed */ static bool btr_cur_need_opposite_intention(const buf_page_t &bpage, - bool is_clust, + const dict_index_t &index, btr_intention_t lock_intention, ulint node_ptr_max_size, ulint compress_limit, const rec_t *rec) { ut_ad(bpage.frame == page_align(rec)); + /* Every check below detects whether a structural change at this + page (e.g. a page split inserting a node pointer into the parent, + a merge with a sibling removing one, or a boundary change updating + one) could cascade into its parent. Handling such a structural change + would also require latching sibling pages at this level, which cannot + be safely acquired under SX without risking deadlock with concurrent + readers; hence the caller must upgrade the index lock from SX to X. + + The root has no parent and no siblings, so none of those cascades + apply and no sibling latches are needed. + + A root page split (btr_root_raise_and_insert()) is a special case + that changes the tree height, but it never cascades upward into a + parent. See the comment and assertion in + btr_cur_search_to_nth_level() for why cascades cannot reach the + unlatched root under SX. */ + if (bpage.id().page_no() == index.page) + { + ut_d(++btr_cur_n_need_opposite_intention_root); + return false; + } if (UNIV_LIKELY_NULL(bpage.zip.data) && - !page_zip_available(&bpage.zip, is_clust, node_ptr_max_size, 1)) + !page_zip_available(&bpage.zip, index.is_clust(), node_ptr_max_size, 1)) return true; const page_t *const page= bpage.frame; if (lock_intention != BTR_INTENTION_INSERT) @@ -1416,7 +1441,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, ut_ad(low_match != uint16_t(~0U) || mode != PAGE_CUR_LE); if (latch_mode == BTR_MODIFY_TREE && - btr_cur_need_opposite_intention(block->page, index()->is_clust(), + btr_cur_need_opposite_intention(block->page, *index(), lock_intention, node_ptr_max_size, compress_limit, page_cur.rec)) @@ -1449,7 +1474,7 @@ dberr_t btr_cur_t::search_leaf(const dtuple_t *tuple, page_cur_mode_t mode, default: break; case BTR_MODIFY_TREE: - if (btr_cur_need_opposite_intention(block->page, index()->is_clust(), + if (btr_cur_need_opposite_intention(block->page, *index(), lock_intention, node_ptr_max_size, compress_limit, page_cur.rec)) @@ -1599,6 +1624,7 @@ ATTRIBUTE_COLD void mtr_t::index_lock_upgrade() index_lock *lock= static_cast(slot.object); lock->u_x_upgrade(SRW_LOCK_CALL); slot.type= MTR_MEMO_X_LOCK; + ut_d(++btr_cur_n_index_lock_upgrades); } /** Mark a non-leaf page "least recently used", but avoid invoking @@ -1803,14 +1829,21 @@ dberr_t btr_cur_search_to_nth_level(ulint level, if (buf_block_t *b= mtr->get_already_latched(page_id, mtr_memo_type_t(rw_latch))) block= b; - else if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch, - block, BUF_GET, mtr, &err))) - { - btr_read_failed(err, *index); - goto func_exit; - } else { + /* Freshly acquiring the root page is only permitted under X-lock + or when no page latch is held (brand-new descent). Under SX with + existing page latches, the root must already be latched in the + mtr, retained from the original search_leaf() descent. */ + ut_ad(height != ULINT_UNDEFINED || + mtr->memo_contains_flagged(&index->lock, MTR_MEMO_X_LOCK) || + (mtr->get_savepoint() == 1 && !mtr->block_at_savepoint(0))); + if (!(block= buf_page_get_gen(page_id, zip_size, rw_latch, + block, BUF_GET, mtr, &err))) + { + btr_read_failed(err, *index); + goto func_exit; + } btr_search_drop_page_hash_index(block, index); btr_cur_nonleaf_make_young(&block->page); } @@ -1992,7 +2025,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, break; if (!index->lock.have_x() && - btr_cur_need_opposite_intention(block->page, index->is_clust(), + btr_cur_need_opposite_intention(block->page, *index, lock_intention, node_ptr_max_size, compress_limit, page_cur.rec)) @@ -2040,7 +2073,7 @@ dberr_t btr_cur_t::open_leaf(bool first, dict_index_t *index, if (!height && first && first_access) buf_read_ahead_linear(page_id_t(block->page.id().space(), page)); } - else if (btr_cur_need_opposite_intention(block->page, index->is_clust(), + else if (btr_cur_need_opposite_intention(block->page, *index, lock_intention, node_ptr_max_size, compress_limit, page_cur.rec)) diff --git a/storage/innobase/handler/ha_innodb.cc b/storage/innobase/handler/ha_innodb.cc index 2e905e8500f8f..2d3d256910fdb 100644 --- a/storage/innobase/handler/ha_innodb.cc +++ b/storage/innobase/handler/ha_innodb.cc @@ -1071,6 +1071,13 @@ static SHOW_VAR innodb_status_variables[]= { /* InnoDB bulk operations */ {"bulk_operations", &export_vars.innodb_bulk_operations, SHOW_SIZE_T}, +#ifdef UNIV_DEBUG + {"btr_index_lock_upgrades", + &btr_cur_n_index_lock_upgrades, SHOW_SIZE_T}, + {"btr_need_opposite_intention_root", + &btr_cur_n_need_opposite_intention_root, SHOW_SIZE_T}, +#endif /* UNIV_DEBUG */ + {NullS, NullS, SHOW_LONG} }; diff --git a/storage/innobase/include/btr0cur.h b/storage/innobase/include/btr0cur.h index 53f88cc8ca1f5..94d1899400ada 100644 --- a/storage/innobase/include/btr0cur.h +++ b/storage/innobase/include/btr0cur.h @@ -833,6 +833,10 @@ extern ulint btr_cur_n_sea_old; #ifdef UNIV_DEBUG /* Flag to limit optimistic insert records */ extern uint btr_cur_limit_optimistic_insert_debug; +/** Number of times index lock was upgraded from SX to X */ +extern Atomic_counter btr_cur_n_index_lock_upgrades; +/** Number of times btr_cur_need_opposite_intention() was called on the root */ +extern Atomic_counter btr_cur_n_need_opposite_intention_root; #endif /* UNIV_DEBUG */ #include "btr0cur.inl"