summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorRobin H. Johnson <robbat2@gentoo.org>2010-08-09 00:21:00 +0000
committerRobin H. Johnson <robbat2@gentoo.org>2010-08-09 00:21:00 +0000
commit4b509e569a5d958c4a81e18dedd3df31a6092391 (patch)
tree18750b10edc3f28dc63d67ed9549ec1fb53bdd04 /percona
parentUpdated 07110 patch for mysql-5.1.49. (diff)
downloadmysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.gz
mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.tar.bz2
mysql-extras-4b509e569a5d958c4a81e18dedd3df31a6092391.zip
Adding latest Percona patches.
Diffstat (limited to 'percona')
-rw-r--r--percona/5.0.91-b22-20100522/CHECKSUM.MD528
-rw-r--r--percona/5.0.91-b22-20100522/README-GENTOO8
-rw-r--r--percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch275
-rw-r--r--percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch633
-rw-r--r--percona/5.0.91-b22-20100522/innodb_extra_rseg.patch243
-rw-r--r--percona/5.0.91-b22-20100522/innodb_extra_status.patch747
-rw-r--r--percona/5.0.91-b22-20100522/innodb_fsync_source.patch594
-rw-r--r--percona/5.0.91-b22-20100522/innodb_io_patches.patch1379
-rw-r--r--percona/5.0.91-b22-20100522/innodb_io_pattern.patch693
-rw-r--r--percona/5.0.91-b22-20100522/innodb_io_tune.patch1823
-rw-r--r--percona/5.0.91-b22-20100522/innodb_locks_held.patch219
-rw-r--r--percona/5.0.91-b22-20100522/innodb_misc_patch.patch64
-rw-r--r--percona/5.0.91-b22-20100522/innodb_recovery_patches.patch217
-rw-r--r--percona/5.0.91-b22-20100522/innodb_rw_lock.patch2480
-rw-r--r--percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch1357
-rw-r--r--percona/5.0.91-b22-20100522/innodb_show_bp.patch453
-rw-r--r--percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch275
-rw-r--r--percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch264
-rw-r--r--percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch1914
-rw-r--r--percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch389
-rw-r--r--percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch265
-rw-r--r--percona/5.0.91-b22-20100522/microsec_process.patch282
-rw-r--r--percona/5.0.91-b22-20100522/microslow_innodb.patch2492
-rw-r--r--percona/5.0.91-b22-20100522/mirror_binlog.patch2694
-rw-r--r--percona/5.0.91-b22-20100522/mysql-test.patch140
-rw-r--r--percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch127
-rw-r--r--percona/5.0.91-b22-20100522/profiling_slow.patch271
-rw-r--r--percona/5.0.91-b22-20100522/series22
-rw-r--r--percona/5.0.91-b22-20100522/show_patches.patch288
-rw-r--r--percona/5.0.91-b22-20100522/userstatv2.patch4406
30 files changed, 25042 insertions, 0 deletions
diff --git a/percona/5.0.91-b22-20100522/CHECKSUM.MD5 b/percona/5.0.91-b22-20100522/CHECKSUM.MD5
new file mode 100644
index 0000000..87cc289
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/CHECKSUM.MD5
@@ -0,0 +1,28 @@
+f94d0861b72103d54a8ff3800847fe02 series
+7738fd1556b03bd512cfab64432b8d96 userstatv2.patch
+b40d08e599ffbb8b3d654190bd278f28 show_patches.patch
+c89ae66aad25d102b1b57ccbea0883dc profiling_slow.patch
+23b1125d15f1624bd920ab94333e4ec6 mysqld_safe_syslog.patch
+d53be78eae8f3680c4ed90ba1122cbb0 mysql-test.patch
+043ffa3cdc3d4f65ed0cc7626e667945 mirror_binlog.patch
+1d4b5e4be5d4ec9af9d991579bddfef7 microslow_innodb.patch
+ea02d7cf5de508217194d846b1877fa3 microsec_process.patch
+74c970feb2f4d7997ee9d10ee0151c8c innodb_use_sys_malloc.patch
+824e96231eae8adf47abf3cc2dc7f06b innodb_thread_concurrency_timer_based.patch
+10b228f7c1df9441bef5951c4fdcfb33 innodb_split_buf_pool_mutex.patch
+6e85bdacf5192de313f4959e1b77441d innodb_show_hashed_memory_standalone.patch
+15e9cf8e77330df9dc0f9fed0542cf93 innodb_show_hashed_memory.patch
+01397a91f4dede07f869c320342fdb3c innodb_show_bp.patch
+4050142f7c8cc1d5857f972c24e4390c innodb_rw_lock_old.patch
+75ca0cd1f878afe6360746ecba82726b innodb_rw_lock.patch
+7cef98cb62b4620de17955e0f371211b innodb_recovery_patches.patch
+f5e5492fa8e2608c29ef781a9448af3e innodb_misc_patch.patch
+5de06fbcbb7c2f8562d43670be84c4d7 innodb_locks_held.patch
+bf1e0ce08175b3aff68e36e468817cc3 innodb_io_tune.patch
+05f6558f5d85308a78e24661807aa95a innodb_io_pattern.patch
+0d868a2f57fa762bceb24749ca819190 innodb_io_patches.patch
+e6eb72d8c4bc5a922c390530858b69b8 innodb_fsync_source.patch
+e7ec26dfed29892247434ac51e432ce6 innodb_extra_status.patch
+640f4bf96bec774576648e019c595e4b innodb_extra_rseg.patch
+df9f80c668652720a7a89675a153a99a innodb_dict_size_limit.patch
+3ca5baf8836512e28e24e5fa3210d903 innodb_check_fragmentation.patch
diff --git a/percona/5.0.91-b22-20100522/README-GENTOO b/percona/5.0.91-b22-20100522/README-GENTOO
new file mode 100644
index 0000000..a4e2724
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/README-GENTOO
@@ -0,0 +1,8 @@
+The following patches, while distributed by Percona, are NOT applied in their
+specfile. As such, we do not apply them in Gentoo either:
+=========
+innodb_extra_status.patch
+innodb_io_tune.patch
+innodb_rw_lock_old.patch
+innodb_show_hashed_memory_standalone.patch
+mirror_binlog.patch
diff --git a/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch b/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch
new file mode 100644
index 0000000..4b16731
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_check_fragmentation.patch
@@ -0,0 +1,275 @@
+diff -r 936d427a9a15 innobase/btr/btr0cur.c
+--- a/innobase/btr/btr0cur.c Mon Dec 22 00:33:03 2008 -0800
++++ b/innobase/btr/btr0cur.c Mon Dec 22 00:33:11 2008 -0800
+@@ -516,6 +516,14 @@
+ == index->table->comp);
+ }
+
++ if (level == 0) {
++ /* Initializes status counters */
++ innobase_mysql_thd_init_innodb_scan_cont();
++ innobase_mysql_thd_init_innodb_scan_jump();
++ innobase_mysql_thd_init_innodb_scan_data();
++ innobase_mysql_thd_init_innodb_scan_garbage();
++ }
++
+ break;
+ }
+
+@@ -663,6 +671,12 @@
+ btr_cur_add_path_info(cursor, height,
+ root_height);
+ }
++
++ /* Initializes status counters */
++ innobase_mysql_thd_init_innodb_scan_cont();
++ innobase_mysql_thd_init_innodb_scan_jump();
++ innobase_mysql_thd_init_innodb_scan_data();
++ innobase_mysql_thd_init_innodb_scan_garbage();
+
+ break;
+ }
+diff -r 936d427a9a15 innobase/btr/btr0pcur.c
+--- a/innobase/btr/btr0pcur.c Mon Dec 22 00:33:03 2008 -0800
++++ b/innobase/btr/btr0pcur.c Mon Dec 22 00:33:11 2008 -0800
+@@ -381,6 +381,7 @@
+ last record of the current page */
+ mtr_t* mtr) /* in: mtr */
+ {
++ ulint page_no;
+ ulint next_page_no;
+ ulint space;
+ page_t* page;
+@@ -393,11 +394,22 @@
+ cursor->old_stored = BTR_PCUR_OLD_NOT_STORED;
+
+ page = btr_pcur_get_page(cursor);
++ page_no = buf_frame_get_page_no(page);
+
+ next_page_no = btr_page_get_next(page, mtr);
+ space = buf_frame_get_space_id(page);
+
+ ut_ad(next_page_no != FIL_NULL);
++
++ if (next_page_no - page_no == 1) {
++ innobase_mysql_thd_increment_innodb_scan_cont(1);
++ } else {
++ innobase_mysql_thd_increment_innodb_scan_jump(1);
++ }
++ innobase_mysql_thd_increment_innodb_scan_data(
++ page_get_data_size(page));
++ innobase_mysql_thd_increment_innodb_scan_garbage(
++ page_header_get_field(page, PAGE_GARBAGE));
+
+ next_page = btr_page_get(space, next_page_no, cursor->latch_mode, mtr);
+ ut_a(page_is_comp(next_page) == page_is_comp(page));
+@@ -427,6 +439,7 @@
+ record of the current page */
+ mtr_t* mtr) /* in: mtr */
+ {
++ ulint page_no;
+ ulint prev_page_no;
+ ulint space;
+ page_t* page;
+@@ -462,9 +475,20 @@
+ btr_pcur_restore_position(latch_mode2, cursor, mtr);
+
+ page = btr_pcur_get_page(cursor);
++ page_no = buf_frame_get_page_no(page);
+
+ prev_page_no = btr_page_get_prev(page, mtr);
+ space = buf_frame_get_space_id(page);
++
++ if (page_no - prev_page_no == 1) {
++ innobase_mysql_thd_increment_innodb_scan_cont(1);
++ } else {
++ innobase_mysql_thd_increment_innodb_scan_jump(1);
++ }
++ innobase_mysql_thd_increment_innodb_scan_data(
++ page_get_data_size(page));
++ innobase_mysql_thd_increment_innodb_scan_garbage(
++ page_header_get_field(page, PAGE_GARBAGE));
+
+ if (btr_pcur_is_before_first_on_page(cursor, mtr)
+ && (prev_page_no != FIL_NULL)) {
+diff -r 936d427a9a15 innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c Mon Dec 22 00:33:03 2008 -0800
++++ b/innobase/btr/btr0sea.c Mon Dec 22 00:33:11 2008 -0800
+@@ -861,6 +861,12 @@
+
+ buf_pool->n_page_gets++;
+
++ /* Initializes status counters */
++ innobase_mysql_thd_init_innodb_scan_cont();
++ innobase_mysql_thd_init_innodb_scan_jump();
++ innobase_mysql_thd_init_innodb_scan_data();
++ innobase_mysql_thd_init_innodb_scan_garbage();
++
+ return(TRUE);
+
+ /*-------------------------------------------*/
+diff -r 936d427a9a15 innobase/include/btr0cur.h
+--- a/innobase/include/btr0cur.h Mon Dec 22 00:33:03 2008 -0800
++++ b/innobase/include/btr0cur.h Mon Dec 22 00:33:11 2008 -0800
+@@ -697,6 +697,17 @@
+ extern ulint btr_cur_n_non_sea_old;
+ extern ulint btr_cur_n_sea_old;
+
++/*--------------------------------------*/
++/* prototypes for new functions added to ha_innodb.cc */
++void innobase_mysql_thd_init_innodb_scan_cont();
++void innobase_mysql_thd_increment_innodb_scan_cont(ulong length);
++void innobase_mysql_thd_init_innodb_scan_jump();
++void innobase_mysql_thd_increment_innodb_scan_jump(ulong length);
++void innobase_mysql_thd_init_innodb_scan_data();
++void innobase_mysql_thd_increment_innodb_scan_data(ulong length);
++void innobase_mysql_thd_init_innodb_scan_garbage();
++void innobase_mysql_thd_increment_innodb_scan_garbage(ulong length);
++
+ #ifndef UNIV_NONINL
+ #include "btr0cur.ic"
+ #endif
+diff -r 936d427a9a15 patch_info/innodb_check_fragmentation.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_check_fragmentation.info Mon Dec 22 00:33:11 2008 -0800
+@@ -0,0 +1,6 @@
++File=innodb_check_fragmentation.patch
++Name=Session status to check fragmentation of the last InnoDB scan
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=The names are Innodb_scan_*
+diff -r 936d427a9a15 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Dec 22 00:33:03 2008 -0800
++++ b/sql/ha_innodb.cc Mon Dec 22 00:33:11 2008 -0800
+@@ -760,6 +760,102 @@
+ }
+
+ /*************************************************************************
++Initializes Innodb_scan_blocks_contiguous. */
++extern "C"
++void
++innobase_mysql_thd_init_innodb_scan_cont()
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_cont = 0;
++ }
++}
++
++/*************************************************************************
++Increments Innodb_scan_blocks_contiguous. */
++extern "C"
++void
++innobase_mysql_thd_increment_innodb_scan_cont(ulong length)
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_cont+= length;
++ }
++}
++
++/*************************************************************************
++Initializes Innodb_scan_blocks_jumpy. */
++extern "C"
++void
++innobase_mysql_thd_init_innodb_scan_jump()
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_jump = 0;
++ }
++}
++
++/*************************************************************************
++Increments Innodb_scan_blocks_jumpy. */
++extern "C"
++void
++innobase_mysql_thd_increment_innodb_scan_jump(ulong length)
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_jump+= length;
++ }
++}
++
++/*************************************************************************
++Initializes Innodb_scan_data_in_pages. */
++extern "C"
++void
++innobase_mysql_thd_init_innodb_scan_data()
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_data = 0;
++ }
++}
++
++/*************************************************************************
++Increments Innodb_scan_data_in_pages. */
++extern "C"
++void
++innobase_mysql_thd_increment_innodb_scan_data(ulong length)
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_data+= length;
++ }
++}
++
++/*************************************************************************
++Initializes Innodb_scan_garbages_in_pages. */
++extern "C"
++void
++innobase_mysql_thd_init_innodb_scan_garbage()
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_garbage = 0;
++ }
++}
++
++/*************************************************************************
++Increments Innodb_scan_garbages_in_pages. */
++extern "C"
++void
++innobase_mysql_thd_increment_innodb_scan_garbage(ulong length)
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ thd->status_var.innodb_scan_garbage+= length;
++ }
++}
++
++/*************************************************************************
+ Gets the InnoDB transaction handle for a MySQL handler object, creates
+ an InnoDB transaction struct if the corresponding MySQL thread struct still
+ lacks one. */
+diff -r 936d427a9a15 sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Dec 22 00:33:03 2008 -0800
++++ b/sql/mysqld.cc Mon Dec 22 00:33:11 2008 -0800
+@@ -6673,6 +6673,10 @@
+ {"Handler_write", (char*) offsetof(STATUS_VAR, ha_write_count), SHOW_LONG_STATUS},
+ #ifdef HAVE_INNOBASE_DB
+ {"Innodb_", (char*) &innodb_status_variables, SHOW_VARS},
++ {"Innodb_scan_pages_contiguous",(char*) offsetof(STATUS_VAR, innodb_scan_cont), SHOW_LONGLONG_STATUS},
++ {"Innodb_scan_pages_jumpy", (char*) offsetof(STATUS_VAR, innodb_scan_jump), SHOW_LONGLONG_STATUS},
++ {"Innodb_scan_data_in_pages",(char*) offsetof(STATUS_VAR, innodb_scan_data), SHOW_LONGLONG_STATUS},
++ {"Innodb_scan_garbages_in_pages",(char*) offsetof(STATUS_VAR, innodb_scan_garbage), SHOW_LONGLONG_STATUS},
+ #endif /*HAVE_INNOBASE_DB*/
+ {"Key_blocks_not_flushed", (char*) &dflt_key_cache_var.global_blocks_changed, SHOW_KEY_CACHE_LONG},
+ {"Key_blocks_unused", (char*) &dflt_key_cache_var.blocks_unused, SHOW_KEY_CACHE_CONST_LONG},
+diff -r 936d427a9a15 sql/sql_class.h
+--- a/sql/sql_class.h Mon Dec 22 00:33:03 2008 -0800
++++ b/sql/sql_class.h Mon Dec 22 00:33:11 2008 -0800
+@@ -729,6 +729,10 @@
+ sense to add to the /global/ status variable counter.
+ */
+ double last_query_cost;
++ ulonglong innodb_scan_cont;
++ ulonglong innodb_scan_jump;
++ ulonglong innodb_scan_data;
++ ulonglong innodb_scan_garbage;
+ } STATUS_VAR;
+
+ /*
diff --git a/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch b/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch
new file mode 100644
index 0000000..ced1aec
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_dict_size_limit.patch
@@ -0,0 +1,633 @@
+diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c 2009-08-27 18:42:17.000000000 +0900
++++ b/innobase/btr/btr0sea.c 2009-08-27 18:43:11.000000000 +0900
+@@ -1077,6 +1077,124 @@
+ }
+
+ /************************************************************************
++Drops a page hash index based on index */
++
++void
++btr_search_drop_page_hash_index_on_index(
++/*=====================================*/
++ dict_index_t* index) /* in: record descriptor */
++{
++ page_t* page;
++ hash_table_t* table;
++ buf_block_t* block;
++ ulint n_fields;
++ ulint n_bytes;
++ rec_t* rec;
++ ulint fold;
++ ulint prev_fold;
++ dulint tree_id;
++ ulint n_cached;
++ ulint n_recs;
++ ulint* folds;
++ ulint i;
++ mem_heap_t* heap = NULL;
++ ulint* offsets;
++
++ rw_lock_x_lock(&btr_search_latch);
++ mutex_enter(&buf_pool->mutex);
++
++ table = btr_search_sys->hash_index;
++
++ block = UT_LIST_GET_LAST(buf_pool->LRU);
++
++ while (block != NULL) {
++ if (block->index == index && block->is_hashed) {
++ page = block->frame;
++
++ /* from btr_search_drop_page_hash_index() */
++ n_fields = block->curr_n_fields;
++ n_bytes = block->curr_n_bytes;
++
++ ut_a(n_fields + n_bytes > 0);
++
++ n_recs = page_get_n_recs(page);
++
++ /* Calculate and cache fold values into an array for fast deletion
++ from the hash index */
++
++ folds = mem_alloc(n_recs * sizeof(ulint));
++
++ n_cached = 0;
++
++ rec = page_get_infimum_rec(page);
++ rec = page_rec_get_next(rec);
++
++ tree_id = btr_page_get_index_id(page);
++
++ ut_a(0 == ut_dulint_cmp(tree_id, index->id));
++
++ prev_fold = 0;
++
++ offsets = NULL;
++
++ while (!page_rec_is_supremum(rec)) {
++ /* FIXME: in a mixed tree, not all records may have enough
++ ordering fields: */
++ offsets = rec_get_offsets(rec, index, offsets,
++ n_fields + (n_bytes > 0), &heap);
++ ut_a(rec_offs_n_fields(offsets) == n_fields + (n_bytes > 0));
++ fold = rec_fold(rec, offsets, n_fields, n_bytes, tree_id);
++
++ if (fold == prev_fold && prev_fold != 0) {
++
++ goto next_rec;
++ }
++
++ /* Remove all hash nodes pointing to this page from the
++ hash chain */
++
++ folds[n_cached] = fold;
++ n_cached++;
++next_rec:
++ rec = page_rec_get_next(rec);
++ prev_fold = fold;
++ }
++
++ for (i = 0; i < n_cached; i++) {
++
++ ha_remove_all_nodes_to_page(table, folds[i], page);
++ }
++
++ ut_a(index->search_info->ref_count > 0);
++ index->search_info->ref_count--;
++
++ block->is_hashed = FALSE;
++ block->index = NULL;
++
++ if (UNIV_UNLIKELY(block->n_pointers)) {
++ /* Corruption */
++ ut_print_timestamp(stderr);
++ fprintf(stderr,
++" InnoDB: Corruption of adaptive hash index. After dropping\n"
++"InnoDB: the hash index to a page of %s, still %lu hash nodes remain.\n",
++ index->name, (ulong) block->n_pointers);
++ }
++
++ mem_free(folds);
++ }
++
++ block = UT_LIST_GET_PREV(LRU, block);
++ }
++
++ mutex_exit(&buf_pool->mutex);
++ rw_lock_x_unlock(&btr_search_latch);
++
++ if (UNIV_LIKELY_NULL(heap)) {
++ mem_heap_free(heap);
++ }
++}
++
++/************************************************************************
+ Drops a page hash index when a page is freed from a fseg to the file system.
+ Drops possible hash index if the page happens to be in the buffer pool. */
+
+diff -ruN a/innobase/dict/dict0boot.c b/innobase/dict/dict0boot.c
+--- a/innobase/dict/dict0boot.c 2009-07-07 21:53:58.000000000 +0900
++++ b/innobase/dict/dict0boot.c 2009-08-27 18:42:59.000000000 +0900
+@@ -247,6 +247,7 @@
+ system tables */
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_TABLES", DICT_HDR_SPACE, 8, FALSE);
++ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, "NAME", DATA_BINARY, 0, 0, 0);
+ dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0);
+@@ -283,6 +284,7 @@
+ ut_a(success);
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_COLUMNS", DICT_HDR_SPACE, 7, FALSE);
++ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY,0,0,0);
+ dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0);
+@@ -309,6 +311,7 @@
+ ut_a(success);
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_INDEXES", DICT_HDR_SPACE, 7, FALSE);
++ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, "TABLE_ID", DATA_BINARY, 0,0,0);
+ dict_mem_table_add_col(table, "ID", DATA_BINARY, 0, 0, 0);
+@@ -345,6 +348,7 @@
+ ut_a(success);
+ /*-------------------------*/
+ table = dict_mem_table_create("SYS_FIELDS", DICT_HDR_SPACE, 3, FALSE);
++ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, "INDEX_ID", DATA_BINARY, 0,0,0);
+ dict_mem_table_add_col(table, "POS", DATA_INT, 0, 4, 0);
+diff -ruN a/innobase/dict/dict0crea.c b/innobase/dict/dict0crea.c
+--- a/innobase/dict/dict0crea.c 2009-07-07 21:53:58.000000000 +0900
++++ b/innobase/dict/dict0crea.c 2009-08-27 18:42:59.000000000 +0900
+@@ -1178,6 +1178,9 @@
+ /* Foreign constraint system tables have already been
+ created, and they are ok */
+
++ table1->n_mysql_handles_opened = 1; /* for pin */
++ table2->n_mysql_handles_opened = 1; /* for pin */
++
+ mutex_exit(&(dict_sys->mutex));
+
+ return(DB_SUCCESS);
+@@ -1267,6 +1270,11 @@
+
+ trx->op_info = "";
+
++ table1 = dict_table_get_low("SYS_FOREIGN");
++ table2 = dict_table_get_low("SYS_FOREIGN_COLS");
++ table1->n_mysql_handles_opened = 1; /* for pin */
++ table2->n_mysql_handles_opened = 1; /* for pin */
++
+ row_mysql_unlock_data_dictionary(trx);
+
+ trx_free_for_mysql(trx);
+diff -ruN a/innobase/dict/dict0dict.c b/innobase/dict/dict0dict.c
+--- a/innobase/dict/dict0dict.c 2009-07-07 21:53:58.000000000 +0900
++++ b/innobase/dict/dict0dict.c 2009-08-27 18:43:11.000000000 +0900
+@@ -638,6 +638,8 @@
+ mutex_enter(&(dict_sys->mutex));
+
+ table = dict_table_get_on_id_low(table_id, trx);
++
++ dict_table_LRU_trim(table);
+
+ mutex_exit(&(dict_sys->mutex));
+
+@@ -752,6 +754,8 @@
+
+ table = dict_table_get_low(table_name);
+
++ dict_table_LRU_trim(table);
++
+ mutex_exit(&(dict_sys->mutex));
+
+ if (table != NULL) {
+@@ -787,6 +791,8 @@
+ table->n_mysql_handles_opened++;
+ }
+
++ dict_table_LRU_trim(table);
++
+ mutex_exit(&(dict_sys->mutex));
+
+ if (table != NULL) {
+@@ -1267,20 +1273,64 @@
+ too much space. Currently not used! */
+
+ void
+-dict_table_LRU_trim(void)
+-/*=====================*/
++dict_table_LRU_trim(
++/*================*/
++ dict_table_t* self)
+ {
+ dict_table_t* table;
+ dict_table_t* prev_table;
++ dict_foreign_t* foreign;
++ ulint n_removed;
++ ulint n_have_parent;
++ ulint cached_foreign_tables;
+
+- ut_error;
++ //ut_error;
+
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
++retry:
++ n_removed = n_have_parent = 0;
+ table = UT_LIST_GET_LAST(dict_sys->table_LRU);
+
++ while ( srv_dict_size_limit && table
++ && ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)
++ + dict_sys->size) > srv_dict_size_limit ) {
++ prev_table = UT_LIST_GET_PREV(table_LRU, table);
++
++ if (table == self || table->n_mysql_handles_opened)
++ goto next_loop;
++
++ cached_foreign_tables = 0;
++ foreign = UT_LIST_GET_FIRST(table->foreign_list);
++ while (foreign != NULL) {
++ if (foreign->referenced_table)
++ cached_foreign_tables++;
++ foreign = UT_LIST_GET_NEXT(foreign_list, foreign);
++ }
++
++ /* TODO: use table->mem_fix also, if it becomes exact. */
++
++ if (cached_foreign_tables == 0) {
++ dict_table_remove_from_cache(table);
++ n_removed++;
++ } else {
++ n_have_parent++;
++ }
++next_loop:
++ table = prev_table;
++ }
++
++ if ( srv_dict_size_limit && n_have_parent && n_removed
++ && ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)
++ + dict_sys->size) > srv_dict_size_limit )
++ goto retry;
++/*
+ while (table && (dict_sys->size >
+ buf_pool_get_max_size() / DICT_POOL_PER_VARYING)) {
+
+@@ -1292,6 +1342,7 @@
+
+ table = prev_table;
+ }
++*/
+ }
+
+ /**************************************************************************
+@@ -1565,6 +1616,10 @@
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(dict_sys->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
++ /* remove all entry of the index from adaptive hash index,
++ because removing from adaptive hash index needs dict_index */
++ if (srv_use_adaptive_hash_indexes && srv_dict_size_limit)
++ btr_search_drop_page_hash_index_on_index(index);
+
+ /* We always create search info whether or not adaptive
+ hash index is enabled or not. */
+diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c 2009-08-27 18:42:17.000000000 +0900
++++ b/innobase/ibuf/ibuf0ibuf.c 2009-08-27 18:42:59.000000000 +0900
+@@ -535,6 +535,7 @@
+ sprintf(buf, "SYS_IBUF_TABLE_%lu", (ulong) space);
+ /* use old-style record format for the insert buffer */
+ table = dict_mem_table_create(buf, space, 2, FALSE);
++ table->n_mysql_handles_opened = 1; /* for pin */
+
+ dict_mem_table_add_col(table, "PAGE_NO", DATA_BINARY, 0, 0, 0);
+ dict_mem_table_add_col(table, "TYPES", DATA_BINARY, 0, 0, 0);
+diff -ruN a/innobase/include/btr0sea.h b/innobase/include/btr0sea.h
+--- a/innobase/include/btr0sea.h 2009-07-07 21:54:00.000000000 +0900
++++ b/innobase/include/btr0sea.h 2009-08-27 18:43:11.000000000 +0900
+@@ -97,6 +97,13 @@
+ /*============================*/
+ page_t* page); /* in: index page, s- or x-latched */
+ /************************************************************************
++Drops a page hash index based on index */
++
++void
++btr_search_drop_page_hash_index_on_index(
++/*=====================================*/
++ dict_index_t* index); /* in: record descriptor */
++/************************************************************************
+ Drops a page hash index when a page is freed from a fseg to the file system.
+ Drops possible hash index if the page happens to be in the buffer pool. */
+
+diff -ruN a/innobase/include/dict0dict.h b/innobase/include/dict0dict.h
+--- a/innobase/include/dict0dict.h 2009-07-07 21:54:01.000000000 +0900
++++ b/innobase/include/dict0dict.h 2009-08-27 18:42:59.000000000 +0900
+@@ -938,6 +938,11 @@
+ const char* ptr, /* in: scan from */
+ const char* string);/* in: look for this */
+
++void
++dict_table_LRU_trim(
++/*================*/
++ dict_table_t* self);
++
+ /* Buffers for storing detailed information about the latest foreign key
+ and unique key errors */
+ extern FILE* dict_foreign_err_file;
+diff -ruN a/innobase/include/dict0dict.ic b/innobase/include/dict0dict.ic
+--- a/innobase/include/dict0dict.ic 2009-07-07 21:54:01.000000000 +0900
++++ b/innobase/include/dict0dict.ic 2009-08-27 18:42:59.000000000 +0900
+@@ -533,6 +533,13 @@
+
+ HASH_SEARCH(name_hash, dict_sys->table_hash, table_fold, table,
+ ut_strcmp(table->name, table_name) == 0);
++
++ /* make young in table_LRU */
++ if (table) {
++ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
++ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
++ }
++
+ return(table);
+ }
+
+@@ -592,6 +599,10 @@
+ if (table != NULL) {
+ table->mem_fix++;
+
++ /* make young in table_LRU */
++ UT_LIST_REMOVE(table_LRU, dict_sys->table_LRU, table);
++ UT_LIST_ADD_FIRST(table_LRU, dict_sys->table_LRU, table);
++
+ /* lock_push(trx, table, LOCK_DICT_MEM_FIX) */
+ }
+
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-08-27 18:42:17.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-08-27 18:42:59.000000000 +0900
+@@ -147,6 +147,8 @@
+ extern uint srv_read_ahead;
+ extern uint srv_adaptive_checkpoint;
+
++extern ulint srv_dict_size_limit;
++
+ extern volatile ibool srv_io_pattern;
+ extern ulong srv_io_pattern_trace;
+ extern ulong srv_io_pattern_trace_running;
+@@ -552,6 +554,7 @@
+ ulint innodb_data_writes;
+ ulint innodb_data_written;
+ ulint innodb_data_reads;
++ ulint innodb_dict_tables;
+ ulint innodb_buffer_pool_pages_total;
+ ulint innodb_buffer_pool_pages_data;
+ ulint innodb_buffer_pool_pages_dirty;
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-08-27 18:42:17.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-08-27 18:42:59.000000000 +0900
+@@ -353,6 +353,8 @@
+ uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
+ uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
+
++ulint srv_dict_size_limit = 0;
++
+ volatile ibool srv_io_pattern = FALSE;
+ ulint srv_io_pattern_trace = 0;
+ ulint srv_io_pattern_trace_running = 0;
+@@ -1953,6 +1955,7 @@
+ export_vars.innodb_data_reads= os_n_file_reads;
+ export_vars.innodb_data_writes= os_n_file_writes;
+ export_vars.innodb_data_written= srv_data_written;
++ export_vars.innodb_dict_tables= (dict_sys ? UT_LIST_GET_LEN(dict_sys->table_LRU) : 0);
+ export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets;
+ export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests;
+ export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free;
+diff -ruN a/mysql-test/r/innodb_dict_size_limit.result b/mysql-test/r/innodb_dict_size_limit.result
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/mysql-test/r/innodb_dict_size_limit.result 2009-08-27 18:42:59.000000000 +0900
+@@ -0,0 +1,60 @@
++DROP TABLE IF EXISTS `test_5`;
++DROP TABLE IF EXISTS `test_4`;
++DROP TABLE IF EXISTS `test_3`;
++DROP TABLE IF EXISTS `test_2`;
++DROP TABLE IF EXISTS `test_1`;
++SET storage_engine=InnoDB;
++SET GLOBAL innodb_dict_size_limit=1;
++FLUSH TABLES;
++CREATE TABLE `test_1` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_2` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_3` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_4` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_5` (`a` int, `b` int, PRIMARY KEY (`a`));
++ALTER TABLE `test_5` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_4`(`a`);
++ALTER TABLE `test_4` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_3`(`a`);
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 9
++FLUSH TABLES;
++SELECT * FROM `test_1`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 8
++SELECT * FROM `test_3`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 11
++FLUSH TABLES;
++SELECT * FROM `test_2`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 8
++SELECT * FROM `test_1`;
++a b
++FLUSH TABLES;
++SELECT * FROM `test_4`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 9
++SELECT * FROM `test_3`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 10
++SET GLOBAL innodb_dict_size_limit=0;
++FLUSH TABLES;
++SELECT * FROM `test_2`;
++a b
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++Variable_name Value
++Innodb_dict_tables 11
++DROP TABLE `test_5`;
++DROP TABLE `test_4`;
++DROP TABLE `test_3`;
++DROP TABLE `test_2`;
++DROP TABLE `test_1`;
+diff -ruN a/mysql-test/t/innodb_dict_size_limit.test b/mysql-test/t/innodb_dict_size_limit.test
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/mysql-test/t/innodb_dict_size_limit.test 2009-08-27 18:42:59.000000000 +0900
+@@ -0,0 +1,63 @@
++#
++# Test for new variable innodb_dict_size_limit;
++#
++-- source include/have_innodb.inc
++
++--disable_warnings
++DROP TABLE IF EXISTS `test_5`;
++DROP TABLE IF EXISTS `test_4`;
++DROP TABLE IF EXISTS `test_3`;
++DROP TABLE IF EXISTS `test_2`;
++DROP TABLE IF EXISTS `test_1`;
++--enable_warnings
++
++SET storage_engine=InnoDB;
++SET GLOBAL innodb_dict_size_limit=1;
++
++FLUSH TABLES;
++
++CREATE TABLE `test_1` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_2` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_3` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_4` (`a` int, `b` int, PRIMARY KEY (`a`));
++CREATE TABLE `test_5` (`a` int, `b` int, PRIMARY KEY (`a`));
++
++ALTER TABLE `test_5` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_4`(`a`);
++ALTER TABLE `test_4` ADD CONSTRAINT FOREIGN KEY(`b`) REFERENCES `test_3`(`a`);
++
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++FLUSH TABLES;
++SELECT * FROM `test_1`;
++
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++SELECT * FROM `test_3`;
++
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++FLUSH TABLES;
++SELECT * FROM `test_2`;
++
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++SELECT * FROM `test_1`;
++FLUSH TABLES;
++SELECT * FROM `test_4`;
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++SELECT * FROM `test_3`;
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++SET GLOBAL innodb_dict_size_limit=0;
++FLUSH TABLES;
++SELECT * FROM `test_2`;
++
++SHOW GLOBAL STATUS LIKE 'Innodb_dict_tables';
++
++DROP TABLE `test_5`;
++DROP TABLE `test_4`;
++DROP TABLE `test_3`;
++DROP TABLE `test_2`;
++DROP TABLE `test_1`;
++
+diff -ruN a/patch_info/innodb_dict_size_limit.info b/patch_info/innodb_dict_size_limit.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_dict_size_limit.info 2009-08-27 18:42:59.000000000 +0900
+@@ -0,0 +1,9 @@
++File=innodb_dict_size_limit.patch
++Name=Limit dictionary cache size
++Version=1.0
++Author=Percona
++License=GPL
++Comment=Variable innodb_dict_size_limit in bytes
++ChangeLog=
++2009-01-26
++YK: Initial release
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-08-27 18:42:17.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-08-27 18:42:59.000000000 +0900
+@@ -288,6 +288,8 @@
+ (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG},
+ {"dblwr_writes",
+ (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG},
++ {"dict_tables",
++ (char*) &export_vars.innodb_dict_tables, SHOW_LONG},
+ {"log_waits",
+ (char*) &export_vars.innodb_log_waits, SHOW_LONG},
+ {"log_write_requests",
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-08-27 18:42:17.000000000 +0900
++++ b/sql/ha_innodb.h 2009-08-27 18:42:59.000000000 +0900
+@@ -243,6 +243,7 @@
+ extern ulong srv_enable_unsafe_group_commit;
+ extern uint srv_read_ahead;
+ extern uint srv_adaptive_checkpoint;
++extern ulong srv_dict_size_limit;
+ extern ulong srv_show_locks_held;
+ extern ulong srv_show_verbose_locks;
+ extern ulong srv_io_pattern_trace;
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-08-27 18:42:17.000000000 +0900
++++ b/sql/mysqld.cc 2009-08-27 18:42:59.000000000 +0900
+@@ -5101,6 +5101,7 @@
+ OPT_INNODB_ADAPTIVE_CHECKPOINT,
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_DICT_SIZE_LIMIT,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
+ OPT_FEDERATED,
+ OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+@@ -5464,6 +5465,10 @@
+ "Number of background write I/O threads in InnoDB.",
+ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
+ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_dict_size_limit", OPT_INNODB_DICT_SIZE_LIMIT,
++ "Limit the allocated memory for dictionary cache. (0: unlimited)",
++ (gptr*) &srv_dict_size_limit, (gptr*) &srv_dict_size_limit, 0,
++ GET_ULONG, REQUIRED_ARG, 0, 0, ULONG_MAX, 0, 0 ,0},
+ {"innodb_io_pattern_trace", OPT_INNODB_IO_PATTERN_TRACE,
+ "Create/Drop the internal hash table for IO pattern tracing.",
+ (gptr*) &srv_io_pattern_trace, (gptr*) &srv_io_pattern_trace,
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-08-27 18:42:17.000000000 +0900
++++ b/sql/set_var.cc 2009-08-27 18:42:59.000000000 +0900
+@@ -540,6 +540,8 @@
+ sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint",
+ &srv_adaptive_checkpoint,
+ &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint);
++sys_var_long_ptr sys_innodb_dict_size_limit("innodb_dict_size_limit",
++ &srv_dict_size_limit);
+ sys_var_long_ptr sys_innodb_show_locks_held(
+ "innodb_show_locks_held",
+ &srv_show_locks_held);
+@@ -930,6 +932,7 @@
+ &sys_innodb_read_ahead,
+ &sys_innodb_enable_unsafe_group_commit,
+ &sys_innodb_adaptive_checkpoint,
++ &sys_innodb_dict_size_limit,
+ &sys_innodb_show_locks_held,
+ &sys_innodb_show_verbose_locks,
+ &sys_innodb_io_pattern_trace,
+@@ -1084,6 +1087,7 @@
+ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
++ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS},
+ {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS},
+ {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS},
+ {sys_innodb_io_pattern_size_limit.name, (char*) &sys_innodb_io_pattern_size_limit, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch b/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch
new file mode 100644
index 0000000..cab3b26
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_extra_rseg.patch
@@ -0,0 +1,243 @@
+diff -r 85e7025cf2d1 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Fri Jul 03 15:41:41 2009 -0700
++++ b/innobase/include/srv0srv.h Fri Jul 03 15:41:47 2009 -0700
+@@ -146,6 +146,8 @@
+ extern ulint srv_enable_unsafe_group_commit;
+ extern uint srv_read_ahead;
+ extern uint srv_adaptive_checkpoint;
++
++extern ulint srv_extra_rsegments;
+
+ extern ulint srv_dict_size_limit;
+
+diff -r 85e7025cf2d1 innobase/include/trx0sys.h
+--- a/innobase/include/trx0sys.h Fri Jul 03 15:41:41 2009 -0700
++++ b/innobase/include/trx0sys.h Fri Jul 03 15:41:47 2009 -0700
+@@ -105,6 +105,13 @@
+ void
+ trx_sys_create(void);
+ /*================*/
++/*********************************************************************
++Create extra rollback segments when create_new_db */
++
++void
++trx_sys_create_extra_rseg(
++/*======================*/
++ ulint num); /* in: number of extra user rollback segments */
+ /********************************************************************
+ Looks for a free slot for a rollback segment in the trx system file copy. */
+
+diff -r 85e7025cf2d1 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Fri Jul 03 15:41:41 2009 -0700
++++ b/innobase/srv/srv0srv.c Fri Jul 03 15:41:47 2009 -0700
+@@ -352,6 +352,8 @@
+
+ uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
+ uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
++
++ulint srv_extra_rsegments = 0; /* extra rseg for users */
+
+ ulint srv_dict_size_limit = 0;
+
+diff -r 85e7025cf2d1 innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c Fri Jul 03 15:41:41 2009 -0700
++++ b/innobase/srv/srv0start.c Fri Jul 03 15:41:47 2009 -0700
+@@ -1418,6 +1418,8 @@
+ dict_create();
+ srv_startup_is_before_trx_rollback_phase = FALSE;
+
++ if (srv_extra_rsegments)
++ trx_sys_create_extra_rseg(srv_extra_rsegments);
+ #ifdef UNIV_LOG_ARCHIVE
+ } else if (srv_archive_recovery) {
+ fprintf(stderr,
+diff -r 85e7025cf2d1 innobase/trx/trx0sys.c
+--- a/innobase/trx/trx0sys.c Fri Jul 03 15:41:41 2009 -0700
++++ b/innobase/trx/trx0sys.c Fri Jul 03 15:41:47 2009 -0700
+@@ -944,3 +944,28 @@
+
+ trx_sys_init_at_db_start();
+ }
++
++/*********************************************************************
++Create extra rollback segments when create_new_db */
++
++void
++trx_sys_create_extra_rseg(
++/*======================*/
++ ulint num) /* in: number of extra user rollback segments */
++{
++ mtr_t mtr;
++ ulint slot_no;
++ ulint i;
++
++ /* Craete extra rollback segments */
++ mtr_start(&mtr);
++ for (i = 1; i < num + 1; i++) {
++ if(!trx_rseg_create(TRX_SYS_SPACE, ULINT_MAX, &slot_no, &mtr)) {
++ fprintf(stderr,
++"InnoDB: Warning: Failed to create extra rollback segments.\n");
++ break;
++ }
++ ut_a(slot_no == i);
++ }
++ mtr_commit(&mtr);
++}
+diff -r 85e7025cf2d1 patch_info/innodb_extra_rseg.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_extra_rseg.info Fri Jul 03 15:41:47 2009 -0700
+@@ -0,0 +1,6 @@
++File=innodb_extra_rseg.patch
++Name=allow to create extra rollback segments
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment
+diff -r 85e7025cf2d1 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Fri Jul 03 15:41:41 2009 -0700
++++ b/sql/ha_innodb.cc Fri Jul 03 15:41:47 2009 -0700
+@@ -152,6 +152,7 @@
+ innobase_open_files;
+
+ long innobase_read_io_threads, innobase_write_io_threads;
++long innobase_extra_rsegments;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+ /* The default values for the following char* start-up parameters
+@@ -1521,6 +1522,8 @@
+ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
+ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
++ srv_extra_rsegments = (ulint) innobase_extra_rsegments;
++
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+
+diff -r 85e7025cf2d1 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Fri Jul 03 15:41:41 2009 -0700
++++ b/sql/ha_innodb.h Fri Jul 03 15:41:47 2009 -0700
+@@ -205,6 +205,7 @@
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
+ extern long innobase_read_io_threads, innobase_write_io_threads;
++extern long innobase_extra_rsegments;
+ extern long innobase_force_recovery;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+diff -r 85e7025cf2d1 sql/mysqld.cc
+--- a/sql/mysqld.cc Fri Jul 03 15:41:41 2009 -0700
++++ b/sql/mysqld.cc Fri Jul 03 15:41:47 2009 -0700
+@@ -5101,6 +5101,7 @@
+ OPT_INNODB_ADAPTIVE_CHECKPOINT,
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_EXTRA_RSEGMENTS,
+ OPT_INNODB_DICT_SIZE_LIMIT,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
+ OPT_FEDERATED,
+@@ -5465,6 +5466,10 @@
+ "Number of background write I/O threads in InnoDB.",
+ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
+ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_extra_rsegments", OPT_INNODB_EXTRA_RSEGMENTS,
++ "Number of extra user rollback segments when create new database.",
++ (gptr*) &innobase_extra_rsegments, (gptr*) &innobase_extra_rsegments,
++ 0, GET_LONG, REQUIRED_ARG, 0, 0, 127, 0, 0, 0},
+ {"innodb_dict_size_limit", OPT_INNODB_DICT_SIZE_LIMIT,
+ "Limit the allocated memory for dictionary cache. (0: unlimited)",
+ (gptr*) &srv_dict_size_limit, (gptr*) &srv_dict_size_limit, 0,
+diff -r 85e7025cf2d1 sql/set_var.cc
+--- a/sql/set_var.cc Fri Jul 03 15:41:41 2009 -0700
++++ b/sql/set_var.cc Fri Jul 03 15:41:47 2009 -0700
+@@ -1087,6 +1087,7 @@
+ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
++ {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG},
+ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS},
+ {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS},
+ {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS},
+diff -r 85e7025cf2d1 sql/sql_show.cc
+--- a/sql/sql_show.cc Fri Jul 03 15:41:41 2009 -0700
++++ b/sql/sql_show.cc Fri Jul 03 15:41:47 2009 -0700
+@@ -39,6 +39,8 @@
+ #include "srv0srv.h"
+ #include "buf0buf.h"
+ #include "dict0dict.h"
++#include "trx0rseg.h" /* for trx_rseg_struct */
++#include "trx0sys.h" /* for trx_sys */
+ }
+ /* We need to undef it in InnoDB */
+ #undef byte
+@@ -4180,6 +4182,45 @@
+ DBUG_RETURN(returnable);
+ }
+
++int
++innodb_rseg_fill(
++/*=================*/
++ THD* thd, /* in: thread */
++ TABLE_LIST* tables, /* in/out: tables to fill */
++ COND* cond) /* in: condition (ignored) */
++{
++ TABLE* table = (TABLE *) tables->table;
++ int status = 0;
++ trx_rseg_t* rseg;
++
++ DBUG_ENTER("innodb_rseg_fill");
++
++ /* deny access to non-superusers */
++ if (check_global_access(thd, PROCESS_ACL)) {
++
++ DBUG_RETURN(0);
++ }
++
++ rseg = UT_LIST_GET_FIRST(trx_sys->rseg_list);
++
++ while (rseg) {
++ table->field[0]->store(rseg->id);
++ table->field[1]->store(rseg->space);
++ table->field[2]->store(rseg->page_no);
++ table->field[3]->store(rseg->max_size);
++ table->field[4]->store(rseg->curr_size);
++
++ if (schema_table_store_record(thd, table)) {
++ status = 1;
++ break;
++ }
++
++ rseg = UT_LIST_GET_NEXT(rseg_list, rseg);
++ }
++
++ DBUG_RETURN(status);
++}
++
+ /*
+ Find schema_tables elment by name
+
+@@ -4996,6 +5037,16 @@
+ {"INDEX_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "index name"},
+ {"N_READ", 11, MYSQL_TYPE_LONG, 0, 0, "read ios"},
+ {"N_WRITE", 11, MYSQL_TYPE_LONG, 0, 0, "write ios"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
++ST_FIELD_INFO innodb_rseg_fields_info[]=
++{
++ {"RSEG_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""},
++ {"SPACE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""},
++ {"PAGE_NO", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""},
++ {"MAX_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""},
++ {"CURR_SIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, ""},
+ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
+ };
+ #endif
+@@ -5177,6 +5228,8 @@
+ #ifdef HAVE_INNOBASE_DB
+ {"INNODB_IO_PATTERN", innodb_io_pattern_field_info, create_schema_table,
+ innodb_io_pattern_fill_table, 0, 0, -1, -1, 0},
++ {"INNODB_RSEG", innodb_rseg_fields_info, create_schema_table,
++ innodb_rseg_fill, 0, 0, -1, -1, 0},
+ #endif
+ {0, 0, 0, 0, 0, 0, 0, 0, 0}
+ };
diff --git a/percona/5.0.91-b22-20100522/innodb_extra_status.patch b/percona/5.0.91-b22-20100522/innodb_extra_status.patch
new file mode 100644
index 0000000..adc1642
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_extra_status.patch
@@ -0,0 +1,747 @@
+diff -r b059d02ec814 innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/buf/buf0buf.c Mon Nov 03 05:09:34 2008 -0800
+@@ -2353,6 +2353,7 @@
+ "AWE: Database pages and free buffers mapped in frames %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
++ if (file) {
+ fprintf(file,
+ "Buffer pool size %lu\n"
+ "Free buffers %lu\n"
+@@ -2371,11 +2372,13 @@
+ + buf_pool->init_flush[BUF_FLUSH_LIST],
+ (ulong) buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE]);
+
++ } // if (file)
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time,
+ buf_pool->last_printout_time);
+ buf_pool->last_printout_time = current_time;
+
++ if (file) {
+ fprintf(file,
+ "Pages read %lu, created %lu, written %lu\n"
+ "%.2f reads/s, %.2f creates/s, %.2f writes/s\n",
+@@ -2405,6 +2408,7 @@
+ } else {
+ fputs("No buffer pool page gets since the last printout\n",
+ file);
++ }
+ }
+
+ buf_pool->n_page_gets_old = buf_pool->n_page_gets;
+diff -r b059d02ec814 innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/ibuf/ibuf0ibuf.c Mon Nov 03 05:09:34 2008 -0800
+@@ -3519,9 +3519,15 @@
+
+ mutex_enter(&ibuf_mutex);
+
++ inno_ibuf_size = 0;
++ inno_ibuf_inserts = 0;
++ inno_ibuf_merged_recs = 0;
++ inno_ibuf_merges = 0;
++
+ data = UT_LIST_GET_FIRST(ibuf->data_list);
+
+ while (data) {
++ if (file) {
+ fprintf(file,
+ "Ibuf: size %lu, free list len %lu, seg size %lu,\n"
+ "%lu inserts, %lu merged recs, %lu merges\n",
+@@ -3542,6 +3548,12 @@
+ }
+ }
+ #endif
++ } // if (file)
++ inno_ibuf_size += (ulong) data->size;
++ inno_ibuf_inserts += (ulong) data->n_inserts;
++ inno_ibuf_merged_recs += (ulong) data->n_merged_recs;
++ inno_ibuf_merges += (ulong) data->n_merges;
++
+ data = UT_LIST_GET_NEXT(data_list, data);
+ }
+
+diff -r b059d02ec814 innobase/include/lock0lock.h
+--- a/innobase/include/lock0lock.h Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/include/lock0lock.h Mon Nov 03 05:09:34 2008 -0800
+@@ -24,6 +24,10 @@
+ #endif /* UNIV_DEBUG */
+ /* Buffer for storing information about the most recent deadlock error */
+ extern FILE* lock_latest_err_file;
++
++/* number of deadlocks happened so far */
++extern ulint innodb_deadlocks;
++
+
+ /*************************************************************************
+ Gets the size of a lock struct. */
+diff -r b059d02ec814 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/include/srv0srv.h Mon Nov 03 05:09:34 2008 -0800
+@@ -261,6 +261,12 @@
+ /* variable to count the number of random read-aheads were done */
+ extern ulint srv_read_ahead_rnd;
+
++/* variable to identify if there is currently a long semaphore wait */
++extern ibool srv_long_lock_wait;
++
++/* variable to count the number long semaphore waits noticed */
++extern ulint srv_long_lock_waits;
++
+ /* Number of IO operations read/write done for all threads */
+ extern ulint os_aio_read_requests;
+ extern ulint os_aio_write_requests;
+@@ -278,6 +284,26 @@
+ extern ulint inno_pending_ibuf_aio_reads;
+ extern ulint inno_pending_log_ios;
+ extern ulint inno_pending_sync_ios;
++
++/* all 24 innodb status variables, exported to status */
++extern ulint inno_transaction_count;
++extern ulint inno_transaction_purge_count;
++extern ulint inno_transaction_purge_lag;
++extern ulint inno_num_active_transactions;
++extern ulint inno_summed_transaction_age;
++extern ulint inno_longest_transaction_age;
++extern ulint inno_lock_wait_timeouts;
++extern ulint inno_num_lock_waiters;
++extern ulint inno_summed_lock_wait_time;
++extern ulint inno_longest_lock_wait;
++extern ulint inno_os_reads;
++extern ulint inno_os_writes;
++extern ulint inno_os_fsyncs;
++extern ulint inno_ibuf_size;
++extern ulint inno_ibuf_inserts;
++extern ulint inno_ibuf_merged_recs;
++extern ulint inno_ibuf_merges;
++extern ulint inno_log_ios_done;
+
+ /* In this structure we store status variables to be passed to MySQL */
+ typedef struct export_var_struct export_struc;
+@@ -552,6 +578,7 @@
+ ulint innodb_data_writes;
+ ulint innodb_data_written;
+ ulint innodb_data_reads;
++ ulint innodb_dict_size;
+ ulint innodb_buffer_pool_pages_total;
+ ulint innodb_buffer_pool_pages_data;
+ ulint innodb_buffer_pool_pages_dirty;
+@@ -587,6 +614,43 @@
+ ulint innodb_rows_inserted;
+ ulint innodb_rows_updated;
+ ulint innodb_rows_deleted;
++ ibool innodb_long_lock_wait;
++ ulint innodb_long_lock_waits;
++
++ ulint innodb_os_aio_read_requests;
++ ulint innodb_os_aio_write_requests;
++ ulint innodb_os_aio_pages_read;
++ ulint innodb_os_aio_pages_written;
++ ib_longlong innodb_os_aio_read_time;
++ ib_longlong innodb_os_aio_write_time;
++ ib_longlong innodb_os_aio_read_time_avg;
++ ib_longlong innodb_os_aio_write_time_avg;
++ ulint innodb_deadlocks;
++
++ // the following 24 variables are exported to "show status"
++ ulint inno_transaction_count;
++ ulint inno_transaction_purge_count;
++ ulint inno_transaction_purge_lag;
++ ulint inno_num_active_transactions;
++ ulint inno_summed_transaction_age;
++ ulint inno_longest_transaction_age;
++ ulint inno_lock_wait_timeouts;
++ ulint inno_num_lock_waiters;
++ ulint inno_summed_lock_wait_time;
++ ulint inno_longest_lock_wait;
++ ulint inno_pending_normal_aio_reads;
++ ulint inno_pending_normal_aio_writes;
++ ulint inno_pending_ibuf_aio_reads;
++ ulint inno_pending_log_ios;
++ ulint inno_pending_sync_ios;
++ ulint inno_os_reads;
++ ulint inno_os_writes;
++ ulint inno_os_fsyncs;
++ ulint inno_ibuf_size;
++ ulint inno_ibuf_inserts;
++ ulint inno_ibuf_merged_recs;
++ ulint inno_ibuf_merges;
++ ulint inno_log_ios_done;
+ };
+
+ /* The server system struct */
+diff -r b059d02ec814 innobase/lock/lock0lock.c
+--- a/innobase/lock/lock0lock.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/lock/lock0lock.c Mon Nov 03 05:09:34 2008 -0800
+@@ -360,6 +360,9 @@
+ ibool lock_deadlock_found = FALSE;
+ FILE* lock_latest_err_file;
+
++/* number of deadlocks happened so far */
++ulint innodb_deadlocks = 0;
++
+ /* Flags for recursive deadlock search */
+ #define LOCK_VICTIM_IS_START 1
+ #define LOCK_VICTIM_IS_OTHER 2
+@@ -3304,6 +3307,7 @@
+
+ FILE* ef = lock_latest_err_file;
+
++ innodb_deadlocks++;
+ rewind(ef);
+ ut_print_timestamp(ef);
+
+@@ -4238,6 +4242,7 @@
+ innobase_mysql_prepare_print_arbitrary_thd();
+ lock_mutex_enter_kernel();
+
++ if (file) {
+ if (lock_deadlock_found) {
+ fputs(
+ "------------------------\n"
+@@ -4269,6 +4274,12 @@
+ fprintf(file,
+ "Total number of lock structs in row lock hash table %lu\n",
+ (ulong) lock_get_n_rec_locks());
++ } // if (file)
++ inno_transaction_purge_count =
++ (ulong) ut_dulint_get_low(purge_sys->purge_trx_no);
++ inno_transaction_count =
++ (ulong) ut_dulint_get_low(trx_sys->max_trx_id);
++ inno_transaction_purge_lag = (ulong) trx_sys->rseg_history_len;
+ }
+
+ /*************************************************************************
+@@ -4289,7 +4300,17 @@
+ ulint i;
+ mtr_t mtr;
+ trx_t* trx;
+-
++ time_t current_time = time(NULL);
++
++ /* init all counters to be updated */
++ inno_num_lock_waiters = 0;
++ inno_summed_lock_wait_time = 0;
++ inno_longest_lock_wait = 0;
++ inno_num_active_transactions = 0;
++ inno_summed_transaction_age = 0;
++ inno_longest_transaction_age = 0;
++
++ if (file) {
+ fprintf(file, "LIST OF TRANSACTIONS FOR EACH SESSION:\n");
+
+ /* First print info on non-active transactions */
+@@ -4304,6 +4325,7 @@
+
+ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
+ }
++ } // if (file)
+
+ loop:
+ trx = UT_LIST_GET_FIRST(trx_sys->trx_list);
+@@ -4330,6 +4352,7 @@
+ }
+
+ if (nth_lock == 0) {
++ if (file) {
+ fputs("---", file);
+ trx_print(file, trx, 600);
+
+@@ -4341,11 +4364,27 @@
+ (ulong) ut_dulint_get_high(trx->read_view->up_limit_id),
+ (ulong) ut_dulint_get_low(trx->read_view->up_limit_id));
+ }
++ } // if (file)
++
++ if (trx->conc_state == TRX_ACTIVE) {
++ ulong trx_age = (ulong)difftime(time(NULL), trx->start_time);
++ inno_num_active_transactions++;
++ inno_summed_transaction_age += trx_age;
++ if (inno_longest_transaction_age > trx_age)
++ inno_longest_transaction_age = trx_age;
++ }
+
+ if (trx->que_state == TRX_QUE_LOCK_WAIT) {
++ ulong wait_time = (ulong)difftime(current_time,
++ trx->wait_started);
++ inno_num_lock_waiters++;
++ inno_summed_lock_wait_time += wait_time;
++ if (inno_longest_lock_wait < wait_time)
++ inno_longest_lock_wait = wait_time;
++ if (file) {
+ fprintf(file,
+ "------- TRX HAS BEEN WAITING %lu SEC FOR THIS LOCK TO BE GRANTED:\n",
+- (ulong)difftime(time(NULL), trx->wait_started));
++ wait_time);
+
+ if (lock_get_type(trx->wait_lock) == LOCK_REC) {
+ lock_rec_print(file, trx->wait_lock);
+@@ -4354,10 +4393,16 @@
+ }
+
+ fputs("------------------\n", file);
+- }
+- }
+-
+- if (!srv_print_innodb_lock_monitor) {
++ } // if (file)
++ }
++ }
++
++ /* don't print locks per transaction if either
++ 1) srv_print_innodb_lock_monitor is NOT set,
++ ie no magic table innodb_lock_monitor is created, or
++ 2) file == NULL, ie, at counter updating stage from "show status"
++ */
++ if (!srv_print_innodb_lock_monitor || !file) {
+ nth_trx++;
+ goto loop;
+ }
+diff -r b059d02ec814 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800
++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:09:34 2008 -0800
+@@ -267,6 +267,35 @@
+ ulint inno_pending_log_ios = 0;
+ ulint inno_pending_sync_ios = 0;
+
++/* variable to identify if there is currently a long semaphore wait */
++ibool srv_long_lock_wait = FALSE;
++
++/* variable to count the number long semaphore waits noticed */
++ulint srv_long_lock_waits = 0;
++
++/* time interval in seconds allowed to calling innodb_show_status functions */
++extern long innobase_min_status_update_time_interval;
++
++/* all 24 innodb status variables, exported to status */
++ulint inno_transaction_count = 0;
++ulint inno_transaction_purge_count = 0;
++ulint inno_transaction_purge_lag = 0;
++ulint inno_num_active_transactions = 0;
++ulint inno_summed_transaction_age = 0;
++ulint inno_longest_transaction_age = 0;
++ulint inno_lock_wait_timeouts = 0; /* Counts number of lock wait timeouts. */
++ulint inno_num_lock_waiters = 0;
++ulint inno_summed_lock_wait_time = 0;
++ulint inno_longest_lock_wait = 0;
++ulint inno_os_reads = 0;
++ulint inno_os_writes = 0;
++ulint inno_os_fsyncs = 0;
++ulint inno_ibuf_size = 0;
++ulint inno_ibuf_inserts = 0;
++ulint inno_ibuf_merged_recs = 0;
++ulint inno_ibuf_merges = 0;
++ulint inno_log_ios_done = 0;
++
+ /* structure to pass status variables to MySQL */
+ export_struc export_vars;
+
+@@ -419,6 +448,10 @@
+ const char* srv_io_thread_function[SRV_MAX_N_IO_THREADS];
+
+ time_t srv_last_monitor_time;
++
++/* last time innodb status were updated thru show status */
++time_t srv_last_innodb_status_time = 0;
++
+
+ mutex_t srv_innodb_monitor_mutex;
+
+@@ -677,6 +710,24 @@
+
+ ulint srv_n_threads_active[SRV_MASTER + 1];
+ ulint srv_n_threads[SRV_MASTER + 1];
++
++/*************************************************************************
++Prints counters for work done by srv_master_thread. */
++
++static
++void
++srv_print_extra(
++/*===================*/
++ FILE *file) /* in: output stream */
++{
++ fprintf(file, "srv_master_thread loops: %lu 1_second, %lu sleeps, "
++ "%lu 10_second, %lu background, %lu flush\n",
++ srv_main_1_second_loops, srv_main_sleeps,
++ srv_main_10_second_loops, srv_main_background_loops,
++ srv_main_flush_loops);
++ fprintf(file, "srv_master_thread log flush: %lu sync, %lu async\n",
++ srv_sync_flush, srv_async_flush);
++}
+
+ /*************************************************************************
+ Sets the info describing an i/o thread current state. */
+@@ -1685,12 +1736,13 @@
+ fputs("----------\n"
+ "BACKGROUND THREAD\n"
+ "----------\n", file);
++ srv_print_extra(file);
+ fil_print(file);
+-
+
+ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
++ fprintf(file, "Lock wait timeouts %lu\n", inno_lock_wait_timeouts);
+ sync_print(file);
+
+ /* Conceptually, srv_innodb_monitor_mutex has a very high latching
+@@ -1709,24 +1761,6 @@
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+- lock_print_info_summary(file);
+- if (trx_start) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_start = ULINT_UNDEFINED;
+- } else {
+- *trx_start = (ulint) t;
+- }
+- }
+- lock_print_info_all_transactions(file);
+- if (trx_end) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_end = ULINT_UNDEFINED;
+- } else {
+- *trx_end = (ulint) t;
+- }
+- }
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+@@ -1815,6 +1849,27 @@
+ (srv_n_rows_read - srv_n_rows_read_old)
+ / time_elapsed);
+
++ /* Print open transaction details */
++ lock_print_info_summary(file);
++
++ if (trx_start) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_start = ULINT_UNDEFINED;
++ } else {
++ *trx_start = (ulint) t;
++ }
++ }
++ lock_print_info_all_transactions(file);
++ if (trx_end) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_end = ULINT_UNDEFINED;
++ } else {
++ *trx_end = (ulint) t;
++ }
++ }
++
+ srv_n_rows_inserted_old = srv_n_rows_inserted;
+ srv_n_rows_updated_old = srv_n_rows_updated;
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+@@ -1833,7 +1888,8 @@
+ void
+ srv_export_innodb_status(void)
+ {
+-
++ long time_elapsed;
++ time_t current_time;
+ mutex_enter(&srv_innodb_monitor_mutex);
+ export_vars.innodb_data_pending_reads= os_n_pending_reads;
+ export_vars.innodb_data_pending_writes= os_n_pending_writes;
+@@ -1844,6 +1900,7 @@
+ export_vars.innodb_data_reads= os_n_file_reads;
+ export_vars.innodb_data_writes= os_n_file_writes;
+ export_vars.innodb_data_written= srv_data_written;
++ export_vars.innodb_dict_size= dict_sys->size;
+ export_vars.innodb_buffer_pool_read_requests= buf_pool->n_page_gets;
+ export_vars.innodb_buffer_pool_write_requests= srv_buf_pool_write_requests;
+ export_vars.innodb_buffer_pool_wait_free= srv_buf_pool_wait_free;
+@@ -1854,10 +1911,12 @@
+ export_vars.innodb_buffer_pool_pages_data= UT_LIST_GET_LEN(buf_pool->LRU);
+ export_vars.innodb_buffer_pool_pages_dirty= UT_LIST_GET_LEN(buf_pool->flush_list);
+ export_vars.innodb_buffer_pool_pages_free= UT_LIST_GET_LEN(buf_pool->free);
+- export_vars.innodb_buffer_pool_pages_latched= buf_get_latched_pages_number();
++ /* This function uses too much CPU for large buffer caches. */
++ export_vars.innodb_buffer_pool_pages_latched= 1; /* buf_get_latched_pages_number(); */
+ export_vars.innodb_buffer_pool_pages_total= buf_pool->curr_size;
+ export_vars.innodb_buffer_pool_pages_misc= buf_pool->max_size -
+ UT_LIST_GET_LEN(buf_pool->LRU) - UT_LIST_GET_LEN(buf_pool->free);
++
+ export_vars.innodb_page_size= UNIV_PAGE_SIZE;
+ export_vars.innodb_log_waits= srv_log_waits;
+ export_vars.innodb_os_log_written= srv_os_log_written;
+@@ -1885,6 +1944,103 @@
+ export_vars.innodb_rows_inserted= srv_n_rows_inserted;
+ export_vars.innodb_rows_updated= srv_n_rows_updated;
+ export_vars.innodb_rows_deleted= srv_n_rows_deleted;
++ export_vars.innodb_long_lock_wait = srv_long_lock_wait;
++ export_vars.innodb_long_lock_waits = srv_long_lock_waits;
++
++ export_vars.innodb_os_aio_read_requests = os_aio_read_requests;
++ export_vars.innodb_os_aio_write_requests = os_aio_write_requests;
++
++ export_vars.innodb_os_aio_pages_read = os_aio_pages_read;
++ export_vars.innodb_os_aio_pages_written = os_aio_pages_written;
++
++ export_vars.innodb_os_aio_read_time = os_aio_read_time;
++ export_vars.innodb_os_aio_write_time = os_aio_write_time;
++
++ if (os_aio_read_requests > 0 ) {
++ export_vars.innodb_os_aio_read_time_avg
++ = os_aio_read_time / os_aio_read_requests;
++ } else {
++ export_vars.innodb_os_aio_read_time_avg = 0;
++ }
++ if (os_aio_write_requests > 0 ) {
++ export_vars.innodb_os_aio_write_time_avg
++ = os_aio_write_time / os_aio_write_requests;
++ } else {
++ export_vars.innodb_os_aio_write_time_avg = 0;
++ }
++
++ export_vars.innodb_deadlocks = innodb_deadlocks;
++
++ // simulate srv_printf_innodb_monitor, invoked by innodb_show_status
++ // 0. direct printout inno_lock_wait_timeouts, declared in srv0srv.c
++ // total # of variable(s) updated: 1
++ export_vars.inno_lock_wait_timeouts = inno_lock_wait_timeouts;
++
++ // *_print functions are allowed to be called once every
++ // some seconds to prevent too frequent invocation.
++ // the number is innobase_min_status_update_time_interval
++ current_time = time(NULL);
++ time_elapsed = difftime(current_time, srv_last_innodb_status_time);
++ if (time_elapsed >= innobase_min_status_update_time_interval) {
++ os_aio_print(NULL);
++ ibuf_print(NULL);
++ buf_print_io(NULL);
++ lock_print_info_summary(NULL);
++ lock_print_info_all_transactions(NULL);
++
++ srv_last_innodb_status_time = current_time;
++ }
++
++ // 1. os_aio_print
++ // the following were filled by calling os_aio_print
++ // total # of variable(s) updated: 8
++
++ export_vars.inno_pending_normal_aio_reads =
++ inno_pending_normal_aio_reads;
++ export_vars.inno_pending_normal_aio_writes =
++ inno_pending_normal_aio_writes;
++ export_vars.inno_pending_ibuf_aio_reads = inno_pending_ibuf_aio_reads;
++ export_vars.inno_pending_log_ios = inno_pending_log_ios;
++ export_vars.inno_pending_sync_ios = inno_pending_sync_ios;
++ export_vars.inno_os_reads = os_n_file_reads;
++ export_vars.inno_os_writes = os_n_file_writes;
++ export_vars.inno_os_fsyncs = os_n_fsyncs;
++
++ // 2. ibuf_print()
++ // total # of variable(s) updated: 4
++
++ export_vars.inno_ibuf_size = inno_ibuf_size;
++ export_vars.inno_ibuf_inserts = inno_ibuf_inserts;
++ export_vars.inno_ibuf_merged_recs = inno_ibuf_merged_recs;
++ export_vars.inno_ibuf_merges = inno_ibuf_merges;
++
++ // 3. log_print
++ // total # of variable(s) updated: 1
++ export_vars.inno_log_ios_done = (ulong) log_sys->n_log_ios;
++
++ // 5. lock_print_info_summary
++ // it enters the mutexes
++ // 1) innobase_mysql_prepare_print_arbitrary_thd()
++ // 2) lock_mutex_enter_kernel()
++ // total # of variable(s) updated: 3
++
++ export_vars.inno_transaction_count = inno_transaction_count;
++ export_vars.inno_transaction_purge_count =
++ inno_transaction_purge_count;
++ export_vars.inno_transaction_purge_lag = inno_transaction_purge_lag;
++
++ // 6. lock_print_info_all_transactions(NULL)
++ // it exits two mutexes entered from lock_print_info_summary(NULL)
++ // total # of variable(s) updated: 6
++
++ export_vars.inno_num_active_transactions = inno_num_active_transactions;
++ export_vars.inno_summed_transaction_age = inno_summed_transaction_age;
++ export_vars.inno_longest_transaction_age = inno_longest_transaction_age;
++
++ export_vars.inno_num_lock_waiters = inno_num_lock_waiters;
++ export_vars.inno_summed_lock_wait_time = inno_summed_lock_wait_time;
++ export_vars.inno_longest_lock_wait = inno_longest_lock_wait;
++
+ mutex_exit(&srv_innodb_monitor_mutex);
+
+ }
+@@ -2026,6 +2182,7 @@
+ if (thr_get_trx(slot->thr)->wait_lock) {
+ lock_cancel_waiting_and_release(
+ thr_get_trx(slot->thr)->wait_lock);
++ ++inno_lock_wait_timeouts;
+ }
+ }
+ }
+diff -r b059d02ec814 patch_info/innodb_extra_status.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_extra_status.info Mon Nov 03 05:09:34 2008 -0800
+@@ -0,0 +1,9 @@
++File=innodb_extra_status.patch
++Name=Adds additional information of InnoDB counters into SHOW STATUS
++Version=1.0
++Author=Google
++License=GPL
++Comment=
++ChangeLog=
++2008-11-03
++VT: Initial porting
+diff -r b059d02ec814 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/ha_innodb.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -299,12 +299,36 @@
+ (char*) &export_vars.innodb_dblwr_pages_written, SHOW_LONG},
+ {"dblwr_writes",
+ (char*) &export_vars.innodb_dblwr_writes, SHOW_LONG},
++ {"dict_size",
++ (char*) &export_vars.innodb_dict_size, SHOW_LONG},
+ {"log_waits",
+ (char*) &export_vars.innodb_log_waits, SHOW_LONG},
+ {"log_write_requests",
+ (char*) &export_vars.innodb_log_write_requests, SHOW_LONG},
+ {"log_writes",
+ (char*) &export_vars.innodb_log_writes, SHOW_LONG},
++ {"long_lock_wait",
++ (char*) &export_vars.innodb_long_lock_wait, SHOW_BOOL},
++ {"long_lock_waits",
++ (char*) &export_vars.innodb_long_lock_waits, SHOW_LONG},
++
++ {"os_read_requests",
++ (char*) &export_vars.innodb_os_aio_read_requests, SHOW_LONG},
++ {"os_write_requests",
++ (char*) &export_vars.innodb_os_aio_write_requests, SHOW_LONG},
++ {"os_pages_read",
++ (char*) &export_vars.innodb_os_aio_pages_read, SHOW_LONG},
++ {"os_pages_written",
++ (char*) &export_vars.innodb_os_aio_pages_written, SHOW_LONG},
++ {"os_read_time",
++ (char*) &export_vars.innodb_os_aio_read_time, SHOW_LONGLONG},
++ {"os_write_time",
++ (char*) &export_vars.innodb_os_aio_write_time, SHOW_LONGLONG},
++ {"time_per_read",
++ (char*) &export_vars.innodb_os_aio_read_time_avg, SHOW_LONGLONG},
++ {"time_per_write",
++ (char*) &export_vars.innodb_os_aio_write_time_avg, SHOW_LONGLONG},
++
+ {"os_log_fsyncs",
+ (char*) &export_vars.innodb_os_log_fsyncs, SHOW_LONG},
+ {"os_log_pending_fsyncs",
+@@ -339,6 +363,56 @@
+ (char*) &export_vars.innodb_rows_read, SHOW_LONG},
+ {"rows_updated",
+ (char*) &export_vars.innodb_rows_updated, SHOW_LONG},
++ {"deadlocks",
++ (char*) &export_vars.innodb_deadlocks, SHOW_LONG},
++
++ /* 24 innodb status variables exported to status */
++ {"transaction_count",
++ (char*) &export_vars.inno_transaction_count, SHOW_LONG},
++ {"transaction_purge_count",
++ (char*) &export_vars.inno_transaction_purge_count, SHOW_LONG},
++ {"transaction_purge_lag",
++ (char*) &export_vars.inno_transaction_purge_lag, SHOW_LONG},
++ {"active_transactions",
++ (char*) &export_vars.inno_num_active_transactions, SHOW_LONG},
++ {"summed_transaction_age",
++ (char*) &export_vars.inno_summed_transaction_age, SHOW_LONG},
++ {"longest_transaction_age",
++ (char*) &export_vars.inno_longest_transaction_age, SHOW_LONG},
++ {"lock_wait_timeouts",
++ (char*) &export_vars.inno_lock_wait_timeouts, SHOW_LONG},
++ {"lock_waiters",
++ (char*) &export_vars.inno_num_lock_waiters, SHOW_LONG},
++ {"summed_lock_wait_time",
++ (char*) &export_vars.inno_summed_lock_wait_time, SHOW_LONG},
++ {"longest_lock_wait",
++ (char*) &export_vars.inno_longest_lock_wait, SHOW_LONG},
++ {"pending_normal_aio_reads",
++ (char*) &export_vars.inno_pending_normal_aio_reads, SHOW_LONG},
++ {"pending_normal_aio_writes",
++ (char*) &export_vars.inno_pending_normal_aio_writes, SHOW_LONG},
++ {"pending_ibuf_aio_reads",
++ (char*) &export_vars.inno_pending_ibuf_aio_reads, SHOW_LONG},
++ {"pending_log_ios",
++ (char*) &export_vars.inno_pending_log_ios, SHOW_LONG},
++ {"pending_sync_ios",
++ (char*) &export_vars.inno_pending_sync_ios, SHOW_LONG},
++ {"os_reads",
++ (char*) &export_vars.inno_os_reads, SHOW_LONG},
++ {"os_writes",
++ (char*) &export_vars.inno_os_writes, SHOW_LONG},
++ {"os_fsyncs",
++ (char*) &export_vars.inno_os_fsyncs, SHOW_LONG},
++ {"ibuf_inserts",
++ (char*) &export_vars.inno_ibuf_size, SHOW_LONG},
++ {"ibuf_size",
++ (char*) &export_vars.inno_ibuf_inserts, SHOW_LONG},
++ {"ibuf_merged_recs",
++ (char*) &export_vars.inno_ibuf_merged_recs, SHOW_LONG},
++ {"ibuf_merges",
++ (char*) &export_vars.inno_ibuf_merges, SHOW_LONG},
++ {"log_ios_done",
++ (char*) &export_vars.inno_log_ios_done, SHOW_LONG},
+ {NullS, NullS, SHOW_LONG}};
+
+ /* General functions */
+diff -r b059d02ec814 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/ha_innodb.h Mon Nov 03 05:09:34 2008 -0800
+@@ -198,6 +198,7 @@
+ extern struct show_var_st innodb_status_variables[];
+ extern ulong innobase_fast_shutdown;
+ extern long innobase_max_merged_io;
++extern long innobase_min_status_update_time_interval;
+ extern ulong innobase_large_page_size;
+ extern long innobase_mirrored_log_groups, innobase_log_files_in_group;
+ extern longlong innobase_buffer_pool_size, innobase_log_file_size;
+diff -r b059d02ec814 sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/mysqld.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -4950,6 +4950,7 @@
+ OPT_INNODB_SYNC_SPIN_LOOPS,
+ OPT_INNODB_CONCURRENCY_TICKETS,
+ OPT_INNODB_THREAD_SLEEP_DELAY,
++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL,
+ OPT_BDB_CACHE_SIZE,
+ OPT_BDB_LOG_BUFFER_SIZE,
+ OPT_BDB_MAX_LOCK,
+@@ -6031,6 +6032,14 @@
+ (gptr*) &srv_thread_sleep_delay,
+ (gptr*) &srv_thread_sleep_delay,
+ 0, GET_ULONG, REQUIRED_ARG, 10000L, 0L, ULONG_MAX, 0, 1L, 0},
++ {"innodb_status_update_interval",
++ OPT_INNODB_MIN_STATUS_UPDATE_TIME_INTERVAL,
++ "Minimum time interval in seconds before InnoDB status counters "
++ "are updated during SHOW STATUS. "
++ "InnoDB counters are always updated during SHOW INNODB STATUS.",
++ (gptr*) &innobase_min_status_update_time_interval,
++ (gptr*) &innobase_min_status_update_time_interval,
++ 0, GET_LONG, REQUIRED_ARG, 30, 0, 3600, 0, 1, 0},
+ #endif /* HAVE_INNOBASE_DB */
+ {"interactive_timeout", OPT_INTERACTIVE_TIMEOUT,
+ "The number of seconds the server waits for activity on an interactive connection before closing it.",
+diff -r b059d02ec814 sql/set_var.cc
+--- a/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800
++++ b/sql/set_var.cc Mon Nov 03 05:09:34 2008 -0800
+@@ -948,6 +948,8 @@
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG },
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG },
+ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG},
++ {"innodb_status_update_interval",
++ (char*) &innobase_min_status_update_time_interval, SHOW_LONG},
+ #endif
+ {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS},
+ {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/innodb_fsync_source.patch b/percona/5.0.91-b22-20100522/innodb_fsync_source.patch
new file mode 100644
index 0000000..2961b78
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_fsync_source.patch
@@ -0,0 +1,594 @@
+diff -r ef44d8017b6b innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/buf/buf0flu.c Fri Jul 03 15:41:32 2009 -0700
+@@ -341,7 +341,7 @@
+
+ /* Now flush the doublewrite buffer data to disk */
+
+- fil_flush(TRX_SYS_SPACE);
++ fil_flush(TRX_SYS_SPACE, FLUSH_FROM_DIRTY_BUFFER);
+
+ /* We know that the writes have been flushed to disk now
+ and in recovery we will find them in the doublewrite buffer
+@@ -381,7 +381,7 @@
+
+ /* Now we flush the data to disk (for example, with fsync) */
+
+- fil_flush_file_spaces(FIL_TABLESPACE);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_DIRTY_BUFFER);
+
+ /* We can now reuse the doublewrite memory buffer: */
+
+@@ -501,7 +501,8 @@
+ }
+ #else
+ /* Force the log to the disk before writing the modified block */
+- log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE);
++ log_write_up_to(block->newest_modification, LOG_WAIT_ALL_GROUPS, TRUE,
++ LOG_WRITE_FROM_DIRTY_BUFFER);
+ #endif
+ buf_flush_init_for_writing(block->frame, block->newest_modification,
+ block->space, block->offset);
+diff -r ef44d8017b6b innobase/fil/fil0fil.c
+--- a/innobase/fil/fil0fil.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/fil/fil0fil.c Fri Jul 03 15:41:32 2009 -0700
+@@ -245,6 +245,7 @@
+ request */
+ UT_LIST_BASE_NODE_T(fil_space_t) space_list;
+ /* list of all file spaces */
++ ulint flush_types[FLUSH_FROM_NUMBER];/* calls to fil_flush by caller */
+ };
+
+ /* The tablespace memory cache. This variable is NULL before the module is
+@@ -849,7 +850,7 @@
+ /* Flush tablespaces so that we can close modified files in the LRU
+ list */
+
+- fil_flush_file_spaces(FIL_TABLESPACE);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER);
+
+ count++;
+
+@@ -1309,7 +1310,10 @@
+
+ UT_LIST_INIT(system->unflushed_spaces);
+ UT_LIST_INIT(system->space_list);
+-
++ {
++ int x;
++ for (x = 0; x < FLUSH_FROM_NUMBER; ++x) system->flush_types[x] = 0;
++ }
+ return(system);
+ }
+
+@@ -1437,6 +1441,23 @@
+ }
+
+ mutex_exit(&(system->mutex));
++}
++
++/********************************************************************
++Prints internal counters */
++
++void
++fil_print(FILE *file)
++{
++ fprintf(file,
++ "fsync callers: %lu buffer pool, %lu other, %lu checkpoint, "
++ "%lu log aio, %lu log sync, %lu archive\n",
++ fil_system->flush_types[FLUSH_FROM_DIRTY_BUFFER],
++ fil_system->flush_types[FLUSH_FROM_OTHER],
++ fil_system->flush_types[FLUSH_FROM_CHECKPOINT],
++ fil_system->flush_types[FLUSH_FROM_LOG_IO_COMPLETE],
++ fil_system->flush_types[FLUSH_FROM_LOG_WRITE_UP_TO],
++ fil_system->flush_types[FLUSH_FROM_ARCHIVE]);
+ }
+
+ /********************************************************************
+@@ -2256,7 +2277,7 @@
+
+ os_thread_sleep(20000);
+
+- fil_flush(id);
++ fil_flush(id, FLUSH_FROM_OTHER);
+
+ goto retry;
+
+@@ -3574,7 +3595,7 @@
+ size_after_extend, *actual_size); */
+ mutex_exit(&(system->mutex));
+
+- fil_flush(space_id);
++ fil_flush(space_id, FLUSH_FROM_OTHER);
+
+ return(success);
+ }
+@@ -4167,8 +4188,9 @@
+ void
+ fil_flush(
+ /*======*/
+- ulint space_id) /* in: file space id (this can be a group of
++ ulint space_id, /* in: file space id (this can be a group of
+ log files or a tablespace of the database) */
++ flush_from_type flush_type)/* in: identifies the caller */
+ {
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+@@ -4177,7 +4199,7 @@
+ ib_longlong old_mod_counter;
+
+ mutex_enter(&(system->mutex));
+-
++ system->flush_types[flush_type]++;
+ HASH_SEARCH(hash, system->spaces, space_id, space,
+ space->id == space_id);
+ if (!space || space->is_being_deleted) {
+@@ -4282,7 +4304,8 @@
+ void
+ fil_flush_file_spaces(
+ /*==================*/
+- ulint purpose) /* in: FIL_TABLESPACE, FIL_LOG */
++ ulint purpose, /* in: FIL_TABLESPACE, FIL_LOG */
++ flush_from_type flush_type)/* in: identifies the caller */
+ {
+ fil_system_t* system = fil_system;
+ fil_space_t* space;
+@@ -4323,7 +4346,7 @@
+ a non-existing space id. */
+ for (i = 0; i < n_space_ids; i++) {
+
+- fil_flush(space_ids[i]);
++ fil_flush(space_ids[i], flush_type);
+ }
+
+ mem_free(space_ids);
+diff -r ef44d8017b6b innobase/include/fil0fil.h
+--- a/innobase/include/fil0fil.h Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/include/fil0fil.h Fri Jul 03 15:41:32 2009 -0700
+@@ -197,6 +197,13 @@
+ fil_init(
+ /*=====*/
+ ulint max_n_open); /* in: max number of open files */
++/********************************************************************
++ * Prints internal counters. */
++
++void
++fil_print(
++ /*=====*/
++ FILE* file); /* in: output stream */
+ /***********************************************************************
+ Opens all log files and system tablespace data files. They stay open until the
+ database server shutdown. This should be called at a server startup after the
+@@ -625,14 +632,26 @@
+ ulint segment); /* in: the number of the segment in the aio
+ array to wait for */
+ /**************************************************************************
++Identifies the caller of fil_flush. */
++typedef enum {
++ FLUSH_FROM_DIRTY_BUFFER,
++ FLUSH_FROM_OTHER,
++ FLUSH_FROM_CHECKPOINT,
++ FLUSH_FROM_LOG_IO_COMPLETE,
++ FLUSH_FROM_LOG_WRITE_UP_TO,
++ FLUSH_FROM_ARCHIVE,
++ FLUSH_FROM_NUMBER
++} flush_from_type;
++/**************************************************************************
+ Flushes to disk possible writes cached by the OS. If the space does not exist
+ or is being dropped, does not do anything. */
+
+ void
+ fil_flush(
+ /*======*/
+- ulint space_id); /* in: file space id (this can be a group of
++ ulint space_id, /* in: file space id (this can be a group of
+ log files or a tablespace of the database) */
++ flush_from_type flush_type);/* in: identifies the caller */
+ /**************************************************************************
+ Flushes to disk writes in file spaces of the given type possibly cached by
+ the OS. */
+@@ -640,7 +659,8 @@
+ void
+ fil_flush_file_spaces(
+ /*==================*/
+- ulint purpose); /* in: FIL_TABLESPACE, FIL_LOG */
++ ulint purpose, /* in: FIL_TABLESPACE, FIL_LOG */
++ flush_from_type flush_type);/* in: identifies the caller */
+ /**********************************************************************
+ Checks the consistency of the tablespace cache. */
+
+diff -r ef44d8017b6b innobase/include/log0log.h
+--- a/innobase/include/log0log.h Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/include/log0log.h Fri Jul 03 15:41:32 2009 -0700
+@@ -146,6 +146,22 @@
+ log_io_complete(
+ /*============*/
+ log_group_t* group); /* in: log group */
++
++/**********************************************************
++Describes the caller of log_write_up_to. */
++
++typedef enum {
++ LOG_WRITE_FROM_DIRTY_BUFFER,
++ LOG_WRITE_FROM_BACKGROUND_SYNC,
++ LOG_WRITE_FROM_BACKGROUND_ASYNC,
++ LOG_WRITE_FROM_INTERNAL,
++ LOG_WRITE_FROM_CHECKPOINT_SYNC,
++ LOG_WRITE_FROM_CHECKPOINT_ASYNC,
++ LOG_WRITE_FROM_LOG_ARCHIVE,
++ LOG_WRITE_FROM_COMMIT_SYNC,
++ LOG_WRITE_FROM_COMMIT_ASYNC,
++ LOG_WRITE_FROM_NUMBER
++} log_sync_type;
+ /**********************************************************
+ This function is called, e.g., when a transaction wants to commit. It checks
+ that the log has been written to the log file up to the last log entry written
+@@ -159,14 +175,21 @@
+ be written, ut_dulint_max if not specified */
+ ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+- ibool flush_to_disk);
+- /* in: TRUE if we want the written log also to be
+- flushed to disk */
++ ibool flush_to_disk,
++ /* in: TRUE if we want the written log also to be flushed to disk */
++ log_sync_type caller);/* in: identifies the caller */
+ /********************************************************************
+ Does a syncronous flush of the log buffer to disk. */
+
+ void
+ log_buffer_flush_to_disk(void);
++/*==========================*/
++/********************************************************************
++Flushes the log buffer. Forces it to disk depending on the value of
++the configuration parameter innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void);
+ /*==========================*/
+ /********************************************************************
+ Flushes the log buffer. Forces it to disk depending on the value of
+@@ -751,6 +774,12 @@
+ AND flushed to disk */
+ ulint n_pending_writes;/* number of currently pending flushes
+ or writes */
++ ulint log_sync_callers[LOG_WRITE_FROM_NUMBER];
++ /* counts calls to log_write_up_to */
++ ulint log_sync_syncers[LOG_WRITE_FROM_NUMBER];
++ /* counts calls to log_write_up_to when log file is sync'd */
++ ulint n_syncs; /* number of fsyncs done for log file */
++ ulint n_checkpoints; /* number of calls to log_checkpoint */
+ /* NOTE on the 'flush' in names of the fields below: starting from
+ 4.0.14, we separate the write of the log file and the actual fsync()
+ or other method to flush it to disk. The names below shhould really
+diff -r ef44d8017b6b innobase/log/log0log.c
+--- a/innobase/log/log0log.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/log/log0log.c Fri Jul 03 15:41:32 2009 -0700
+@@ -782,6 +782,15 @@
+ log_sys->written_to_all_lsn = log_sys->lsn;
+
+ log_sys->n_pending_writes = 0;
++ {
++ int x;
++ for (x = 0; x < LOG_WRITE_FROM_NUMBER; ++x) {
++ log_sys->log_sync_callers[x] = 0;
++ log_sys->log_sync_syncers[x] = 0;
++ }
++ }
++ log_sys->n_syncs = 0;
++ log_sys->n_checkpoints = 0;
+
+ log_sys->no_flush_event = os_event_create(NULL);
+
+@@ -1066,7 +1075,7 @@
+ if (srv_unix_file_flush_method != SRV_UNIX_O_DSYNC
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FLUSH_FROM_LOG_IO_COMPLETE);
+ }
+
+ #ifdef UNIV_DEBUG
+@@ -1088,7 +1097,7 @@
+ && srv_unix_file_flush_method != SRV_UNIX_NOSYNC
+ && srv_flush_log_at_trx_commit != 2) {
+
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FLUSH_FROM_LOG_IO_COMPLETE);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+@@ -1303,9 +1312,10 @@
+ be written, ut_dulint_max if not specified */
+ ulint wait, /* in: LOG_NO_WAIT, LOG_WAIT_ONE_GROUP,
+ or LOG_WAIT_ALL_GROUPS */
+- ibool flush_to_disk)
++ ibool flush_to_disk,
+ /* in: TRUE if we want the written log also to be
+ flushed to disk */
++ log_sync_type caller) /* in: identifies caller */
+ {
+ log_group_t* group;
+ ulint start_offset;
+@@ -1315,6 +1325,7 @@
+ ulint loop_count;
+ ulint unlock;
+
++ log_sys->log_sync_callers[caller]++;
+ if (recv_no_ibuf_operations) {
+ /* Recovery is running and no operations on the log files are
+ allowed yet (the variable name .._no_ibuf_.. is misleading) */
+@@ -1465,13 +1476,17 @@
+ so we have also flushed to disk what we have written */
+
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
++ log_sys->n_syncs++;
++ log_sys->log_sync_syncers[caller]++;
+
+ } else if (flush_to_disk) {
+
+ group = UT_LIST_GET_FIRST(log_sys->log_groups);
+
+- fil_flush(group->space_id);
++ fil_flush(group->space_id, FLUSH_FROM_LOG_WRITE_UP_TO);
+ log_sys->flushed_to_disk_lsn = log_sys->write_lsn;
++ log_sys->n_syncs++;
++ log_sys->log_sync_syncers[caller]++;
+ }
+
+ mutex_enter(&(log_sys->mutex));
+@@ -1520,7 +1535,8 @@
+
+ mutex_exit(&(log_sys->mutex));
+
+- log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE);
++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE,
++ LOG_WRITE_FROM_BACKGROUND_SYNC);
+ }
+
+ /********************************************************************
+@@ -1574,7 +1590,7 @@
+ mutex_exit(&(log->mutex));
+
+ if (do_flush) {
+- log_write_up_to(lsn, LOG_NO_WAIT, FALSE);
++ log_write_up_to(lsn, LOG_NO_WAIT, FALSE, LOG_WRITE_FROM_INTERNAL);
+ }
+ }
+
+@@ -1944,11 +1960,11 @@
+ }
+
+ if (srv_unix_file_flush_method != SRV_UNIX_NOSYNC) {
+- fil_flush_file_spaces(FIL_TABLESPACE);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_CHECKPOINT);
+ }
+
+ mutex_enter(&(log_sys->mutex));
+-
++ log_sys->n_checkpoints++;
+ oldest_lsn = log_buf_pool_get_oldest_modification();
+
+ mutex_exit(&(log_sys->mutex));
+@@ -1961,7 +1977,8 @@
+ write-ahead-logging algorithm ensures that the log has been flushed
+ up to oldest_lsn. */
+
+- log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
++ log_write_up_to(oldest_lsn, LOG_WAIT_ALL_GROUPS, TRUE,
++ LOG_WRITE_FROM_CHECKPOINT_SYNC);
+
+ mutex_enter(&(log_sys->mutex));
+
+@@ -2589,7 +2606,7 @@
+
+ mutex_exit(&(log_sys->mutex));
+
+- fil_flush(group->archive_space_id);
++ fil_flush(group->archive_space_id, FLUSH_FROM_ARCHIVE);
+
+ mutex_enter(&(log_sys->mutex));
+
+@@ -2670,7 +2687,8 @@
+
+ mutex_exit(&(log_sys->mutex));
+
+- log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE);
++ log_write_up_to(limit_lsn, LOG_WAIT_ALL_GROUPS, TRUE,
++ LOG_WRITE_FROM_LOG_ARCHIVE);
+
+ calc_new_limit = FALSE;
+
+@@ -3207,8 +3225,8 @@
+ }
+ mutex_exit(&kernel_mutex);
+
+- fil_flush_file_spaces(FIL_TABLESPACE);
+- fil_flush_file_spaces(FIL_LOG);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER);
++ fil_flush_file_spaces(FIL_LOG, FLUSH_FROM_OTHER);
+
+ /* The call fil_write_flushed_lsn_to_data_files() will pass the buffer
+ pool: therefore it is essential that the buffer pool has been
+@@ -3241,7 +3259,7 @@
+
+ fil_write_flushed_lsn_to_data_files(lsn, arch_log_no);
+
+- fil_flush_file_spaces(FIL_TABLESPACE);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER);
+
+ fil_close_all_files();
+
+@@ -3363,15 +3381,45 @@
+ time_elapsed = 0.001 + difftime(current_time,
+ log_sys->last_printout_time);
+ fprintf(file,
+- "%lu pending log writes, %lu pending chkp writes\n"
+- "%lu log i/o's done, %.2f log i/o's/second\n",
+- (ulong) log_sys->n_pending_writes,
+- (ulong) log_sys->n_pending_checkpoint_writes,
+- (ulong) log_sys->n_log_ios,
+- ((log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed));
++ "%lu pending log writes, %lu pending chkp writes\n"
++ "%lu log i/o's done, %.2f log i/o's/second, %lu syncs, %lu checkpoints\n",
++ (ulong) log_sys->n_pending_writes,
++ (ulong) log_sys->n_pending_checkpoint_writes,
++ (ulong) log_sys->n_log_ios,
++ (log_sys->n_log_ios - log_sys->n_log_ios_old) / time_elapsed,
++ log_sys->n_syncs,
++ log_sys->n_checkpoints);
+
+ log_sys->n_log_ios_old = log_sys->n_log_ios;
+ log_sys->last_printout_time = current_time;
++
++ fprintf(file,
++ "log sync callers: %lu buffer pool, background %lu sync and %lu async, "
++ "%lu internal, checkpoint %lu sync and %lu async, %lu archive, "
++ "commit %lu sync and %lu async\n",
++ log_sys->log_sync_callers[LOG_WRITE_FROM_DIRTY_BUFFER],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_BACKGROUND_SYNC],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_BACKGROUND_ASYNC],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_INTERNAL],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_CHECKPOINT_SYNC],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_CHECKPOINT_ASYNC],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_LOG_ARCHIVE],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_COMMIT_SYNC],
++ log_sys->log_sync_callers[LOG_WRITE_FROM_COMMIT_ASYNC]);
++
++ fprintf(file,
++ "log sync syncers: %lu buffer pool, background %lu sync and %lu async, "
++ "%lu internal, checkpoint %lu sync and %lu async, %lu archive, "
++ "commit %lu sync and %lu async\n",
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_DIRTY_BUFFER],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_BACKGROUND_SYNC],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_BACKGROUND_ASYNC],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_INTERNAL],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_CHECKPOINT_SYNC],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_CHECKPOINT_ASYNC],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_LOG_ARCHIVE],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_COMMIT_SYNC],
++ log_sys->log_sync_syncers[LOG_WRITE_FROM_COMMIT_ASYNC]);
+
+ mutex_exit(&(log_sys->mutex));
+ }
+diff -r ef44d8017b6b innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/srv/srv0srv.c Fri Jul 03 15:41:32 2009 -0700
+@@ -1698,6 +1698,12 @@
+ (ulong)time_elapsed);
+
+ fputs("----------\n"
++ "BACKGROUND THREAD\n"
++ "----------\n", file);
++ fil_print(file);
++
++
++ fputs("----------\n"
+ "SEMAPHORES\n"
+ "----------\n", file);
+ sync_print(file);
+diff -r ef44d8017b6b innobase/trx/trx0sys.c
+--- a/innobase/trx/trx0sys.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/trx/trx0sys.c Fri Jul 03 15:41:32 2009 -0700
+@@ -511,7 +511,7 @@
+ page += UNIV_PAGE_SIZE;
+ }
+
+- fil_flush_file_spaces(FIL_TABLESPACE);
++ fil_flush_file_spaces(FIL_TABLESPACE, FLUSH_FROM_OTHER);
+
+ leave_func:
+ ut_free(unaligned_read_buf);
+diff -r ef44d8017b6b innobase/trx/trx0trx.c
+--- a/innobase/trx/trx0trx.c Fri Jul 03 15:41:25 2009 -0700
++++ b/innobase/trx/trx0trx.c Fri Jul 03 15:41:32 2009 -0700
+@@ -942,19 +942,21 @@
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+- FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE,
++ LOG_WRITE_FROM_COMMIT_SYNC);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ ut_error;
+ }
+@@ -1701,18 +1703,21 @@
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ /* Write the log to the log files AND flush them to
+ disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE,
++ LOG_WRITE_FROM_COMMIT_SYNC);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ ut_error;
+ }
+@@ -1948,19 +1953,21 @@
+ if (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) {
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP,
+- FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ /* Write the log to the log files AND flush
+ them to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, TRUE,
++ LOG_WRITE_FROM_COMMIT_SYNC);
+ }
+ } else if (srv_flush_log_at_trx_commit == 2) {
+
+ /* Write the log but do not flush it to disk */
+
+- log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE);
++ log_write_up_to(lsn, LOG_WAIT_ONE_GROUP, FALSE,
++ LOG_WRITE_FROM_COMMIT_ASYNC);
+ } else {
+ ut_error;
+ }
+diff -r ef44d8017b6b patch_info/innodb_fsync_source.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_fsync_source.info Fri Jul 03 15:41:32 2009 -0700
+@@ -0,0 +1,9 @@
++File=innodb_fsync_source.patch
++Name=Information of fsync callers in InnoDB
++Version=1.0
++Author=Google
++License=GPL
++Comment=
++ChangeLog=
++2008-11-01
++VT: Initial porting
diff --git a/percona/5.0.91-b22-20100522/innodb_io_patches.patch b/percona/5.0.91-b22-20100522/innodb_io_patches.patch
new file mode 100644
index 0000000..aaef29a
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_io_patches.patch
@@ -0,0 +1,1379 @@
+diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c 2009-05-08 06:12:03.000000000 +0900
++++ b/innobase/buf/buf0flu.c 2009-07-02 16:44:49.000000000 +0900
+@@ -898,10 +898,17 @@
+
+ old_page_count = page_count;
+
++ if (srv_flush_neighbor_pages) {
+ /* Try to flush also all the neighbors */
+ page_count +=
+ buf_flush_try_neighbors(space, offset,
+ flush_type);
++ } else {
++ /* Try to flush the page only */
++ page_count +=
++ buf_flush_try_page(space, offset,
++ flush_type);
++ }
+ /* fprintf(stderr,
+ "Flush type %lu, page no %lu, neighb %lu\n",
+ flush_type, offset,
+diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/buf/buf0rea.c 2009-07-02 16:44:49.000000000 +0900
+@@ -20,6 +20,7 @@
+ #include "os0file.h"
+ #include "srv0start.h"
+
++extern uint srv_read_ahead;
+ extern ulint srv_read_ahead_rnd;
+ extern ulint srv_read_ahead_seq;
+ extern ulint srv_buf_pool_reads;
+@@ -189,6 +190,10 @@
+ ulint err;
+ ulint i;
+
++ if (!(srv_read_ahead & 1)) {
++ return(0);
++ }
++
+ if (srv_startup_is_before_trx_rollback_phase) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+@@ -396,6 +401,10 @@
+ ulint err;
+ ulint i;
+
++ if (!(srv_read_ahead & 2)) {
++ return(0);
++ }
++
+ if (srv_startup_is_before_trx_rollback_phase) {
+ /* No read-ahead to avoid thread deadlocks */
+ return(0);
+diff -ruN a/innobase/ibuf/ibuf0ibuf.c b/innobase/ibuf/ibuf0ibuf.c
+--- a/innobase/ibuf/ibuf0ibuf.c 2009-05-08 06:12:04.000000000 +0900
++++ b/innobase/ibuf/ibuf0ibuf.c 2009-07-02 16:44:49.000000000 +0900
+@@ -370,8 +370,9 @@
+ grow in size, as the references on the upper levels of the tree can
+ change */
+
+- ibuf->max_size = buf_pool_get_curr_size() / UNIV_PAGE_SIZE
+- / IBUF_POOL_SIZE_PER_MAX_SIZE;
++ ibuf->max_size = ut_min( buf_pool_get_curr_size() / UNIV_PAGE_SIZE
++ / IBUF_POOL_SIZE_PER_MAX_SIZE, (ulint) srv_ibuf_max_size / UNIV_PAGE_SIZE);
++ srv_ibuf_max_size = (long long) ibuf->max_size * UNIV_PAGE_SIZE;
+ ibuf->meter = IBUF_THRESHOLD + 1;
+
+ UT_LIST_INIT(ibuf->data_list);
+@@ -2258,11 +2259,13 @@
+
+ mutex_enter(&ibuf_mutex);
+
++ if (!srv_ibuf_active_contract) {
+ if (ibuf->size < ibuf->max_size + IBUF_CONTRACT_ON_INSERT_NON_SYNC) {
+ mutex_exit(&ibuf_mutex);
+
+ return;
+ }
++ }
+
+ sync = FALSE;
+
+diff -ruN a/innobase/include/log0log.h b/innobase/include/log0log.h
+--- a/innobase/include/log0log.h 2009-05-08 06:12:06.000000000 +0900
++++ b/innobase/include/log0log.h 2009-07-02 16:44:49.000000000 +0900
+@@ -169,6 +169,13 @@
+ log_buffer_flush_to_disk(void);
+ /*==========================*/
+ /********************************************************************
++Flushes the log buffer. Forces it to disk depending on the value of
++the configuration parameter innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void);
++/*=============================*/
++/********************************************************************
+ Advances the smallest lsn for which there are unflushed dirty blocks in the
+ buffer pool and also may make a new checkpoint. NOTE: this function may only
+ be called if the calling thread owns no synchronization objects! */
+diff -ruN a/innobase/include/os0file.h b/innobase/include/os0file.h
+--- a/innobase/include/os0file.h 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/os0file.h 2009-07-02 16:44:49.000000000 +0900
+@@ -551,8 +551,10 @@
+ /*========*/
+ ulint n, /* in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++// ulint n_segments, /* in: combined number of segments in the four
++// first aio arrays; must be >= 4 */
++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads */
++ ulint n_write_threads, /**/
+ ulint n_slots_sync); /* in: number of slots in the sync aio array */
+ /***********************************************************************
+ Requests an asynchronous i/o operation. */
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-07-02 18:02:38.000000000 +0900
+@@ -89,6 +89,8 @@
+ extern ulint srv_lock_table_size;
+
+ extern ulint srv_n_file_io_threads;
++extern ulint srv_n_read_io_threads;
++extern ulint srv_n_write_io_threads;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ extern ibool srv_log_archive_on;
+@@ -133,6 +135,15 @@
+ extern ulong srv_max_purge_lag;
+ extern ibool srv_use_awe;
+ extern ibool srv_use_adaptive_hash_indexes;
++
++extern ulint srv_io_capacity;
++extern long long srv_ibuf_max_size;
++extern ulint srv_ibuf_active_contract;
++extern ulint srv_ibuf_accel_rate;
++extern ulint srv_flush_neighbor_pages;
++extern ulint srv_enable_unsafe_group_commit;
++extern uint srv_read_ahead;
++extern uint srv_adaptive_checkpoint;
+ /*-------------------------------------------*/
+
+ extern ulint srv_n_rows_inserted;
+diff -ruN a/innobase/log/log0log.c b/innobase/log/log0log.c
+--- a/innobase/log/log0log.c 2009-05-08 06:12:10.000000000 +0900
++++ b/innobase/log/log0log.c 2009-07-02 16:44:49.000000000 +0900
+@@ -1524,6 +1524,29 @@
+ }
+
+ /********************************************************************
++Flush the log buffer. Force it to disk depending on the value of
++innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void)
++/*=============================*/
++{
++ dulint lsn;
++
++ mutex_enter(&(log_sys->mutex));
++
++ lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
++ srv_flush_log_at_trx_commit == 1 ?
++ LOG_WRITE_FROM_BACKGROUND_SYNC :
++ LOG_WRITE_FROM_BACKGROUND_ASYNC);
++}
++/********************************************************************
+ Tries to establish a big enough margin of free space in the log buffer, such
+ that a new log entry can be catenated without an immediate need for a flush. */
+ static
+@@ -3326,6 +3349,15 @@
+ (ulong) ut_dulint_get_high(log_sys->last_checkpoint_lsn),
+ (ulong) ut_dulint_get_low(log_sys->last_checkpoint_lsn));
+
++ fprintf(file,
++ "Max checkpoint age %lu\n"
++ "Modified age %lu\n"
++ "Checkpoint age %lu\n",
++ (ulong) log_sys->max_checkpoint_age,
++ (ulong) ut_dulint_minus(log_sys->lsn,
++ log_buf_pool_get_oldest_modification()),
++ (ulong) ut_dulint_minus(log_sys->lsn, log_sys->last_checkpoint_lsn));
++
+ current_time = time(NULL);
+
+ time_elapsed = 0.001 + difftime(current_time,
+diff -ruN a/innobase/os/os0file.c b/innobase/os/os0file.c
+--- a/innobase/os/os0file.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/os/os0file.c 2009-07-02 16:44:49.000000000 +0900
+@@ -66,6 +66,28 @@
+
+ ibool os_aio_print_debug = FALSE;
+
++/* State for the state of an IO request in simulated AIO.
++ Protocol for simulated aio:
++ client requests IO: find slot with reserved = FALSE. Add entry with
++ status = OS_AIO_NOT_ISSUED.
++ IO thread wakes: find adjacent slots with reserved = TRUE and status =
++ OS_AIO_NOT_ISSUED. Change status for slots to
++ OS_AIO_ISSUED.
++ IO operation completes: set status for slots to OS_AIO_DONE. set status
++ for the first slot to OS_AIO_CLAIMED and return
++ result for that slot.
++ When there are multiple read and write threads, they all compete to execute
++ the requests in the array (os_aio_array_t). This avoids the need to load
++ balance requests at the time the request is made at the cost of waking all
++ threads when a request is available.
++*/
++typedef enum {
++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
++ OS_AIO_ISSUED, /* Being processed by an IO thread. */
++ OS_AIO_DONE, /* Request processed. */
++ OS_AIO_CLAIMED /* Result being returned to client. */
++} os_aio_status;
++
+ /* The aio array slot structure */
+ typedef struct os_aio_slot_struct os_aio_slot_t;
+
+@@ -74,6 +96,8 @@
+ ulint pos; /* index of the slot in the aio
+ array */
+ ibool reserved; /* TRUE if this slot is reserved */
++ os_aio_status status; /* Status for current request. Valid when reserved
++ is TRUE. Used only in simulated aio. */
+ time_t reservation_time;/* time when reserved */
+ ulint len; /* length of the block to read or
+ write */
+@@ -84,11 +108,11 @@
+ ulint offset_high; /* 32 high bits of file offset */
+ os_file_t file; /* file where to read or write */
+ const char* name; /* file name or path */
+- ibool io_already_done;/* used only in simulated aio:
+- TRUE if the physical i/o already
+- made and only the slot message
+- needs to be passed to the caller
+- of os_aio_simulated_handle */
++// ibool io_already_done;/* used only in simulated aio:
++// TRUE if the physical i/o already
++// made and only the slot message
++// needs to be passed to the caller
++// of os_aio_simulated_handle */
+ fil_node_t* message1; /* message which is given by the */
+ void* message2; /* the requester of an aio operation
+ and which can be used to identify
+@@ -137,6 +161,13 @@
+ /* Array of events used in simulated aio */
+ os_event_t* os_aio_segment_wait_events = NULL;
+
++/* Number for the first global segment for reading. */
++const ulint os_aio_first_read_segment = 2;
++
++/* Number for the first global segment for writing. Set to
++2 + os_aio_read_write_threads. */
++ulint os_aio_first_write_segment = 0;
++
+ /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+ are NULL when the module has not yet been initialized. */
+ static os_aio_array_t* os_aio_read_array = NULL;
+@@ -145,11 +176,17 @@
+ static os_aio_array_t* os_aio_log_array = NULL;
+ static os_aio_array_t* os_aio_sync_array = NULL;
+
++/* Per thread buffer used for merged IO requests. Used by
++os_aio_simulated_handle so that a buffer doesn't have to be allocated
++for each request. */
++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
++
+ static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+ /* If the following is TRUE, read i/o handler threads try to
+ wait until a batch of new read requests have been posted */
+-static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ ulint os_n_file_reads = 0;
+ ulint os_bytes_read_since_printout = 0;
+@@ -2878,8 +2915,10 @@
+ /*========*/
+ ulint n, /* in: maximum number of pending aio operations
+ allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++// ulint n_segments, /* in: combined number of segments in the four
++// first aio arrays; must be >= 4 */
++ ulint n_read_threads, /* n_segments == 2 + n_read_threads + n_write_threads*/
++ ulint n_write_threads, /**/
+ ulint n_slots_sync) /* in: number of slots in the sync aio array */
+ {
+ ulint n_read_segs;
+@@ -2889,6 +2928,8 @@
+ #ifdef POSIX_ASYNC_IO
+ sigset_t sigset;
+ #endif
++ ulint n_segments = 2 + n_read_threads + n_write_threads;
++
+ ut_ad(n % n_segments == 0);
+ ut_ad(n_segments >= 4);
+
+@@ -2896,14 +2937,17 @@
+
+ for (i = 0; i < n_segments; i++) {
+ srv_set_io_thread_op_info(i, "not started yet");
++ os_aio_thread_buffer[i] = 0;
++ os_aio_thread_buffer_size[i] = 0;
+ }
+
+ n_per_seg = n / n_segments;
+- n_write_segs = (n_segments - 2) / 2;
+- n_read_segs = n_segments - 2 - n_write_segs;
++ n_write_segs = n_write_threads;
++ n_read_segs = n_read_threads;
+
+ /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+
++ os_aio_first_write_segment = os_aio_first_read_segment + n_read_threads;
+ os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
+
+ srv_io_thread_function[0] = "insert buffer thread";
+@@ -2912,14 +2956,14 @@
+
+ srv_io_thread_function[1] = "log thread";
+
+- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
++ os_aio_read_array = os_aio_array_create(n_per_seg,
+ n_read_segs);
+ for (i = 2; i < 2 + n_read_segs; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+ srv_io_thread_function[i] = "read thread";
+ }
+
+- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
++ os_aio_write_array = os_aio_array_create(n_per_seg,
+ n_write_segs);
+ for (i = 2 + n_read_segs; i < n_segments; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+@@ -3181,6 +3225,13 @@
+ struct aiocb* control;
+ #endif
+ ulint i;
++ ulint prim_segment;
++ ulint n;
++
++ n = array->n_slots / array->n_segments;
++ /* 64 blocks' striping ( aligning max(BUF_READ_AHEAD_AREA) ) */
++ prim_segment = ( offset >> (UNIV_PAGE_SIZE_SHIFT + 6) ) % (array->n_segments);
++
+ loop:
+ os_mutex_enter(array->mutex);
+
+@@ -3199,6 +3250,16 @@
+ goto loop;
+ }
+
++ for (i = prim_segment * n; i < array->n_slots; i++) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved == FALSE) {
++ break;
++ }
++ }
++
++ if (slot->reserved == TRUE){
++ /* Not found after the intended segment. So we should search before. */
+ for (i = 0;; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+@@ -3206,6 +3267,7 @@
+ break;
+ }
+ }
++ }
+
+ array->n_reserved++;
+
+@@ -3228,7 +3290,8 @@
+ slot->buf = buf;
+ slot->offset = offset;
+ slot->offset_high = offset_high;
+- slot->io_already_done = FALSE;
++// slot->io_already_done = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ #ifdef WIN_ASYNC_IO
+ control = &(slot->control);
+@@ -3281,6 +3344,7 @@
+ ut_ad(slot->reserved);
+
+ slot->reserved = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ array->n_reserved--;
+
+@@ -3317,16 +3381,18 @@
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+ os_mutex_enter(array->mutex);
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved) {
++ if (slot->reserved &&
++ (slot->status == OS_AIO_NOT_ISSUED ||
++ slot->status == OS_AIO_DONE)) {
+ /* Found an i/o request */
+
+ break;
+@@ -3336,7 +3402,25 @@
+ os_mutex_exit(array->mutex);
+
+ if (i < n) {
+- os_event_set(os_aio_segment_wait_events[global_segment]);
++ if (array == os_aio_ibuf_array) {
++ os_event_set(os_aio_segment_wait_events[0]);
++
++ } else if (array == os_aio_log_array) {
++ os_event_set(os_aio_segment_wait_events[1]);
++
++ } else if (array == os_aio_read_array) {
++ ulint x;
++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else if (array == os_aio_write_array) {
++ ulint x;
++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else {
++ ut_a(0);
++ }
+ }
+ }
+
+@@ -3347,8 +3431,6 @@
+ os_aio_simulated_wake_handler_threads(void)
+ /*=======================================*/
+ {
+- ulint i;
+-
+ if (os_aio_use_native_aio) {
+ /* We do not use simulated aio: do nothing */
+
+@@ -3357,9 +3439,10 @@
+
+ os_aio_recommend_sleep_for_read_threads = FALSE;
+
+- for (i = 0; i < os_aio_n_segments; i++) {
+- os_aio_simulated_wake_handler_thread(i);
+- }
++ os_aio_simulated_wake_handler_thread(0);
++ os_aio_simulated_wake_handler_thread(1);
++ os_aio_simulated_wake_handler_thread(os_aio_first_read_segment);
++ os_aio_simulated_wake_handler_thread(os_aio_first_write_segment);
+ }
+
+ /**************************************************************************
+@@ -3640,7 +3723,7 @@
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ if (array == os_aio_sync_array) {
+ os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+@@ -3648,12 +3731,12 @@
+ } else {
+ srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+ i = os_event_wait_multiple(n,
+- (array->native_events) + segment * n);
++ (array->native_events));
+ }
+
+ os_mutex_enter(array->mutex);
+
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+ ut_a(slot->reserved);
+
+@@ -3830,10 +3913,13 @@
+ os_aio_slot_t* slot;
+ os_aio_slot_t* slot2;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
++ os_aio_slot_t* lowest_request;
++ os_aio_slot_t* oldest_request;
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ ulint lowest_offset;
++ ulint oldest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+@@ -3841,6 +3927,7 @@
+ ibool ret;
+ ulint n;
+ ulint i;
++ time_t now;
+
+ segment = os_aio_get_array_and_local_segment(&array, global_segment);
+
+@@ -3853,7 +3940,7 @@
+ ut_ad(os_aio_validate());
+ ut_ad(segment < array->n_segments);
+
+- n = array->n_slots / array->n_segments;
++ n = array->n_slots;
+
+ /* Look through n slots after the segment * n'th slot */
+
+@@ -3875,9 +3962,9 @@
+ done */
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved && slot->io_already_done) {
++ if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+@@ -3897,67 +3984,57 @@
+ then pick the one at the lowest offset. */
+
+ biggest_age = 0;
+- lowest_offset = ULINT_MAX;
++ now = time(NULL);
++ oldest_request = lowest_request = NULL;
++ oldest_offset = lowest_offset = ULINT_MAX;
+
++ /* Find the oldest request and the request with the smallest offset */
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot = os_aio_array_get_nth_slot(array, i);
+
+- if (slot->reserved) {
+- age = (ulint)difftime(time(NULL),
+- slot->reservation_time);
++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
++ age = (ulint)difftime(now, slot->reservation_time);
+
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+- && slot->offset < lowest_offset)) {
++ && slot->offset < oldest_offset)) {
+
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+ biggest_age = age;
+- lowest_offset = slot->offset;
++ oldest_request = slot;
++ oldest_offset = slot->offset;
+ }
+- }
+- }
+-
+- if (n_consecutive == 0) {
+- /* There were no old requests. Look for an i/o request at the
+- lowest offset in the array (we ignore the high 32 bits of the
+- offset in these heuristics) */
+-
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array,
+- i + segment * n);
+-
+- if (slot->reserved && slot->offset < lowest_offset) {
+
++ /* Look for an i/o request at the lowest offset in the array
++ * (we ignore the high 32 bits of the offset) */
++ if (slot->offset < lowest_offset) {
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
++ lowest_request = slot;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+- if (n_consecutive == 0) {
++ if (!lowest_request && !oldest_request) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+- slot = consecutive_ios[0];
++ if (oldest_request) {
++ slot = oldest_request;
++ } else {
++ slot = lowest_request;
++ }
++ consecutive_ios[0] = slot;
++ n_consecutive = 1;
+
+ /* Check if there are several consecutive blocks to read or write */
+
+ consecutive_loop:
+ for (i = 0; i < n; i++) {
+- slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot2 = os_aio_array_get_nth_slot(array, i);
+
+ if (slot2->reserved && slot2 != slot
+ && slot2->offset == slot->offset + slot->len
+@@ -3965,7 +4042,8 @@
+ sum does not wrap over */
+ && slot2->offset_high == slot->offset_high
+ && slot2->type == slot->type
+- && slot2->file == slot->file) {
++ && slot2->file == slot->file
++ && slot2->status == OS_AIO_NOT_ISSUED) {
+
+ /* Found a consecutive i/o request */
+
+@@ -3994,6 +4072,8 @@
+
+ for (i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_ISSUED;
+ }
+
+ if (n_consecutive == 1) {
+@@ -4001,7 +4081,14 @@
+ combined_buf = slot->buf;
+ combined_buf2 = NULL;
+ } else {
+- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
++ if (os_aio_thread_buffer[global_segment])
++ ut_free(os_aio_thread_buffer[global_segment]);
++
++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
++ }
++ combined_buf2 = os_aio_thread_buffer[global_segment];
+
+ ut_a(combined_buf2);
+
+@@ -4012,6 +4099,9 @@
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
++ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_ISSUED);
++
+ os_mutex_exit(array->mutex);
+
+ if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+@@ -4081,16 +4171,13 @@
+ }
+ }
+
+- if (combined_buf2) {
+- ut_free(combined_buf2);
+- }
+-
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (i = 0; i < n_consecutive; i++) {
+- consecutive_ios[i]->io_already_done = TRUE;
++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_DONE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+@@ -4100,6 +4187,8 @@
+ slot_io_done:
+
+ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_DONE);
++ slot->status = OS_AIO_CLAIMED;
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-07-02 16:43:23.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-07-02 18:36:54.000000000 +0900
+@@ -167,6 +167,8 @@
+ ulint srv_lock_table_size = ULINT_MAX;
+
+ ulint srv_n_file_io_threads = ULINT_MAX;
++ulint srv_n_read_io_threads = 1;
++ulint srv_n_write_io_threads = 1;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ ibool srv_log_archive_on = FALSE;
+@@ -330,6 +332,24 @@
+ ibool srv_use_awe = FALSE;
+ ibool srv_use_adaptive_hash_indexes = TRUE;
+
++ulint srv_io_capacity = 100;
++
++/* Returns the number of IO operations that is X percent of the capacity.
++PCT_IO(5) -> returns the number of IO operations that is 5% of the max
++where max is srv_io_capacity. */
++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
++
++long long srv_ibuf_max_size = 0;
++ulint srv_ibuf_active_contract = 0; /* 0:disable 1:enable */
++ulint srv_ibuf_accel_rate = 100;
++#define PCT_IBUF_IO(pct) ((ulint) (srv_io_capacity * srv_ibuf_accel_rate * ((double) pct / 10000.0)))
++
++ulint srv_flush_neighbor_pages = 1; /* 0:disable 1:enable */
++
++ulint srv_enable_unsafe_group_commit = 0; /* 0:disable 1:enable */
++
++uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
++uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
+ /*-------------------------------------------*/
+ ulong srv_n_spin_wait_rounds = 20;
+ ulong srv_n_free_tickets_to_enter = 500;
+@@ -2228,6 +2248,10 @@
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
++
++ dulint lsn_old;
++
++ dulint oldest_lsn;
+
+ #ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+@@ -2244,6 +2268,9 @@
+
+ mutex_exit(&kernel_mutex);
+
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
+ os_event_set(srv_sys->operational);
+ loop:
+ /*****************************************************************/
+@@ -2279,6 +2306,18 @@
+ if (!skip_sleep) {
+
+ os_thread_sleep(1000000);
++ /*
++ mutex_enter(&(log_sys->mutex));
++ oldest_lsn = buf_pool_get_oldest_modification();
++ dulint lsn = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++
++ if (!ut_dulint_is_zero(oldest_lsn))
++ fprintf(stderr,
++ "InnoDB flush: age pct: %lu, lsn progress: %lu\n",
++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++ ut_dulint_minus(lsn, lsn_old));
++ */
+ }
+
+ skip_sleep = FALSE;
+@@ -2317,13 +2356,14 @@
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+ }
+
+ if (buf_get_modified_ratio_pct() >
+@@ -2332,7 +2372,7 @@
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+
+ /* If we had to do the flush, it may have taken
+@@ -2341,6 +2381,140 @@
+ iteration of this loop. */
+
+ skip_sleep = TRUE;
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ } else if (srv_adaptive_checkpoint == 1) {
++
++ /* Try to keep modified age not to exceed
++ max_checkpoint_age * 7/8 line */
++
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ oldest_lsn = buf_pool_get_oldest_modification();
++ if (ut_dulint_is_zero(oldest_lsn)) {
++
++ mutex_exit(&(log_sys->mutex));
++
++ } else {
++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++ /* We should not flush from here. */
++ mutex_exit(&(log_sys->mutex));
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 4)) {
++
++ /* 2nd defence line (max_checkpoint_age * 3/4) */
++
++ mutex_exit(&(log_sys->mutex));
++
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
++ ut_dulint_max);
++ skip_sleep = TRUE;
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age)/2 ) {
++
++ /* 1st defence line (max_checkpoint_age * 1/2) */
++
++ mutex_exit(&(log_sys->mutex));
++
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
++ ut_dulint_max);
++ skip_sleep = TRUE;
++ } else {
++ mutex_exit(&(log_sys->mutex));
++ }
++ }
++ } else if (srv_adaptive_checkpoint == 2) {
++
++ /* Try to keep modified age not to exceed
++ max_checkpoint_age * 7/8 line */
++
++ mutex_enter(&(log_sys->mutex));
++
++ oldest_lsn = buf_pool_get_oldest_modification();
++ if (ut_dulint_is_zero(oldest_lsn)) {
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++
++ } else {
++ if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age) - ((log_sys->max_checkpoint_age) / 8)) {
++ /* LOG_POOL_PREFLUSH_RATIO_ASYNC is exceeded. */
++ /* We should not flush from here. */
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ } else if (ut_dulint_minus(log_sys->lsn, oldest_lsn)
++ > (log_sys->max_checkpoint_age)/2 ) {
++
++ /* defence line (max_checkpoint_age * 1/2) */
++ dulint lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ ib_longlong level, bpl;
++ buf_block_t* bpage;
++
++ mutex_enter(&buf_pool->mutex);
++
++ level = 0;
++ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
++
++ while (bpage != NULL) {
++ dulint oldest_modification = bpage->oldest_modification;
++ if (!ut_dulint_is_zero(oldest_modification)) {
++ level += log_sys->max_checkpoint_age
++ - ut_dulint_minus(lsn, oldest_modification);
++ }
++ bpage = UT_LIST_GET_NEXT(flush_list, bpage);
++ }
++
++ if (level) {
++ bpl = ((ib_longlong) UT_LIST_GET_LEN(buf_pool->flush_list)
++ * UT_LIST_GET_LEN(buf_pool->flush_list)
++ * ut_dulint_minus(lsn, lsn_old)) / level;
++ } else {
++ bpl = 0;
++ }
++
++ mutex_exit(&buf_pool->mutex);
++
++ if (!srv_use_doublewrite_buf) {
++ /* flush is faster than when doublewrite */
++ bpl = (bpl * 3) / 4;
++ }
++
++ if(bpl) {
++retry_flush_batch:
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
++ bpl,
++ ut_dulint_add(oldest_lsn,
++ ut_dulint_minus(lsn,
++ lsn_old)));
++ if (n_pages_flushed == ULINT_UNDEFINED) {
++ os_thread_sleep(5000);
++ goto retry_flush_batch;
++ }
++ }
++
++ lsn_old = lsn;
++ /*
++ fprintf(stderr,
++ "InnoDB flush: age pct: %lu, lsn progress: %lu, blocks to flush:%llu\n",
++ ut_dulint_minus(lsn, oldest_lsn) * 100 / log_sys->max_checkpoint_age,
++ ut_dulint_minus(lsn, lsn_old), bpl);
++ */
++ } else {
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
++ }
++ }
++
++ } else {
++ mutex_enter(&(log_sys->mutex));
++ lsn_old = log_sys->lsn;
++ mutex_exit(&(log_sys->mutex));
+ }
+
+ if (srv_activity_count == old_activity_count) {
+@@ -2367,23 +2541,25 @@
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
++ if (n_pend_ios < 3 && (n_ios - n_ios_very_old < PCT_IO(200))) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
+
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+@@ -2422,14 +2598,14 @@
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ ut_dulint_max);
+ }
+
+@@ -2518,7 +2694,7 @@
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+ n_bytes_merged = 0;
+ } else {
+- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IBUF_IO(100));
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+@@ -2535,7 +2711,7 @@
+
+ if (srv_fast_shutdown < 2) {
+ n_pages_flushed =
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ } else {
+ /* In the fastest shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+@@ -2557,7 +2733,14 @@
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ current_time = time(NULL);
++ if (difftime(current_time, last_flush_time) > 1) {
++ log_buffer_flush_to_disk();
++ last_flush_time = current_time;
++ } else {
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c 2009-05-08 06:12:12.000000000 +0900
++++ b/innobase/srv/srv0start.c 2009-07-02 16:44:49.000000000 +0900
+@@ -1205,24 +1205,28 @@
+ return(DB_ERROR);
+ }
+
++ /* over write innodb_file_io_threads */
++ srv_n_file_io_threads = 2 + srv_n_read_io_threads + srv_n_write_io_threads;
++
+ /* Restrict the maximum number of file i/o threads */
+ if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+
+ srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
++ srv_n_read_io_threads = srv_n_write_io_threads = (SRV_MAX_N_IO_THREADS - 2) / 2;
+ }
+
+ if (!os_aio_use_native_aio) {
+ /* In simulated aio we currently have use only for 4 threads */
+- srv_n_file_io_threads = 4;
++ /*srv_n_file_io_threads = 4;*/
+
+ os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
+ * srv_n_file_io_threads,
+- srv_n_file_io_threads,
++ srv_n_read_io_threads, srv_n_write_io_threads,
+ SRV_MAX_N_PENDING_SYNC_IOS);
+ } else {
+ os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
+ * srv_n_file_io_threads,
+- srv_n_file_io_threads,
++ srv_n_read_io_threads, srv_n_write_io_threads,
+ SRV_MAX_N_PENDING_SYNC_IOS);
+ }
+
+diff -ruN a/patch_info/innodb_io_patches.info b/patch_info/innodb_io_patches.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_io_patches.info 2009-07-02 16:44:49.000000000 +0900
+@@ -0,0 +1,11 @@
++File=innodb_io_patches.patch
++Name=Cluster of past InnoDB IO patches
++Version=1.1
++Author=Percona
++License=GPL
++Comment=This patch contains fixed (control_flush_and_merge_and_read, control_io-threads, adaptive_flush)
++ChangeLog=
++2008-11-06
++YK: Initial release
++2009-01-09
++YK: Some parameters are added
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-07-02 16:44:49.000000000 +0900
+@@ -149,6 +149,7 @@
+ innobase_lock_wait_timeout, innobase_force_recovery,
+ innobase_open_files;
+
++long innobase_read_io_threads, innobase_write_io_threads;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+ /* The default values for the following char* start-up parameters
+@@ -1417,6 +1418,8 @@
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
++ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
++ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+@@ -7330,6 +7333,10 @@
+ trx_t* trx = check_trx_exists(thd);
+
+ if (thd->lex->sql_command != SQLCOM_XA_PREPARE) {
++ if (srv_enable_unsafe_group_commit && !thd->variables.innodb_support_xa) {
++ /* choose group commit rather than binlog order */
++ return(0);
++ }
+
+ /* For ibbackup to work the order of transactions in binlog
+ and InnoDB must be the same. Consider the situation
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/ha_innodb.h 2009-07-02 18:10:51.000000000 +0900
+@@ -204,6 +204,7 @@
+ extern long innobase_additional_mem_pool_size;
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
++extern long innobase_read_io_threads, innobase_write_io_threads;
+ extern long innobase_force_recovery;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+@@ -234,6 +235,15 @@
+ extern ulong srv_thread_concurrency;
+ extern ulong srv_commit_concurrency;
+ extern ulong srv_flush_log_at_trx_commit;
++extern ulong srv_io_capacity;
++extern long long srv_ibuf_max_size;
++extern ulong srv_ibuf_active_contract;
++extern ulong srv_ibuf_accel_rate;
++extern ulong srv_flush_neighbor_pages;
++extern ulong srv_enable_unsafe_group_commit;
++extern uint srv_read_ahead;
++extern uint srv_adaptive_checkpoint;
++
+ /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does
+ NOT update cardinality for indexes of InnoDB table". By default we are
+ running with the fix disabled because MySQL 5.1 is frozen for such
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/mysqld.cc 2009-07-02 18:00:04.000000000 +0900
+@@ -5086,6 +5086,16 @@
+ OPT_INNODB_ROLLBACK_ON_TIMEOUT,
+ OPT_SECURE_FILE_PRIV,
+ OPT_KEEP_FILES_ON_CREATE,
++ OPT_INNODB_IO_CAPACITY,
++ OPT_INNODB_IBUF_MAX_SIZE,
++ OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++ OPT_INNODB_IBUF_ACCEL_RATE,
++ OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++ OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++ OPT_INNODB_READ_AHEAD,
++ OPT_INNODB_ADAPTIVE_CHECKPOINT,
++ OPT_INNODB_READ_IO_THREADS,
++ OPT_INNODB_WRITE_IO_THREADS,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
+ OPT_FEDERATED,
+ OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+@@ -5403,6 +5413,44 @@
+ (gptr*) &srv_use_legacy_cardinality_algorithm,
+ (gptr*) &srv_use_legacy_cardinality_algorithm,
+ 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
++ "Number of IO operations per second the server can do. Tunes background IO rate.",
++ (gptr*) &srv_io_capacity, (gptr*) &srv_io_capacity,
++ 0, GET_ULONG, REQUIRED_ARG, 200, 100, 999999999, 0, 0, 0},
++ {"innodb_ibuf_max_size", OPT_INNODB_IBUF_MAX_SIZE,
++ "The maximum size of the insert buffer. (in bytes)",
++ (gptr*) &srv_ibuf_max_size, (gptr*) &srv_ibuf_max_size, 0,
++ GET_LL, REQUIRED_ARG, LONGLONG_MAX, 0, LONGLONG_MAX, 0, 0, 0},
++ {"innodb_ibuf_active_contract", OPT_INNODB_IBUF_ACTIVE_CONTRACT,
++ "Enable/Disable active_contract of insert buffer. 0:disable 1:enable",
++ (gptr*) &srv_ibuf_active_contract, (gptr*) &srv_ibuf_active_contract,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_ibuf_accel_rate", OPT_INNODB_IBUF_ACCEL_RATE,
++ "Tunes amount of insert buffer processing of background, in addition to innodb_io_capacity. (in percentage)",
++ (gptr*) &srv_ibuf_accel_rate, (gptr*) &srv_ibuf_accel_rate,
++ 0, GET_ULONG, REQUIRED_ARG, 100, 100, 999999999, 0, 0, 0},
++ {"innodb_flush_neighbor_pages", OPT_INNODB_FLUSH_NEIGHBOR_PAGES,
++ "Enable/Disable flushing also neighbor pages. 0:disable 1:enable",
++ (gptr*) &srv_flush_neighbor_pages, (gptr*) &srv_flush_neighbor_pages,
++ 0, GET_ULONG, REQUIRED_ARG, 1, 0, 1, 0, 0, 0},
++ {"innodb_read_ahead", OPT_INNODB_READ_AHEAD,
++ "Control read ahead activity. (none, random, linear, [both])",
++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"innodb_adaptive_checkpoint", OPT_INNODB_ADAPTIVE_CHECKPOINT,
++ "Enable/Diasable flushing along modified age. ([none], reflex, estimate)",
++ 0, 0, 0, GET_ULONG, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"innodb_enable_unsafe_group_commit", OPT_INNODB_ENABLE_UNSAFE_GROUP_COMMIT,
++ "Enable/Disable unsafe group commit when support_xa=OFF and use with binlog or other XA storage engine.",
++ (gptr*) &srv_enable_unsafe_group_commit, (gptr*) &srv_enable_unsafe_group_commit,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
++ "Number of background read I/O threads in InnoDB.",
++ (gptr*) &innobase_read_io_threads, (gptr*) &innobase_read_io_threads,
++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
++ "Number of background write I/O threads in InnoDB.",
++ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
++ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+ (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+@@ -7644,6 +7692,38 @@
+ case OPT_INNODB_LOG_ARCHIVE:
+ innobase_log_archive= argument ? test(atoi(argument)) : 1;
+ break;
++ case OPT_INNODB_READ_AHEAD:
++ if (argument == disabled_my_option)
++ srv_read_ahead = 0;
++ else if (! argument)
++ srv_read_ahead = 3;
++ else
++ {
++ int type;
++ if ((type=find_type(argument, &innodb_read_ahead_typelib, 2)) <= 0)
++ {
++ fprintf(stderr,"Unknown innodb_read_ahead type: %s\n",argument);
++ exit(1);
++ }
++ srv_read_ahead = (uint) ((type - 1) & 3);
++ }
++ break;
++ case OPT_INNODB_ADAPTIVE_CHECKPOINT:
++ if (argument == disabled_my_option)
++ srv_adaptive_checkpoint = 0;
++ else if (! argument)
++ srv_adaptive_checkpoint = 0;
++ else
++ {
++ int type;
++ if ((type=find_type(argument, &innodb_adaptive_checkpoint_typelib, 2)) <= 0)
++ {
++ fprintf(stderr,"Unknown innodb_adaptive_checkpoint type: %s\n",argument);
++ exit(1);
++ }
++ srv_adaptive_checkpoint = (uint) ((type - 1) % 3);
++ }
++ break;
+ #endif /* HAVE_INNOBASE_DB */
+ case OPT_MYISAM_RECOVER:
+ {
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.cc 2009-07-02 17:45:29.000000000 +0900
+@@ -489,6 +489,57 @@
+ sys_var_long_ptr sys_innodb_flush_log_at_trx_commit(
+ "innodb_flush_log_at_trx_commit",
+ &srv_flush_log_at_trx_commit);
++sys_var_long_ptr sys_innodb_io_capacity("innodb_io_capacity",
++ &srv_io_capacity);
++sys_var_long_ptr sys_innodb_ibuf_active_contract("innodb_ibuf_active_contract",
++ &srv_ibuf_active_contract);
++sys_var_long_ptr sys_innodb_ibuf_accel_rate("innodb_ibuf_accel_rate",
++ &srv_ibuf_accel_rate);
++sys_var_long_ptr sys_innodb_flush_neighbor_pages("innodb_flush_neighbor_pages",
++ &srv_flush_neighbor_pages);
++
++const char *innodb_read_ahead_names[]=
++{
++ "none", /* 0 */
++ "random",
++ "linear",
++ "both", /* 3 */
++ /* For compatibility of the older patch */
++ "0", /* 4 ("none" + 4) */
++ "1",
++ "2",
++ "3", /* 7 ("both" + 4) */
++ NullS
++};
++TYPELIB innodb_read_ahead_typelib=
++{
++ array_elements(innodb_read_ahead_names) - 1, "innodb_read_ahead_typelib",
++ innodb_read_ahead_names, NULL
++};
++sys_var_enum sys_innodb_read_ahead("innodb_read_ahead", &srv_read_ahead,
++ &innodb_read_ahead_typelib, fix_innodb_read_ahead);
++sys_var_long_ptr sys_innodb_enable_unsafe_group_commit("innodb_enable_unsafe_group_commit",
++ &srv_enable_unsafe_group_commit);
++
++const char *innodb_adaptive_checkpoint_names[]=
++{
++ "none", /* 0 */
++ "reflex", /* 1 */
++ "estimate", /* 2 */
++ /* For compatibility of the older patch */
++ "0", /* 3 ("none" + 3) */
++ "1", /* 4 ("reflex" + 3) */
++ "2", /* 5 ("estimate" + 3) */
++ NullS
++};
++TYPELIB innodb_adaptive_checkpoint_typelib=
++{
++ array_elements(innodb_adaptive_checkpoint_names) - 1, "innodb_adaptive_checkpoint_typelib",
++ innodb_adaptive_checkpoint_names, NULL
++};
++sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint",
++ &srv_adaptive_checkpoint,
++ &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint);
+ sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path",
+ &innobase_data_file_path);
+ sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir",
+@@ -860,6 +911,13 @@
+ &sys_innodb_thread_concurrency,
+ &sys_innodb_commit_concurrency,
+ &sys_innodb_flush_log_at_trx_commit,
++ &sys_innodb_io_capacity,
++ &sys_innodb_ibuf_active_contract,
++ &sys_innodb_ibuf_accel_rate,
++ &sys_innodb_flush_neighbor_pages,
++ &sys_innodb_read_ahead,
++ &sys_innodb_enable_unsafe_group_commit,
++ &sys_innodb_adaptive_checkpoint,
+ #endif
+ &sys_trust_routine_creators,
+ &sys_trust_function_creators,
+@@ -997,6 +1055,16 @@
+ {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
+ {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
+ {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
++ {sys_innodb_io_capacity.name, (char*) &sys_innodb_io_capacity, SHOW_SYS},
++ {"innodb_ibuf_max_size", (char*) &srv_ibuf_max_size, SHOW_LONGLONG},
++ {sys_innodb_ibuf_active_contract.name, (char*) &sys_innodb_ibuf_active_contract, SHOW_SYS},
++ {sys_innodb_ibuf_accel_rate.name, (char*) &sys_innodb_ibuf_accel_rate, SHOW_SYS},
++ {sys_innodb_flush_neighbor_pages.name, (char*) &sys_innodb_flush_neighbor_pages, SHOW_SYS},
++ {sys_innodb_read_ahead.name, (char*) &sys_innodb_read_ahead, SHOW_SYS},
++ {sys_innodb_enable_unsafe_group_commit.name, (char*) &sys_innodb_enable_unsafe_group_commit, SHOW_SYS},
++ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
+ {sys_innodb_use_legacy_cardinality_algorithm.name,
+ (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS},
+ #endif
+@@ -1459,6 +1527,18 @@
+ }
+ }
+
++#ifdef HAVE_INNOBASE_DB
++extern void fix_innodb_read_ahead(THD *thd, enum_var_type type)
++{
++ srv_read_ahead &= 3;
++}
++
++extern void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type)
++{
++ srv_adaptive_checkpoint %= 3;
++}
++#endif /* HAVE_INNOBASE_DB */
++
+ static void fix_max_binlog_size(THD *thd, enum_var_type type)
+ {
+ DBUG_ENTER("fix_max_binlog_size");
+diff -ruN a/sql/set_var.h b/sql/set_var.h
+--- a/sql/set_var.h 2009-07-02 16:43:23.000000000 +0900
++++ b/sql/set_var.h 2009-07-02 17:35:17.000000000 +0900
+@@ -31,6 +31,11 @@
+
+ extern TYPELIB bool_typelib, delay_key_write_typelib, sql_mode_typelib;
+
++#ifdef HAVE_INNOBASE_DB
++extern TYPELIB innodb_read_ahead_typelib;
++extern TYPELIB innodb_adaptive_checkpoint_typelib;
++#endif /* HAVE_INNOBASE_DB */
++
+ typedef int (*sys_check_func)(THD *, set_var *);
+ typedef bool (*sys_update_func)(THD *, set_var *);
+ typedef void (*sys_after_update_func)(THD *,enum_var_type);
+@@ -1148,6 +1153,10 @@
+ int sql_set_variables(THD *thd, List<set_var_base> *var_list);
+ bool not_all_support_one_shot(List<set_var_base> *var_list);
+ void fix_delay_key_write(THD *thd, enum_var_type type);
++#ifdef HAVE_INNOBASE_DB
++void fix_innodb_read_ahead(THD *thd, enum_var_type type);
++void fix_innodb_adaptive_checkpoint(THD *thd, enum_var_type type);
++#endif /* HAVE_INNOBASE_DB */
+ ulong fix_sql_mode(ulong sql_mode);
+ extern sys_var_const_str sys_charset_system;
+ extern sys_var_str sys_init_connect;
diff --git a/percona/5.0.91-b22-20100522/innodb_io_pattern.patch b/percona/5.0.91-b22-20100522/innodb_io_pattern.patch
new file mode 100644
index 0000000..d9e60e9
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_io_pattern.patch
@@ -0,0 +1,693 @@
+diff -r d4826c0a98c2 include/mysql_com.h
+--- a/include/mysql_com.h Wed Jul 29 09:58:58 2009 -0700
++++ b/include/mysql_com.h Wed Jul 29 10:00:12 2009 -0700
+@@ -122,6 +122,9 @@
+ #define REFRESH_DES_KEY_FILE 0x40000L
+ #define REFRESH_USER_RESOURCES 0x80000L
+
++/* TRUNCATE INFORMATION_SCHEMA.INNODB_IO_PATTERN */
++#define REFRESH_INNODB_IO_PATTERN 0x1000000L
++
+ #define CLIENT_LONG_PASSWORD 1 /* new more secure passwords */
+ #define CLIENT_FOUND_ROWS 2 /* Found instead of affected rows */
+ #define CLIENT_LONG_FLAG 4 /* Get all column flags */
+diff -r d4826c0a98c2 innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c Wed Jul 29 09:58:58 2009 -0700
++++ b/innobase/buf/buf0buf.c Wed Jul 29 10:00:12 2009 -0700
+@@ -654,6 +654,9 @@
+ }
+
+ buf_pool->page_hash = hash_create(2 * max_size);
++ buf_pool->io_counter_hash = NULL;
++ buf_pool->io_counter_heap = NULL;
++ buf_pool->io_counters = 0;
+
+ buf_pool->n_pend_reads = 0;
+
+@@ -1967,6 +1970,9 @@
+ ulint io_type;
+ ulint read_page_no;
+
++ buf_io_counter_t* io_counter;
++ ulint fold;
++
+ ut_ad(block);
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+@@ -2068,6 +2074,26 @@
+ buf_pool->n_pages_read++;
+
+ rw_lock_x_unlock_gen(&(block->lock), BUF_IO_READ);
++ /* io_counter here */
++ if (srv_io_pattern && srv_io_pattern_trace_running) {
++ fold = buf_page_address_fold(block->space, block->offset);
++ HASH_SEARCH(hash, buf_pool->io_counter_hash, fold, io_counter,
++ (io_counter->space == block->space) && (io_counter->offset == block->offset));
++ if (io_counter == NULL && buf_pool->io_counters < srv_io_pattern_size_limit) {
++ io_counter = mem_heap_alloc(buf_pool->io_counter_heap,(sizeof(buf_io_counter_t)));
++ io_counter->space = block->space;
++ io_counter->offset = block->offset;
++ io_counter->n_read = 0;
++ io_counter->n_write = 0;
++ HASH_INSERT(buf_io_counter_t, hash, buf_pool->io_counter_hash,
++ buf_page_address_fold(block->space, block->offset), io_counter);
++ buf_pool->io_counters++;
++ }
++ if (io_counter != NULL) {
++ io_counter->index_id = ut_dulint_get_low(btr_page_get_index_id(buf_block_get_frame(block)));
++ io_counter->n_read++;
++ }
++ }
+
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+@@ -2083,6 +2109,26 @@
+ buf_flush_write_complete(block);
+
+ rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE);
++ /* io_counter here */
++ if (srv_io_pattern && srv_io_pattern_trace_running) {
++ fold = buf_page_address_fold(block->space, block->offset);
++ HASH_SEARCH(hash, buf_pool->io_counter_hash, fold, io_counter,
++ (io_counter->space == block->space) && (io_counter->offset == block->offset));
++ if (io_counter == NULL && buf_pool->io_counters < srv_io_pattern_size_limit) {
++ io_counter = mem_heap_alloc(buf_pool->io_counter_heap,(sizeof(buf_io_counter_t)));
++ io_counter->space = block->space;
++ io_counter->offset = block->offset;
++ io_counter->n_read = 0;
++ io_counter->n_write = 0;
++ HASH_INSERT(buf_io_counter_t, hash, buf_pool->io_counter_hash,
++ buf_page_address_fold(block->space, block->offset), io_counter);
++ buf_pool->io_counters++;
++ }
++ if (io_counter != NULL) {
++ io_counter->index_id = ut_dulint_get_low(btr_page_get_index_id(buf_block_get_frame(block)));
++ io_counter->n_write++;
++ }
++ }
+
+ buf_pool->n_pages_written++;
+
+@@ -2657,3 +2703,58 @@
+ return buf_pool_get_nth_block(buf_pool, i);
+
+ }
++
++/*************************************************************************
++Controls the internal hash table for IO pattern tracing
++along innodb_io_pattern_trace value.*/
++
++void
++buf_io_counter_control(void)
++/*========================*/
++{
++ ulint n;
++
++ mutex_enter(&(buf_pool->mutex));
++ if (srv_io_pattern_trace) {
++ if (buf_pool->io_counter_hash == NULL) {
++ /* estimating (buf_pool * 10) */
++ buf_pool->io_counter_hash = hash_create(20 * buf_pool->max_size);
++ buf_pool->io_counter_heap = mem_heap_create(4096 * 1024);
++ buf_pool->io_counters = 0;
++
++ srv_io_pattern = TRUE;
++ }
++ } else {
++ if (buf_pool->io_counter_hash != NULL) {
++ srv_io_pattern = FALSE;
++
++ for (n = 0; n < buf_pool->io_counter_hash->n_cells; n++) {
++ (buf_pool->io_counter_hash->array + n)->node = NULL;
++ }
++ mem_heap_free(buf_pool->io_counter_heap);
++ buf_pool->io_counter_heap = NULL;
++ buf_pool->io_counters = 0;
++
++ hash_table_free(buf_pool->io_counter_hash);
++ buf_pool->io_counter_hash = NULL;
++ }
++ }
++ mutex_exit(&(buf_pool->mutex));
++}
++
++void
++buf_io_counter_clear(void)
++/*======================*/
++{
++ ulint n;
++
++ mutex_enter(&(buf_pool->mutex));
++ if (buf_pool->io_counter_hash != NULL) {
++ for (n = 0; n < buf_pool->io_counter_hash->n_cells; n++) {
++ (buf_pool->io_counter_hash->array + n)->node = NULL;
++ }
++ mem_heap_empty(buf_pool->io_counter_heap);
++ buf_pool->io_counters = 0;
++ }
++ mutex_exit(&(buf_pool->mutex));
++}
+diff -r d4826c0a98c2 innobase/include/buf0buf.h
+--- a/innobase/include/buf0buf.h Wed Jul 29 09:58:58 2009 -0700
++++ b/innobase/include/buf0buf.h Wed Jul 29 10:00:12 2009 -0700
+@@ -709,6 +709,18 @@
+ void buf_pool_dump(void);
+ buf_block_t* buf_pool_get_nth_block_no_inline(buf_pool_t* pool, ulint i);
+
++
++/*************************************************************************
++Controls the internal hash table for IO pattern tracing
++along innodb_io_pattern_trace value.*/
++
++void
++buf_io_counter_control(void);
++/*=========================*/
++
++void
++buf_io_counter_clear(void);
++/*=======================*/
+
+ /* The buffer control block structure */
+
+@@ -930,6 +942,9 @@
+ ulint curr_size; /* current pool size in pages;
+ currently always the same as
+ max_size */
++ hash_table_t* io_counter_hash;
++ mem_heap_t* io_counter_heap;
++ ulint io_counters;
+ hash_table_t* page_hash; /* hash table of the file pages */
+
+ ulint n_pend_reads; /* number of pending read operations */
+@@ -1015,6 +1030,15 @@
+ locki table, are not in this list */
+ };
+
++struct buf_io_counter_struct{
++ ulint space;
++ ulint offset;
++ buf_io_counter_t* hash;
++ ulint index_id;
++ ulint n_read;
++ ulint n_write;
++};
++
+ /* States of a control block */
+ #define BUF_BLOCK_NOT_USED 211 /* is in the free list */
+ #define BUF_BLOCK_READY_FOR_USE 212 /* when buf_get_free_block returns
+diff -r d4826c0a98c2 innobase/include/buf0types.h
+--- a/innobase/include/buf0types.h Wed Jul 29 09:58:58 2009 -0700
++++ b/innobase/include/buf0types.h Wed Jul 29 10:00:12 2009 -0700
+@@ -12,6 +12,8 @@
+ typedef struct buf_block_struct buf_block_t;
+ typedef struct buf_pool_struct buf_pool_t;
+
++typedef struct buf_io_counter_struct buf_io_counter_t;
++
+ /* The 'type' used of a buffer frame */
+ typedef byte buf_frame_t;
+
+diff -r d4826c0a98c2 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Wed Jul 29 09:58:58 2009 -0700
++++ b/innobase/include/srv0srv.h Wed Jul 29 10:00:12 2009 -0700
+@@ -146,6 +146,11 @@
+ extern ulint srv_enable_unsafe_group_commit;
+ extern uint srv_read_ahead;
+ extern uint srv_adaptive_checkpoint;
++
++extern volatile ibool srv_io_pattern;
++extern ulong srv_io_pattern_trace;
++extern ulong srv_io_pattern_trace_running;
++extern ulong srv_io_pattern_size_limit;
+ /*-------------------------------------------*/
+
+ extern ulint srv_n_rows_inserted;
+diff -r d4826c0a98c2 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Wed Jul 29 09:58:58 2009 -0700
++++ b/innobase/srv/srv0srv.c Wed Jul 29 10:00:12 2009 -0700
+@@ -352,6 +352,11 @@
+
+ uint srv_read_ahead = 3; /* 1: random 2: linear 3: Both */
+ uint srv_adaptive_checkpoint = 0; /* 0: none 1: reflex 2: estimate */
++
++volatile ibool srv_io_pattern = FALSE;
++ulint srv_io_pattern_trace = 0;
++ulint srv_io_pattern_trace_running = 0;
++ulint srv_io_pattern_size_limit = ULINT_MAX - (1024 * 1024);
+ /*-------------------------------------------*/
+ ulong srv_n_spin_wait_rounds = 20;
+ ulong srv_n_free_tickets_to_enter = 500;
+diff -r d4826c0a98c2 mysql-test/r/information_schema.result
+--- a/mysql-test/r/information_schema.result Wed Jul 29 09:58:58 2009 -0700
++++ b/mysql-test/r/information_schema.result Wed Jul 29 10:00:12 2009 -0700
+@@ -59,6 +59,7 @@
+ USER_PRIVILEGES
+ USER_STATISTICS
+ VIEWS
++INNODB_IO_PATTERN
+ columns_priv
+ db
+ func
+@@ -742,7 +743,7 @@
+ CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1;
+ CREATE VIEW a2 AS SELECT t_CRASHME FROM a1;
+ count(*)
+-108
++109
+ drop view a2, a1;
+ drop table t_crashme;
+ select table_schema,table_name, column_name from
+@@ -812,12 +813,13 @@
+ TABLE_PRIVILEGES TABLE_NAME select
+ TABLE_STATISTICS TABLE_NAME select
+ VIEWS TABLE_NAME select
++INNODB_IO_PATTERN TABLE_NAME select
+ delete from mysql.user where user='mysqltest_4';
+ delete from mysql.db where user='mysqltest_4';
+ flush privileges;
+ SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA;
+ table_schema count(*)
+-information_schema 23
++information_schema 24
+ mysql 17
+ create table t1 (i int, j int);
+ create trigger trg1 before insert on t1 for each row
+@@ -1225,6 +1227,7 @@
+ USER_PRIVILEGES GRANTEE
+ USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
++INNODB_IO_PATTERN SPACE
+ SELECT t.table_name, c1.column_name
+ FROM information_schema.tables t
+ INNER JOIN
+@@ -1263,6 +1266,7 @@
+ USER_PRIVILEGES GRANTEE
+ USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
++INNODB_IO_PATTERN SPACE
+ SELECT MAX(table_name) FROM information_schema.tables;
+ MAX(table_name)
+ VIEWS
+@@ -1337,6 +1341,7 @@
+ COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1
+ INDEX_STATISTICS information_schema.INDEX_STATISTICS 1
+ INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1
++INNODB_IO_PATTERN information_schema.INNODB_IO_PATTERN 1
+ KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
+ PROCESSLIST information_schema.PROCESSLIST 1
+ PROFILING information_schema.PROFILING 1
+diff -r d4826c0a98c2 mysql-test/r/information_schema_db.result
+--- a/mysql-test/r/information_schema_db.result Wed Jul 29 09:58:58 2009 -0700
++++ b/mysql-test/r/information_schema_db.result Wed Jul 29 10:00:12 2009 -0700
+@@ -28,6 +28,7 @@
+ USER_PRIVILEGES
+ USER_STATISTICS
+ VIEWS
++INNODB_IO_PATTERN
+ show tables from INFORMATION_SCHEMA like 'T%';
+ Tables_in_information_schema (T%)
+ TABLES
+diff -r d4826c0a98c2 mysql-test/r/mysqlshow.result
+--- a/mysql-test/r/mysqlshow.result Wed Jul 29 09:58:58 2009 -0700
++++ b/mysql-test/r/mysqlshow.result Wed Jul 29 10:00:12 2009 -0700
+@@ -102,6 +102,7 @@
+ | USER_PRIVILEGES |
+ | USER_STATISTICS |
+ | VIEWS |
++| INNODB_IO_PATTERN |
+ +---------------------------------------+
+ Database: INFORMATION_SCHEMA
+ +---------------------------------------+
+@@ -130,6 +131,7 @@
+ | USER_PRIVILEGES |
+ | USER_STATISTICS |
+ | VIEWS |
++| INNODB_IO_PATTERN |
+ +---------------------------------------+
+ Wildcard: inf_rmation_schema
+ +--------------------+
+diff -r d4826c0a98c2 patch_info/innodb_io_pattern.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_io_pattern.info Wed Jul 29 10:00:12 2009 -0700
+@@ -0,0 +1,8 @@
++File=innodb_io_pattern.patch
++Name=Information schema table of InnoDB IO counts for each datafile pages
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=INFORMATION_SCHEMA.INNODB_IO_PATTERN
++2008-12-01
++YK: fix for mysql-test
+diff -r d4826c0a98c2 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/ha_innodb.cc Wed Jul 29 10:00:12 2009 -0700
+@@ -1583,6 +1583,8 @@
+ pthread_cond_init(&commit_cond, NULL);
+ innodb_inited= 1;
+
++ buf_io_counter_control();
++
+ /* If this is a replication slave and we needed to do a crash recovery,
+ set the master binlog position to what InnoDB internally knew about
+ how far we got transactions durable inside InnoDB. There is a
+@@ -6551,6 +6553,28 @@
+ }
+
+ /****************************************************************************
++Controls the internal hash table for IO pattern tracing
++along innodb_io_pattern_trace value.*/
++
++void
++innodb_io_pattern_control(void)
++/*===========================*/
++{
++ if (innodb_inited) {
++ buf_io_counter_control();
++ }
++}
++
++void
++innodb_io_pattern_clear(void)
++/*=========================*/
++{
++ if (innodb_inited) {
++ buf_io_counter_clear();
++ }
++}
++
++/****************************************************************************
+ Implements the SHOW INNODB STATUS command. Sends the output of the InnoDB
+ Monitor to the client. */
+
+diff -r d4826c0a98c2 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/ha_innodb.h Wed Jul 29 10:00:12 2009 -0700
+@@ -245,6 +245,9 @@
+ extern uint srv_adaptive_checkpoint;
+ extern ulong srv_show_locks_held;
+ extern ulong srv_show_verbose_locks;
++extern ulong srv_io_pattern_trace;
++extern ulong srv_io_pattern_trace_running;
++extern ulong srv_io_pattern_size_limit;
+
+ /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does
+ NOT update cardinality for indexes of InnoDB table". By default we are
+@@ -278,6 +281,9 @@
+ bool innodb_mutex_show_status(THD* thd);
+ void innodb_export_status(void);
+
++void innodb_io_pattern_control(void);
++void innodb_io_pattern_clear(void);
++
+ void innobase_release_temporary_latches(THD *thd);
+
+ void innobase_store_binlog_offset_and_flush_log(char *binlog_name,longlong offset);
+diff -r d4826c0a98c2 sql/lex.h
+--- a/sql/lex.h Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/lex.h Wed Jul 29 10:00:12 2009 -0700
+@@ -244,6 +244,7 @@
+ { "INNER", SYM(INNER_SYM)},
+ { "INNOBASE", SYM(INNOBASE_SYM)},
+ { "INNODB", SYM(INNOBASE_SYM)},
++ { "INNODB_IO_PATTERN", SYM(INNODB_IO_PATTERN)},
+ { "INOUT", SYM(INOUT_SYM)},
+ { "INSENSITIVE", SYM(INSENSITIVE_SYM)},
+ { "INSERT", SYM(INSERT)},
+diff -r d4826c0a98c2 sql/mysqld.cc
+--- a/sql/mysqld.cc Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/mysqld.cc Wed Jul 29 10:00:12 2009 -0700
+@@ -5029,6 +5029,9 @@
+ OPT_INNODB_SYNC_SPIN_LOOPS,
+ OPT_INNODB_CONCURRENCY_TICKETS,
+ OPT_INNODB_THREAD_SLEEP_DELAY,
++ OPT_INNODB_IO_PATTERN_TRACE,
++ OPT_INNODB_IO_PATTERN_TRACE_RUNNING,
++ OPT_INNODB_IO_PATTERN_SIZE_LIMIT,
+ OPT_BDB_CACHE_SIZE,
+ OPT_BDB_LOG_BUFFER_SIZE,
+ OPT_BDB_MAX_LOCK,
+@@ -5461,6 +5464,18 @@
+ "Number of background write I/O threads in InnoDB.",
+ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
+ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_io_pattern_trace", OPT_INNODB_IO_PATTERN_TRACE,
++ "Create/Drop the internal hash table for IO pattern tracing.",
++ (gptr*) &srv_io_pattern_trace, (gptr*) &srv_io_pattern_trace,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_io_pattern_trace_running", OPT_INNODB_IO_PATTERN_TRACE_RUNNING,
++ "Control IO pattern trace running or not.",
++ (gptr*) &srv_io_pattern_trace_running, (gptr*) &srv_io_pattern_trace_running,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, 1, 0, 0, 0},
++ {"innodb_io_pattern_size_limit", OPT_INNODB_IO_PATTERN_SIZE_LIMIT,
++ "Set max number of counters per data pages. (0 = disable counting).",
++ (gptr*) &srv_io_pattern_size_limit, (gptr*) &srv_io_pattern_size_limit,
++ 0, GET_ULONG, REQUIRED_ARG, 0, 0, ULONG_MAX - (1024 * 1024), 0, 0, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+ (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+diff -r d4826c0a98c2 sql/set_var.cc
+--- a/sql/set_var.cc Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/set_var.cc Wed Jul 29 10:00:12 2009 -0700
+@@ -546,6 +546,12 @@
+ sys_var_long_ptr sys_innodb_show_verbose_locks(
+ "innodb_show_verbose_locks",
+ &srv_show_verbose_locks);
++sys_var_innodb_io_pattern_trace sys_innodb_io_pattern_trace("innodb_io_pattern_trace",
++ &srv_io_pattern_trace);
++sys_var_long_ptr sys_innodb_io_pattern_trace_running("innodb_io_pattern_trace_running",
++ &srv_io_pattern_trace_running);
++sys_var_long_ptr sys_innodb_io_pattern_size_limit("innodb_io_pattern_size_limit",
++ &srv_io_pattern_size_limit);
+ sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path",
+ &innobase_data_file_path);
+ sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir",
+@@ -926,6 +932,9 @@
+ &sys_innodb_adaptive_checkpoint,
+ &sys_innodb_show_locks_held,
+ &sys_innodb_show_verbose_locks,
++ &sys_innodb_io_pattern_trace,
++ &sys_innodb_io_pattern_trace_running,
++ &sys_innodb_io_pattern_size_limit,
+ #endif
+ &sys_trust_routine_creators,
+ &sys_trust_function_creators,
+@@ -1075,6 +1084,9 @@
+ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
++ {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS},
++ {sys_innodb_io_pattern_trace_running.name, (char*) &sys_innodb_io_pattern_trace_running, SHOW_SYS},
++ {sys_innodb_io_pattern_size_limit.name, (char*) &sys_innodb_io_pattern_size_limit, SHOW_SYS},
+ {sys_innodb_use_legacy_cardinality_algorithm.name,
+ (char*) &sys_innodb_use_legacy_cardinality_algorithm, SHOW_SYS},
+ #endif
+@@ -3210,6 +3222,19 @@
+ thd->variables.lc_time_names= global_system_variables.lc_time_names;
+ }
+
++#ifdef HAVE_INNOBASE_DB
++bool sys_var_innodb_io_pattern_trace::update(THD *thd, set_var *var)
++{
++ bool ret;
++
++ ret = sys_var_long_ptr_global::update(thd, var);
++
++ innodb_io_pattern_control();
++
++ return ret;
++}
++#endif /* HAVE_INNOBASE_DB */
++
+ /*
+ Functions to update thd->options bits
+ */
+diff -r d4826c0a98c2 sql/set_var.h
+--- a/sql/set_var.h Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/set_var.h Wed Jul 29 10:00:12 2009 -0700
+@@ -1012,6 +1012,17 @@
+ virtual void set_default(THD *thd, enum_var_type type);
+ };
+
++#ifdef HAVE_INNOBASE_DB
++/* sys_var_innodb_io_pattern_trace */
++class sys_var_innodb_io_pattern_trace :public sys_var_long_ptr
++{
++public:
++ sys_var_innodb_io_pattern_trace(const char *name_arg, ulong *value_ptr_arg)
++ :sys_var_long_ptr(name_arg,value_ptr_arg) {}
++ bool update(THD *thd, set_var *var);
++};
++#endif /* HAVE_INNOBASE_DB */
++
+ /****************************************************************************
+ Classes for parsing of the SET command
+ ****************************************************************************/
+diff -r d4826c0a98c2 sql/sql_parse.cc
+--- a/sql/sql_parse.cc Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/sql_parse.cc Wed Jul 29 10:00:12 2009 -0700
+@@ -8104,6 +8104,13 @@
+ }
+ pthread_mutex_unlock(&LOCK_global_user_client_stats);
+ }
++#ifdef HAVE_INNOBASE_DB
++ if (options & REFRESH_INNODB_IO_PATTERN)
++ {
++ tmp_write_to_binlog= 0;
++ innodb_io_pattern_clear();
++ }
++#endif /* HAVE_INNOBASE_DB */
+ *write_to_binlog= tmp_write_to_binlog;
+ return result;
+ }
+diff -r d4826c0a98c2 sql/sql_show.cc
+--- a/sql/sql_show.cc Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/sql_show.cc Wed Jul 29 10:00:12 2009 -0700
+@@ -33,6 +33,17 @@
+ #include "ha_innodb.h"
+ #endif
+
++#ifdef HAVE_INNOBASE_DB
++#define INSIDE_HA_INNOBASE_CC
++extern "C" {
++#include "srv0srv.h"
++#include "buf0buf.h"
++#include "dict0dict.h"
++}
++/* We need to undef it in InnoDB */
++#undef byte
++#endif /* HAVE_INNOBASE_DB */
++
+ #ifndef NO_EMBEDDED_ACCESS_CHECKS
+ static const char *grant_names[]={
+ "select","insert","update","delete","create","drop","reload","shutdown",
+@@ -4108,6 +4119,72 @@
+ DBUG_RETURN(res);
+ }
+
++int innodb_io_pattern_fill_table(THD *thd, TABLE_LIST *tables, COND *cond)
++{
++ TABLE *table= (TABLE *) tables->table;
++
++ buf_io_counter_t* io_counter;
++ dict_index_t* index;
++
++ DBUG_ENTER("innodb_io_pattern_fill_table");
++ int returnable= 0;
++
++ /* deny access to non-superusers */
++ if (check_global_access(thd, PROCESS_ACL)) {
++ DBUG_RETURN(0);
++ }
++
++ /* We cannot use inline functions of InnoDB here */
++
++ /* !!!!!ATTENTION!!!!!: This function is not protected by mutex for performance. */
++ /* Don't use "DROP TABLE innodb_io_pattern" and INFORMATION_SCHEMA.INNODB_IO_PATTERN */
++ /* at the same time as possible. */
++
++ if (srv_io_pattern) {
++ for (ulint n=0; n < buf_pool->io_counter_hash->n_cells; n++) {
++ if (!srv_io_pattern)
++ goto end_func;
++
++ io_counter = (buf_io_counter_t*)(buf_pool->io_counter_hash->array + n)->node;
++ while (io_counter) {
++ if (!srv_io_pattern)
++ goto end_func;
++
++ if (dict_sys != NULL) {
++ dulint id;
++ id.high = 0;
++ id.low = io_counter->index_id;
++ index = dict_index_find_on_id_low(id);
++ } else {
++ index = NULL;
++ }
++
++ table->field[0]->store(io_counter->space);
++ table->field[1]->store(io_counter->offset);
++ table->field[2]->store(io_counter->index_id);
++ if (index != NULL) {
++ table->field[3]->store(index->table_name,strlen(index->table_name),system_charset_info);
++ table->field[4]->store(index->name,strlen(index->name),system_charset_info);
++ } else {
++ table->field[3]->store("",0,system_charset_info);
++ table->field[4]->store("",0,system_charset_info);
++ }
++ table->field[5]->store(io_counter->n_read);
++ table->field[6]->store(io_counter->n_write);
++ if (schema_table_store_record(thd, table))
++ {
++ returnable= 1;
++ goto end_func;
++ }
++ io_counter = io_counter->hash;
++ }
++ }
++ }
++
++ end_func:
++ DBUG_RETURN(returnable);
++}
++
+ /*
+ Find schema_tables elment by name
+
+@@ -4914,6 +4986,19 @@
+ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
+ };
+
++#ifdef HAVE_INNOBASE_DB
++ST_FIELD_INFO innodb_io_pattern_field_info[]=
++{
++ {"SPACE", 11, MYSQL_TYPE_LONG, 0, 0, "space_id"},
++ {"OFFSET", 11, MYSQL_TYPE_LONG, 0, 0, "offset"},
++ {"INDEX_ID", 11, MYSQL_TYPE_LONG, 0, 0, "index id"},
++ {"TABLE_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "table name"},
++ {"INDEX_NAME", 32, MYSQL_TYPE_STRING, 0, 0, "index name"},
++ {"N_READ", 11, MYSQL_TYPE_LONG, 0, 0, "read ios"},
++ {"N_WRITE", 11, MYSQL_TYPE_LONG, 0, 0, "write ios"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++#endif
+
+ ST_FIELD_INFO variables_fields_info[]=
+ {
+@@ -5089,6 +5174,10 @@
+ make_old_format, 0, -1, -1, 1},
+ {"VIEWS", view_fields_info, create_schema_table,
+ get_all_tables, 0, get_schema_views_record, 1, 2, 0},
++#ifdef HAVE_INNOBASE_DB
++ {"INNODB_IO_PATTERN", innodb_io_pattern_field_info, create_schema_table,
++ innodb_io_pattern_fill_table, 0, 0, -1, -1, 0},
++#endif
+ {0, 0, 0, 0, 0, 0, 0, 0, 0}
+ };
+
+diff -r d4826c0a98c2 sql/sql_yacc.yy
+--- a/sql/sql_yacc.yy Wed Jul 29 09:58:58 2009 -0700
++++ b/sql/sql_yacc.yy Wed Jul 29 10:00:12 2009 -0700
+@@ -685,6 +685,7 @@
+ %token INFILE
+ %token INNER_SYM
+ %token INNOBASE_SYM
++%token INNODB_IO_PATTERN
+ %token INOUT_SYM
+ %token INSENSITIVE_SYM
+ %token INSERT
+@@ -8500,6 +8501,7 @@
+ | MASTER_SYM { Lex->type|= REFRESH_MASTER; }
+ | DES_KEY_FILE { Lex->type|= REFRESH_DES_KEY_FILE; }
+ | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; }
++ | INNODB_IO_PATTERN { Lex->type|= REFRESH_INNODB_IO_PATTERN; }
+ | CLIENT_STATS_SYM { Lex->type|= REFRESH_CLIENT_STATS; }
+ | USER_STATS_SYM { Lex->type|= REFRESH_USER_STATS; }
+ | TABLE_STATS_SYM { Lex->type|= REFRESH_TABLE_STATS; }
+@@ -9552,6 +9554,7 @@
+ | ISOLATION {}
+ | ISSUER_SYM {}
+ | INNOBASE_SYM {}
++ | INNODB_IO_PATTERN {}
+ | INSERT_METHOD {}
+ | IO_SYM {}
+ | IPC_SYM {}
diff --git a/percona/5.0.91-b22-20100522/innodb_io_tune.patch b/percona/5.0.91-b22-20100522/innodb_io_tune.patch
new file mode 100644
index 0000000..3953e1d
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_io_tune.patch
@@ -0,0 +1,1823 @@
+diff -r 322370200e6a innobase/include/os0file.h
+--- a/innobase/include/os0file.h Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/include/os0file.h Mon Nov 03 05:08:52 2008 -0800
+@@ -532,21 +532,16 @@
+ FALSE otherwise */
+ const char* path); /* in: path name */
+ /****************************************************************************
+-Initializes the asynchronous io system. Creates separate aio array for
+-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
+-segment, two aio arrays for log reads and writes with one segment, and a
+-synchronous aio array of the specified size. The combined number of segments
+-in the three first aio arrays is the parameter n_segments given to the
+-function. The caller must create an i/o handler thread for each segment in
+-the four first arrays, but not for the sync aio array. */
++Initializes the asynchronous io system. */
+
+-void
++ulint
+ os_aio_init(
+ /*========*/
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++ /* out: number of AIO handler threads */
++ ulint ios_per_array, /* in: maximum number of pending aio operations
++ allowed per IO array */
++ ulint n_read_threads, /* in: number of read threads */
++ ulint n_write_threads, /* in: number of write threads */
+ ulint n_slots_sync); /* in: number of slots in the sync aio array */
+ /***********************************************************************
+ Requests an asynchronous i/o operation. */
+diff -r 322370200e6a innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/include/srv0srv.h Mon Nov 03 05:08:52 2008 -0800
+@@ -87,6 +87,14 @@
+ extern ulint srv_lock_table_size;
+
+ extern ulint srv_n_file_io_threads;
++extern ulint srv_n_read_io_threads;
++extern ulint srv_n_write_io_threads;
++
++/* Number of IO operations per second the server can do */
++extern ulint srv_io_capacity;
++
++/* Flush dirty pages when below max dirty percent */
++extern ibool srv_extra_dirty_writes;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ extern ibool srv_log_archive_on;
+@@ -252,6 +260,24 @@
+
+ /* variable to count the number of random read-aheads were done */
+ extern ulint srv_read_ahead_rnd;
++
++/* Number of IO operations read/write done for all threads */
++extern ulint os_aio_read_requests;
++extern ulint os_aio_write_requests;
++
++/* Number of pages read/written done for all threads */
++extern ulint os_aio_pages_read;
++extern ulint os_aio_pages_written;
++
++/* time usec used to perform read/write for all threads */
++extern ib_longlong os_aio_read_time;
++extern ib_longlong os_aio_write_time;
++
++extern ulint inno_pending_normal_aio_reads;
++extern ulint inno_pending_normal_aio_writes;
++extern ulint inno_pending_ibuf_aio_reads;
++extern ulint inno_pending_log_ios;
++extern ulint inno_pending_sync_ios;
+
+ /* In this structure we store status variables to be passed to MySQL */
+ typedef struct export_var_struct export_struc;
+diff -r 322370200e6a innobase/log/log0log.c
+--- a/innobase/log/log0log.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/log/log0log.c Mon Nov 03 05:08:52 2008 -0800
+@@ -1537,6 +1537,30 @@
+
+ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS, TRUE,
+ LOG_WRITE_FROM_BACKGROUND_SYNC);
++}
++
++/********************************************************************
++Flush the log buffer. Force it to disk depending on the value of
++innodb_flush_log_at_trx_commit. */
++
++void
++log_buffer_flush_maybe_sync(void)
++/*==========================*/
++{
++ dulint lsn;
++
++ mutex_enter(&(log_sys->mutex));
++
++ lsn = log_sys->lsn;
++
++ mutex_exit(&(log_sys->mutex));
++
++ /* Force log buffer to disk when innodb_flush_log_at_trx_commit = 1. */
++ log_write_up_to(lsn, LOG_WAIT_ALL_GROUPS,
++ srv_flush_log_at_trx_commit == 1 ? TRUE : FALSE,
++ srv_flush_log_at_trx_commit == 1 ?
++ LOG_WRITE_FROM_BACKGROUND_SYNC :
++ LOG_WRITE_FROM_BACKGROUND_ASYNC);
+ }
+
+ /********************************************************************
+diff -r 322370200e6a innobase/os/os0file.c
+--- a/innobase/os/os0file.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/os/os0file.c Mon Nov 03 05:08:52 2008 -0800
+@@ -22,6 +22,8 @@
+ #include <errno.h>
+ #endif /* UNIV_HOTBACKUP */
+
++extern long innobase_max_merged_io;
++
+ #undef HAVE_FDATASYNC
+
+ #ifdef POSIX_ASYNC_IO
+@@ -63,6 +65,28 @@
+ ibool os_aio_use_native_aio = FALSE;
+
+ ibool os_aio_print_debug = FALSE;
++
++/* State for the state of an IO request in simulated AIO.
++ Protocol for simulated aio:
++ client requests IO: find slot with reserved = FALSE. Add entry with
++ status = OS_AIO_NOT_ISSUED.
++ IO thread wakes: find adjacent slots with reserved = TRUE and status =
++ OS_AIO_NOT_ISSUED. Change status for slots to
++ OS_AIO_ISSUED.
++ IO operation completes: set status for slots to OS_AIO_DONE. set status
++ for the first slot to OS_AIO_CLAIMED and return
++ result for that slot.
++ When there are multiple read and write threads, they all compete to execute
++ the requests in the array (os_aio_array_t). This avoids the need to load
++ balance requests at the time the request is made at the cost of waking all
++ threads when a request is available.
++*/
++typedef enum {
++ OS_AIO_NOT_ISSUED, /* Available to be processed by an IO thread. */
++ OS_AIO_ISSUED, /* Being processed by an IO thread. */
++ OS_AIO_DONE, /* Request processed. */
++ OS_AIO_CLAIMED /* Result being returned to client. */
++} os_aio_status;
+
+ /* The aio array slot structure */
+ typedef struct os_aio_slot_struct os_aio_slot_t;
+@@ -72,6 +96,8 @@
+ ulint pos; /* index of the slot in the aio
+ array */
+ ibool reserved; /* TRUE if this slot is reserved */
++ os_aio_status status; /* Status for current request. Valid when reserved
++ is TRUE. Used only in simulated aio. */
+ time_t reservation_time;/* time when reserved */
+ ulint len; /* length of the block to read or
+ write */
+@@ -82,11 +108,6 @@
+ ulint offset_high; /* 32 high bits of file offset */
+ os_file_t file; /* file where to read or write */
+ const char* name; /* file name or path */
+- ibool io_already_done;/* used only in simulated aio:
+- TRUE if the physical i/o already
+- made and only the slot message
+- needs to be passed to the caller
+- of os_aio_simulated_handle */
+ fil_node_t* message1; /* message which is given by the */
+ void* message2; /* the requester of an aio operation
+ and which can be used to identify
+@@ -116,9 +137,6 @@
+ in this array */
+ ulint n_slots; /* Total number of slots in the aio array.
+ This must be divisible by n_threads. */
+- ulint n_segments;/* Number of segments in the aio array of
+- pending aio requests. A thread can wait
+- separately for any one of the segments. */
+ ulint n_reserved;/* Number of reserved slots in the
+ aio array outside the ibuf segment */
+ os_aio_slot_t* slots; /* Pointer to the slots in the array */
+@@ -134,6 +152,17 @@
+
+ /* Array of events used in simulated aio */
+ os_event_t* os_aio_segment_wait_events = NULL;
++
++/* Number of threads for reading and writing. */
++ulint os_aio_read_threads = 0;
++ulint os_aio_write_threads = 0;
++
++/* Number for the first global segment for reading. */
++const ulint os_aio_first_read_segment = 2;
++
++/* Number for the first global segment for writing. Set to
++2 + os_aio_read_write_threads. */
++ulint os_aio_first_write_segment = 0;
+
+ /* The aio arrays for non-ibuf i/o and ibuf i/o, as well as sync aio. These
+ are NULL when the module has not yet been initialized. */
+@@ -143,11 +172,39 @@
+ static os_aio_array_t* os_aio_log_array = NULL;
+ static os_aio_array_t* os_aio_sync_array = NULL;
+
++/* Per thread buffer used for merged IO requests. Used by
++os_aio_simulated_handle so that a buffer doesn't have to be allocated
++for each request. */
++static char* os_aio_thread_buffer[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_buffer_size[SRV_MAX_N_IO_THREADS];
++
++/* Count pages read and written per thread */
++static ulint os_aio_thread_io_reads[SRV_MAX_N_IO_THREADS];
++static ulint os_aio_thread_io_writes[SRV_MAX_N_IO_THREADS];
++
++/* Number of IO operations done. One request can be for N pages. */
++static ulint os_aio_thread_io_requests[SRV_MAX_N_IO_THREADS];
++
++/* usecs spent blocked on an IO request */
++static double os_aio_thread_io_wait[SRV_MAX_N_IO_THREADS];
++/* max usecs spent blocked on an IO request */
++static double os_aio_thread_max_io_wait[SRV_MAX_N_IO_THREADS];
++
++/* Number of IO global segments. An IO handler thread is created for each
++global segment, except for the segment associated with os_aio_sync_array.
++Several segments can be associated with os_aio_{read,write}_array. One
++segment is created for each of the other arrays. This is also the number
++of valid entries in srv_io_thread_reads, srv_io_thread_writes,
++srv_io_thread_op_info, srv_io_thread_function and os_aio_segment_wait_events. */
+ static ulint os_aio_n_segments = ULINT_UNDEFINED;
+
+-/* If the following is TRUE, read i/o handler threads try to
+-wait until a batch of new read requests have been posted */
+-static ibool os_aio_recommend_sleep_for_read_threads = FALSE;
++/* Set to TRUE to temporarily block reads from being scheduled while a batch
++of read requests is added to allow them to be merged by the IO handler thread
++if they are adjacent. Declared volatile because we don't want this to be
++read from a register in a loop when another thread may change the value in
++memory.
++*/
++static volatile ibool os_aio_recommend_sleep_for_read_threads = FALSE;
+
+ ulint os_n_file_reads = 0;
+ ulint os_bytes_read_since_printout = 0;
+@@ -166,6 +223,19 @@
+ ulint os_file_n_pending_pwrites = 0;
+ ulint os_n_pending_writes = 0;
+ ulint os_n_pending_reads = 0;
++
++/* TODO -- does InnoDB provide a portable method for this? */
++static double time_usecs() {
++#ifdef __WIN__
++ return 0.0;
++#else
++ struct timeval tv;
++ if (gettimeofday(&tv, NULL))
++ return 0;
++ else
++ return tv.tv_sec * 1000000.0 + tv.tv_usec;
++#endif
++}
+
+ /***************************************************************************
+ Gets the operating system version. Currently works only on Windows. */
+@@ -1351,6 +1421,8 @@
+ /* We disable OS caching (O_DIRECT) only on data files */
+ if (type != OS_LOG_FILE
+ && srv_unix_file_flush_method == SRV_UNIX_O_DIRECT) {
++
++ fprintf(stderr, "Using O_DIRECT for file %s\n", name);
+
+ os_file_set_nocache(file, name, mode_str);
+ }
+@@ -1798,6 +1870,32 @@
+ #endif /* __WIN__ */
+ }
+
++#ifndef __WIN__
++/***************************************************************************
++Possibly flushes a given file to disk. */
++
++ibool
++os_maybe_fsync(
++/*==========*/
++ /* out: 0 if success, error code otherwise */
++ os_file_t file) /* in, own: handle to a file */
++{
++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fsync(file);
++}
++
++/***************************************************************************
++Possibly flushes a given file to disk. */
++
++ibool
++os_maybe_fdatasync(
++/*==========*/
++ /* out: 0 if success, error code otherwise */
++ os_file_t file) /* in, own: handle to a file */
++{
++ return (srv_unix_file_flush_method == SRV_UNIX_NOSYNC) ? 0 : fdatasync(file);
++}
++#endif
++
+ /***************************************************************************
+ Flushes the write buffers of a given file to the disk. */
+
+@@ -1855,21 +1953,21 @@
+ /* If we are not on an operating system that supports this,
+ then fall back to a plain fsync. */
+
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ } else {
+ ret = fcntl(file, F_FULLFSYNC, NULL);
+
+ if (ret) {
+ /* If we are not on a file system that supports this,
+ then fall back to a plain fsync. */
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ }
+ }
+ #elif HAVE_FDATASYNC
+- ret = fdatasync(file);
++ ret = os_maybe_fdatasync(file);
+ #else
+ /* fprintf(stderr, "Flushing to file %p\n", file); */
+- ret = fsync(file);
++ ret = os_maybe_fsync(file);
+ #endif
+ os_n_fsyncs++;
+
+@@ -2298,6 +2396,9 @@
+
+ return(TRUE);
+ }
++ fprintf(stderr,
++"InnoDB: error: os_file_pread wanted %lu and got %lu.\n",
++ (ulint) n, (ulint) ret);
+ #endif
+ #ifdef __WIN__
+ error_handling:
+@@ -2784,9 +2885,8 @@
+ os_aio_array_create(
+ /*================*/
+ /* out, own: aio array */
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments) /* in: number of segments in the aio array */
++ ulint n) /* in: maximum number of pending aio operations
++ allowed */
+ {
+ os_aio_array_t* array;
+ ulint i;
+@@ -2795,7 +2895,6 @@
+ OVERLAPPED* over;
+ #endif
+ ut_a(n > 0);
+- ut_a(n_segments > 0);
+
+ array = ut_malloc(sizeof(os_aio_array_t));
+
+@@ -2806,7 +2905,6 @@
+ os_event_set(array->is_empty);
+
+ array->n_slots = n;
+- array->n_segments = n_segments;
+ array->n_reserved = 0;
+ array->slots = ut_malloc(n * sizeof(os_aio_slot_t));
+ #ifdef __WIN__
+@@ -2833,70 +2931,75 @@
+
+ /****************************************************************************
+ Initializes the asynchronous io system. Calls also os_io_init_simple.
+-Creates a separate aio array for
+-non-ibuf read and write, a third aio array for the ibuf i/o, with just one
+-segment, two aio arrays for log reads and writes with one segment, and a
+-synchronous aio array of the specified size. The combined number of segments
+-in the three first aio arrays is the parameter n_segments given to the
+-function. The caller must create an i/o handler thread for each segment in
+-the four first arrays, but not for the sync aio array. */
+-
+-void
++Creates an aio array for each of non-ibuf read, non-ibuf write, ibuf IO,
++log IO, and synchronous IO. The caller must create i/o handler thread for all
++but the synchronous aio array. Multiple threads can access the same array for
++the non-ibuf read (prefetch) and write (flush dirty buffer pages) arrays.
++Return the number of AIO handler threads. */
++
++ulint
+ os_aio_init(
+ /*========*/
+- ulint n, /* in: maximum number of pending aio operations
+- allowed; n must be divisible by n_segments */
+- ulint n_segments, /* in: combined number of segments in the four
+- first aio arrays; must be >= 4 */
++ ulint ios_per_array, /* in: maximum number of pending aio operations
++ allowed per array */
++ ulint n_read_threads, /* in: number of read threads */
++ ulint n_write_threads, /* in: number of write threads */
+ ulint n_slots_sync) /* in: number of slots in the sync aio array */
+ {
+- ulint n_read_segs;
+- ulint n_write_segs;
+- ulint n_per_seg;
+- ulint i;
++ ulint i;
++ ulint n_segments = 2 + n_read_threads + n_write_threads;
+ #ifdef POSIX_ASYNC_IO
+ sigset_t sigset;
+ #endif
+- ut_ad(n % n_segments == 0);
+- ut_ad(n_segments >= 4);
++ ut_a(ios_per_array >= OS_AIO_N_PENDING_IOS_PER_THREAD);
++ ut_a(n_read_threads >= 1 && n_read_threads <= 64);
++ ut_a(n_write_threads >= 1 && n_write_threads <= 64);
++ ut_a(n_segments < SRV_MAX_N_IO_THREADS);
+
+ os_io_init_simple();
+
+ for (i = 0; i < n_segments; i++) {
+ srv_set_io_thread_op_info(i, "not started yet");
+- }
+-
+- n_per_seg = n / n_segments;
+- n_write_segs = (n_segments - 2) / 2;
+- n_read_segs = n_segments - 2 - n_write_segs;
+-
+- /* fprintf(stderr, "Array n per seg %lu\n", n_per_seg); */
+-
+- os_aio_ibuf_array = os_aio_array_create(n_per_seg, 1);
++ os_aio_thread_io_reads[i] = 0;
++ os_aio_thread_io_writes[i] = 0;
++ os_aio_thread_io_requests[i] = 0;
++ os_aio_thread_buffer[i] = 0;
++ os_aio_thread_buffer_size[i] = 0;
++ os_aio_thread_io_wait[i] = 0;
++ os_aio_thread_max_io_wait[i] = 0;
++ }
++
++ os_aio_read_threads = n_read_threads;
++ os_aio_write_threads = n_write_threads;
++ os_aio_first_write_segment = os_aio_first_read_segment + os_aio_read_threads;
++
++ fprintf(stderr,
++ "InnoDB: ios_per_array %lu read threads %lu write threads %lu\n",
++ ios_per_array, os_aio_read_threads, os_aio_write_threads);
++
++ os_aio_ibuf_array = os_aio_array_create(ios_per_array);
+
+ srv_io_thread_function[0] = "insert buffer thread";
+
+- os_aio_log_array = os_aio_array_create(n_per_seg, 1);
++ os_aio_log_array = os_aio_array_create(ios_per_array);
+
+ srv_io_thread_function[1] = "log thread";
+
+- os_aio_read_array = os_aio_array_create(n_read_segs * n_per_seg,
+- n_read_segs);
+- for (i = 2; i < 2 + n_read_segs; i++) {
++ os_aio_read_array = os_aio_array_create(ios_per_array);
++ for (i = os_aio_first_read_segment; i < os_aio_first_write_segment; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+- srv_io_thread_function[i] = "read thread";
+- }
+-
+- os_aio_write_array = os_aio_array_create(n_write_segs * n_per_seg,
+- n_write_segs);
+- for (i = 2 + n_read_segs; i < n_segments; i++) {
++ srv_io_thread_function[i] = "read thread";
++ }
++
++ os_aio_write_array = os_aio_array_create(ios_per_array);
++ for (i = os_aio_first_write_segment; i < n_segments; i++) {
+ ut_a(i < SRV_MAX_N_IO_THREADS);
+- srv_io_thread_function[i] = "write thread";
+- }
+-
+- os_aio_sync_array = os_aio_array_create(n_slots_sync, 1);
+-
+- os_aio_n_segments = n_segments;
++ srv_io_thread_function[i] = "write thread";
++ }
++
++ os_aio_sync_array = os_aio_array_create(n_slots_sync);
++
++ os_aio_n_segments = 2 + os_aio_read_threads + os_aio_write_threads;
+
+ os_aio_validate();
+
+@@ -2924,6 +3027,7 @@
+
+ pthread_sigmask(SIG_BLOCK, &sigset, NULL); */
+ #endif
++ return os_aio_n_segments;
+ }
+
+ #ifdef WIN_ASYNC_IO
+@@ -2981,77 +3085,32 @@
+ os_event_wait(os_aio_write_array->is_empty);
+ }
+
+-/**************************************************************************
+-Calculates segment number for a slot. */
+-static
+-ulint
+-os_aio_get_segment_no_from_slot(
+-/*============================*/
+- /* out: segment number (which is the number
+- used by, for example, i/o-handler threads) */
+- os_aio_array_t* array, /* in: aio wait array */
+- os_aio_slot_t* slot) /* in: slot in this array */
+-{
+- ulint segment;
+- ulint seg_len;
+-
+- if (array == os_aio_ibuf_array) {
+- segment = 0;
+-
+- } else if (array == os_aio_log_array) {
+- segment = 1;
+-
+- } else if (array == os_aio_read_array) {
+- seg_len = os_aio_read_array->n_slots /
+- os_aio_read_array->n_segments;
+-
+- segment = 2 + slot->pos / seg_len;
+- } else {
+- ut_a(array == os_aio_write_array);
+- seg_len = os_aio_write_array->n_slots /
+- os_aio_write_array->n_segments;
+-
+- segment = os_aio_read_array->n_segments + 2
+- + slot->pos / seg_len;
+- }
+-
+- return(segment);
+-}
+-
+-/**************************************************************************
+-Calculates local segment number and aio array from global segment number. */
+-static
+-ulint
+-os_aio_get_array_and_local_segment(
++
++/**************************************************************************
++Calculates aio array from global segment number. */
++static
++os_aio_array_t*
++os_aio_get_array(
+ /*===============================*/
+- /* out: local segment number within
+- the aio array */
+- os_aio_array_t** array, /* out: aio wait array */
++ /* out: aio wait array */
+ ulint global_segment)/* in: global segment number */
+ {
+- ulint segment;
+
+ ut_a(global_segment < os_aio_n_segments);
+
+ if (global_segment == 0) {
+- *array = os_aio_ibuf_array;
+- segment = 0;
++ return os_aio_ibuf_array;
+
+ } else if (global_segment == 1) {
+- *array = os_aio_log_array;
+- segment = 0;
+-
+- } else if (global_segment < os_aio_read_array->n_segments + 2) {
+- *array = os_aio_read_array;
+-
+- segment = global_segment - 2;
+- } else {
+- *array = os_aio_write_array;
+-
+- segment = global_segment - (os_aio_read_array->n_segments + 2);
+- }
+-
+- return(segment);
++ return os_aio_log_array;
++
++ } else if (global_segment < os_aio_first_write_segment) {
++ return os_aio_read_array;
++
++ } else {
++ return os_aio_write_array;
++
++ }
+ }
+
+ /***********************************************************************
+@@ -3160,7 +3219,7 @@
+
+ os_aio_simulated_wake_handler_threads();
+ }
+-
++
+ os_event_wait(array->not_full);
+
+ goto loop;
+@@ -3173,7 +3232,7 @@
+ break;
+ }
+ }
+-
++ ut_a(i < array->n_slots);
+ array->n_reserved++;
+
+ if (array->n_reserved == 1) {
+@@ -3195,7 +3254,7 @@
+ slot->buf = buf;
+ slot->offset = offset;
+ slot->offset_high = offset_high;
+- slot->io_already_done = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ #ifdef WIN_ASYNC_IO
+ control = &(slot->control);
+@@ -3246,8 +3305,9 @@
+ os_mutex_enter(array->mutex);
+
+ ut_ad(slot->reserved);
+-
++
+ slot->reserved = FALSE;
++ slot->status = OS_AIO_NOT_ISSUED;
+
+ array->n_reserved--;
+
+@@ -3266,36 +3326,40 @@
+ }
+
+ /**************************************************************************
+-Wakes up a simulated aio i/o-handler thread if it has something to do. */
++Wake up the simulated aio i/o-handler threads for a given array if there
++is work to do. */
+ static
+ void
+ os_aio_simulated_wake_handler_thread(
+ /*=================================*/
+- ulint global_segment) /* in: the number of the segment in the aio
+- arrays */
+-{
+- os_aio_array_t* array;
+- os_aio_slot_t* slot;
+- ulint segment;
++ os_aio_array_t* array) /* in: aio array for which wakeup is done */
++{
++ os_aio_slot_t* slot;
+ ulint n;
+ ulint i;
+
+ ut_ad(!os_aio_use_native_aio);
+
+- segment = os_aio_get_array_and_local_segment(&array, global_segment);
+-
+- n = array->n_slots / array->n_segments;
+-
+- /* Look through n slots after the segment * n'th slot */
+-
+- os_mutex_enter(array->mutex);
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved) {
+- /* Found an i/o request */
+-
++ n = array->n_slots;
++
++ /* Look through n slots */
++
++ os_mutex_enter(array->mutex);
++
++ for (i = 0; i < n; i++) {
++ slot = os_aio_array_get_nth_slot(array, i );
++
++ if (slot->reserved &&
++ (slot->status == OS_AIO_NOT_ISSUED ||
++ slot->status == OS_AIO_DONE)) {
++ /* Found an i/o request
++ /* OS_AIO_NOT_ISSUED means the read or write request has
++ * yet to be done. OS_AIO_DONE means the request has been
++ * done but it was part of a set of requests merged into
++ * one read or write call and was not the first block in
++ * the request, so the handling of the IO completion for
++ * that block has not been done. */
++
+ break;
+ }
+ }
+@@ -3303,7 +3367,25 @@
+ os_mutex_exit(array->mutex);
+
+ if (i < n) {
+- os_event_set(os_aio_segment_wait_events[global_segment]);
++ if (array == os_aio_ibuf_array) {
++ os_event_set(os_aio_segment_wait_events[0]);
++
++ } else if (array == os_aio_log_array) {
++ os_event_set(os_aio_segment_wait_events[1]);
++
++ } else if (array == os_aio_read_array) {
++ ulint x;
++ for (x = os_aio_first_read_segment; x < os_aio_first_write_segment; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else if (array == os_aio_write_array) {
++ ulint x;
++ for (x = os_aio_first_write_segment; x < os_aio_n_segments; x++)
++ os_event_set(os_aio_segment_wait_events[x]);
++
++ } else {
++ ut_a(0);
++ }
+ }
+ }
+
+@@ -3320,13 +3402,14 @@
+ /* We do not use simulated aio: do nothing */
+
+ return;
+- }
+-
+- os_aio_recommend_sleep_for_read_threads = FALSE;
+-
+- for (i = 0; i < os_aio_n_segments; i++) {
+- os_aio_simulated_wake_handler_thread(i);
+- }
++ }
++
++ os_aio_recommend_sleep_for_read_threads = FALSE;
++
++ os_aio_simulated_wake_handler_thread(os_aio_ibuf_array);
++ os_aio_simulated_wake_handler_thread(os_aio_log_array);
++ os_aio_simulated_wake_handler_thread(os_aio_read_array);
++ os_aio_simulated_wake_handler_thread(os_aio_write_array);
+ }
+
+ /**************************************************************************
+@@ -3339,18 +3422,13 @@
+ os_aio_simulated_put_read_threads_to_sleep(void)
+ /*============================================*/
+ {
+- os_aio_array_t* array;
+ ulint g;
+
++ /* TODO(mcallaghan): provide similar function for write? */
+ os_aio_recommend_sleep_for_read_threads = TRUE;
+
+- for (g = 0; g < os_aio_n_segments; g++) {
+- os_aio_get_array_and_local_segment(&array, g);
+-
+- if (array == os_aio_read_array) {
+-
+- os_event_reset(os_aio_segment_wait_events[g]);
+- }
++ for (g = os_aio_first_read_segment; g < os_aio_first_write_segment; g++) {
++ os_event_reset(os_aio_segment_wait_events[g]);
+ }
+ }
+
+@@ -3480,8 +3558,7 @@
+ #endif
+ } else {
+ if (!wake_later) {
+- os_aio_simulated_wake_handler_thread(
+- os_aio_get_segment_no_from_slot(array, slot));
++ os_aio_simulated_wake_handler_thread(array);
+ }
+ }
+ } else if (type == OS_FILE_WRITE) {
+@@ -3497,8 +3574,7 @@
+ #endif
+ } else {
+ if (!wake_later) {
+- os_aio_simulated_wake_handler_thread(
+- os_aio_get_segment_no_from_slot(array, slot));
++ os_aio_simulated_wake_handler_thread(array);
+ }
+ }
+ } else {
+@@ -3561,7 +3637,7 @@
+ os_aio_windows_handle(
+ /*==================*/
+ /* out: TRUE if the aio operation succeeded */
+- ulint segment, /* in: the number of the segment in the aio
++ ulint global_segment, /* in: the number of the segment in the aio
+ arrays to wait for; segment 0 is the ibuf
+ i/o thread, segment 1 the log i/o thread,
+ then follow the non-ibuf read threads, and as
+@@ -3579,7 +3655,6 @@
+ void** message2,
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
+ {
+- ulint orig_seg = segment;
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+ ulint n;
+@@ -3588,33 +3663,30 @@
+ BOOL ret;
+ DWORD len;
+
+- if (segment == ULINT_UNDEFINED) {
++ if (global_segment == ULINT_UNDEFINED) {
+ array = os_aio_sync_array;
+- segment = 0;
+- } else {
+- segment = os_aio_get_array_and_local_segment(&array, segment);
++ } else {
++ array = os_aio_get_array(global_segment);
+ }
+
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+ we do not have to acquire the protecting mutex yet */
+
+ ut_ad(os_aio_validate());
+- ut_ad(segment < array->n_segments);
+-
+- n = array->n_slots / array->n_segments;
++
++ n = array->n_slots;
+
+ if (array == os_aio_sync_array) {
+ os_event_wait(os_aio_array_get_nth_slot(array, pos)->event);
+ i = pos;
+ } else {
+- srv_set_io_thread_op_info(orig_seg, "wait Windows aio");
+- i = os_event_wait_multiple(n,
+- (array->native_events) + segment * n);
+- }
+-
+- os_mutex_enter(array->mutex);
+-
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
++ srv_set_io_thread_op_info(global_segment, "wait Windows aio");
++ i = os_event_wait_multiple(n, (array->native_events));
++ }
++
++ os_mutex_enter(array->mutex);
++
++ slot = os_aio_array_get_nth_slot(array, i);
+
+ ut_a(slot->reserved);
+
+@@ -3787,14 +3859,16 @@
+ ulint* type) /* out: OS_FILE_WRITE or ..._READ */
+ {
+ os_aio_array_t* array;
+- ulint segment;
+ os_aio_slot_t* slot;
+ os_aio_slot_t* slot2;
+ os_aio_slot_t* consecutive_ios[OS_AIO_MERGE_N_CONSECUTIVE];
++ os_aio_slot_t* lowest_request;
++ os_aio_slot_t* oldest_request;
+ ulint n_consecutive;
+ ulint total_len;
+ ulint offs;
+ ulint lowest_offset;
++ ulint oldest_offset;
+ ulint biggest_age;
+ ulint age;
+ byte* combined_buf;
+@@ -3802,8 +3876,10 @@
+ ibool ret;
+ ulint n;
+ ulint i;
+-
+- segment = os_aio_get_array_and_local_segment(&array, global_segment);
++
++ double start_usecs, stop_usecs, elapsed_usecs;
++ time_t now;
++ array = os_aio_get_array(global_segment);
+
+ restart:
+ /* NOTE! We only access constant fields in os_aio_array. Therefore
+@@ -3812,11 +3888,10 @@
+ srv_set_io_thread_op_info(global_segment,
+ "looking for i/o requests (a)");
+ ut_ad(os_aio_validate());
+- ut_ad(segment < array->n_segments);
+-
+- n = array->n_slots / array->n_segments;
+-
+- /* Look through n slots after the segment * n'th slot */
++
++ n = array->n_slots;
++
++ /* Look through n slots */
+
+ if (array == os_aio_read_array
+ && os_aio_recommend_sleep_for_read_threads) {
+@@ -3836,9 +3911,9 @@
+ done */
+
+ for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved && slot->io_already_done) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved && slot->status == OS_AIO_DONE) {
+
+ if (os_aio_print_debug) {
+ fprintf(stderr,
+@@ -3846,79 +3921,66 @@
+ }
+
+ ret = TRUE;
+-
++
+ goto slot_io_done;
+ }
+ }
+
+- n_consecutive = 0;
+-
+- /* If there are at least 2 seconds old requests, then pick the oldest
+- one to prevent starvation. If several requests have the same age,
+- then pick the one at the lowest offset. */
+-
+ biggest_age = 0;
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array, i + segment * n);
+-
+- if (slot->reserved) {
+- age = (ulint)difftime(time(NULL),
+- slot->reservation_time);
+-
++ now = time(NULL);
++ oldest_request = lowest_request = NULL;
++ oldest_offset = lowest_offset = ULINT_MAX;
++
++ /* Find the oldest request and the request with the smallest offset */
++ for (i = 0; i < n; i++) {
++ slot = os_aio_array_get_nth_slot(array, i);
++
++ if (slot->reserved && slot->status == OS_AIO_NOT_ISSUED) {
++ age = (ulint)difftime(now, slot->reservation_time);
++
++ /* If there are at least 2 seconds old requests, then pick the oldest
++ one to prevent starvation. If several requests have the same age,
++ then pick the one at the lowest offset. */
+ if ((age >= 2 && age > biggest_age)
+ || (age >= 2 && age == biggest_age
+- && slot->offset < lowest_offset)) {
++ && slot->offset < oldest_offset)) {
+
+ /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+ biggest_age = age;
++ oldest_request = slot;
++ oldest_offset = slot->offset;
++ }
++
++ /* Look for an i/o request at the lowest offset in the array
++ * (we ignore the high 32 bits of the offset) */
++ if (slot->offset < lowest_offset) {
++ /* Found an i/o request */
++ lowest_request = slot;
+ lowest_offset = slot->offset;
+ }
+ }
+ }
+
+- if (n_consecutive == 0) {
+- /* There were no old requests. Look for an i/o request at the
+- lowest offset in the array (we ignore the high 32 bits of the
+- offset in these heuristics) */
+-
+- lowest_offset = ULINT_MAX;
+-
+- for (i = 0; i < n; i++) {
+- slot = os_aio_array_get_nth_slot(array,
+- i + segment * n);
+-
+- if (slot->reserved && slot->offset < lowest_offset) {
+-
+- /* Found an i/o request */
+- consecutive_ios[0] = slot;
+-
+- n_consecutive = 1;
+-
+- lowest_offset = slot->offset;
+- }
+- }
+- }
+-
+- if (n_consecutive == 0) {
++ if (!lowest_request && !oldest_request) {
+
+ /* No i/o requested at the moment */
+
+ goto wait_for_io;
+ }
+
+- slot = consecutive_ios[0];
++ if (oldest_request) {
++ slot = oldest_request;
++ } else {
++ slot = lowest_request;
++ }
++ consecutive_ios[0] = slot;
++ n_consecutive = 1;
+
+ /* Check if there are several consecutive blocks to read or write */
+
+ consecutive_loop:
+ for (i = 0; i < n; i++) {
+- slot2 = os_aio_array_get_nth_slot(array, i + segment * n);
++ slot2 = os_aio_array_get_nth_slot(array, i);
+
+ if (slot2->reserved && slot2 != slot
+ && slot2->offset == slot->offset + slot->len
+@@ -3926,7 +3988,8 @@
+ sum does not wrap over */
+ && slot2->offset_high == slot->offset_high
+ && slot2->type == slot->type
+- && slot2->file == slot->file) {
++ && slot2->file == slot->file
++ && slot2->status == OS_AIO_NOT_ISSUED) {
+
+ /* Found a consecutive i/o request */
+
+@@ -3935,7 +3998,8 @@
+
+ slot = slot2;
+
+- if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE) {
++ if (n_consecutive < OS_AIO_MERGE_N_CONSECUTIVE
++ && n_consecutive < innobase_max_merged_io) {
+
+ goto consecutive_loop;
+ } else {
+@@ -3955,6 +4019,8 @@
+
+ for (i = 0; i < n_consecutive; i++) {
+ total_len += consecutive_ios[i]->len;
++ ut_a(consecutive_ios[i]->status == OS_AIO_NOT_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_ISSUED;
+ }
+
+ if (n_consecutive == 1) {
+@@ -3962,7 +4028,16 @@
+ combined_buf = slot->buf;
+ combined_buf2 = NULL;
+ } else {
+- combined_buf2 = ut_malloc(total_len + UNIV_PAGE_SIZE);
++ if ((total_len + UNIV_PAGE_SIZE) > os_aio_thread_buffer_size[global_segment]) {
++
++ if (os_aio_thread_buffer[global_segment])
++ ut_free(os_aio_thread_buffer[global_segment]);
++
++ os_aio_thread_buffer[global_segment] = ut_malloc(total_len + UNIV_PAGE_SIZE);
++
++ os_aio_thread_buffer_size[global_segment] = total_len + UNIV_PAGE_SIZE;
++ }
++ combined_buf2 = os_aio_thread_buffer[global_segment];
+
+ ut_a(combined_buf2);
+
+@@ -3973,6 +4048,9 @@
+ this assumes that there is just one i/o-handler thread serving
+ a single segment of slots! */
+
++ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_ISSUED);
++
+ os_mutex_exit(array->mutex);
+
+ if (slot->type == OS_FILE_WRITE && n_consecutive > 1) {
+@@ -3998,6 +4076,7 @@
+
+ /* Do the i/o with ordinary, synchronous i/o functions: */
+ if (slot->type == OS_FILE_WRITE) {
++ os_aio_thread_io_writes[global_segment] += n_consecutive;
+ if (array == os_aio_write_array) {
+ if ((total_len % UNIV_PAGE_SIZE != 0)
+ || (slot->offset % UNIV_PAGE_SIZE != 0)) {
+@@ -4012,16 +4091,34 @@
+ os_file_check_page_trailers(combined_buf, total_len);
+ }
+
++ start_usecs = time_usecs();
+ ret = os_file_write(slot->name, slot->file, combined_buf,
+ slot->offset, slot->offset_high, total_len);
+-
++ stop_usecs = time_usecs();
++ elapsed_usecs = stop_usecs - start_usecs;
++ if (elapsed_usecs < 0) elapsed_usecs = 0;
+ if (array == os_aio_write_array) {
+ os_file_check_page_trailers(combined_buf, total_len);
+ }
+- } else {
++ os_aio_write_requests++;
++ os_aio_pages_written += n_consecutive;
++ os_aio_write_time += (ib_longlong)elapsed_usecs;
++ } else {
++ start_usecs = time_usecs();
++ os_aio_thread_io_reads[global_segment] += n_consecutive;
+ ret = os_file_read(slot->file, combined_buf,
+ slot->offset, slot->offset_high, total_len);
+- }
++ stop_usecs = time_usecs();
++ elapsed_usecs = stop_usecs - start_usecs;
++ if (elapsed_usecs < 0) elapsed_usecs = 0;
++ os_aio_read_requests++;
++ os_aio_pages_read += n_consecutive;
++ os_aio_read_time += (ib_longlong)elapsed_usecs;
++ }
++ if (elapsed_usecs > os_aio_thread_max_io_wait[global_segment])
++ os_aio_thread_max_io_wait[global_segment] = elapsed_usecs;
++ os_aio_thread_io_wait[global_segment] += elapsed_usecs;
++ os_aio_thread_io_requests[global_segment]++;
+
+ ut_a(ret);
+ srv_set_io_thread_op_info(global_segment, "file i/o done");
+@@ -4042,16 +4139,13 @@
+ }
+ }
+
+- if (combined_buf2) {
+- ut_free(combined_buf2);
+- }
+-
+ os_mutex_enter(array->mutex);
+
+ /* Mark the i/os done in slots */
+
+ for (i = 0; i < n_consecutive; i++) {
+- consecutive_ios[i]->io_already_done = TRUE;
++ ut_a(consecutive_ios[i]->status == OS_AIO_ISSUED);
++ consecutive_ios[i]->status = OS_AIO_DONE;
+ }
+
+ /* We return the messages for the first slot now, and if there were
+@@ -4061,6 +4155,8 @@
+ slot_io_done:
+
+ ut_a(slot->reserved);
++ ut_a(slot->status == OS_AIO_DONE);
++ slot->status = OS_AIO_CLAIMED;
+
+ *message1 = slot->message1;
+ *message2 = slot->message2;
+@@ -4070,7 +4166,8 @@
+ os_mutex_exit(array->mutex);
+
+ os_aio_array_free_slot(array, slot);
+-
++ srv_set_io_thread_op_info(global_segment, "exited handler");
++
+ return(ret);
+
+ wait_for_io:
+@@ -4115,7 +4212,6 @@
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+- ut_a(array->n_segments > 0);
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+@@ -4165,11 +4261,20 @@
+ double time_elapsed;
+ double avg_bytes_read;
+ ulint i;
+-
+- for (i = 0; i < srv_n_file_io_threads; i++) {
+- fprintf(file, "I/O thread %lu state: %s (%s)", (ulong) i,
+- srv_io_thread_op_info[i],
+- srv_io_thread_function[i]);
++ ulint num_issued, num_done, num_claimed;
++
++ if (file) {
++ for (i = 0; i < os_aio_n_segments; i++) {
++ fprintf(file,
++ "I/O thread %lu state: %s (%s) reads %lu writes %lu "
++ "requests %lu io secs %lf io msecs/request %lf max_io_wait %lf",
++ i, srv_io_thread_op_info[i], srv_io_thread_function[i],
++ os_aio_thread_io_reads[i], os_aio_thread_io_writes[i],
++ os_aio_thread_io_requests[i],
++ os_aio_thread_io_wait[i] / 1000000.0,
++ os_aio_thread_io_requests[i] ?
++ os_aio_thread_io_wait[i] / os_aio_thread_io_requests[i] / 1000.0 : 0.0,
++ os_aio_thread_max_io_wait[i] / 1000.0);
+
+ #ifndef __WIN__
+ if (os_aio_segment_wait_events[i]->is_set) {
+@@ -4181,6 +4286,7 @@
+ }
+
+ fputs("Pending normal aio reads:", file);
++ } // if (file)
+
+ array = os_aio_read_array;
+ loop:
+@@ -4189,14 +4295,23 @@
+ os_mutex_enter(array->mutex);
+
+ ut_a(array->n_slots > 0);
+- ut_a(array->n_segments > 0);
+
+ n_reserved = 0;
++ num_done = num_issued = num_claimed = 0;
+
+ for (i = 0; i < array->n_slots; i++) {
+ slot = os_aio_array_get_nth_slot(array, i);
+
+ if (slot->reserved) {
++ if (slot->status == OS_AIO_ISSUED)
++ num_issued++;
++ else if (slot->status == OS_AIO_DONE)
++ num_done++;
++ else {
++ ut_ad(slot->status == OS_AIO_CLAIMED);
++ num_claimed++;
++ }
++
+ n_reserved++;
+ /* fprintf(stderr, "Reserved slot, messages %p %p\n",
+ slot->message1, slot->message2); */
+@@ -4206,42 +4321,56 @@
+
+ ut_a(array->n_reserved == n_reserved);
+
+- fprintf(file, " %lu", (ulong) n_reserved);
+-
++ if (file) fprintf(file, " %lu", (ulong) n_reserved);
++
+ os_mutex_exit(array->mutex);
+
+ if (array == os_aio_read_array) {
+- fputs(", aio writes:", file);
+-
++ inno_pending_normal_aio_reads = (ulong) n_reserved;
++ if (file) fputs(", aio writes:", file);
+ array = os_aio_write_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_write_array) {
+- fputs(",\n ibuf aio reads:", file);
++ inno_pending_normal_aio_writes = (ulong) n_reserved;
++ if (file) fputs(",\n ibuf aio reads:", file);
+ array = os_aio_ibuf_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_ibuf_array) {
+- fputs(", log i/o's:", file);
++ inno_pending_ibuf_aio_reads = (ulong) n_reserved;
++ if (file) fputs(", log i/o's:", file);
+ array = os_aio_log_array;
+
+ goto loop;
+ }
+
+ if (array == os_aio_log_array) {
+- fputs(", sync i/o's:", file);
++ inno_pending_log_ios = (ulong) n_reserved;
++ if (file) fputs(", sync i/o's:", file);
+ array = os_aio_sync_array;
+
+ goto loop;
+ }
+
+- putc('\n', file);
++ if (array == os_aio_sync_array) {
++ inno_pending_sync_ios = (ulong) n_reserved;
++ }
++
+ current_time = time(NULL);
+ time_elapsed = 0.001 + difftime(current_time, os_last_printout);
++
++ if (file) {
++ putc('\n', file);
++ fprintf(file,
++ "Summary of background IO slot status: %lu issued, "
++ "%lu done, %lu claimed, sleep set %d\n",
++ num_issued, num_done, num_claimed,
++ os_aio_recommend_sleep_for_read_threads);
+
+ fprintf(file,
+ "Pending flushes (fsync) log: %lu; buffer pool: %lu\n"
+@@ -4274,6 +4403,7 @@
+ / time_elapsed,
+ (os_n_fsyncs - os_n_fsyncs_old)
+ / time_elapsed);
++ } // if (file)
+
+ os_n_file_reads_old = os_n_file_reads;
+ os_n_file_writes_old = os_n_file_writes;
+diff -r 322370200e6a innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/srv/srv0srv.c Mon Nov 03 05:08:52 2008 -0800
+@@ -164,7 +164,17 @@
+ ulint srv_mem_pool_size = ULINT_MAX; /* size in bytes */
+ ulint srv_lock_table_size = ULINT_MAX;
+
++ulint srv_io_capacity = ULINT_MAX; /* Number of IO operations per
++ second the server can do */
++
++ibool srv_extra_dirty_writes = TRUE; /* Write dirty pages to disk when pct
++ dirty < max dirty pct */
++
++/* Deprecated by srv_n_{read,write}_io_threads */
+ ulint srv_n_file_io_threads = ULINT_MAX;
++/* Number of background IO threads for read and write requests */
++ulint srv_n_read_io_threads = ULINT_MAX;
++ulint srv_n_write_io_threads = ULINT_MAX;
+
+ #ifdef UNIV_LOG_ARCHIVE
+ ibool srv_log_archive_on = FALSE;
+@@ -238,6 +248,24 @@
+
+ /* variable to count the number of random read-aheads */
+ ulint srv_read_ahead_rnd = 0;
++
++/* Number of IO operations read/write done for all threads */
++ulint os_aio_read_requests = 0;
++ulint os_aio_write_requests = 0;
++
++/* Number of pages read/written done for all threads */
++ulint os_aio_pages_read = 0;
++ulint os_aio_pages_written = 0;
++
++/* time usec used to perform read/write for all threads */
++ib_longlong os_aio_read_time = 0;
++ib_longlong os_aio_write_time = 0;
++
++ulint inno_pending_normal_aio_reads = 0;
++ulint inno_pending_normal_aio_writes = 0;
++ulint inno_pending_ibuf_aio_reads = 0;
++ulint inno_pending_log_ios = 0;
++ulint inno_pending_sync_ios = 0;
+
+ /* structure to pass status variables to MySQL */
+ export_struc export_vars;
+@@ -413,6 +441,23 @@
+
+ ulint srv_main_thread_process_no = 0;
+ ulint srv_main_thread_id = 0;
++
++// The following count work done by srv_master_thread.
++
++// Iterations by the 'once per second' loop.
++ulint srv_main_1_second_loops = 0;
++// Calls to sleep by the 'once per second' loop.
++ulint srv_main_sleeps = 0;
++// Iterations by the 'once per 10 seconds' loop.
++ulint srv_main_10_second_loops = 0;
++// Iterations of the loop bounded by the 'background_loop' label.
++ulint srv_main_background_loops = 0;
++// Iterations of the loop bounded by the 'flush_loop' label.
++ulint srv_main_flush_loops = 0;
++// Calls to log_buffer_flush_to_disk.
++ulint srv_sync_flush = 0;
++// Calls to log_buffer_flush_maybe_sync.
++ulint srv_async_flush = 0;
+
+ /*
+ IMPLEMENTATION OF THE SERVER MAIN PROGRAM
+@@ -2170,7 +2215,12 @@
+ }
+
+ /*************************************************************************
+-The master thread controlling the server. */
++Returns the number of IO operations that is X percent of the capacity.
++
++PCT_IO(5) -> returns the number of IO operations that is 5% of the max
++where max is srv_io_capacity.
++*/
++#define PCT_IO(pct) ((ulint) (srv_io_capacity * ((double) pct / 100.0)))
+
+ #ifndef __WIN__
+ void*
+@@ -2199,11 +2249,15 @@
+ ulint n_pend_ios;
+ ibool skip_sleep = FALSE;
+ ulint i;
++
+
+ #ifdef UNIV_DEBUG_THREAD_CREATION
+ fprintf(stderr, "Master thread starts, id %lu\n",
+ os_thread_pf(os_thread_get_curr_id()));
+ #endif
++ fprintf(stderr, "InnoDB master thread running with io_capacity %lu\n",
++ srv_io_capacity);
++
+ srv_main_thread_process_no = os_proc_get_number();
+ srv_main_thread_id = os_thread_pf(os_thread_get_curr_id());
+
+@@ -2275,26 +2329,28 @@
+
+ srv_main_thread_op_info = "flushing log";
+ log_buffer_flush_to_disk();
++ srv_sync_flush++;
+
+ srv_main_thread_op_info = "making checkpoint";
+ log_free_check();
+
+- /* If there were less than 5 i/os during the
+- one second sleep, we assume that there is free
+- disk i/o capacity available, and it makes sense to
+- do an insert buffer merge. */
++ /* If i/os during one second sleep were less than 5% of
++ capacity, we assume that there is free disk i/o capacity
++ available, and it makes sense to do an insert buffer merge. */
+
+ n_pend_ios = buf_get_n_pending_ios()
+ + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_old < 5)) {
++ if (n_pend_ios < PCT_IO(3) && (n_ios - n_ios_old < PCT_IO(5))) {
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+ }
+
+ if (buf_get_modified_ratio_pct() >
+@@ -2303,7 +2359,8 @@
+ /* Try to keep the number of modified pages in the
+ buffer pool under the limit wished by the user */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST,
++ PCT_IO(100),
+ ut_dulint_max);
+
+ /* If we had to do the flush, it may have taken
+@@ -2325,36 +2382,47 @@
+
+ /* ---- We perform the following code approximately once per
+ 10 seconds when there is database activity */
++ srv_main_10_second_loops++;
+
+ #ifdef MEM_PERIODIC_CHECK
+ /* Check magic numbers of every allocated mem block once in 10
+ seconds */
+ mem_validate_all_blocks();
+ #endif
+- /* If there were less than 200 i/os during the 10 second period,
+- we assume that there is free disk i/o capacity available, and it
+- makes sense to flush 100 pages. */
++ /* If i/os during the 10 second period were less than 200% of
++ capacity, we assume that there is free disk i/o capacity
++ available, and it makes sense to flush srv_io_capacity pages.
++
++ Note that this is done regardless of the fraction of dirty
++ pages relative to the max requested by the user. The one second
++ loop above requests writes for that case. The writes done here
++ are not required, and may be disabled. */
+
+ n_pend_ios = buf_get_n_pending_ios() + log_sys->n_pending_writes;
+ n_ios = log_sys->n_log_ios + buf_pool->n_pages_read
+ + buf_pool->n_pages_written;
+- if (n_pend_ios < 3 && (n_ios - n_ios_very_old < 200)) {
++ if (srv_extra_dirty_writes &&
++ n_pend_ios < PCT_IO(3) && (n_ios - n_ios_very_old < PCT_IO(200))) {
+
+ srv_main_thread_op_info = "flushing buffer pool pages";
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+ }
+
+ /* We run a batch of insert buffer merge every 10 seconds,
+ even if the server were active */
+
+ srv_main_thread_op_info = "doing insert buffer merge";
+- ibuf_contract_for_n_pages(TRUE, 5);
++ ibuf_contract_for_n_pages(TRUE, PCT_IO(5));
+
+ srv_main_thread_op_info = "flushing log";
+- log_buffer_flush_to_disk();
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
+
+ /* We run a full purge every 10 seconds, even if the server
+ were active */
+@@ -2378,8 +2446,9 @@
+ if (difftime(current_time, last_flush_time) > 1) {
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
++ srv_sync_flush++;
+ }
+ }
+
+@@ -2393,14 +2462,14 @@
+ (> 70 %), we assume we can afford reserving the disk(s) for
+ the time it requires to flush 100 pages */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 100,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100),
+ ut_dulint_max);
+ } else {
+ /* Otherwise, we only flush a small number of pages so that
+ we do not unnecessarily use much disk i/o capacity from
+ other work */
+
+- n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, 10,
++ n_pages_flushed = buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(10),
+ ut_dulint_max);
+ }
+
+@@ -2434,7 +2503,7 @@
+
+ /* The server has been quiet for a while: start running background
+ operations */
+-
++ srv_main_background_loops++;
+ srv_main_thread_op_info = "doing background drop tables";
+
+ n_tables_to_drop = row_drop_tables_for_mysql_in_background();
+@@ -2472,6 +2541,7 @@
+
+ log_buffer_flush_to_disk();
+ last_flush_time = current_time;
++ srv_sync_flush++;
+ }
+ }
+
+@@ -2487,9 +2557,13 @@
+ srv_main_thread_op_info = "doing insert buffer merge";
+
+ if (srv_fast_shutdown && srv_shutdown_state > 0) {
+- n_bytes_merged = 0;
++ n_bytes_merged = 0;
+ } else {
+- n_bytes_merged = ibuf_contract_for_n_pages(TRUE, 20);
++ /* This should do an amount of IO similar to the number of
++ * dirty pages that will be flushed in the call to
++ * buf_flush_batch below. Otherwise, the system favors
++ * clean pages over cleanup throughput. */
++ n_bytes_merged = ibuf_contract_for_n_pages(TRUE, PCT_IO(100));
+ }
+
+ srv_main_thread_op_info = "reserving kernel mutex";
+@@ -2503,10 +2577,11 @@
+
+ flush_loop:
+ srv_main_thread_op_info = "flushing buffer pool pages";
++ srv_main_flush_loops++;
+
+ if (srv_fast_shutdown < 2) {
+ n_pages_flushed =
+- buf_flush_batch(BUF_FLUSH_LIST, 100, ut_dulint_max);
++ buf_flush_batch(BUF_FLUSH_LIST, PCT_IO(100), ut_dulint_max);
+ } else {
+ /* In the fastest shutdown we do not flush the buffer pool
+ to data files: we set n_pages_flushed to 0 artificially. */
+@@ -2528,7 +2603,17 @@
+
+ srv_main_thread_op_info = "flushing log";
+
+- log_buffer_flush_to_disk();
++ current_time = time(NULL);
++ if (difftime(current_time, last_flush_time) > 1) {
++ srv_main_thread_op_info = (char*) "flushing log";
++ log_buffer_flush_to_disk();
++ last_flush_time = current_time;
++ srv_sync_flush++;
++ } else {
++ /* No fsync when srv_flush_log_at_trx_commit != 1 */
++ log_buffer_flush_maybe_sync();
++ srv_async_flush++;
++ }
+
+ srv_main_thread_op_info = "making checkpoint";
+
+diff -r 322370200e6a innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c Mon Nov 03 05:07:57 2008 -0800
++++ b/innobase/srv/srv0start.c Mon Nov 03 05:08:52 2008 -0800
+@@ -973,6 +973,7 @@
+ ulint i;
+ ibool srv_file_per_table_original_value = srv_file_per_table;
+ mtr_t mtr;
++ ulint n_threads;
+ #ifdef HAVE_DARWIN_THREADS
+ # ifdef F_FULLFSYNC
+ /* This executable has been compiled on Mac OS X 10.3 or later.
+@@ -1206,24 +1207,32 @@
+ }
+
+ /* Restrict the maximum number of file i/o threads */
+- if (srv_n_file_io_threads > SRV_MAX_N_IO_THREADS) {
+-
+- srv_n_file_io_threads = SRV_MAX_N_IO_THREADS;
++ if ((srv_n_read_io_threads + srv_n_write_io_threads) > SRV_MAX_N_IO_THREADS) {
++ fprintf(stderr,
++ "InnoDB: requested too many read(%d) or write(%d) IO threads, max is %d\n",
++ srv_n_read_io_threads, srv_n_write_io_threads, SRV_MAX_N_IO_THREADS);
++ return(DB_ERROR);
+ }
+
+ if (!os_aio_use_native_aio) {
+- /* In simulated aio we currently have use only for 4 threads */
+- srv_n_file_io_threads = 4;
++ /* More than 4 threads are now supported. */
++ n_threads = os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD,
++ srv_n_read_io_threads,
++ srv_n_write_io_threads,
++ SRV_MAX_N_PENDING_SYNC_IOS);
++ } else {
++ /* Might need more slots here. Alas, I don't do windows. */
++ n_threads = os_aio_init(SRV_N_PENDING_IOS_PER_THREAD,
++ srv_n_read_io_threads,
++ srv_n_write_io_threads,
++ SRV_MAX_N_PENDING_SYNC_IOS);
++ }
+
+- os_aio_init(8 * SRV_N_PENDING_IOS_PER_THREAD
+- * srv_n_file_io_threads,
+- srv_n_file_io_threads,
+- SRV_MAX_N_PENDING_SYNC_IOS);
+- } else {
+- os_aio_init(SRV_N_PENDING_IOS_PER_THREAD
+- * srv_n_file_io_threads,
+- srv_n_file_io_threads,
+- SRV_MAX_N_PENDING_SYNC_IOS);
++ if (n_threads > SRV_MAX_N_IO_THREADS) {
++ fprintf(stderr,
++ "InnoDB: requested too many IO threads(%d), max is %d\n",
++ n_threads, SRV_MAX_N_IO_THREADS);
++ return(DB_ERROR);
+ }
+
+ fil_init(srv_max_n_open_files);
+@@ -1259,11 +1268,11 @@
+
+ /* Create i/o-handler threads: */
+
+- for (i = 0; i < srv_n_file_io_threads; i++) {
++ for (i = 0; i < n_threads; i++) {
+ n[i] = i;
+
+ os_thread_create(io_handler_thread, n + i, thread_ids + i);
+- }
++ }
+
+ #ifdef UNIV_LOG_ARCHIVE
+ if (0 != ut_strcmp(srv_log_group_home_dirs[0], srv_arch_dir)) {
+diff -r 322370200e6a patch_info/innodb_io_tune.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_io_tune.info Mon Nov 03 05:08:52 2008 -0800
+@@ -0,0 +1,9 @@
++File=innodb_io_tune.patch
++Name=Tune InnoDB IO settings
++Version=1.0
++Author=Google
++License=GPL
++Comment=
++ChangeLog=
++2008-11-01
++VT: Initial porting
+diff -r 322370200e6a sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/ha_innodb.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -147,7 +147,7 @@
+ innobase_additional_mem_pool_size, innobase_file_io_threads,
+ innobase_lock_wait_timeout, innobase_force_recovery,
+ innobase_open_files;
+-
++long innobase_read_io_threads, innobase_write_io_threads;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+ /* The default values for the following char* start-up parameters
+@@ -175,6 +175,23 @@
+ my_bool innobase_rollback_on_timeout = FALSE;
+ my_bool innobase_create_status_file = FALSE;
+ my_bool innobase_adaptive_hash_index = TRUE;
++
++/* Max number of IO requests merged to perform large IO in background
++ IO threads.
++*/
++long innobase_max_merged_io = 64;
++
++/* time interval in seconds allowed to calling innodb_show_status functions */
++long innobase_min_status_update_time_interval = 30;
++
++
++/* Default number of IO per second supported by server. Tunes background
++ IO rate
++*/
++long innobase_io_capacity = 100;
++
++/* Write dirty pages when pct dirty is less than max pct dirty */
++my_bool innobase_extra_dirty_writes = TRUE;
+
+ static char *internal_innobase_data_file_path = NULL;
+
+@@ -1372,7 +1389,11 @@
+
+ srv_mem_pool_size = (ulint) innobase_additional_mem_pool_size;
+
++ srv_io_capacity = (ulint) innobase_io_capacity;
++ srv_extra_dirty_writes = (ibool) innobase_extra_dirty_writes;
+ srv_n_file_io_threads = (ulint) innobase_file_io_threads;
++ srv_n_read_io_threads = (ulint) innobase_read_io_threads;
++ srv_n_write_io_threads = (ulint) innobase_write_io_threads;
+
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+diff -r 322370200e6a sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/ha_innodb.h Mon Nov 03 05:08:52 2008 -0800
+@@ -197,6 +197,7 @@
+
+ extern struct show_var_st innodb_status_variables[];
+ extern ulong innobase_fast_shutdown;
++extern long innobase_max_merged_io;
+ extern ulong innobase_large_page_size;
+ extern long innobase_mirrored_log_groups, innobase_log_files_in_group;
+ extern longlong innobase_buffer_pool_size, innobase_log_file_size;
+@@ -205,10 +206,14 @@
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
+ extern long innobase_force_recovery;
++extern long innobase_read_io_threads, innobase_write_io_threads;
+ extern long innobase_open_files;
+ extern char *innobase_data_home_dir, *innobase_data_file_path;
+ extern char *innobase_log_group_home_dir, *innobase_log_arch_dir;
+ extern char *innobase_unix_file_flush_method;
++extern long innobase_io_capacity;
++extern my_bool innobase_extra_dirty_writes;
++
+ /* The following variables have to be my_bool for SHOW VARIABLES to work */
+ extern my_bool innobase_log_archive,
+ innobase_use_doublewrite,
+diff -r 322370200e6a sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/mysqld.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -4932,6 +4932,11 @@
+ OPT_INNODB_ADDITIONAL_MEM_POOL_SIZE,
+ OPT_INNODB_MAX_PURGE_LAG,
+ OPT_INNODB_FILE_IO_THREADS,
++ OPT_INNODB_READ_IO_THREADS,
++ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_MAX_MERGED_IO,
++ OPT_INNODB_IO_CAPACITY,
++ OPT_INNODB_EXTRA_DIRTY_WRITES,
+ OPT_INNODB_LOCK_WAIT_TIMEOUT,
+ OPT_INNODB_THREAD_CONCURRENCY,
+ OPT_INNODB_COMMIT_CONCURRENCY,
+@@ -5302,6 +5307,25 @@
+ (gptr*) &global_system_variables.innodb_table_locks,
+ (gptr*) &global_system_variables.innodb_table_locks,
+ 0, GET_BOOL, OPT_ARG, 1, 0, 0, 0, 0, 0},
++ {"innodb_max_merged_io", OPT_INNODB_MAX_MERGED_IO,
++ "Max number of IO requests merged to issue large IO from background IO threads.",
++ (gptr*) &innobase_max_merged_io,
++ (gptr*) &innobase_max_merged_io, 0, GET_LONG, REQUIRED_ARG, 64, 1, 64, 0, 0, 0},
++ {"innodb_read_io_threads", OPT_INNODB_READ_IO_THREADS,
++ "Number of background read I/O threads in InnoDB.", (gptr*) &innobase_read_io_threads,
++ (gptr*) &innobase_read_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
++ {"innodb_write_io_threads", OPT_INNODB_WRITE_IO_THREADS,
++ "Number of background write I/O threads in InnoDB.", (gptr*) &innobase_write_io_threads,
++ (gptr*) &innobase_write_io_threads, 0, GET_LONG, REQUIRED_ARG, 1, 1, 64, 0, 1, 0},
++ {"innodb_io_capacity", OPT_INNODB_IO_CAPACITY,
++ "Number of IO operations per second the server can do. Tunes background IO rate.",
++ (gptr*) &innobase_io_capacity,
++ (gptr*) &innobase_io_capacity, 0, GET_LONG,
++ REQUIRED_ARG, 100, 100, 999999999, 0, 1, 0},
++ {"innodb_extra_dirty_writes", OPT_INNODB_EXTRA_DIRTY_WRITES,
++ "When set, flush dirty buffer pages when dirty pct is less than max dirty pct. ",
++ (gptr*) &innobase_extra_dirty_writes, (gptr*) &innobase_extra_dirty_writes,
++ 0, GET_BOOL, NO_ARG, 1, 0, 1, 0, 1, 0},
+ #endif /* End HAVE_INNOBASE_DB */
+ {"isam", OPT_ISAM, "Obsolete. ISAM storage engine is no longer supported.",
+ (gptr*) &opt_isam, (gptr*) &opt_isam, 0, GET_BOOL, NO_ARG, 0, 0, 0,
+diff -r 322370200e6a sql/set_var.cc
+--- a/sql/set_var.cc Mon Nov 03 05:07:57 2008 -0800
++++ b/sql/set_var.cc Mon Nov 03 05:08:52 2008 -0800
+@@ -919,12 +919,14 @@
+ {"innodb_data_home_dir", (char*) &innobase_data_home_dir, SHOW_CHAR_PTR},
+ {"innodb_adaptive_hash_index", (char*) &innobase_adaptive_hash_index, SHOW_MY_BOOL},
+ {"innodb_doublewrite", (char*) &innobase_use_doublewrite, SHOW_MY_BOOL},
++ {"innodb_extra_dirty_writes", (char*) &innobase_extra_dirty_writes, SHOW_MY_BOOL},
+ {sys_innodb_fast_shutdown.name,(char*) &sys_innodb_fast_shutdown, SHOW_SYS},
+ {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG },
+ {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL},
+ {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS},
+ {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR},
+ {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG },
++ {"innodb_io_capacity", (char*) &innobase_io_capacity, SHOW_LONG },
+ {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG },
+ {"innodb_locks_unsafe_for_binlog", (char*) &innobase_locks_unsafe_for_binlog, SHOW_MY_BOOL},
+ {"innodb_log_arch_dir", (char*) &innobase_log_arch_dir, SHOW_CHAR_PTR},
+@@ -943,6 +945,9 @@
+ {sys_innodb_table_locks.name, (char*) &sys_innodb_table_locks, SHOW_SYS},
+ {sys_innodb_thread_concurrency.name, (char*) &sys_innodb_thread_concurrency, SHOW_SYS},
+ {sys_innodb_thread_sleep_delay.name, (char*) &sys_innodb_thread_sleep_delay, SHOW_SYS},
++ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG },
++ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG },
++ {"innodb_max_merged_io", (char*) &innobase_max_merged_io, SHOW_LONG},
+ #endif
+ {sys_interactive_timeout.name,(char*) &sys_interactive_timeout, SHOW_SYS},
+ {sys_join_buffer_size.name, (char*) &sys_join_buffer_size, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/innodb_locks_held.patch b/percona/5.0.91-b22-20100522/innodb_locks_held.patch
new file mode 100644
index 0000000..062fa47
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_locks_held.patch
@@ -0,0 +1,219 @@
+diff -r e9fb5b8bcf78 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Mon Jun 01 00:36:33 2009 -0700
++++ b/innobase/include/srv0srv.h Mon Jun 01 00:36:41 2009 -0700
+@@ -80,6 +80,8 @@
+ extern ulint srv_log_file_size;
+ extern ulint srv_log_buffer_size;
+ extern ulong srv_flush_log_at_trx_commit;
++extern ulong srv_show_locks_held;
++extern ulong srv_show_verbose_locks;
+
+ extern byte srv_latin1_ordering[256];/* The sort order table of the latin1
+ character set */
+diff -r e9fb5b8bcf78 innobase/lock/lock0lock.c
+--- a/innobase/lock/lock0lock.c Mon Jun 01 00:36:33 2009 -0700
++++ b/innobase/lock/lock0lock.c Mon Jun 01 00:36:41 2009 -0700
+@@ -4181,6 +4181,7 @@
+ #endif /* UNIV_SYNC_DEBUG */
+ }
+
++ if ( srv_show_verbose_locks ) {
+ for (i = 0; i < lock_rec_get_n_bits(lock); i++) {
+
+ if (lock_rec_get_nth_bit(lock, i)) {
+@@ -4198,6 +4199,7 @@
+ putc('\n', file);
+ }
+ }
++ } /* srv_show_verbose_locks */
+
+ mtr_commit(&mtr);
+ if (UNIV_LIKELY_NULL(heap)) {
+@@ -4369,7 +4371,7 @@
+ }
+ }
+
+- if (!srv_print_innodb_lock_monitor) {
++ if (!srv_print_innodb_lock_monitor && !srv_show_locks_held) {
+ nth_trx++;
+ goto loop;
+ }
+@@ -4426,9 +4428,9 @@
+
+ nth_lock++;
+
+- if (nth_lock >= 10) {
++ if (nth_lock >= srv_show_locks_held) {
+ fputs(
+- "10 LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n",
++ "TOO MANY LOCKS PRINTED FOR THIS TRX: SUPPRESSING FURTHER PRINTS\n",
+ file);
+
+ nth_trx++;
+diff -r e9fb5b8bcf78 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Mon Jun 01 00:36:33 2009 -0700
++++ b/innobase/srv/srv0srv.c Mon Jun 01 00:36:41 2009 -0700
+@@ -116,6 +116,8 @@
+ ulint srv_log_file_size = ULINT_MAX; /* size in database pages */
+ ulint srv_log_buffer_size = ULINT_MAX; /* size in database pages */
+ ulong srv_flush_log_at_trx_commit = 1;
++ulint srv_show_locks_held = 10;
++ulint srv_show_verbose_locks = 0;
+
+ byte srv_latin1_ordering[256] /* The sort order table of the latin1
+ character set. The following table is
+@@ -1711,24 +1713,6 @@
+
+ mutex_exit(&dict_foreign_err_mutex);
+
+- lock_print_info_summary(file);
+- if (trx_start) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_start = ULINT_UNDEFINED;
+- } else {
+- *trx_start = (ulint) t;
+- }
+- }
+- lock_print_info_all_transactions(file);
+- if (trx_end) {
+- long t = ftell(file);
+- if (t < 0) {
+- *trx_end = ULINT_UNDEFINED;
+- } else {
+- *trx_end = (ulint) t;
+- }
+- }
+ fputs("--------\n"
+ "FILE I/O\n"
+ "--------\n", file);
+@@ -1822,6 +1806,25 @@
+ srv_n_rows_deleted_old = srv_n_rows_deleted;
+ srv_n_rows_read_old = srv_n_rows_read;
+
++ lock_print_info_summary(file);
++ if (trx_start) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_start = ULINT_UNDEFINED;
++ } else {
++ *trx_start = (ulint) t;
++ }
++ }
++ lock_print_info_all_transactions(file);
++ if (trx_end) {
++ long t = ftell(file);
++ if (t < 0) {
++ *trx_end = ULINT_UNDEFINED;
++ } else {
++ *trx_end = (ulint) t;
++ }
++ }
++
+ fputs("----------------------------\n"
+ "END OF INNODB MONITOR OUTPUT\n"
+ "============================\n", file);
+diff -r e9fb5b8bcf78 libmysqld/set_var.cc
+--- a/libmysqld/set_var.cc Mon Jun 01 00:36:33 2009 -0700
++++ b/libmysqld/set_var.cc Mon Jun 01 00:36:41 2009 -0700
+@@ -825,6 +825,8 @@
+ &sys_innodb_thread_concurrency,
+ &sys_innodb_commit_concurrency,
+ &sys_innodb_flush_log_at_trx_commit,
++ &sys_innodb_show_locks_held,
++ &sys_innodb_show_verbose_locks,
+ #endif
+ &sys_trust_routine_creators,
+ &sys_trust_function_creators,
+@@ -942,6 +944,8 @@
+ {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG },
+ {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL},
+ {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS},
++ {sys_innodb_show_locks_held.name, (char*) &sys_innodb_show_locks_held, SHOW_SYS },
++ {sys_innodb_show_verbose_locks.name, (char*) &sys_innodb_show_verbose_locks, SHOW_SYS },
+ {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR},
+ {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG },
+ {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG },
+diff -r e9fb5b8bcf78 patch_info/innodb_locks_held.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_locks_held.info Mon Jun 01 00:36:41 2009 -0700
+@@ -0,0 +1,6 @@
++File=innodb_locks_held.patch
++Name=Add locks held, remove locked records in SHOW INNODB STATUS
++Version=1.0
++Author=Baron Schwartz <baron@xaprb.com>
++License=GPL
++Comment=Bug #29126 fix
+diff -r e9fb5b8bcf78 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Jun 01 00:36:33 2009 -0700
++++ b/sql/ha_innodb.h Mon Jun 01 00:36:41 2009 -0700
+@@ -243,6 +243,8 @@
+ extern ulong srv_enable_unsafe_group_commit;
+ extern uint srv_read_ahead;
+ extern uint srv_adaptive_checkpoint;
++extern ulong srv_show_locks_held;
++extern ulong srv_show_verbose_locks;
+
+ /* An option to enable the fix for "Bug#43660 SHOW INDEXES/ANALYZE does
+ NOT update cardinality for indexes of InnoDB table". By default we are
+diff -r e9fb5b8bcf78 sql/mysqld.cc
+--- a/sql/mysqld.cc Mon Jun 01 00:36:33 2009 -0700
++++ b/sql/mysqld.cc Mon Jun 01 00:36:41 2009 -0700
+@@ -5016,6 +5016,8 @@
+ OPT_INNODB_MAX_PURGE_LAG,
+ OPT_INNODB_FILE_IO_THREADS,
+ OPT_INNODB_LOCK_WAIT_TIMEOUT,
++ OPT_INNODB_SHOW_LOCKS_HELD,
++ OPT_INNODB_SHOW_VERBOSE_LOCKS,
+ OPT_INNODB_THREAD_CONCURRENCY,
+ OPT_INNODB_COMMIT_CONCURRENCY,
+ OPT_INNODB_FORCE_RECOVERY,
+@@ -5364,6 +5366,14 @@
+ (gptr*) &srv_flush_log_at_trx_commit,
+ (gptr*) &srv_flush_log_at_trx_commit,
+ 0, GET_ULONG, OPT_ARG, 1, 0, 2, 0, 0, 0},
++ {"innodb_show_locks_held", OPT_INNODB_SHOW_LOCKS_HELD,
++ "Number of locks held to print for each InnoDB transaction in SHOW INNODB STATUS.",
++ (gptr*) &srv_show_locks_held, (gptr*) &srv_show_locks_held,
++ 0, GET_LONG, OPT_ARG, 10, 0, 1000, 0, 1, 0},
++ {"innodb_show_verbose_locks", OPT_INNODB_SHOW_VERBOSE_LOCKS,
++ "Whether to show records locked in SHOW INNODB STATUS.",
++ (gptr*) &srv_show_verbose_locks, (gptr*) &srv_show_verbose_locks,
++ 0, GET_LONG, OPT_ARG, 0, 0, 1, 0, 1, 0},
+ {"innodb_flush_method", OPT_INNODB_FLUSH_METHOD,
+ "With which method to flush data.", (gptr*) &innobase_unix_file_flush_method,
+ (gptr*) &innobase_unix_file_flush_method, 0, GET_STR, REQUIRED_ARG, 0, 0, 0,
+diff -r e9fb5b8bcf78 sql/set_var.cc
+--- a/sql/set_var.cc Mon Jun 01 00:36:33 2009 -0700
++++ b/sql/set_var.cc Mon Jun 01 00:36:41 2009 -0700
+@@ -527,6 +527,12 @@
+ sys_var_enum sys_innodb_adaptive_checkpoint("innodb_adaptive_checkpoint",
+ &srv_adaptive_checkpoint,
+ &innodb_adaptive_checkpoint_typelib, fix_innodb_adaptive_checkpoint);
++sys_var_long_ptr sys_innodb_show_locks_held(
++ "innodb_show_locks_held",
++ &srv_show_locks_held);
++sys_var_long_ptr sys_innodb_show_verbose_locks(
++ "innodb_show_verbose_locks",
++ &srv_show_verbose_locks);
+ sys_var_const_os_str_ptr sys_innodb_data_file_path("innodb_data_file_path",
+ &innobase_data_file_path);
+ sys_var_const_os_str_ptr sys_innodb_data_home_dir("innodb_data_home_dir",
+@@ -906,6 +912,8 @@
+ &sys_innodb_read_ahead,
+ &sys_innodb_enable_unsafe_group_commit,
+ &sys_innodb_adaptive_checkpoint,
++ &sys_innodb_show_locks_held,
++ &sys_innodb_show_verbose_locks,
+ #endif
+ &sys_trust_routine_creators,
+ &sys_trust_function_creators,
+@@ -1023,6 +1031,8 @@
+ {"innodb_file_io_threads", (char*) &innobase_file_io_threads, SHOW_LONG },
+ {"innodb_file_per_table", (char*) &innobase_file_per_table, SHOW_MY_BOOL},
+ {sys_innodb_flush_log_at_trx_commit.name, (char*) &sys_innodb_flush_log_at_trx_commit, SHOW_SYS},
++ {sys_innodb_show_locks_held.name, (char*) &sys_innodb_show_locks_held, SHOW_SYS },
++ {sys_innodb_show_verbose_locks.name, (char*) &sys_innodb_show_verbose_locks, SHOW_SYS },
+ {"innodb_flush_method", (char*) &innobase_unix_file_flush_method, SHOW_CHAR_PTR},
+ {"innodb_force_recovery", (char*) &innobase_force_recovery, SHOW_LONG },
+ {"innodb_lock_wait_timeout", (char*) &innobase_lock_wait_timeout, SHOW_LONG },
diff --git a/percona/5.0.91-b22-20100522/innodb_misc_patch.patch b/percona/5.0.91-b22-20100522/innodb_misc_patch.patch
new file mode 100644
index 0000000..4f4faf3
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_misc_patch.patch
@@ -0,0 +1,64 @@
+diff -ru mysql-5.0.84_p_orig/innobase/row/row0sel.c mysql-5.0.84/innobase/row/row0sel.c
+--- mysql-5.0.84_p_orig/innobase/row/row0sel.c 2009-07-07 21:54:10.000000000 +0900
++++ mysql-5.0.84/innobase/row/row0sel.c 2009-08-28 09:28:56.000000000 +0900
+@@ -2988,6 +2988,15 @@
+ return(SEL_FOUND);
+ }
+
++/**********************************************************************
++Returns true if the thread is executing a SELECT statement.
++(Prototype for global functions in ha_innodb.cc) */
++ibool
++thd_is_select(
++/*==========*/
++ /* out: true if thd is executing SELECT */
++ const void* thd); /* in: thread handle (THD*) */
++
+ /************************************************************************
+ Searches for rows in the database. This is used in the interface to
+ MySQL. This function opens a cursor, and also implements fetch next
+@@ -3361,20 +3370,12 @@
+
+ if (trx->isolation_level <= TRX_ISO_READ_COMMITTED
+ && prebuilt->select_lock_type != LOCK_NONE
+- && trx->mysql_query_str) {
+-
+- /* Scan the MySQL query string; check if SELECT is the first
+- word there */
+- ibool success;
+-
+- dict_accept(*trx->mysql_query_str, "SELECT", &success);
+-
+- if (success) {
++ && trx->mysql_thd != NULL
++ && thd_is_select(trx->mysql_thd)) {
+ /* It is a plain locking SELECT and the isolation
+ level is low: do not lock gaps */
+
+ set_also_gap_locks = FALSE;
+- }
+ }
+
+ /* Note that if the search mode was GE or G, then the cursor
+diff -ru mysql-5.0.84_p_orig/sql/ha_innodb.cc mysql-5.0.84/sql/ha_innodb.cc
+--- mysql-5.0.84_p_orig/sql/ha_innodb.cc 2009-08-27 16:06:21.000000000 +0900
++++ mysql-5.0.84/sql/ha_innodb.cc 2009-08-28 09:33:38.000000000 +0900
+@@ -394,6 +394,18 @@
+ }
+ }
+
++/**********************************************************************
++Returns true if the thread is executing a SELECT statement. */
++extern "C"
++ibool
++thd_is_select(
++/*==========*/
++ /* out: true if thd is executing SELECT */
++ const void* thd) /* in: thread handle (THD*) */
++{
++ return(((const THD*) thd)->lex->sql_command == SQLCOM_SELECT);
++}
++
+ /************************************************************************
+ Call this function when mysqld passes control to the client. That is to
+ avoid deadlocks on the adaptive hash S-latch possibly held by thd. For more
diff --git a/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch b/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch
new file mode 100644
index 0000000..3d3e567
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_recovery_patches.patch
@@ -0,0 +1,217 @@
+diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c 2009-08-04 16:53:42.000000000 +0900
++++ b/innobase/buf/buf0flu.c 2009-08-04 17:02:36.000000000 +0900
+@@ -85,6 +85,22 @@
+ prev_b = NULL;
+ b = UT_LIST_GET_FIRST(buf_pool->flush_list);
+
++ if (srv_fast_recovery) {
++ /* speed hack */
++ if (b == NULL || (ut_dulint_cmp(b->oldest_modification,
++ block->oldest_modification) < 0)) {
++ UT_LIST_ADD_FIRST(flush_list, buf_pool->flush_list, block);
++ } else {
++ b = UT_LIST_GET_LAST(buf_pool->flush_list);
++ if (ut_dulint_cmp(b->oldest_modification,
++ block->oldest_modification) < 0) {
++ /* align oldest_modification not to sort */
++ block->oldest_modification = b->oldest_modification;
++ }
++ UT_LIST_ADD_LAST(flush_list, buf_pool->flush_list, block);
++ }
++ } else {
++ /* normal */
+ while (b && (ut_dulint_cmp(b->oldest_modification,
+ block->oldest_modification) > 0)) {
+ prev_b = b;
+@@ -97,6 +113,7 @@
+ UT_LIST_INSERT_AFTER(flush_list, buf_pool->flush_list, prev_b,
+ block);
+ }
++ }
+
+ ut_ad(buf_flush_validate_low());
+ }
+diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c 2009-08-04 16:53:42.000000000 +0900
++++ b/innobase/buf/buf0rea.c 2009-08-04 17:11:41.000000000 +0900
+@@ -127,6 +127,46 @@
+ block = buf_page_init_for_read(err, mode, space, tablespace_version,
+ offset);
+ if (block == NULL) {
++ /* bugfix: http://bugs.mysql.com/bug.php?id=43948 */
++ if (recv_recovery_is_on() && *err == DB_TABLESPACE_DELETED) {
++ /* hashed log recs must be treated here */
++ recv_addr_t* recv_addr;
++
++ mutex_enter(&(recv_sys->mutex));
++
++ if (recv_sys->apply_log_recs == FALSE) {
++ mutex_exit(&(recv_sys->mutex));
++ goto not_to_recover;
++ }
++
++ /* recv_get_fil_addr_struct() */
++ recv_addr = HASH_GET_FIRST(recv_sys->addr_hash,
++ hash_calc_hash(ut_fold_ulint_pair(space, offset),
++ recv_sys->addr_hash));
++ while (recv_addr) {
++ if ((recv_addr->space == space)
++ && (recv_addr->page_no == offset)) {
++ break;
++ }
++ recv_addr = HASH_GET_NEXT(addr_hash, recv_addr);
++ }
++
++ if ((recv_addr == NULL)
++ || (recv_addr->state == RECV_BEING_PROCESSED)
++ || (recv_addr->state == RECV_PROCESSED)) {
++ mutex_exit(&(recv_sys->mutex));
++ goto not_to_recover;
++ }
++
++ fprintf(stderr, " (space:%lu is deleted)", space);
++ recv_addr->state = RECV_PROCESSED;
++
++ ut_a(recv_sys->n_addrs);
++ recv_sys->n_addrs--;
++
++ mutex_exit(&(recv_sys->mutex));
++ }
++not_to_recover:
+
+ return(0);
+ }
+@@ -697,11 +737,11 @@
+ while (buf_pool->n_pend_reads >= recv_n_pool_free_frames / 2) {
+
+ os_aio_simulated_wake_handler_threads();
+- os_thread_sleep(500000);
++ os_thread_sleep(10000);
+
+ count++;
+
+- if (count > 100) {
++ if (count > 5000) {
+ fprintf(stderr,
+ "InnoDB: Error: InnoDB has waited for 50 seconds for pending\n"
+ "InnoDB: reads to the buffer pool to be finished.\n"
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-08-04 16:53:42.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-08-04 17:39:51.000000000 +0900
+@@ -59,6 +59,8 @@
+ extern ibool srv_file_per_table;
+ extern ibool srv_locks_unsafe_for_binlog;
+
++extern ibool srv_fast_recovery;
++
+ extern ulint srv_n_data_files;
+ extern char** srv_data_file_names;
+ extern ulint* srv_data_file_sizes;
+diff -ruN a/innobase/log/log0recv.c b/innobase/log/log0recv.c
+--- a/innobase/log/log0recv.c 2009-07-07 21:54:08.000000000 +0900
++++ b/innobase/log/log0recv.c 2009-08-04 17:15:15.000000000 +0900
+@@ -101,7 +101,7 @@
+ use these free frames to read in pages when we start applying the
+ log records to the database. */
+
+-ulint recv_n_pool_free_frames = 256;
++ulint recv_n_pool_free_frames = 1024;
+
+ /* The maximum lsn we see for a page during the recovery process. If this
+ is bigger than the lsn we are able to scan up to, that is an indication that
+@@ -1135,6 +1135,8 @@
+ recv_addr = recv_get_fil_addr_struct(space, page_no);
+
+ if ((recv_addr == NULL)
++ /* bugfix: http://bugs.mysql.com/bug.php?id=44140 */
++ || (recv_addr->state == RECV_BEING_READ && !just_read_in)
+ || (recv_addr->state == RECV_BEING_PROCESSED)
+ || (recv_addr->state == RECV_PROCESSED)) {
+
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-08-04 16:53:42.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-08-04 17:41:05.000000000 +0900
+@@ -88,6 +88,8 @@
+ i.e. do not use next-key locking
+ except on duplicate key checking and
+ foreign key checking */
++ibool srv_fast_recovery = FALSE;
++
+ ulint srv_n_data_files = 0;
+ char** srv_data_file_names = NULL;
+ ulint* srv_data_file_sizes = NULL; /* size in database pages */
+diff -ruN a/patch_info/innodb_recovery_patches.info b/patch_info/innodb_recovery_patches.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_recovery_patches.info 2009-08-04 16:58:07.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_recovery_patches.patch
++Name=Bugfixes and adjustments about recovery process
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-08-04 16:53:42.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-08-04 17:35:44.000000000 +0900
+@@ -182,6 +182,7 @@
+ my_bool innobase_rollback_on_timeout = FALSE;
+ my_bool innobase_create_status_file = FALSE;
+ my_bool innobase_adaptive_hash_index = TRUE;
++my_bool innobase_fast_recovery = FALSE;
+
+ static char *internal_innobase_data_file_path = NULL;
+
+@@ -1534,6 +1535,8 @@
+ srv_lock_wait_timeout = (ulint) innobase_lock_wait_timeout;
+ srv_force_recovery = (ulint) innobase_force_recovery;
+
++ srv_fast_recovery = (ibool) innobase_fast_recovery;
++
+ srv_use_doublewrite_buf = (ibool) innobase_use_doublewrite;
+ srv_use_checksums = (ibool) innobase_use_checksums;
+
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-08-04 16:53:42.000000000 +0900
++++ b/sql/ha_innodb.h 2009-08-04 17:37:18.000000000 +0900
+@@ -220,6 +220,7 @@
+ innobase_use_large_pages,
+ innobase_use_native_aio,
+ innobase_file_per_table, innobase_locks_unsafe_for_binlog,
++ innobase_fast_recovery,
+ innobase_rollback_on_timeout,
+ innobase_create_status_file,
+ innobase_adaptive_hash_index;
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-08-04 16:53:42.000000000 +0900
++++ b/sql/mysqld.cc 2009-08-04 17:48:25.000000000 +0900
+@@ -5102,6 +5102,7 @@
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
+ OPT_INNODB_USE_SYS_MALLOC,
++ OPT_INNODB_FAST_RECOVERY,
+ OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED,
+ OPT_INNODB_EXTRA_RSEGMENTS,
+ OPT_INNODB_DICT_SIZE_LIMIT,
+@@ -5347,6 +5348,10 @@
+ {"innodb_doublewrite", OPT_INNODB_DOUBLEWRITE, "Enable InnoDB doublewrite buffer (enabled by default). \
+ Disable with --skip-innodb-doublewrite.", (gptr*) &innobase_use_doublewrite,
+ (gptr*) &innobase_use_doublewrite, 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
++ {"innodb_fast_recovery", OPT_INNODB_FAST_RECOVERY,
++ "Enable to use speed hack of recovery avoiding flush list sorting.",
++ (gptr*) &innobase_fast_recovery, (gptr*) &innobase_fast_recovery,
++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"innodb_fast_shutdown", OPT_INNODB_FAST_SHUTDOWN,
+ "Speeds up the shutdown process of the InnoDB storage engine. Possible "
+ "values are 0, 1 (faster)"
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-08-04 16:53:42.000000000 +0900
++++ b/sql/set_var.cc 2009-08-04 17:51:49.000000000 +0900
+@@ -1088,6 +1088,7 @@
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
+ {"innodb_use_sys_malloc", (char*) &innobase_use_sys_malloc, SHOW_MY_BOOL},
++ {"innodb_fast_recovery", (char*) &innobase_fast_recovery, SHOW_MY_BOOL},
+ {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL},
+ {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG},
+ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/innodb_rw_lock.patch b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch
new file mode 100644
index 0000000..a509f70
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_rw_lock.patch
@@ -0,0 +1,2480 @@
+diff -ruN a/innobase/btr/btr0cur.c b/innobase/btr/btr0cur.c
+--- a/innobase/btr/btr0cur.c 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/btr/btr0cur.c 2009-10-22 15:18:44.000000000 +0900
+@@ -313,7 +313,7 @@
+ #ifdef UNIV_SEARCH_PERF_STAT
+ info->n_searches++;
+ #endif
+- if (btr_search_latch.writer == RW_LOCK_NOT_LOCKED
++ if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_NOT_LOCKED
+ && latch_mode <= BTR_MODIFY_LEAF && info->last_hash_succ
+ && !estimate
+ #ifdef PAGE_CUR_LE_OR_EXTENDS
+diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/btr/btr0sea.c 2009-10-22 15:18:44.000000000 +0900
+@@ -773,8 +773,8 @@
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+- ut_ad(btr_search_latch.writer != RW_LOCK_EX);
+- ut_ad(btr_search_latch.reader_count > 0);
++ ut_ad(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_EX);
++ ut_ad(rw_lock_get_reader_count(&btr_search_latch) > 0);
+
+ rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+
+diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/buf/buf0buf.c 2009-10-22 15:18:44.000000000 +0900
+@@ -1292,7 +1292,7 @@
+
+ if (mode == BUF_GET_NOWAIT) {
+ if (rw_latch == RW_S_LATCH) {
+- success = rw_lock_s_lock_func_nowait(&(block->lock),
++ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+@@ -1442,7 +1442,7 @@
+ ut_ad(!ibuf_inside() || ibuf_page(block->space, block->offset));
+
+ if (rw_latch == RW_S_LATCH) {
+- success = rw_lock_s_lock_func_nowait(&(block->lock),
++ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+@@ -1596,7 +1596,7 @@
+ ut_ad(!ibuf_inside() || (mode == BUF_KEEP_OLD));
+
+ if (rw_latch == RW_S_LATCH) {
+- success = rw_lock_s_lock_func_nowait(&(block->lock),
++ success = rw_lock_s_lock_nowait(&(block->lock),
+ file, line);
+ fix_type = MTR_MEMO_PAGE_S_FIX;
+ } else {
+diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic
+--- a/innobase/include/buf0buf.ic 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/include/buf0buf.ic 2009-10-22 16:12:25.000000000 +0900
+@@ -523,7 +523,7 @@
+ #ifdef UNIV_SYNC_DEBUG
+ ibool ret;
+
+- ret = rw_lock_s_lock_func_nowait(&(block->debug_latch), file, line);
++ ret = rw_lock_s_lock_nowait(&(block->debug_latch), file, line);
+
+ ut_ad(ret == TRUE);
+ ut_ad(mutex_own(&block->mutex));
+diff -ruN a/innobase/include/os0sync.h b/innobase/include/os0sync.h
+--- a/innobase/include/os0sync.h 2009-09-10 04:02:59.000000000 +0900
++++ b/innobase/include/os0sync.h 2009-10-22 15:18:44.000000000 +0900
+@@ -1,11 +1,35 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ The interface to the operating system
+ synchronization primitives.
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/6/1995 Heikki Tuuri
+ *******************************************************/
++
+ #ifndef os0sync_h
+ #define os0sync_h
+
+@@ -261,6 +285,23 @@
+ /*===============*/
+ os_fast_mutex_t* fast_mutex); /* in: mutex to free */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++/**************************************************************
++Atomic compare-and-swap for InnoDB. Currently requires GCC atomic builtins.
++Returns true if swapped, ptr is pointer to target, old_val is value to
++compare to, new_val is the value to swap in. */
++#define os_compare_and_swap(ptr, old_val, new_val) \
++ __sync_bool_compare_and_swap(ptr, old_val, new_val)
++
++/**************************************************************
++Atomic increment for InnoDB. Currently requires GCC atomic builtins.
++Returns the resulting value, ptr is pointer to target, amount is the
++amount of increment. */
++#define os_atomic_increment(ptr, amount) \
++ __sync_add_and_fetch(ptr, amount)
++
++#endif /* HAVE_ATOMIC_BUILTINS */
++
+ #ifndef UNIV_NONINL
+ #include "os0sync.ic"
+ #endif
+diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
+--- a/innobase/include/sync0rw.h 2009-09-10 04:02:59.000000000 +0900
++++ b/innobase/include/sync0rw.h 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ The read-write lock (for threads, not for database transactions)
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/11/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -24,6 +47,12 @@
+ #define RW_X_LATCH 2
+ #define RW_NO_LATCH 3
+
++/* We decrement lock_word by this amount for each x_lock. It is also the
++start value for the lock_word, meaning that it limits the maximum number
++of concurrent read locks before the rw_lock breaks. The current value of
++0x00100000 allows 1,048,575 concurrent readers and 2047 recursive writers.*/
++#define X_LOCK_DECR 0x00100000
++
+ typedef struct rw_lock_struct rw_lock_t;
+ #ifdef UNIV_SYNC_DEBUG
+ typedef struct rw_lock_debug_struct rw_lock_debug_t;
+@@ -47,14 +76,14 @@
+ there may be waiters for the event */
+ #endif /* UNIV_SYNC_DEBUG */
+
+-extern ulint rw_s_system_call_count;
+-extern ulint rw_s_spin_wait_count;
+-extern ulint rw_s_exit_count;
+-extern ulint rw_s_os_wait_count;
+-extern ulint rw_x_system_call_count;
+-extern ulint rw_x_spin_wait_count;
+-extern ulint rw_x_os_wait_count;
+-extern ulint rw_x_exit_count;
++extern ib_longlong rw_s_spin_wait_count;
++extern ib_longlong rw_s_spin_round_count;
++extern ib_longlong rw_s_exit_count;
++extern ib_longlong rw_s_os_wait_count;
++extern ib_longlong rw_x_spin_wait_count;
++extern ib_longlong rw_x_spin_round_count;
++extern ib_longlong rw_x_os_wait_count;
++extern ib_longlong rw_x_exit_count;
+
+ /**********************************************************************
+ Creates, or rather, initializes an rw-lock object in a specified memory
+@@ -116,8 +145,22 @@
+ NOTE! The following macros should be used in rw s-locking, not the
+ corresponding function. */
+
+-#define rw_lock_s_lock_nowait(M) rw_lock_s_lock_func_nowait(\
+- (M), __FILE__, __LINE__)
++#define rw_lock_s_lock_nowait(M, F, L) rw_lock_s_lock_low(\
++ (M), 0, (F), (L))
++/**********************************************************************
++Low-level function which tries to lock an rw-lock in s-mode. Performs no
++spinning. */
++UNIV_INLINE
++ibool
++rw_lock_s_lock_low(
++/*===============*/
++ /* out: TRUE if success */
++ rw_lock_t* lock, /* in: pointer to rw-lock */
++ ulint pass __attribute__((unused)),
++ /* in: pass value; != 0, if the lock will be
++ passed to another thread to unlock */
++ const char* file_name, /* in: file name where lock requested */
++ ulint line); /* in: line where requested */
+ /**********************************************************************
+ NOTE! Use the corresponding macro, not directly this function, except if
+ you supply the file name and line number. Lock an rw-lock in shared mode
+@@ -135,18 +178,6 @@
+ const char* file_name,/* in: file name where lock requested */
+ ulint line); /* in: line where requested */
+ /**********************************************************************
+-NOTE! Use the corresponding macro, not directly this function, except if
+-you supply the file name and line number. Lock an rw-lock in shared mode
+-for the current thread if the lock can be acquired immediately. */
+-UNIV_INLINE
+-ibool
+-rw_lock_s_lock_func_nowait(
+-/*=======================*/
+- /* out: TRUE if success */
+- rw_lock_t* lock, /* in: pointer to rw-lock */
+- const char* file_name,/* in: file name where lock requested */
+- ulint line); /* in: line where requested */
+-/**********************************************************************
+ NOTE! Use the corresponding macro, not directly this function! Lock an
+ rw-lock in exclusive mode for the current thread if the lock can be
+ obtained immediately. */
+@@ -338,6 +369,41 @@
+ rw_lock_get_reader_count(
+ /*=====================*/
+ rw_lock_t* lock);
++/**********************************************************************
++Decrements lock_word the specified amount if it is greater than 0.
++This is used by both s_lock and x_lock operations. */
++UNIV_INLINE
++ibool
++rw_lock_lock_word_decr(
++/*===================*/
++ /* out: TRUE if decr occurs */
++ rw_lock_t* lock, /* in: rw-lock */
++ ulint amount); /* in: amount to decrement */
++/**********************************************************************
++Increments lock_word the specified amount and returns new value. */
++UNIV_INLINE
++lint
++rw_lock_lock_word_incr(
++/*===================*/
++ /* out: TRUE if decr occurs */
++ rw_lock_t* lock,
++ ulint amount); /* in: rw-lock */
++/**********************************************************************
++This function sets the lock->writer_thread and lock->recursive fields.
++For platforms where we are using atomic builtins instead of lock->mutex
++it sets the lock->writer_thread field using atomics to ensure memory
++ordering. Note that it is assumed that the caller of this function
++effectively owns the lock i.e.: nobody else is allowed to modify
++lock->writer_thread at this point in time.
++The protocol is that lock->writer_thread MUST be updated BEFORE the
++lock->recursive flag is set. */
++UNIV_INLINE
++void
++rw_lock_set_writer_id_and_recursion_flag(
++/*=====================================*/
++ rw_lock_t* lock, /* in/out: lock to work on */
++ ibool recursive); /* in: TRUE if recursion
++ allowed */
+ #ifdef UNIV_SYNC_DEBUG
+ /**********************************************************************
+ Checks if the thread has locked the rw-lock in the specified mode, with
+@@ -417,47 +483,33 @@
+ field. Then no new readers are allowed in. */
+
+ struct rw_lock_struct {
++ volatile lint lock_word;
++ /* Holds the state of the lock. */
++ volatile ulint waiters;/* 1: there are waiters */
++ volatile ibool recursive;/* Default value FALSE which means the lock
++ is non-recursive. The value is typically set
++ to TRUE making normal rw_locks recursive. In
++ case of asynchronous IO, when a non-zero
++ value of 'pass' is passed then we keep the
++ lock non-recursive.
++ This flag also tells us about the state of
++ writer_thread field. If this flag is set
++ then writer_thread MUST contain the thread
++ id of the current x-holder or wait-x thread.
++ This flag must be reset in x_unlock
++ functions before incrementing the lock_word */
++ volatile os_thread_id_t writer_thread;
++ /* Thread id of writer thread. Is only
++ guaranteed to have sane and non-stale
++ value iff recursive flag is set. */
+ os_event_t event; /* Used by sync0arr.c for thread queueing */
+-
+-#ifdef __WIN__
+- os_event_t wait_ex_event; /* This windows specific event is
+- used by the thread which has set the
+- lock state to RW_LOCK_WAIT_EX. The
+- rw_lock design guarantees that this
+- thread will be the next one to proceed
+- once the current the event gets
+- signalled. See LEMMA 2 in sync0sync.c */
+-#endif
+-
+- ulint reader_count; /* Number of readers who have locked this
+- lock in the shared mode */
+- ulint writer; /* This field is set to RW_LOCK_EX if there
+- is a writer owning the lock (in exclusive
+- mode), RW_LOCK_WAIT_EX if a writer is
+- queueing for the lock, and
+- RW_LOCK_NOT_LOCKED, otherwise. */
+- os_thread_id_t writer_thread;
+- /* Thread id of a possible writer thread */
+- ulint writer_count; /* Number of times the same thread has
+- recursively locked the lock in the exclusive
+- mode */
++ os_event_t wait_ex_event;
++ /* Event for next-writer to wait on. A thread
++ must decrement lock_word before waiting. */
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_t mutex; /* The mutex protecting rw_lock_struct */
+- ulint pass; /* Default value 0. This is set to some
+- value != 0 given by the caller of an x-lock
+- operation, if the x-lock is to be passed to
+- another thread to unlock (which happens in
+- asynchronous i/o). */
+- ulint waiters; /* This ulint is set to 1 if there are
+- waiters (readers or writers) in the global
+- wait array, waiting for this rw_lock.
+- Otherwise, == 0. */
+- ibool writer_is_wait_ex;
+- /* This is TRUE if the writer field is
+- RW_LOCK_WAIT_EX; this field is located far
+- from the memory update hotspot fields which
+- are at the start of this struct, thus we can
+- peek this field without causing much memory
+- bus traffic */
++#endif /* HAVE_ATOMIC_BUILTINS */
++
+ UT_LIST_NODE_T(rw_lock_t) list;
+ /* All allocated rw locks are put into a
+ list */
+@@ -465,15 +517,23 @@
+ UT_LIST_BASE_NODE_T(rw_lock_debug_t) debug_list;
+ /* In the debug version: pointer to the debug
+ info list of the lock */
++ ulint level; /* Level in the global latching order. */
+ #endif /* UNIV_SYNC_DEBUG */
+- ulint level; /* Level in the global latching
+- order; default SYNC_LEVEL_NONE */
++ ulint count_os_wait; /* Count of os_waits. May not be accurate */
+ const char* cfile_name;/* File name where lock created */
+- ulint cline; /* Line where created */
++ /* last s-lock file/line is not guaranteed to be correct */
+ const char* last_s_file_name;/* File name where last s-locked */
+ const char* last_x_file_name;/* File name where last x-locked */
+- ulint last_s_line; /* Line number where last time s-locked */
+- ulint last_x_line; /* Line number where last time x-locked */
++ ibool writer_is_wait_ex;
++ /* This is TRUE if the writer field is
++ RW_LOCK_WAIT_EX; this field is located far
++ from the memory update hotspot fields which
++ are at the start of this struct, thus we can
++ peek this field without causing much memory
++ bus traffic */
++ unsigned cline:14; /* Line where created */
++ unsigned last_s_line:14; /* Line number where last time s-locked */
++ unsigned last_x_line:14; /* Line number where last time x-locked */
+ ulint magic_n;
+ };
+
+diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic
+--- a/innobase/include/sync0rw.ic 2009-09-10 04:02:59.000000000 +0900
++++ b/innobase/include/sync0rw.ic 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ The read-write lock (for threads)
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/11/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -49,53 +72,88 @@
+ ulint
+ rw_lock_get_waiters(
+ /*================*/
+- rw_lock_t* lock)
++ /* out: 1 if waiters, 0 otherwise */
++ rw_lock_t* lock) /* in: rw-lock */
+ {
+ return(lock->waiters);
+ }
++
++/************************************************************************
++Sets lock->waiters to 1. It is not an error if lock->waiters is already
++1. On platforms where ATOMIC builtins are used this function enforces a
++memory barrier. */
+ UNIV_INLINE
+ void
+-rw_lock_set_waiters(
+-/*================*/
+- rw_lock_t* lock,
+- ulint flag)
++rw_lock_set_waiter_flag(
++/*====================*/
++ rw_lock_t* lock) /* in: rw-lock */
+ {
+- lock->waiters = flag;
++#ifdef HAVE_ATOMIC_BUILTINS
++ os_compare_and_swap(&lock->waiters, 0, 1);
++#else /* HAVE_ATOMIC_BUILTINS */
++ lock->waiters = 1;
++#endif /* HAVE_ATOMIC_BUILTINS */
+ }
++
++/************************************************************************
++Resets lock->waiters to 0. It is not an error if lock->waiters is already
++0. On platforms where ATOMIC builtins are used this function enforces a
++memory barrier. */
+ UNIV_INLINE
+-ulint
+-rw_lock_get_writer(
+-/*===============*/
+- rw_lock_t* lock)
++void
++rw_lock_reset_waiter_flag(
++/*======================*/
++ rw_lock_t* lock) /* in: rw-lock */
+ {
+- return(lock->writer);
++#ifdef HAVE_ATOMIC_BUILTINS
++ os_compare_and_swap(&lock->waiters, 1, 0);
++#else /* HAVE_ATOMIC_BUILTINS */
++ lock->waiters = 0;
++#endif /* HAVE_ATOMIC_BUILTINS */
+ }
++
++/**********************************************************************
++Returns the write-status of the lock - this function made more sense
++with the old rw_lock implementation. */
+ UNIV_INLINE
+-void
+-rw_lock_set_writer(
++ulint
++rw_lock_get_writer(
+ /*===============*/
+- rw_lock_t* lock,
+- ulint flag)
++ rw_lock_t* lock)
+ {
+- lock->writer = flag;
++ lint lock_word = lock->lock_word;
++ if(lock_word > 0) {
++ /* return NOT_LOCKED in s-lock state, like the writer
++ member of the old lock implementation. */
++ return(RW_LOCK_NOT_LOCKED);
++ } else if (((-lock_word) % X_LOCK_DECR) == 0) {
++ return(RW_LOCK_EX);
++ } else {
++ ut_ad(lock_word > -X_LOCK_DECR);
++ return(RW_LOCK_WAIT_EX);
++ }
+ }
++
++/**********************************************************************
++Returns number of readers. */
+ UNIV_INLINE
+ ulint
+ rw_lock_get_reader_count(
+ /*=====================*/
+ rw_lock_t* lock)
+ {
+- return(lock->reader_count);
+-}
+-UNIV_INLINE
+-void
+-rw_lock_set_reader_count(
+-/*=====================*/
+- rw_lock_t* lock,
+- ulint count)
+-{
+- lock->reader_count = count;
++ lint lock_word = lock->lock_word;
++ if(lock_word > 0) {
++ /* s-locked, no x-waiters */
++ return(X_LOCK_DECR - lock_word);
++ } else if (lock_word < 0 && lock_word > -X_LOCK_DECR) {
++ /* s-locked, with x-waiters */
++ return((ulint)(-lock_word));
++ }
++ return(0);
+ }
++
++#ifndef HAVE_ATOMIC_BUILTINS
+ UNIV_INLINE
+ mutex_t*
+ rw_lock_get_mutex(
+@@ -104,6 +162,7 @@
+ {
+ return(&(lock->mutex));
+ }
++#endif
+
+ /**********************************************************************
+ Returns the value of writer_count for the lock. Does not reserve the lock
+@@ -115,7 +174,126 @@
+ /* out: value of writer_count */
+ rw_lock_t* lock) /* in: rw-lock */
+ {
+- return(lock->writer_count);
++ lint lock_copy = lock->lock_word;
++ /* If there is a reader, lock_word is not divisible by X_LOCK_DECR */
++ if(lock_copy > 0 || (-lock_copy) % X_LOCK_DECR != 0) {
++ return(0);
++ }
++ return(((-lock_copy) / X_LOCK_DECR) + 1);
++}
++
++/**********************************************************************
++Two different implementations for decrementing the lock_word of a rw_lock:
++one for systems supporting atomic operations, one for others. This does
++does not support recusive x-locks: they should be handled by the caller and
++need not be atomic since they are performed by the current lock holder.
++Returns true if the decrement was made, false if not. */
++UNIV_INLINE
++ibool
++rw_lock_lock_word_decr(
++/*===================*/
++ /* out: TRUE if decr occurs */
++ rw_lock_t* lock, /* in: rw-lock */
++ ulint amount) /* in: amount of decrement */
++{
++
++#ifdef HAVE_ATOMIC_BUILTINS
++
++ lint local_lock_word = lock->lock_word;
++ while (local_lock_word > 0) {
++ if(os_compare_and_swap(&(lock->lock_word),
++ local_lock_word,
++ local_lock_word - amount)) {
++ return(TRUE);
++ }
++ local_lock_word = lock->lock_word;
++ }
++ return(FALSE);
++
++#else /* HAVE_ATOMIC_BUILTINS */
++
++ ibool success = FALSE;
++ mutex_enter(&(lock->mutex));
++ if(lock->lock_word > 0) {
++ lock->lock_word -= amount;
++ success = TRUE;
++ }
++ mutex_exit(&(lock->mutex));
++ return(success);
++
++#endif /* HAVE_ATOMIC_BUILTINS */
++}
++
++/**********************************************************************
++Two different implementations for incrementing the lock_word of a rw_lock:
++one for systems supporting atomic operations, one for others.
++Returns the value of lock_word after increment. */
++UNIV_INLINE
++lint
++rw_lock_lock_word_incr(
++/*===================*/
++ /* out: lock->lock_word after increment */
++ rw_lock_t* lock, /* in: rw-lock */
++ ulint amount) /* in: amount of increment */
++{
++
++#ifdef HAVE_ATOMIC_BUILTINS
++
++ return(os_atomic_increment(&(lock->lock_word), amount));
++
++#else /* HAVE_ATOMIC_BUILTINS */
++
++ lint local_lock_word;
++
++ mutex_enter(&(lock->mutex));
++
++ lock->lock_word += amount;
++ local_lock_word = lock->lock_word;
++
++ mutex_exit(&(lock->mutex));
++
++ return(local_lock_word);
++
++#endif /* HAVE_ATOMIC_BUILTINS */
++}
++
++/**********************************************************************
++This function sets the lock->writer_thread and lock->recursive fields.
++For platforms where we are using atomic builtins instead of lock->mutex
++it sets the lock->writer_thread field using atomics to ensure memory
++ordering. Note that it is assumed that the caller of this function
++effectively owns the lock i.e.: nobody else is allowed to modify
++lock->writer_thread at this point in time.
++The protocol is that lock->writer_thread MUST be updated BEFORE the
++lock->recursive flag is set. */
++UNIV_INLINE
++void
++rw_lock_set_writer_id_and_recursion_flag(
++/*=====================================*/
++ rw_lock_t* lock, /* in/out: lock to work on */
++ ibool recursive) /* in: TRUE if recursion
++ allowed */
++{
++ os_thread_id_t curr_thread = os_thread_get_curr_id();
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ os_thread_id_t local_thread;
++ ibool success;
++
++ local_thread = lock->writer_thread;
++ success = os_compare_and_swap(&lock->writer_thread,
++ local_thread, curr_thread);
++ ut_a(success);
++ lock->recursive = recursive;
++
++#else /* HAVE_ATOMIC_BUILTINS */
++
++ mutex_enter(&lock->mutex);
++ lock->writer_thread = curr_thread;
++ lock->recursive = recursive;
++ mutex_exit(&lock->mutex);
++
++#endif /* HAVE_ATOMIC_BUILTINS */
+ }
+
+ /**********************************************************************
+@@ -133,26 +311,21 @@
+ const char* file_name, /* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
+-#ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+-#endif /* UNIV_SYNC_DEBUG */
+- /* Check if the writer field is free */
+-
+- if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) {
+- /* Set the shared lock by incrementing the reader count */
+- lock->reader_count++;
++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
++ if (!rw_lock_lock_word_decr(lock, 1)) {
++ /* Locking did not succeed */
++ return(FALSE);
++ }
+
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name,
+- line);
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name, line);
+ #endif
+- lock->last_s_file_name = file_name;
+- lock->last_s_line = line;
+-
+- return(TRUE); /* locking succeeded */
+- }
++ /* These debugging values are not set safely: they may be incorrect
++ or even refer to a line that is invalid for the file name. */
++ lock->last_s_file_name = file_name;
++ lock->last_s_line = line;
+
+- return(FALSE); /* locking did not succeed */
++ return(TRUE); /* locking succeeded */
+ }
+
+ /**********************************************************************
+@@ -167,11 +340,10 @@
+ const char* file_name, /* in: file name where requested */
+ ulint line) /* in: line where lock requested */
+ {
+- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED);
+- ut_ad(rw_lock_get_reader_count(lock) == 0);
++ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+- /* Set the shared lock by incrementing the reader count */
+- lock->reader_count++;
++ /* Indicate there is a new reader by decrementing lock_word */
++ lock->lock_word--;
+
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+@@ -194,13 +366,11 @@
+ ulint line) /* in: line where lock requested */
+ {
+ ut_ad(rw_lock_validate(lock));
+- ut_ad(rw_lock_get_reader_count(lock) == 0);
+- ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
++ ut_ad(lock->lock_word == X_LOCK_DECR);
+
+- rw_lock_set_writer(lock, RW_LOCK_EX);
++ lock->lock_word -= X_LOCK_DECR;
+ lock->writer_thread = os_thread_get_curr_id();
+- lock->writer_count++;
+- lock->pass = 0;
++ lock->recursive = TRUE;
+
+ lock->last_x_file_name = file_name;
+ lock->last_x_line = line;
+@@ -241,15 +411,12 @@
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+ #endif /* UNIV_SYNC_DEBUG */
+
+- mutex_enter(rw_lock_get_mutex(lock));
+-
+- if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) {
+- mutex_exit(rw_lock_get_mutex(lock));
++ /* TODO: study performance of UNIV_LIKELY branch prediction hints. */
++ if (rw_lock_s_lock_low(lock, pass, file_name, line)) {
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
+- mutex_exit(rw_lock_get_mutex(lock));
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+@@ -259,86 +426,60 @@
+
+ /**********************************************************************
+ NOTE! Use the corresponding macro, not directly this function! Lock an
+-rw-lock in shared mode for the current thread if the lock can be acquired
+-immediately. */
++rw-lock in exclusive mode for the current thread if the lock can be
++obtained immediately. */
+ UNIV_INLINE
+ ibool
+-rw_lock_s_lock_func_nowait(
++rw_lock_x_lock_func_nowait(
+ /*=======================*/
+ /* out: TRUE if success */
+ rw_lock_t* lock, /* in: pointer to rw-lock */
+ const char* file_name,/* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
+- ibool success = FALSE;
+-
+- mutex_enter(rw_lock_get_mutex(lock));
+-
+- if (lock->writer == RW_LOCK_NOT_LOCKED) {
+- /* Set the shared lock by incrementing the reader count */
+- lock->reader_count++;
++ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+-#ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name,
+- line);
+-#endif
++ ibool success;
+
+- lock->last_s_file_name = file_name;
+- lock->last_s_line = line;
++#ifdef HAVE_ATOMIC_BUILTINS
++ success = os_compare_and_swap(&(lock->lock_word), X_LOCK_DECR, 0);
++#else
+
++ success = FALSE;
++ mutex_enter(&(lock->mutex));
++ if (lock->lock_word == X_LOCK_DECR) {
++ lock->lock_word = 0;
+ success = TRUE;
+ }
++ mutex_exit(&(lock->mutex));
+
+- mutex_exit(rw_lock_get_mutex(lock));
+-
+- return(success);
+-}
++#endif
++ if (success) {
++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+
+-/**********************************************************************
+-NOTE! Use the corresponding macro, not directly this function! Lock an
+-rw-lock in exclusive mode for the current thread if the lock can be
+-obtained immediately. */
+-UNIV_INLINE
+-ibool
+-rw_lock_x_lock_func_nowait(
+-/*=======================*/
+- /* out: TRUE if success */
+- rw_lock_t* lock, /* in: pointer to rw-lock */
+- const char* file_name,/* in: file name where lock requested */
+- ulint line) /* in: line where requested */
+-{
+- ibool success = FALSE;
+- os_thread_id_t curr_thread = os_thread_get_curr_id();
+- mutex_enter(rw_lock_get_mutex(lock));
++ } else if (lock->recursive
++ && os_thread_eq(lock->writer_thread, curr_thread)) {
++ /* Relock: this lock_word modification is safe since no other
++ threads can modify (lock, unlock, or reserve) lock_word while
++ there is an exclusive writer and this is the writer thread. */
++ lock->lock_word -= X_LOCK_DECR;
+
+- if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) {
+- } else if (UNIV_LIKELY(rw_lock_get_writer(lock)
+- == RW_LOCK_NOT_LOCKED)) {
+- rw_lock_set_writer(lock, RW_LOCK_EX);
+- lock->writer_thread = curr_thread;
+- lock->pass = 0;
+- relock:
+- lock->writer_count++;
++ ut_ad(((-lock->lock_word) % X_LOCK_DECR) == 0);
+
++ } else {
++ /* Failure */
++ return(FALSE);
++ }
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
+ #endif
+
+- lock->last_x_file_name = file_name;
+- lock->last_x_line = line;
+-
+- success = TRUE;
+- } else if (rw_lock_get_writer(lock) == RW_LOCK_EX
+- && lock->pass == 0
+- && os_thread_eq(lock->writer_thread, curr_thread)) {
+- goto relock;
+- }
+-
+- mutex_exit(rw_lock_get_mutex(lock));
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
+
+ ut_ad(rw_lock_validate(lock));
+
+- return(success);
++ return(TRUE);
+ }
+
+ /**********************************************************************
+@@ -354,39 +495,21 @@
+ #endif
+ )
+ {
+- mutex_t* mutex = &(lock->mutex);
+- ibool sg = FALSE;
+-
+- /* Acquire the mutex protecting the rw-lock fields */
+- mutex_enter(mutex);
+-
+- /* Reset the shared lock by decrementing the reader count */
+-
+- ut_a(lock->reader_count > 0);
+- lock->reader_count--;
++ ut_ad((lock->lock_word % X_LOCK_DECR) != 0);
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+ #endif
+
+- /* If there may be waiters and this was the last s-lock,
+- signal the object */
++ /* Increment lock_word to indicate 1 less reader */
++ if (rw_lock_lock_word_incr(lock, 1) == 0) {
+
+- if (UNIV_UNLIKELY(lock->waiters)
+- && lock->reader_count == 0) {
+- sg = TRUE;
+-
+- rw_lock_set_waiters(lock, 0);
+- }
+-
+- mutex_exit(mutex);
+-
+- if (UNIV_UNLIKELY(sg)) {
+-#ifdef __WIN__
++ /* wait_ex waiter exists. It may not be asleep, but we signal
++ anyway. We do not wake other waiters, because they can't
++ exist without wait_ex waiter and wait_ex waiter goes first.*/
+ os_event_set(lock->wait_ex_event);
+-#endif
+- os_event_set(lock->event);
+ sync_array_object_signalled(sync_primary_wait_array);
++
+ }
+
+ ut_ad(rw_lock_validate(lock));
+@@ -405,16 +528,15 @@
+ /*====================*/
+ rw_lock_t* lock) /* in: rw-lock */
+ {
+- /* Reset the shared lock by decrementing the reader count */
+-
+- ut_ad(lock->reader_count > 0);
+-
+- lock->reader_count--;
++ ut_ad(lock->lock_word < X_LOCK_DECR);
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+ #endif
+
++ /* Decrease reader count by incrementing lock_word */
++ lock->lock_word++;
++
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+ #ifdef UNIV_SYNC_PERF_STAT
+@@ -435,42 +557,32 @@
+ #endif
+ )
+ {
+- ibool sg = FALSE;
+-
+- /* Acquire the mutex protecting the rw-lock fields */
+- mutex_enter(&(lock->mutex));
+-
+- /* Reset the exclusive lock if this thread no longer has an x-mode
+- lock */
+-
+- ut_ad(lock->writer_count > 0);
++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+- lock->writer_count--;
+-
+- if (lock->writer_count == 0) {
+- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
++ /* lock->recursive flag also indicates if lock->writer_thread is
++ valid or stale. If we are the last of the recursive callers
++ then we must unset lock->recursive flag to indicate that the
++ lock->writer_thread is now stale.
++ Note that since we still hold the x-lock we can safely read the
++ lock_word. */
++ if (lock->lock_word == 0) {
++ /* Last caller in a possible recursive chain. */
++ lock->recursive = FALSE;
+ }
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+ #endif
+
+- /* If there may be waiters, signal the lock */
+- if (UNIV_UNLIKELY(lock->waiters)
+- && lock->writer_count == 0) {
+-
+- sg = TRUE;
+- rw_lock_set_waiters(lock, 0);
+- }
+-
+- mutex_exit(&(lock->mutex));
+-
+- if (UNIV_UNLIKELY(sg)) {
+-#ifdef __WIN__
+- os_event_set(lock->wait_ex_event);
+-#endif
+- os_event_set(lock->event);
+- sync_array_object_signalled(sync_primary_wait_array);
++ if (rw_lock_lock_word_incr(lock, X_LOCK_DECR) == X_LOCK_DECR) {
++ /* Lock is now free. May have to signal read/write waiters.
++ We do not need to signal wait_ex waiters, since they cannot
++ exist when there is a writer. */
++ if (lock->waiters) {
++ rw_lock_reset_waiter_flag(lock);
++ os_event_set(lock->event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
+ }
+
+ ut_ad(rw_lock_validate(lock));
+@@ -492,18 +604,18 @@
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+- ut_ad(lock->writer_count > 0);
+-
+- lock->writer_count--;
+-
+- if (lock->writer_count == 0) {
+- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+- }
++ ut_ad((lock->lock_word % X_LOCK_DECR) == 0);
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+ #endif
+
++ if (lock->lock_word == 0) {
++ lock->recursive = FALSE;
++ }
++
++ lock->lock_word += X_LOCK_DECR;
++
+ ut_ad(!lock->waiters);
+ ut_ad(rw_lock_validate(lock));
+
+diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
+--- a/innobase/include/sync0sync.h 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/include/sync0sync.h 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ Mutex, the basic synchronization primitive
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/5/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -465,8 +488,11 @@
+ struct mutex_struct {
+ os_event_t event; /* Used by sync0arr.c for the wait queue */
+ ulint lock_word; /* This ulint is the target of the atomic
+- test-and-set instruction in Win32 */
+-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER)
++ test-and-set instruction in Win32 and
++ x86 32/64 with GCC 4.1.0 or later version */
++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER)
++#elif defined(HAVE_ATOMIC_BUILTINS)
++#else
+ os_fast_mutex_t
+ os_fast_mutex; /* In other systems we use this OS mutex
+ in place of lock_word */
+@@ -525,8 +551,7 @@
+ /* The number of system calls made in this module. Intended for performance
+ monitoring. */
+
+-extern ulint mutex_system_call_count;
+-extern ulint mutex_exit_count;
++extern ib_longlong mutex_exit_count;
+
+ /* Latching order checks start when this is set TRUE */
+ extern ibool sync_order_checks_on;
+diff -ruN a/innobase/include/sync0sync.ic b/innobase/include/sync0sync.ic
+--- a/innobase/include/sync0sync.ic 2009-09-10 04:02:59.000000000 +0900
++++ b/innobase/include/sync0sync.ic 2009-10-22 15:18:44.000000000 +0900
+@@ -1,21 +1,34 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ Mutex, the basic synchronization primitive
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/5/1995 Heikki Tuuri
+ *******************************************************/
+
+-#if defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86)
+-/* %z0: Use the size of operand %0 which in our case is *m to determine
+-instruction size, it should end up as xchgl. "1" in the input constraint,
+-says that "in" has to go in the same place as "out".*/
+-#define TAS(m, in, out) \
+- asm volatile ("xchg%z0 %2, %0" \
+- : "=g" (*(m)), "=r" (out) \
+- : "1" (in)) /* Note: "1" here refers to "=r" (out) */
+-#endif
+-
+ /**********************************************************************
+ Sets the waiters field in a mutex. */
+
+@@ -94,12 +107,8 @@
+ /* mutex_fence(); */
+
+ return(res);
+-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86)
+- ulint res;
+-
+- TAS(&mutex->lock_word, 1, res);
+-
+- return(res);
++#elif defined(HAVE_ATOMIC_BUILTINS)
++ return __sync_lock_test_and_set(&(mutex->lock_word), 1);
+ #else
+ ibool ret;
+
+@@ -136,10 +145,11 @@
+ __asm MOV EDX, 0
+ __asm MOV ECX, lw
+ __asm XCHG EDX, DWORD PTR [ECX]
+-#elif defined(not_defined) && defined(__GNUC__) && defined(UNIV_INTEL_X86)
+- ulint res;
+-
+- TAS(&mutex->lock_word, 0, res);
++#elif defined(HAVE_ATOMIC_BUILTINS)
++ /* In theory __sync_lock_release should be used to release the lock.
++ Unfortunately, it does not work properly alone. The workaround is
++ that more conservative __sync_lock_test_and_set is used instead. */
++ __sync_lock_test_and_set(&(mutex->lock_word), 0);
+ #else
+ mutex->lock_word = 0;
+
+diff -ruN a/innobase/row/row0sel.c b/innobase/row/row0sel.c
+--- a/innobase/row/row0sel.c 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/row/row0sel.c 2009-10-22 15:18:44.000000000 +0900
+@@ -1178,7 +1178,7 @@
+ rw_lock_s_lock(&btr_search_latch);
+
+ search_latch_locked = TRUE;
+- } else if (btr_search_latch.writer_is_wait_ex) {
++ } else if (rw_lock_get_writer(&btr_search_latch) == RW_LOCK_WAIT_EX) {
+
+ /* There is an x-latch request waiting: release the
+ s-latch for a moment; as an s-latch here is often
+@@ -3123,7 +3123,7 @@
+ /* PHASE 0: Release a possible s-latch we are holding on the
+ adaptive hash index latch if there is someone waiting behind */
+
+- if (UNIV_UNLIKELY(btr_search_latch.writer != RW_LOCK_NOT_LOCKED)
++ if (UNIV_UNLIKELY(rw_lock_get_writer(&btr_search_latch) != RW_LOCK_NOT_LOCKED)
+ && trx->has_search_latch) {
+
+ /* There is an x-latch request on the adaptive hash index:
+diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c
+--- a/innobase/sync/sync0arr.c 2009-09-10 04:03:01.000000000 +0900
++++ b/innobase/sync/sync0arr.c 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ The wait array used in synchronization primitives
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/5/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -297,25 +320,21 @@
+ }
+
+ /***********************************************************************
+-Puts the cell event in reset state. */
++Returns the event that the thread owning the cell waits for. */
+ static
+-ib_longlong
+-sync_cell_event_reset(
+-/*==================*/
+- /* out: value of signal_count
+- at the time of reset. */
+- ulint type, /* in: lock type mutex/rw_lock */
+- void* object) /* in: the rw_lock/mutex object */
++os_event_t
++sync_cell_get_event(
++/*================*/
++ sync_cell_t* cell) /* in: non-empty sync array cell */
+ {
++ ulint type = cell->request_type;
++
+ if (type == SYNC_MUTEX) {
+- return(os_event_reset(((mutex_t *) object)->event));
+-#ifdef __WIN__
++ return(((mutex_t *) cell->wait_object)->event);
+ } else if (type == RW_LOCK_WAIT_EX) {
+- return(os_event_reset(
+- ((rw_lock_t *) object)->wait_ex_event));
+-#endif
+- } else {
+- return(os_event_reset(((rw_lock_t *) object)->event));
++ return(((rw_lock_t *) cell->wait_object)->wait_ex_event);
++ } else { /* RW_LOCK_SHARED and RW_LOCK_EX wait on the same event */
++ return(((rw_lock_t *) cell->wait_object)->event);
+ }
+ }
+
+@@ -334,6 +353,7 @@
+ ulint* index) /* out: index of the reserved cell */
+ {
+ sync_cell_t* cell;
++ os_event_t event;
+ ulint i;
+
+ ut_a(object);
+@@ -372,8 +392,8 @@
+ /* Make sure the event is reset and also store
+ the value of signal_count at which the event
+ was reset. */
+- cell->signal_count = sync_cell_event_reset(type,
+- object);
++ event = sync_cell_get_event(cell);
++ cell->signal_count = os_event_reset(event);
+
+ cell->reservation_time = time(NULL);
+
+@@ -413,19 +433,7 @@
+ ut_a(!cell->waiting);
+ ut_ad(os_thread_get_curr_id() == cell->thread);
+
+- if (cell->request_type == SYNC_MUTEX) {
+- event = ((mutex_t*) cell->wait_object)->event;
+-#ifdef __WIN__
+- /* On windows if the thread about to wait is the one which
+- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then
+- it waits on a special event i.e.: wait_ex_event. */
+- } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+- event = ((rw_lock_t*) cell->wait_object)->wait_ex_event;
+-#endif
+- } else {
+- event = ((rw_lock_t*) cell->wait_object)->event;
+- }
+-
++ event = sync_cell_get_event(cell);
+ cell->waiting = TRUE;
+
+ #ifdef UNIV_SYNC_DEBUG
+@@ -464,6 +472,7 @@
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+ ulint type;
++ ulint writer;
+
+ type = cell->request_type;
+
+@@ -492,9 +501,7 @@
+ (ulong) mutex->waiters);
+
+ } else if (type == RW_LOCK_EX
+-#ifdef __WIN__
+ || type == RW_LOCK_WAIT_EX
+-#endif
+ || type == RW_LOCK_SHARED) {
+
+ fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
+@@ -505,21 +512,24 @@
+ " RW-latch at %p created in file %s line %lu\n",
+ rwlock, rwlock->cfile_name,
+ (ulong) rwlock->cline);
+- if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
++ writer = rw_lock_get_writer(rwlock);
++ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+- rwlock->writer == RW_LOCK_EX
++ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+- "number of readers %lu, waiters flag %lu\n"
++ "number of readers %lu, waiters flag %lu, "
++ "lock_word: %lx\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+- (ulong) rwlock->reader_count,
++ (ulong) rw_lock_get_reader_count(rwlock),
+ (ulong) rwlock->waiters,
++ rwlock->lock_word,
+ rwlock->last_s_file_name,
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+@@ -773,28 +783,30 @@
+ return(TRUE);
+ }
+
+- } else if (cell->request_type == RW_LOCK_EX
+- || cell->request_type == RW_LOCK_WAIT_EX) {
++ } else if (cell->request_type == RW_LOCK_EX) {
+
+ lock = cell->wait_object;
+
+- if (rw_lock_get_reader_count(lock) == 0
+- && rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
++ if (lock->lock_word > 0) {
++ /* Either unlocked or only read locked. */
+
+ return(TRUE);
+ }
+
+- if (rw_lock_get_reader_count(lock) == 0
+- && rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX
+- && os_thread_eq(lock->writer_thread, cell->thread)) {
++ } else if (cell->request_type == RW_LOCK_WAIT_EX) {
++
++ lock = cell->wait_object;
++
++ /* lock_word == 0 means all readers have left */
++ if (lock->lock_word == 0) {
+
+ return(TRUE);
+ }
+-
+ } else if (cell->request_type == RW_LOCK_SHARED) {
+ lock = cell->wait_object;
+
+- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
++ /* lock_word > 0 means no writer or reserved writer */
++ if (lock->lock_word > 0) {
+
+ return(TRUE);
+ }
+@@ -839,11 +851,15 @@
+ /*========================*/
+ sync_array_t* arr) /* in: wait array */
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ (void) os_atomic_increment(&arr->sg_count, 1);
++#else
+ sync_array_enter(arr);
+
+ arr->sg_count++;
+
+ sync_array_exit(arr);
++#endif
+ }
+
+ /**************************************************************************
+@@ -859,6 +875,7 @@
+ sync_cell_t* cell;
+ ulint count;
+ ulint i;
++ os_event_t event;
+
+ sync_array_enter(arr);
+
+@@ -868,36 +885,20 @@
+ while (count < arr->n_reserved) {
+
+ cell = sync_array_get_nth_cell(arr, i);
++ i++;
+
+- if (cell->wait_object != NULL) {
+-
++ if (cell->wait_object == NULL) {
++ continue;
++ }
+ count++;
+
+ if (sync_arr_cell_can_wake_up(cell)) {
+
+- if (cell->request_type == SYNC_MUTEX) {
+- mutex_t* mutex;
++ event = sync_cell_get_event(cell);
+
+- mutex = cell->wait_object;
+- os_event_set(mutex->event);
+-#ifdef __WIN__
+- } else if (cell->request_type
+- == RW_LOCK_WAIT_EX) {
+- rw_lock_t* lock;
+-
+- lock = cell->wait_object;
+- os_event_set(lock->wait_ex_event);
+-#endif
+- } else {
+- rw_lock_t* lock;
+-
+- lock = cell->wait_object;
+- os_event_set(lock->event);
+- }
+- }
++ os_event_set(event);
+ }
+
+- i++;
+ }
+
+ sync_array_exit(arr);
+@@ -1014,4 +1015,3 @@
+
+ sync_array_exit(arr);
+ }
+-
+diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
+--- a/innobase/sync/sync0rw.c 2009-09-10 04:03:01.000000000 +0900
++++ b/innobase/sync/sync0rw.c 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ The read-write lock (for thread synchronization)
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/11/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -15,17 +38,110 @@
+ #include "mem0mem.h"
+ #include "srv0srv.h"
+
+-ulint rw_s_system_call_count = 0;
+-ulint rw_s_spin_wait_count = 0;
+-ulint rw_s_os_wait_count = 0;
++/*
++ IMPLEMENTATION OF THE RW_LOCK
++ =============================
++The status of a rw_lock is held in lock_word. The initial value of lock_word is
++X_LOCK_DECR. lock_word is decremented by 1 for each s-lock and by X_LOCK_DECR
++for each x-lock. This describes the lock state for each value of lock_word:
++
++lock_word == X_LOCK_DECR: Unlocked.
++0 < lock_word < X_LOCK_DECR: Read locked, no waiting writers.
++ (X_LOCK_DECR - lock_word) is the
++ number of readers that hold the lock.
++lock_word == 0: Write locked
++-X_LOCK_DECR < lock_word < 0: Read locked, with a waiting writer.
++ (-lock_word) is the number of readers
++ that hold the lock.
++lock_word <= -X_LOCK_DECR: Recursively write locked. lock_word has been
++ decremented by X_LOCK_DECR once for each lock,
++ so the number of locks is:
++ ((-lock_word) / X_LOCK_DECR) + 1
++When lock_word <= -X_LOCK_DECR, we also know that lock_word % X_LOCK_DECR == 0:
++other values of lock_word are invalid.
++
++The lock_word is always read and updated atomically and consistently, so that
++it always represents the state of the lock, and the state of the lock changes
++with a single atomic operation. This lock_word holds all of the information
++that a thread needs in order to determine if it is eligible to gain the lock
++or if it must spin or sleep. The one exception to this is that writer_thread
++must be verified before recursive write locks: to solve this scenario, we make
++writer_thread readable by all threads, but only writeable by the x-lock holder.
++
++The other members of the lock obey the following rules to remain consistent:
++
++recursive: This and the writer_thread field together control the
++ behaviour of recursive x-locking.
++ lock->recursive must be FALSE in following states:
++ 1) The writer_thread contains garbage i.e.: the
++ lock has just been initialized.
++ 2) The lock is not x-held and there is no
++ x-waiter waiting on WAIT_EX event.
++ 3) The lock is x-held or there is an x-waiter
++ waiting on WAIT_EX event but the 'pass' value
++ is non-zero.
++ lock->recursive is TRUE iff:
++ 1) The lock is x-held or there is an x-waiter
++ waiting on WAIT_EX event and the 'pass' value
++ is zero.
++ This flag must be set after the writer_thread field
++ has been updated with a memory ordering barrier.
++ It is unset before the lock_word has been incremented.
++writer_thread: Is used only in recursive x-locking. Can only be safely
++ read iff lock->recursive flag is TRUE.
++ This field is uninitialized at lock creation time and
++ is updated atomically when x-lock is acquired or when
++ move_ownership is called. A thread is only allowed to
++ set the value of this field to it's thread_id i.e.: a
++ thread cannot set writer_thread to some other thread's
++ id.
++waiters: May be set to 1 anytime, but to avoid unnecessary wake-up
++ signals, it should only be set to 1 when there are threads
++ waiting on event. Must be 1 when a writer starts waiting to
++ ensure the current x-locking thread sends a wake-up signal
++ during unlock. May only be reset to 0 immediately before a
++ a wake-up signal is sent to event. On most platforms, a
++ memory barrier is required after waiters is set, and before
++ verifying lock_word is still held, to ensure some unlocker
++ really does see the flags new value.
++event: Threads wait on event for read or writer lock when another
++ thread has an x-lock or an x-lock reservation (wait_ex). A
++ thread may only wait on event after performing the following
++ actions in order:
++ (1) Record the counter value of event (with os_event_reset).
++ (2) Set waiters to 1.
++ (3) Verify lock_word <= 0.
++ (1) must come before (2) to ensure signal is not missed.
++ (2) must come before (3) to ensure a signal is sent.
++ These restrictions force the above ordering.
++ Immediately before sending the wake-up signal, we should:
++ (1) Verify lock_word == X_LOCK_DECR (unlocked)
++ (2) Reset waiters to 0.
++wait_ex_event: A thread may only wait on the wait_ex_event after it has
++ performed the following actions in order:
++ (1) Decrement lock_word by X_LOCK_DECR.
++ (2) Record counter value of wait_ex_event (os_event_reset,
++ called from sync_array_reserve_cell).
++ (3) Verify that lock_word < 0.
++ (1) must come first to ensures no other threads become reader
++ or next writer, and notifies unlocker that signal must be sent.
++ (2) must come before (3) to ensure the signal is not missed.
++ These restrictions force the above ordering.
++ Immediately before sending the wake-up signal, we should:
++ Verify lock_word == 0 (waiting thread holds x_lock)
++*/
++
++ib_longlong rw_s_spin_wait_count = 0;
++ib_longlong rw_s_spin_round_count = 0;
++ib_longlong rw_s_os_wait_count = 0;
++
++ib_longlong rw_s_exit_count = 0;
++
++ib_longlong rw_x_spin_wait_count = 0;
++ib_longlong rw_x_spin_round_count = 0;
++ib_longlong rw_x_os_wait_count = 0;
+
+-ulint rw_s_exit_count = 0;
+-
+-ulint rw_x_system_call_count = 0;
+-ulint rw_x_spin_wait_count = 0;
+-ulint rw_x_os_wait_count = 0;
+-
+-ulint rw_x_exit_count = 0;
++ib_longlong rw_x_exit_count = 0;
+
+ /* The global list of rw-locks */
+ rw_lock_list_t rw_lock_list;
+@@ -99,22 +215,30 @@
+ object is created, then the following call initializes
+ the sync system. */
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_create(rw_lock_get_mutex(lock));
+ mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+
+ lock->mutex.cfile_name = cfile_name;
+ lock->mutex.cline = cline;
+-#if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP
++# if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP
+ lock->mutex.cmutex_name = cmutex_name;
+ lock->mutex.mutex_type = 1;
+-#endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
++# endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+- rw_lock_set_waiters(lock, 0);
+- rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+- lock->writer_count = 0;
+- rw_lock_set_reader_count(lock, 0);
+-
+- lock->writer_is_wait_ex = FALSE;
++#else /* HAVE_ATOMIC_BUILTINS */
++# ifdef UNIV_DEBUG
++ UT_NOT_USED(cmutex_name);
++# endif
++#endif /* HAVE_ATOMIC_BUILTINS */
++
++ lock->lock_word = X_LOCK_DECR;
++ lock->waiters = 0;
++
++ /* We set this value to signify that lock->writer_thread
++ contains garbage at initialization and cannot be used for
++ recursive x-locking. */
++ lock->recursive = FALSE;
+
+ #ifdef UNIV_SYNC_DEBUG
+ UT_LIST_INIT(lock->debug_list);
+@@ -126,15 +250,13 @@
+ lock->cfile_name = cfile_name;
+ lock->cline = cline;
+
++ lock->count_os_wait = 0;
+ lock->last_s_file_name = "not yet reserved";
+ lock->last_x_file_name = "not yet reserved";
+ lock->last_s_line = 0;
+ lock->last_x_line = 0;
+ lock->event = os_event_create(NULL);
+-
+-#ifdef __WIN__
+ lock->wait_ex_event = os_event_create(NULL);
+-#endif
+
+ mutex_enter(&rw_lock_list_mutex);
+
+@@ -158,23 +280,17 @@
+ /*=========*/
+ rw_lock_t* lock) /* in: rw-lock */
+ {
+-#ifdef UNIV_DEBUG
+ ut_a(rw_lock_validate(lock));
+-#endif /* UNIV_DEBUG */
+- ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+- ut_a(rw_lock_get_waiters(lock) == 0);
+- ut_a(rw_lock_get_reader_count(lock) == 0);
++ ut_a(lock->lock_word == X_LOCK_DECR);
+
+- lock->magic_n = 0;
+-
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_free(rw_lock_get_mutex(lock));
++#endif /* HAVE_ATOMIC_BUILTINS */
+
+ mutex_enter(&rw_lock_list_mutex);
+ os_event_free(lock->event);
+
+-#ifdef __WIN__
+ os_event_free(lock->wait_ex_event);
+-#endif
+
+ if (UT_LIST_GET_PREV(list, lock)) {
+ ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+@@ -186,6 +302,8 @@
+ UT_LIST_REMOVE(list, rw_lock_list, lock);
+
+ mutex_exit(&rw_lock_list_mutex);
++
++ lock->magic_n = 0;
+ }
+
+ /**********************************************************************
+@@ -199,19 +317,12 @@
+ {
+ ut_a(lock);
+
+- mutex_enter(rw_lock_get_mutex(lock));
++ ulint waiters = rw_lock_get_waiters(lock);
++ lint lock_word = lock->lock_word;
+
+ ut_a(lock->magic_n == RW_LOCK_MAGIC_N);
+- ut_a((rw_lock_get_reader_count(lock) == 0)
+- || (rw_lock_get_writer(lock) != RW_LOCK_EX));
+- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED));
+- ut_a((rw_lock_get_waiters(lock) == 0)
+- || (rw_lock_get_waiters(lock) == 1));
+- ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0));
+-
+- mutex_exit(rw_lock_get_mutex(lock));
++ ut_a(waiters == 0 || waiters == 1);
++ ut_a(lock_word > -X_LOCK_DECR ||(-lock_word) % X_LOCK_DECR == 0);
+
+ return(TRUE);
+ }
+@@ -232,18 +343,15 @@
+ ulint line) /* in: line where requested */
+ {
+ ulint index; /* index of the reserved wait cell */
+- ulint i; /* spin round count */
++ ulint i = 0; /* spin round count */
+
+ ut_ad(rw_lock_validate(lock));
+
++ rw_s_spin_wait_count++; /* Count calls to this function */
+ lock_loop:
+- rw_s_spin_wait_count++;
+
+ /* Spin waiting for the writer field to become free */
+- i = 0;
+-
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
++ while (i < SYNC_SPIN_ROUNDS && lock->lock_word <= 0) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+@@ -262,28 +370,32 @@
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
+ }
+
+- mutex_enter(rw_lock_get_mutex(lock));
+-
+ /* We try once again to obtain the lock */
+-
+ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
+- mutex_exit(rw_lock_get_mutex(lock));
++ rw_s_spin_round_count += i;
+
+ return; /* Success */
+ } else {
+- /* If we get here, locking did not succeed, we may
+- suspend the thread to wait in the wait array */
+
+- rw_s_system_call_count++;
++ if (i < SYNC_SPIN_ROUNDS) {
++ goto lock_loop;
++ }
++
++ rw_s_spin_round_count += i;
+
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock, RW_LOCK_SHARED,
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
+-
+- mutex_exit(rw_lock_get_mutex(lock));
++ /* Set waiters before checking lock_word to ensure wake-up
++ signal is sent. This may lead to some unnecessary signals. */
++ rw_lock_set_waiter_flag(lock);
++
++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Success */
++ }
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -292,11 +404,13 @@
+ lock, lock->cfile_name, (ulong) lock->cline);
+ }
+
+- rw_s_system_call_count++;
++ /* these stats may not be accurate */
++ lock->count_os_wait++;
+ rw_s_os_wait_count++;
+
+ sync_array_wait_event(sync_primary_wait_array, index);
+
++ i = 0;
+ goto lock_loop;
+ }
+ }
+@@ -318,114 +432,130 @@
+ {
+ ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
+- mutex_enter(&(lock->mutex));
+-
+- lock->writer_thread = os_thread_get_curr_id();
+-
+- lock->pass = 0;
+-
+- mutex_exit(&(lock->mutex));
++ rw_lock_set_writer_id_and_recursion_flag(lock, TRUE);
+ }
+
+ /**********************************************************************
+-Low-level function for acquiring an exclusive lock. */
++Function for the next writer to call. Waits for readers to exit.
++The caller must have already decremented lock_word by X_LOCK_DECR.*/
+ UNIV_INLINE
+-ulint
+-rw_lock_x_lock_low(
+-/*===============*/
+- /* out: RW_LOCK_NOT_LOCKED if did
+- not succeed, RW_LOCK_EX if success,
+- RW_LOCK_WAIT_EX, if got wait reservation */
++void
++rw_lock_x_lock_wait(
++/*================*/
+ rw_lock_t* lock, /* in: pointer to rw-lock */
++#ifdef UNIV_SYNC_DEBUG
+ ulint pass, /* in: pass value; != 0, if the lock will
+ be passed to another thread to unlock */
++#endif
+ const char* file_name,/* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
+-#ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+-#endif /* UNIV_SYNC_DEBUG */
+- if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
++ ulint index;
++ ulint i = 0;
+
+- if (rw_lock_get_reader_count(lock) == 0) {
++ ut_ad(lock->lock_word <= 0);
++
++ while (lock->lock_word < 0) {
++ if (srv_spin_wait_delay) {
++ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
++ }
++ if(i < SYNC_SPIN_ROUNDS) {
++ i++;
++ continue;
++ }
+
+- rw_lock_set_writer(lock, RW_LOCK_EX);
+- lock->writer_thread = os_thread_get_curr_id();
+- lock->writer_count++;
+- lock->pass = pass;
++ /* If there is still a reader, then go to sleep.*/
++ rw_x_spin_round_count += i;
++ i = 0;
++ sync_array_reserve_cell(sync_primary_wait_array,
++ lock,
++ RW_LOCK_WAIT_EX,
++ file_name, line,
++ &index);
++ /* Check lock_word to ensure wake-up isn't missed.*/
++ if(lock->lock_word < 0) {
+
++ /* these stats may not be accurate */
++ lock->count_os_wait++;
++ rw_x_os_wait_count++;
++
++ /* Add debug info as it is needed to detect possible
++ deadlock. We must add info for WAIT_EX thread for
++ deadlock detection to work properly. */
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
+ file_name, line);
+ #endif
+- lock->last_x_file_name = file_name;
+- lock->last_x_line = line;
+-
+- /* Locking succeeded, we may return */
+- return(RW_LOCK_EX);
+- } else {
+- /* There are readers, we have to wait */
+- rw_lock_set_writer(lock, RW_LOCK_WAIT_EX);
+- lock->writer_thread = os_thread_get_curr_id();
+- lock->pass = pass;
+- lock->writer_is_wait_ex = TRUE;
+
++ sync_array_wait_event(sync_primary_wait_array,
++ index);
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
+- file_name, line);
++ rw_lock_remove_debug_info(lock, pass,
++ RW_LOCK_WAIT_EX);
+ #endif
+-
+- return(RW_LOCK_WAIT_EX);
++ /* It is possible to wake when lock_word < 0.
++ We must pass the while-loop check to proceed.*/
++ } else {
++ sync_array_free_cell(sync_primary_wait_array,
++ index);
+ }
++ }
++ rw_x_spin_round_count += i;
++}
+
+- } else if ((rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX)
+- && os_thread_eq(lock->writer_thread,
+- os_thread_get_curr_id())) {
++/**********************************************************************
++Low-level function for acquiring an exclusive lock. */
++UNIV_INLINE
++ibool
++rw_lock_x_lock_low(
++/*===============*/
++ /* out: RW_LOCK_NOT_LOCKED if did
++ not succeed, RW_LOCK_EX if success. */
++ rw_lock_t* lock, /* in: pointer to rw-lock */
++ ulint pass, /* in: pass value; != 0, if the lock will
++ be passed to another thread to unlock */
++ const char* file_name,/* in: file name where lock requested */
++ ulint line) /* in: line where requested */
++{
++ os_thread_id_t curr_thread = os_thread_get_curr_id();
+
+- if (rw_lock_get_reader_count(lock) == 0) {
++ if (rw_lock_lock_word_decr(lock, X_LOCK_DECR)) {
+
+- rw_lock_set_writer(lock, RW_LOCK_EX);
+- lock->writer_count++;
+- lock->pass = pass;
+- lock->writer_is_wait_ex = FALSE;
++ /* lock->recursive also tells us if the writer_thread
++ field is stale or active. As we are going to write
++ our own thread id in that field it must be that the
++ current writer_thread value is not active. */
++ ut_a(!lock->recursive);
+
++ /* Decrement occurred: we are writer or next-writer. */
++ rw_lock_set_writer_id_and_recursion_flag(lock,
++ pass ? FALSE : TRUE);
++
++ rw_lock_x_lock_wait(lock,
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX);
+- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
+- file_name, line);
++ pass,
+ #endif
++ file_name, line);
+
+- lock->last_x_file_name = file_name;
+- lock->last_x_line = line;
+-
+- /* Locking succeeded, we may return */
+- return(RW_LOCK_EX);
++ } else {
++ /* Decrement failed: relock or failed lock */
++ if (!pass && lock->recursive
++ && os_thread_eq(lock->writer_thread, curr_thread)) {
++ /* Relock */
++ lock->lock_word -= X_LOCK_DECR;
++ } else {
++ /* Another thread locked before us */
++ return(FALSE);
+ }
+-
+- return(RW_LOCK_WAIT_EX);
+-
+- } else if ((rw_lock_get_writer(lock) == RW_LOCK_EX)
+- && os_thread_eq(lock->writer_thread,
+- os_thread_get_curr_id())
+- && (lock->pass == 0)
+- && (pass == 0)) {
+-
+- lock->writer_count++;
+-
++ }
+ #ifdef UNIV_SYNC_DEBUG
+- rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name,
+- line);
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
++ file_name, line);
+ #endif
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = (unsigned int) line;
+
+- lock->last_x_file_name = file_name;
+- lock->last_x_line = line;
+-
+- /* Locking succeeded, we may return */
+- return(RW_LOCK_EX);
+- }
+-
+- /* Locking did not succeed */
+- return(RW_LOCK_NOT_LOCKED);
++ return(TRUE);
+ }
+
+ /**********************************************************************
+@@ -448,47 +578,30 @@
+ ulint line) /* in: line where requested */
+ {
+ ulint index; /* index of the reserved wait cell */
+- ulint state; /* lock state acquired */
+ ulint i; /* spin round count */
++ ibool spinning = FALSE;
+
+ ut_ad(rw_lock_validate(lock));
+
+-lock_loop:
+- /* Acquire the mutex protecting the rw-lock fields */
+- mutex_enter_fast(&(lock->mutex));
+-
+- state = rw_lock_x_lock_low(lock, pass, file_name, line);
++ i = 0;
+
+- mutex_exit(&(lock->mutex));
++lock_loop:
+
+- if (state == RW_LOCK_EX) {
++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
++ rw_x_spin_round_count += i;
+
+ return; /* Locking succeeded */
+
+- } else if (state == RW_LOCK_NOT_LOCKED) {
+-
+- /* Spin waiting for the writer field to become free */
+- i = 0;
+-
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
+- if (srv_spin_wait_delay) {
+- ut_delay(ut_rnd_interval(0,
+- srv_spin_wait_delay));
+- }
++ } else {
+
+- i++;
+- }
+- if (i == SYNC_SPIN_ROUNDS) {
+- os_thread_yield();
++ if (!spinning) {
++ spinning = TRUE;
++ rw_x_spin_wait_count++;
+ }
+- } else if (state == RW_LOCK_WAIT_EX) {
+
+- /* Spin waiting for the reader count field to become zero */
+- i = 0;
+-
+- while (rw_lock_get_reader_count(lock) != 0
+- && i < SYNC_SPIN_ROUNDS) {
++ /* Spin waiting for the lock_word to become free */
++ while (i < SYNC_SPIN_ROUNDS
++ && lock->lock_word <= 0) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+ srv_spin_wait_delay));
+@@ -498,12 +611,13 @@
+ }
+ if (i == SYNC_SPIN_ROUNDS) {
+ os_thread_yield();
++ } else {
++ goto lock_loop;
+ }
+- } else {
+- i = 0; /* Eliminate a compiler warning */
+- ut_error;
+ }
+
++ rw_x_spin_round_count += i;
++
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+ "Thread %lu spin wait rw-x-lock at %p cfile %s cline %lu rnds %lu\n",
+@@ -511,39 +625,20 @@
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
+ }
+
+- rw_x_spin_wait_count++;
+-
+- /* We try once again to obtain the lock. Acquire the mutex protecting
+- the rw-lock fields */
+-
+- mutex_enter(rw_lock_get_mutex(lock));
+-
+- state = rw_lock_x_lock_low(lock, pass, file_name, line);
+-
+- if (state == RW_LOCK_EX) {
+- mutex_exit(rw_lock_get_mutex(lock));
+-
+- return; /* Locking succeeded */
+- }
+-
+- rw_x_system_call_count++;
+-
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock,
+-#ifdef __WIN__
+- /* On windows RW_LOCK_WAIT_EX signifies
+- that this thread should wait on the
+- special wait_ex_event. */
+- (state == RW_LOCK_WAIT_EX)
+- ? RW_LOCK_WAIT_EX :
+-#endif
+ RW_LOCK_EX,
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
+-
+- mutex_exit(rw_lock_get_mutex(lock));
++ /* Waiters must be set before checking lock_word, to ensure signal
++ is sent. This could lead to a few unnecessary wake-up signals. */
++ rw_lock_set_waiter_flag(lock);
++
++ if (rw_lock_x_lock_low(lock, pass, file_name, line)) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Locking succeeded */
++ }
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -552,11 +647,13 @@
+ lock->cfile_name, (ulong) lock->cline);
+ }
+
+- rw_x_system_call_count++;
++ /* these stats may not be accurate */
++ lock->count_os_wait++;
+ rw_x_os_wait_count++;
+
+ sync_array_wait_event(sync_primary_wait_array, index);
+
++ i = 0;
+ goto lock_loop;
+ }
+
+@@ -697,7 +794,9 @@
+ rw_lock_t* lock, /* in: rw-lock */
+ ulint level) /* in: level */
+ {
++#ifdef UNIV_SYNC_DEBUG
+ lock->level = level;
++#endif /* UNIV_SYNC_DEBUG */
+ }
+
+ #ifdef UNIV_SYNC_DEBUG
+@@ -718,7 +817,7 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
+- mutex_enter(&(lock->mutex));
++ rw_lock_debug_mutex_enter();
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+
+@@ -728,7 +827,7 @@
+ && (info->pass == 0)
+ && (info->lock_type == lock_type)) {
+
+- mutex_exit(&(lock->mutex));
++ rw_lock_debug_mutex_exit();
+ /* Found! */
+
+ return(TRUE);
+@@ -736,7 +835,7 @@
+
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+- mutex_exit(&(lock->mutex));
++ rw_lock_debug_mutex_exit();
+
+ return(FALSE);
+ }
+@@ -758,22 +857,18 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
+- mutex_enter(&(lock->mutex));
+-
+ if (lock_type == RW_LOCK_SHARED) {
+- if (lock->reader_count > 0) {
++ if (rw_lock_get_reader_count(lock) > 0) {
+ ret = TRUE;
+ }
+ } else if (lock_type == RW_LOCK_EX) {
+- if (lock->writer == RW_LOCK_EX) {
++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+ ret = TRUE;
+ }
+ } else {
+ ut_error;
+ }
+
+- mutex_exit(&(lock->mutex));
+-
+ return(ret);
+ }
+
+@@ -801,11 +896,10 @@
+
+ count++;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
+-
+- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+- || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++#endif
++ if (lock->lock_word != X_LOCK_DECR) {
+
+ fprintf(stderr, "RW-LOCK: %p ", lock);
+
+@@ -821,8 +915,10 @@
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+ }
+-
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
++
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+@@ -845,9 +941,10 @@
+ "RW-LATCH INFO\n"
+ "RW-LATCH: %p ", lock);
+
+- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+- || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++#ifndef HAVE_ATOMIC_BUILTINS
++ mutex_enter(&(lock->mutex));
++#endif
++ if (lock->lock_word != X_LOCK_DECR) {
+
+ if (rw_lock_get_waiters(lock)) {
+ fputs(" Waiters for the lock exist\n", stderr);
+@@ -861,6 +958,9 @@
+ info = UT_LIST_GET_NEXT(list, info);
+ }
+ }
++#ifndef HAVE_ATOMIC_BUILTINS
++ mutex_exit(&(lock->mutex));
++#endif
+ }
+
+ /*************************************************************************
+@@ -909,14 +1009,11 @@
+ lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+ while (lock != NULL) {
+- mutex_enter(rw_lock_get_mutex(lock));
+
+- if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+- || (rw_lock_get_reader_count(lock) != 0)) {
++ if (lock->lock_word != X_LOCK_DECR) {
+ count++;
+ }
+
+- mutex_exit(rw_lock_get_mutex(lock));
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
+--- a/innobase/sync/sync0sync.c 2009-10-22 15:15:05.000000000 +0900
++++ b/innobase/sync/sync0sync.c 2009-10-22 15:18:44.000000000 +0900
+@@ -1,8 +1,31 @@
++/*****************************************************************************
++
++Copyright (c) 1995, 2009, Innobase Oy. All Rights Reserved.
++Copyright (c) 2008, Google Inc.
++
++Portions of this file contain modifications contributed and copyrighted by
++Google, Inc. Those modifications are gratefully acknowledged and are described
++briefly in the InnoDB documentation. The contributions by Google are
++incorporated with their permission, and subject to the conditions contained in
++the file COPYING.Google.
++
++This program is free software; you can redistribute it and/or modify it under
++the terms of the GNU General Public License as published by the Free Software
++Foundation; version 2 of the License.
++
++This program is distributed in the hope that it will be useful, but WITHOUT
++ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
++FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License along with
++this program; if not, write to the Free Software Foundation, Inc., 59 Temple
++Place, Suite 330, Boston, MA 02111-1307 USA
++
++*****************************************************************************/
++
+ /******************************************************
+ Mutex, the basic synchronization primitive
+
+-(c) 1995 Innobase Oy
+-
+ Created 9/5/1995 Heikki Tuuri
+ *******************************************************/
+
+@@ -140,17 +163,12 @@
+
+ ulint sync_dummy = 0;
+
+-/* The number of system calls made in this module. Intended for performance
+-monitoring. */
+-
+-ulint mutex_system_call_count = 0;
+-
+ /* Number of spin waits on mutexes: for performance monitoring */
+
+-ulint mutex_spin_round_count = 0;
+-ulint mutex_spin_wait_count = 0;
+-ulint mutex_os_wait_count = 0;
+-ulint mutex_exit_count = 0;
++ib_longlong mutex_spin_round_count = 0;
++ib_longlong mutex_spin_wait_count = 0;
++ib_longlong mutex_os_wait_count = 0;
++ib_longlong mutex_exit_count = 0;
+
+ /* The global array of wait cells for implementation of the database's own
+ mutexes and read-write locks */
+@@ -240,6 +258,8 @@
+ {
+ #if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER)
+ mutex_reset_lock_word(mutex);
++#elif defined(HAVE_ATOMIC_BUILTINS)
++ mutex_reset_lock_word(mutex);
+ #else
+ os_fast_mutex_init(&(mutex->os_fast_mutex));
+ mutex->lock_word = 0;
+@@ -325,7 +345,9 @@
+
+ os_event_free(mutex->event);
+
+-#if !defined(_WIN32) || !defined(UNIV_CAN_USE_X86_ASSEMBLER)
++#if defined(_WIN32) && defined(UNIV_CAN_USE_X86_ASSEMBLER)
++#elif defined(HAVE_ATOMIC_BUILTINS)
++#else
+ os_fast_mutex_free(&(mutex->os_fast_mutex));
+ #endif
+ /* If we free the mutex protecting the mutex list (freeing is
+@@ -421,6 +443,12 @@
+ #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+ ut_ad(mutex);
+
++ /* This update is not thread safe, but we don't mind if the count
++ isn't exact. Moved out of ifdef that follows because we are willing
++ to sacrifice the cost of counting this as the data is valuable.
++ Count the number of calls to mutex_spin_wait. */
++ mutex_spin_wait_count++;
++
+ mutex_loop:
+
+ i = 0;
+@@ -433,7 +461,6 @@
+
+ spin_loop:
+ #if defined UNIV_DEBUG && !defined UNIV_HOTBACKUP
+- mutex_spin_wait_count++;
+ mutex->count_spin_loop++;
+ #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
+
+@@ -502,8 +529,6 @@
+ sync_array_reserve_cell(sync_primary_wait_array, mutex,
+ SYNC_MUTEX, file_name, line, &index);
+
+- mutex_system_call_count++;
+-
+ /* The memory order of the array reservation and the change in the
+ waiters field is important: when we suspend a thread, we first
+ reserve the cell and then set waiters field to 1. When threads are
+@@ -551,7 +576,6 @@
+ mutex->cfile_name, (ulong) mutex->cline, (ulong) i);
+ #endif
+
+- mutex_system_call_count++;
+ mutex_os_wait_count++;
+
+ #ifndef UNIV_HOTBACKUP
+@@ -1368,20 +1392,31 @@
+ FILE* file) /* in: file where to print */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- fprintf(stderr, "Mutex exits %lu, rws exits %lu, rwx exits %lu\n",
++ fprintf(file, "Mutex exits %llu, rws exits %llu, rwx exits %llu\n",
+ mutex_exit_count, rw_s_exit_count, rw_x_exit_count);
+ #endif
+
+ fprintf(file,
+-"Mutex spin waits %lu, rounds %lu, OS waits %lu\n"
+-"RW-shared spins %lu, OS waits %lu; RW-excl spins %lu, OS waits %lu\n",
+- (ulong) mutex_spin_wait_count,
+- (ulong) mutex_spin_round_count,
+- (ulong) mutex_os_wait_count,
+- (ulong) rw_s_spin_wait_count,
+- (ulong) rw_s_os_wait_count,
+- (ulong) rw_x_spin_wait_count,
+- (ulong) rw_x_os_wait_count);
++ "Mutex spin waits %llu, rounds %llu, OS waits %llu\n"
++ "RW-shared spins %llu, OS waits %llu;"
++ " RW-excl spins %llu, OS waits %llu\n",
++ mutex_spin_wait_count,
++ mutex_spin_round_count,
++ mutex_os_wait_count,
++ rw_s_spin_wait_count,
++ rw_s_os_wait_count,
++ rw_x_spin_wait_count,
++ rw_x_os_wait_count);
++
++ fprintf(file,
++ "Spin rounds per wait: %.2f mutex, %.2f RW-shared, "
++ "%.2f RW-excl\n",
++ (double) mutex_spin_round_count /
++ (mutex_spin_wait_count ? mutex_spin_wait_count : 1),
++ (double) rw_s_spin_round_count /
++ (rw_s_spin_wait_count ? rw_s_spin_wait_count : 1),
++ (double) rw_x_spin_round_count /
++ (rw_x_spin_wait_count ? rw_x_spin_wait_count : 1));
+ }
+
+ /***********************************************************************
+diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_rw_lock.info 2009-10-22 15:18:30.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_rw_lock.patch
++Name=Fix of InnoDB rw_locks ported from InnoDB Plugin
++Version=1.0
++Author=InnoBase Oy.
++License=GPL
++Comment=
diff --git a/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch b/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch
new file mode 100644
index 0000000..b4a1a79
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_rw_lock_old.patch
@@ -0,0 +1,1357 @@
+diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c 2009-05-20 14:21:44.000000000 +0900
++++ b/innobase/btr/btr0sea.c 2009-05-20 14:39:34.000000000 +0900
+@@ -773,7 +773,7 @@
+ rw_lock_s_lock(&btr_search_latch);
+ }
+
+- ut_ad(btr_search_latch.writer != RW_LOCK_EX);
++ ut_ad(btr_search_latch.writer_count == 0);
+ ut_ad(btr_search_latch.reader_count > 0);
+
+ rec = ha_search_and_get_data(btr_search_sys->hash_index, fold);
+diff -ruN a/innobase/include/sync0rw.h b/innobase/include/sync0rw.h
+--- a/innobase/include/sync0rw.h 2009-01-30 06:42:20.000000000 +0900
++++ b/innobase/include/sync0rw.h 2009-04-16 16:15:28.000000000 +0900
+@@ -325,7 +325,17 @@
+ Accessor functions for rw lock. */
+ UNIV_INLINE
+ ulint
+-rw_lock_get_waiters(
++rw_lock_get_s_waiters(
++/*==================*/
++ rw_lock_t* lock);
++UNIV_INLINE
++ulint
++rw_lock_get_x_waiters(
++/*==================*/
++ rw_lock_t* lock);
++UNIV_INLINE
++ulint
++rw_lock_get_wx_waiters(
+ /*================*/
+ rw_lock_t* lock);
+ UNIV_INLINE
+@@ -408,6 +418,17 @@
+ rw_lock_debug_t* info); /* in: debug struct */
+ #endif /* UNIV_SYNC_DEBUG */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++/* This value means NOT_LOCKED */
++#define RW_LOCK_BIAS 0x00100000
++#else
++#error HAVE_ATOMIC_BUILTINS is not defined. Do you use enough new GCC or compatibles?
++#error Or do you use exact options for CFLAGS?
++#error e.g. (for x86_32): "-m32 -march=i586 -mtune=i686"
++#error e.g. (for Sparc_64): "-m64 -mcpu=v9"
++#error Otherwise, this build may be slower than normal version.
++#endif
++
+ /* NOTE! The structure appears here only for the compiler to know its size.
+ Do not use its fields directly! The structure used in the spin lock
+ implementation of a read-write lock. Several threads may have a shared lock
+@@ -417,9 +438,9 @@
+ field. Then no new readers are allowed in. */
+
+ struct rw_lock_struct {
+- os_event_t event; /* Used by sync0arr.c for thread queueing */
+-
+-#ifdef __WIN__
++ /* Used by sync0arr.c for thread queueing */
++ os_event_t s_event; /* Used for s_lock */
++ os_event_t x_event; /* Used for x_lock */
+ os_event_t wait_ex_event; /* This windows specific event is
+ used by the thread which has set the
+ lock state to RW_LOCK_WAIT_EX. The
+@@ -427,31 +448,35 @@
+ thread will be the next one to proceed
+ once the current the event gets
+ signalled. See LEMMA 2 in sync0sync.c */
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ volatile lint lock_word; /* Used by using atomic builtin */
+ #endif
+
+- ulint reader_count; /* Number of readers who have locked this
++ volatile ulint reader_count; /* Number of readers who have locked this
+ lock in the shared mode */
+- ulint writer; /* This field is set to RW_LOCK_EX if there
++ volatile ulint writer; /* This field is set to RW_LOCK_EX if there
+ is a writer owning the lock (in exclusive
+ mode), RW_LOCK_WAIT_EX if a writer is
+ queueing for the lock, and
+ RW_LOCK_NOT_LOCKED, otherwise. */
+- os_thread_id_t writer_thread;
++ volatile os_thread_id_t writer_thread;
+ /* Thread id of a possible writer thread */
+- ulint writer_count; /* Number of times the same thread has
++ volatile ulint writer_count; /* Number of times the same thread has
+ recursively locked the lock in the exclusive
+ mode */
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_t mutex; /* The mutex protecting rw_lock_struct */
++#endif
+ ulint pass; /* Default value 0. This is set to some
+ value != 0 given by the caller of an x-lock
+ operation, if the x-lock is to be passed to
+ another thread to unlock (which happens in
+ asynchronous i/o). */
+- ulint waiters; /* This ulint is set to 1 if there are
+- waiters (readers or writers) in the global
+- wait array, waiting for this rw_lock.
+- Otherwise, == 0. */
+- ibool writer_is_wait_ex;
++ volatile ulint s_waiters; /* 1: there are waiters (s_lock) */
++ volatile ulint x_waiters; /* 1: there are waiters (x_lock) */
++ volatile ulint wait_ex_waiters; /* 1: there are waiters (wait_ex) */
++ volatile ibool writer_is_wait_ex;
+ /* This is TRUE if the writer field is
+ RW_LOCK_WAIT_EX; this field is located far
+ from the memory update hotspot fields which
+diff -ruN a/innobase/include/sync0rw.ic b/innobase/include/sync0rw.ic
+--- a/innobase/include/sync0rw.ic 2009-01-30 06:42:20.000000000 +0900
++++ b/innobase/include/sync0rw.ic 2009-04-16 17:06:53.000000000 +0900
+@@ -47,20 +47,64 @@
+ Accessor functions for rw lock. */
+ UNIV_INLINE
+ ulint
+-rw_lock_get_waiters(
++rw_lock_get_s_waiters(
+ /*================*/
+ rw_lock_t* lock)
+ {
+- return(lock->waiters);
++ return(lock->s_waiters);
+ }
+ UNIV_INLINE
+-void
+-rw_lock_set_waiters(
++ulint
++rw_lock_get_x_waiters(
+ /*================*/
++ rw_lock_t* lock)
++{
++ return(lock->x_waiters);
++}
++UNIV_INLINE
++ulint
++rw_lock_get_wx_waiters(
++/*================*/
++ rw_lock_t* lock)
++{
++ return(lock->wait_ex_waiters);
++}
++UNIV_INLINE
++void
++rw_lock_set_s_waiters(
+ rw_lock_t* lock,
+ ulint flag)
+ {
+- lock->waiters = flag;
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->s_waiters, flag);
++#else
++ lock->s_waiters = flag;
++#endif
++}
++UNIV_INLINE
++void
++rw_lock_set_x_waiters(
++ rw_lock_t* lock,
++ ulint flag)
++{
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->x_waiters, flag);
++#else
++ lock->x_waiters = flag;
++#endif
++}
++UNIV_INLINE
++void
++rw_lock_set_wx_waiters(
++/*================*/
++ rw_lock_t* lock,
++ ulint flag)
++{
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_lock_test_and_set(&lock->wait_ex_waiters, flag);
++#else
++ lock->wait_ex_waiters = flag;
++#endif
+ }
+ UNIV_INLINE
+ ulint
+@@ -68,7 +112,19 @@
+ /*===============*/
+ rw_lock_t* lock)
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (lock->writer == RW_LOCK_NOT_LOCKED) {
++ return(RW_LOCK_NOT_LOCKED);
++ }
++
++ if (lock->writer_is_wait_ex) {
++ return(RW_LOCK_WAIT_EX);
++ } else {
++ return(RW_LOCK_EX);
++ }
++#else
+ return(lock->writer);
++#endif
+ }
+ UNIV_INLINE
+ void
+@@ -96,6 +152,7 @@
+ {
+ lock->reader_count = count;
+ }
++#ifndef HAVE_ATOMIC_BUILTINS
+ UNIV_INLINE
+ mutex_t*
+ rw_lock_get_mutex(
+@@ -104,6 +161,7 @@
+ {
+ return(&(lock->mutex));
+ }
++#endif
+
+ /**********************************************************************
+ Returns the value of writer_count for the lock. Does not reserve the lock
+@@ -133,14 +191,26 @@
+ const char* file_name, /* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
+-#ifdef UNIV_SYNC_DEBUG
++#if defined(UNIV_SYNC_DEBUG) && !defined(HAVE_ATOMIC_BUILTINS)
+ ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+ #endif /* UNIV_SYNC_DEBUG */
+ /* Check if the writer field is free */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (UNIV_LIKELY(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED)) {
++ /* try s-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) {
++ /* fail */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++ return(FALSE); /* locking did not succeed */
++ }
++ /* success */
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ if (UNIV_LIKELY(lock->writer == RW_LOCK_NOT_LOCKED)) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, pass, RW_LOCK_SHARED, file_name,
+@@ -167,11 +237,15 @@
+ const char* file_name, /* in: file name where requested */
+ ulint line) /* in: line where lock requested */
+ {
+- ut_ad(lock->writer == RW_LOCK_NOT_LOCKED);
++ ut_ad(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+ ut_ad(rw_lock_get_reader_count(lock) == 0);
+
+ /* Set the shared lock by incrementing the reader count */
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ lock->reader_count++;
++#endif
+
+ lock->last_s_file_name = file_name;
+ lock->last_s_line = line;
+@@ -199,7 +273,11 @@
+
+ rw_lock_set_writer(lock, RW_LOCK_EX);
+ lock->writer_thread = os_thread_get_curr_id();
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(lock->writer_count),1);
++#else
+ lock->writer_count++;
++#endif
+ lock->pass = 0;
+
+ lock->last_x_file_name = file_name;
+@@ -241,15 +319,21 @@
+ ut_ad(!rw_lock_own(lock, RW_LOCK_SHARED)); /* see NOTE above */
+ #endif /* UNIV_SYNC_DEBUG */
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ if (UNIV_LIKELY(rw_lock_s_lock_low(lock, pass, file_name, line))) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Success */
+ } else {
+ /* Did not succeed, try spin wait */
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ rw_lock_s_lock_spin(lock, pass, file_name, line);
+
+@@ -272,11 +356,23 @@
+ {
+ ibool success = FALSE;
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED) {
++ /* try s-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),1) <= 0) {
++ /* fail */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++ return(FALSE); /* locking did not succeed */
++ }
++ /* success */
++ __sync_fetch_and_add(&(lock->reader_count),1);
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (lock->writer == RW_LOCK_NOT_LOCKED) {
+ /* Set the shared lock by incrementing the reader count */
+ lock->reader_count++;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_add_debug_info(lock, 0, RW_LOCK_SHARED, file_name,
+@@ -289,7 +385,9 @@
+ success = TRUE;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return(success);
+ }
+@@ -309,6 +407,54 @@
+ {
+ ibool success = FALSE;
+ os_thread_id_t curr_thread = os_thread_get_curr_id();
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (lock->reader_count == 0) {
++ /* try to lock writer */
++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX)
++ == RW_LOCK_NOT_LOCKED) {
++ /* success */
++retry_x_lock:
++ /* try x-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* success */
++ lock->writer_thread = curr_thread;
++ lock->pass = 0;
++ lock->writer_is_wait_ex = FALSE;
++ /* next function may work as memory barrier */
++ relock:
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, 0, RW_LOCK_EX, file_name, line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ ut_ad(rw_lock_validate(lock));
++
++ return(TRUE);
++ } else {
++ /* fail (x-lock) */
++ if (__sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS)
++ == 0)
++ goto retry_x_lock;
++ }
++
++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED);
++ }
++ }
++
++ if (lock->pass == 0
++ && os_thread_eq(lock->writer_thread, curr_thread)) {
++ goto relock;
++ }
++
++ //ut_ad(rw_lock_validate(lock));
++
++ return(FALSE);
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
+
+ if (UNIV_UNLIKELY(rw_lock_get_reader_count(lock) != 0)) {
+@@ -339,6 +485,7 @@
+ ut_ad(rw_lock_validate(lock));
+
+ return(success);
++#endif
+ }
+
+ /**********************************************************************
+@@ -354,16 +501,33 @@
+ #endif
+ )
+ {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_t* mutex = &(lock->mutex);
+- ibool sg = FALSE;
++#endif
++ ibool x_sg = FALSE;
++ ibool wx_sg = FALSE;
++#ifdef HAVE_ATOMIC_BUILTINS
++ ibool last = FALSE;
++#endif
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(mutex);
++#endif
+
+ /* Reset the shared lock by decrementing the reader count */
+
+ ut_a(lock->reader_count > 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* unlock lock_word */
++ __sync_fetch_and_add(&(lock->lock_word),1);
++
++ if(__sync_sub_and_fetch(&(lock->reader_count),1) == 0) {
++ last = TRUE;
++ }
++#else
+ lock->reader_count--;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_SHARED);
+@@ -372,22 +536,39 @@
+ /* If there may be waiters and this was the last s-lock,
+ signal the object */
+
+- if (UNIV_UNLIKELY(lock->waiters)
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0))) {
++ os_event_set(lock->wait_ex_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++ else if (UNIV_UNLIKELY(last && __sync_lock_test_and_set(&lock->x_waiters, 0))) {
++ os_event_set(lock->x_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++#else
++ if (UNIV_UNLIKELY(lock->wait_ex_waiters)
+ && lock->reader_count == 0) {
+- sg = TRUE;
++ wx_sg = TRUE;
+
+- rw_lock_set_waiters(lock, 0);
++ rw_lock_set_wx_waiters(lock, 0);
++ }
++ else if (UNIV_UNLIKELY(lock->x_waiters)
++ && lock->reader_count == 0) {
++ x_sg = TRUE;
++
++ rw_lock_set_x_waiters(lock, 0);
+ }
+
+ mutex_exit(mutex);
+
+- if (UNIV_UNLIKELY(sg)) {
+-#ifdef __WIN__
++ if (UNIV_UNLIKELY(wx_sg)) {
+ os_event_set(lock->wait_ex_event);
+-#endif
+- os_event_set(lock->event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ } else if (UNIV_UNLIKELY(x_sg)) {
++ os_event_set(lock->x_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
++#endif
+
+ ut_ad(rw_lock_validate(lock));
+
+@@ -409,13 +590,22 @@
+
+ ut_ad(lock->reader_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_sub_and_fetch(&(lock->reader_count),1);
++#else
+ lock->reader_count--;
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_SHARED);
+ #endif
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ ut_ad(!lock->s_waiters);
++ ut_ad(!lock->x_waiters);
++#else
+ ut_ad(!lock->waiters);
++#endif
+ ut_ad(rw_lock_validate(lock));
+ #ifdef UNIV_SYNC_PERF_STAT
+ rw_s_exit_count++;
+@@ -435,41 +625,83 @@
+ #endif
+ )
+ {
+- ibool sg = FALSE;
++#ifdef HAVE_ATOMIC_BUILTINS
++ ibool last = FALSE;
++#endif
++ ibool s_sg = FALSE;
++ ibool x_sg = FALSE;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter(&(lock->mutex));
++#endif
+
+ /* Reset the exclusive lock if this thread no longer has an x-mode
+ lock */
+
+ ut_ad(lock->writer_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) {
++ last = TRUE;
++ }
++
++ if (last) {
++ /* unlock lock_word */
++ __sync_fetch_and_add(&(lock->lock_word),RW_LOCK_BIAS);
++
++ /* FIXME: It is a value of bad manners for pthread.
++ But we shouldn't keep an ID of not-owner. */
++ lock->writer_thread = -1;
++ __sync_lock_test_and_set(&(lock->writer),RW_LOCK_NOT_LOCKED);
++ }
++#else
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
++#endif
+
+ #ifdef UNIV_SYNC_DEBUG
+ rw_lock_remove_debug_info(lock, pass, RW_LOCK_EX);
+ #endif
+
+ /* If there may be waiters, signal the lock */
+- if (UNIV_UNLIKELY(lock->waiters)
+- && lock->writer_count == 0) {
+-
+- sg = TRUE;
+- rw_lock_set_waiters(lock, 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (last) {
++ if(__sync_lock_test_and_set(&lock->s_waiters, 0)){
++ s_sg = TRUE;
++ }
++ if(__sync_lock_test_and_set(&lock->x_waiters, 0)){
++ x_sg = TRUE;
++ }
++ }
++#else
++ if (lock->writer_count == 0) {
++ if(lock->s_waiters){
++ s_sg = TRUE;
++ rw_lock_set_s_waiters(lock, 0);
++ }
++ if(lock->x_waiters){
++ x_sg = TRUE;
++ rw_lock_set_x_waiters(lock, 0);
++ }
+ }
+
+ mutex_exit(&(lock->mutex));
++#endif
+
+- if (UNIV_UNLIKELY(sg)) {
++ if (UNIV_UNLIKELY(s_sg)) {
++ os_event_set(lock->s_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++ if (UNIV_UNLIKELY(x_sg)) {
+ #ifdef __WIN__
++ /* I doubt the necessity of it. */
+ os_event_set(lock->wait_ex_event);
+ #endif
+- os_event_set(lock->event);
++ os_event_set(lock->x_event);
+ sync_array_object_signalled(sync_primary_wait_array);
+ }
+
+@@ -494,9 +726,13 @@
+
+ ut_ad(lock->writer_count > 0);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if(__sync_sub_and_fetch(&(lock->writer_count),1) == 0) {
++#else
+ lock->writer_count--;
+
+ if (lock->writer_count == 0) {
++#endif
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ }
+
+@@ -504,7 +740,12 @@
+ rw_lock_remove_debug_info(lock, 0, RW_LOCK_EX);
+ #endif
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ ut_ad(!lock->s_waiters);
++ ut_ad(!lock->x_waiters);
++#else
+ ut_ad(!lock->waiters);
++#endif
+ ut_ad(rw_lock_validate(lock));
+
+ #ifdef UNIV_SYNC_PERF_STAT
+diff -ruN a/innobase/sync/sync0arr.c b/innobase/sync/sync0arr.c
+--- a/innobase/sync/sync0arr.c 2009-01-30 06:42:24.000000000 +0900
++++ b/innobase/sync/sync0arr.c 2009-04-16 16:15:28.000000000 +0900
+@@ -309,13 +309,13 @@
+ {
+ if (type == SYNC_MUTEX) {
+ return(os_event_reset(((mutex_t *) object)->event));
+-#ifdef __WIN__
+ } else if (type == RW_LOCK_WAIT_EX) {
+ return(os_event_reset(
+ ((rw_lock_t *) object)->wait_ex_event));
+-#endif
+- } else {
+- return(os_event_reset(((rw_lock_t *) object)->event));
++ } else if (type == RW_LOCK_SHARED) {
++ return(os_event_reset(((rw_lock_t *) object)->s_event));
++ } else { /* RW_LOCK_EX */
++ return(os_event_reset(((rw_lock_t *) object)->x_event));
+ }
+ }
+
+@@ -415,15 +415,12 @@
+
+ if (cell->request_type == SYNC_MUTEX) {
+ event = ((mutex_t*) cell->wait_object)->event;
+-#ifdef __WIN__
+- /* On windows if the thread about to wait is the one which
+- has set the state of the rw_lock to RW_LOCK_WAIT_EX, then
+- it waits on a special event i.e.: wait_ex_event. */
+ } else if (cell->request_type == RW_LOCK_WAIT_EX) {
+ event = ((rw_lock_t*) cell->wait_object)->wait_ex_event;
+-#endif
+- } else {
+- event = ((rw_lock_t*) cell->wait_object)->event;
++ } else if (cell->request_type == RW_LOCK_SHARED) {
++ event = ((rw_lock_t*) cell->wait_object)->s_event;
++ } else {
++ event = ((rw_lock_t*) cell->wait_object)->x_event;
+ }
+
+ cell->waiting = TRUE;
+@@ -464,6 +461,7 @@
+ mutex_t* mutex;
+ rw_lock_t* rwlock;
+ ulint type;
++ ulint writer;
+
+ type = cell->request_type;
+
+@@ -492,12 +490,10 @@
+ (ulong) mutex->waiters);
+
+ } else if (type == RW_LOCK_EX
+-#ifdef __WIN__
+ || type == RW_LOCK_WAIT_EX
+-#endif
+ || type == RW_LOCK_SHARED) {
+
+- fputs(type == RW_LOCK_EX ? "X-lock on" : "S-lock on", file);
++ fputs(type == RW_LOCK_SHARED ? "S-lock on" : "X-lock on", file);
+
+ rwlock = cell->old_wait_rw_lock;
+
+@@ -505,21 +501,23 @@
+ " RW-latch at %p created in file %s line %lu\n",
+ rwlock, rwlock->cfile_name,
+ (ulong) rwlock->cline);
+- if (rwlock->writer != RW_LOCK_NOT_LOCKED) {
++ writer = rw_lock_get_writer(rwlock);
++ if (writer != RW_LOCK_NOT_LOCKED) {
+ fprintf(file,
+ "a writer (thread id %lu) has reserved it in mode %s",
+ (ulong) os_thread_pf(rwlock->writer_thread),
+- rwlock->writer == RW_LOCK_EX
++ writer == RW_LOCK_EX
+ ? " exclusive\n"
+ : " wait exclusive\n");
+ }
+
+ fprintf(file,
+- "number of readers %lu, waiters flag %lu\n"
++ "number of readers %lu, s_waiters flag %lu, x_waiters flag %lu\n"
+ "Last time read locked in file %s line %lu\n"
+ "Last time write locked in file %s line %lu\n",
+ (ulong) rwlock->reader_count,
+- (ulong) rwlock->waiters,
++ (ulong) rwlock->s_waiters,
++ (ulong) (rwlock->x_waiters || rwlock->wait_ex_waiters),
+ rwlock->last_s_file_name,
+ (ulong) rwlock->last_s_line,
+ rwlock->last_x_file_name,
+@@ -839,11 +837,15 @@
+ /*========================*/
+ sync_array_t* arr) /* in: wait array */
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ __sync_fetch_and_add(&(arr->sg_count),1);
++#else
+ sync_array_enter(arr);
+
+ arr->sg_count++;
+
+ sync_array_exit(arr);
++#endif
+ }
+
+ /**************************************************************************
+@@ -880,19 +882,23 @@
+
+ mutex = cell->wait_object;
+ os_event_set(mutex->event);
+-#ifdef __WIN__
+ } else if (cell->request_type
+ == RW_LOCK_WAIT_EX) {
+ rw_lock_t* lock;
+
+ lock = cell->wait_object;
+ os_event_set(lock->wait_ex_event);
+-#endif
+- } else {
++ } else if (cell->request_type
++ == RW_LOCK_SHARED) {
+ rw_lock_t* lock;
+
+ lock = cell->wait_object;
+- os_event_set(lock->event);
++ os_event_set(lock->s_event);
++ } else {
++ rw_lock_t* lock;
++
++ lock = cell->wait_object;
++ os_event_set(lock->x_event);
+ }
+ }
+ }
+diff -ruN a/innobase/sync/sync0rw.c b/innobase/sync/sync0rw.c
+--- a/innobase/sync/sync0rw.c 2009-01-30 06:42:24.000000000 +0900
++++ b/innobase/sync/sync0rw.c 2009-04-16 17:33:59.000000000 +0900
+@@ -99,6 +99,7 @@
+ object is created, then the following call initializes
+ the sync system. */
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_create(rw_lock_get_mutex(lock));
+ mutex_set_level(rw_lock_get_mutex(lock), SYNC_NO_ORDER_CHECK);
+
+@@ -108,8 +109,14 @@
+ lock->mutex.cmutex_name = cmutex_name;
+ lock->mutex.mutex_type = 1;
+ #endif /* UNIV_DEBUG && !UNIV_HOTBACKUP */
++#endif /* !HAVE_ATOMIC_BUILTINS */
+
+- rw_lock_set_waiters(lock, 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ lock->lock_word = RW_LOCK_BIAS;
++#endif
++ rw_lock_set_s_waiters(lock, 0);
++ rw_lock_set_x_waiters(lock, 0);
++ rw_lock_set_wx_waiters(lock, 0);
+ rw_lock_set_writer(lock, RW_LOCK_NOT_LOCKED);
+ lock->writer_count = 0;
+ rw_lock_set_reader_count(lock, 0);
+@@ -130,11 +137,9 @@
+ lock->last_x_file_name = "not yet reserved";
+ lock->last_s_line = 0;
+ lock->last_x_line = 0;
+- lock->event = os_event_create(NULL);
+-
+-#ifdef __WIN__
++ lock->s_event = os_event_create(NULL);
++ lock->x_event = os_event_create(NULL);
+ lock->wait_ex_event = os_event_create(NULL);
+-#endif
+
+ mutex_enter(&rw_lock_list_mutex);
+
+@@ -162,19 +167,21 @@
+ ut_a(rw_lock_validate(lock));
+ #endif /* UNIV_DEBUG */
+ ut_a(rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED);
+- ut_a(rw_lock_get_waiters(lock) == 0);
++ ut_a(rw_lock_get_s_waiters(lock) == 0);
++ ut_a(rw_lock_get_x_waiters(lock) == 0);
++ ut_a(rw_lock_get_wx_waiters(lock) == 0);
+ ut_a(rw_lock_get_reader_count(lock) == 0);
+
+ lock->magic_n = 0;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_free(rw_lock_get_mutex(lock));
++#endif
+
+ mutex_enter(&rw_lock_list_mutex);
+- os_event_free(lock->event);
+-
+-#ifdef __WIN__
++ os_event_free(lock->s_event);
++ os_event_free(lock->x_event);
+ os_event_free(lock->wait_ex_event);
+-#endif
+
+ if (UT_LIST_GET_PREV(list, lock)) {
+ ut_a(UT_LIST_GET_PREV(list, lock)->magic_n == RW_LOCK_MAGIC_N);
+@@ -192,26 +199,43 @@
+ Checks that the rw-lock has been initialized and that there are no
+ simultaneous shared and exclusive locks. */
+
++/* MEMO: If HAVE_ATOMIC_BUILTINS, we should use this function statically. */
++
+ ibool
+ rw_lock_validate(
+ /*=============*/
+ rw_lock_t* lock)
+ {
++ ulint test;
+ ut_a(lock);
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ ut_a(lock->magic_n == RW_LOCK_MAGIC_N);
++#ifndef HAVE_ATOMIC_BUILTINS
+ ut_a((rw_lock_get_reader_count(lock) == 0)
+ || (rw_lock_get_writer(lock) != RW_LOCK_EX));
+- ut_a((rw_lock_get_writer(lock) == RW_LOCK_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_WAIT_EX)
+- || (rw_lock_get_writer(lock) == RW_LOCK_NOT_LOCKED));
+- ut_a((rw_lock_get_waiters(lock) == 0)
+- || (rw_lock_get_waiters(lock) == 1));
++#endif
++ test = rw_lock_get_writer(lock);
++ ut_a((test == RW_LOCK_EX)
++ || (test == RW_LOCK_WAIT_EX)
++ || (test == RW_LOCK_NOT_LOCKED));
++ test = rw_lock_get_s_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++ test = rw_lock_get_x_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++ test = rw_lock_get_wx_waiters(lock);
++ ut_a((test == 0)
++ || (test == 1));
++#ifndef HAVE_ATOMIC_BUILTINS
+ ut_a((lock->writer != RW_LOCK_EX) || (lock->writer_count > 0));
+
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return(TRUE);
+ }
+@@ -237,13 +261,14 @@
+ ut_ad(rw_lock_validate(lock));
+
+ lock_loop:
++ i = 0;
++spin_loop:
+ rw_s_spin_wait_count++;
+
+ /* Spin waiting for the writer field to become free */
+- i = 0;
+
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
++ while (i < SYNC_SPIN_ROUNDS
++ && rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0, srv_spin_wait_delay));
+ }
+@@ -262,15 +287,27 @@
+ lock->cfile_name, (ulong) lock->cline, (ulong) i);
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ /* We try once again to obtain the lock */
+
+ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Success */
+ } else {
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ i++;
++
++ if (i < SYNC_SPIN_ROUNDS) {
++ goto spin_loop;
++ }
++#endif
+ /* If we get here, locking did not succeed, we may
+ suspend the thread to wait in the wait array */
+
+@@ -281,9 +318,26 @@
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
++ rw_lock_set_s_waiters(lock, 1);
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ for (i = 0; i < 4; i++) {
++ if (TRUE == rw_lock_s_lock_low(lock, pass, file_name, line)) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Success */
++ }
++ }
+
++ /* If wait_ex_waiter stalls, wakes it. */
++ if (lock->reader_count == 0
++ && __sync_lock_test_and_set(&lock->wait_ex_waiters, 0)) {
++ os_event_set(lock->wait_ex_event);
++ sync_array_object_signalled(sync_primary_wait_array);
++ }
++#else
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -318,13 +372,19 @@
+ {
+ ut_ad(rw_lock_is_locked(lock, RW_LOCK_EX));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ lock->writer_thread = os_thread_get_curr_id();
+
+ lock->pass = 0;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#else
++ __sync_synchronize();
++#endif
+ }
+
+ /**********************************************************************
+@@ -342,6 +402,89 @@
+ const char* file_name,/* in: file name where lock requested */
+ ulint line) /* in: line where requested */
+ {
++#ifdef HAVE_ATOMIC_BUILTINS
++ os_thread_id_t curr_thread = os_thread_get_curr_id();
++retry_writer:
++ /* try to lock writer */
++ if(__sync_lock_test_and_set(&(lock->writer),RW_LOCK_EX)
++ == RW_LOCK_NOT_LOCKED) {
++ /* success */
++ /* obtain RW_LOCK_WAIT_EX right */
++ lock->writer_thread = curr_thread;
++ lock->pass = pass;
++ lock->writer_is_wait_ex = TRUE;
++ /* atomic operation may be safer about memory order. */
++ __sync_synchronize();
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_WAIT_EX,
++ file_name, line);
++#endif
++ }
++
++ if (!os_thread_eq(lock->writer_thread, curr_thread)) {
++ return(RW_LOCK_NOT_LOCKED);
++ }
++
++ switch(rw_lock_get_writer(lock)) {
++ case RW_LOCK_WAIT_EX:
++ /* have right to try x-lock */
++retry_x_lock:
++ /* try x-lock */
++ if(__sync_sub_and_fetch(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* success */
++ lock->pass = pass;
++ lock->writer_is_wait_ex = FALSE;
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_remove_debug_info(lock, pass, RW_LOCK_WAIT_EX);
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX,
++ file_name, line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ /* Locking succeeded, we may return */
++ return(RW_LOCK_EX);
++ } else if(__sync_fetch_and_add(&(lock->lock_word),
++ RW_LOCK_BIAS) == 0) {
++ /* retry x-lock */
++ goto retry_x_lock;
++ }
++
++ /* There are readers, we have to wait */
++ return(RW_LOCK_WAIT_EX);
++
++ break;
++
++ case RW_LOCK_EX:
++ /* already have x-lock */
++ if ((lock->pass == 0)&&(pass == 0)) {
++ __sync_fetch_and_add(&(lock->writer_count),1);
++
++#ifdef UNIV_SYNC_DEBUG
++ rw_lock_add_debug_info(lock, pass, RW_LOCK_EX, file_name,
++ line);
++#endif
++
++ lock->last_x_file_name = file_name;
++ lock->last_x_line = line;
++
++ /* Locking succeeded, we may return */
++ return(RW_LOCK_EX);
++ }
++
++ return(RW_LOCK_NOT_LOCKED);
++
++ break;
++
++ default: /* RW_LOCK_NOT_LOCKED? maybe impossible */
++ goto retry_writer;
++ }
++#else /* HAVE_ATOMIC_BUILTINS */
++
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(rw_lock_get_mutex(lock)));
+ #endif /* UNIV_SYNC_DEBUG */
+@@ -423,6 +566,7 @@
+ /* Locking succeeded, we may return */
+ return(RW_LOCK_EX);
+ }
++#endif /* HAVE_ATOMIC_BUILTINS */
+
+ /* Locking did not succeed */
+ return(RW_LOCK_NOT_LOCKED);
+@@ -448,19 +592,33 @@
+ ulint line) /* in: line where requested */
+ {
+ ulint index; /* index of the reserved wait cell */
+- ulint state; /* lock state acquired */
++ ulint state = RW_LOCK_NOT_LOCKED; /* lock state acquired */
++#ifdef HAVE_ATOMIC_BUILTINS
++ ulint prev_state = RW_LOCK_NOT_LOCKED;
++#endif
+ ulint i; /* spin round count */
+
+ ut_ad(rw_lock_validate(lock));
+
+ lock_loop:
++ i = 0;
++
++#ifdef HAVE_ATOMIC_BUILTINS
++ prev_state = state;
++#else
+ /* Acquire the mutex protecting the rw-lock fields */
+ mutex_enter_fast(&(lock->mutex));
++#endif
+
+ state = rw_lock_x_lock_low(lock, pass, file_name, line);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (state != prev_state) i=0; /* if progress, reset counter. */
++#else
+ mutex_exit(&(lock->mutex));
++#endif
+
++spin_loop:
+ if (state == RW_LOCK_EX) {
+
+ return; /* Locking succeeded */
+@@ -468,10 +626,9 @@
+ } else if (state == RW_LOCK_NOT_LOCKED) {
+
+ /* Spin waiting for the writer field to become free */
+- i = 0;
+
+- while (rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED
+- && i < SYNC_SPIN_ROUNDS) {
++ while (i < SYNC_SPIN_ROUNDS
++ && lock->lock_word != RW_LOCK_BIAS) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+ srv_spin_wait_delay));
+@@ -485,9 +642,12 @@
+ } else if (state == RW_LOCK_WAIT_EX) {
+
+ /* Spin waiting for the reader count field to become zero */
+- i = 0;
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ while (lock->lock_word != RW_LOCK_BIAS
++#else
+ while (rw_lock_get_reader_count(lock) != 0
++#endif
+ && i < SYNC_SPIN_ROUNDS) {
+ if (srv_spin_wait_delay) {
+ ut_delay(ut_rnd_interval(0,
+@@ -500,7 +660,6 @@
+ os_thread_yield();
+ }
+ } else {
+- i = 0; /* Eliminate a compiler warning */
+ ut_error;
+ }
+
+@@ -516,34 +675,69 @@
+ /* We try once again to obtain the lock. Acquire the mutex protecting
+ the rw-lock fields */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ prev_state = state;
++#else
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ state = rw_lock_x_lock_low(lock, pass, file_name, line);
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (state != prev_state) i=0; /* if progress, reset counter. */
++#endif
++
+ if (state == RW_LOCK_EX) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ return; /* Locking succeeded */
+ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ i++;
++
++ if (i < SYNC_SPIN_ROUNDS) {
++ goto spin_loop;
++ }
++#endif
++
+ rw_x_system_call_count++;
+
+ sync_array_reserve_cell(sync_primary_wait_array,
+ lock,
+-#ifdef __WIN__
+- /* On windows RW_LOCK_WAIT_EX signifies
+- that this thread should wait on the
+- special wait_ex_event. */
+ (state == RW_LOCK_WAIT_EX)
+ ? RW_LOCK_WAIT_EX :
+-#endif
+ RW_LOCK_EX,
+ file_name, line,
+ &index);
+
+- rw_lock_set_waiters(lock, 1);
++ if (state == RW_LOCK_WAIT_EX) {
++ rw_lock_set_wx_waiters(lock, 1);
++ } else {
++ rw_lock_set_x_waiters(lock, 1);
++ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ /* like sync0sync.c doing */
++ for (i = 0; i < 4; i++) {
++ prev_state = state;
++ state = rw_lock_x_lock_low(lock, pass, file_name, line);
++ if (state == RW_LOCK_EX) {
++ sync_array_free_cell(sync_primary_wait_array, index);
++ return; /* Locking succeeded */
++ }
++ if (state != prev_state) {
++ /* retry! */
++ sync_array_free_cell(sync_primary_wait_array, index);
++ goto lock_loop;
++ }
++ }
++#else
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+
+ if (srv_print_latch_waits) {
+ fprintf(stderr,
+@@ -718,7 +912,9 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ info = UT_LIST_GET_FIRST(lock->debug_list);
+
+@@ -728,7 +924,9 @@
+ && (info->pass == 0)
+ && (info->lock_type == lock_type)) {
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+ /* Found! */
+
+ return(TRUE);
+@@ -736,7 +934,9 @@
+
+ info = UT_LIST_GET_NEXT(list, info);
+ }
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+
+ return(FALSE);
+ }
+@@ -758,21 +958,25 @@
+ ut_ad(lock);
+ ut_ad(rw_lock_validate(lock));
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ if (lock_type == RW_LOCK_SHARED) {
+ if (lock->reader_count > 0) {
+ ret = TRUE;
+ }
+ } else if (lock_type == RW_LOCK_EX) {
+- if (lock->writer == RW_LOCK_EX) {
++ if (rw_lock_get_writer(lock) == RW_LOCK_EX) {
+ ret = TRUE;
+ }
+ } else {
+ ut_error;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+
+ return(ret);
+ }
+@@ -801,16 +1005,26 @@
+
+ count++;
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(&(lock->mutex));
++#endif
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++ || (rw_lock_get_s_waiters(lock) != 0)
++ || (rw_lock_get_x_waiters(lock) != 0)
++ || (rw_lock_get_wx_waiters(lock) != 0)) {
+
+ fprintf(stderr, "RW-LOCK: %p ", lock);
+
+- if (rw_lock_get_waiters(lock)) {
+- fputs(" Waiters for the lock exist\n", stderr);
++ if (rw_lock_get_s_waiters(lock)) {
++ fputs(" s_waiters for the lock exist,", stderr);
++ }
++ if (rw_lock_get_x_waiters(lock)) {
++ fputs(" x_waiters for the lock exist\n", stderr);
++ }
++ if (rw_lock_get_wx_waiters(lock)) {
++ fputs(" wait_ex_waiters for the lock exist\n", stderr);
+ } else {
+ putc('\n', stderr);
+ }
+@@ -822,7 +1036,9 @@
+ }
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(&(lock->mutex));
++#endif
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+@@ -847,10 +1063,18 @@
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)
+- || (rw_lock_get_waiters(lock) != 0)) {
++ || (rw_lock_get_s_waiters(lock) != 0)
++ || (rw_lock_get_x_waiters(lock) != 0)
++ || (rw_lock_get_wx_waiters(lock) != 0)) {
+
+- if (rw_lock_get_waiters(lock)) {
+- fputs(" Waiters for the lock exist\n", stderr);
++ if (rw_lock_get_s_waiters(lock)) {
++ fputs(" s_waiters for the lock exist,", stderr);
++ }
++ if (rw_lock_get_x_waiters(lock)) {
++ fputs(" x_waiters for the lock exist\n", stderr);
++ }
++ if (rw_lock_get_wx_waiters(lock)) {
++ fputs(" wait_ex_waiters for the lock exist\n", stderr);
+ } else {
+ putc('\n', stderr);
+ }
+@@ -909,14 +1133,18 @@
+ lock = UT_LIST_GET_FIRST(rw_lock_list);
+
+ while (lock != NULL) {
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_enter(rw_lock_get_mutex(lock));
++#endif
+
+ if ((rw_lock_get_writer(lock) != RW_LOCK_NOT_LOCKED)
+ || (rw_lock_get_reader_count(lock) != 0)) {
+ count++;
+ }
+
++#ifndef HAVE_ATOMIC_BUILTINS
+ mutex_exit(rw_lock_get_mutex(lock));
++#endif
+ lock = UT_LIST_GET_NEXT(list, lock);
+ }
+
+diff -ruN a/patch_info/innodb_rw_lock.info b/patch_info/innodb_rw_lock.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_rw_lock.info 2009-04-16 16:15:28.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_rw_lock.patch
++Name=Fix of InnoDB rw_locks
++Version=1.0
++Author=Yasufumi Kinoshita
++License=BSD
++Comment=
diff --git a/percona/5.0.91-b22-20100522/innodb_show_bp.patch b/percona/5.0.91-b22-20100522/innodb_show_bp.patch
new file mode 100644
index 0000000..d964785
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_show_bp.patch
@@ -0,0 +1,453 @@
+diff -r fe944d2c6e1f innobase/btr/btr0btr.c
+--- a/innobase/btr/btr0btr.c Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/btr/btr0btr.c Mon Nov 10 19:48:24 2008 -0800
+@@ -2989,3 +2989,11 @@
+
+ return(TRUE);
+ }
++
++dulint
++btr_page_get_index_id_noninline(
++/*============*/
++ page_t* page) /* in: index page */
++{
++ return btr_page_get_index_id(page);
++}
+diff -r fe944d2c6e1f innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/buf/buf0buf.c Mon Nov 10 19:48:24 2008 -0800
+@@ -2629,3 +2629,13 @@
+ buf_block_print(block);
+ }
+
++buf_block_t*
++buf_pool_get_nth_block_no_inline(
++/*===================*/
++ /* out: pointer to block */
++ buf_pool_t* buf_pool,/* in: buf_pool */
++ ulint i) /* in: index of the block */{
++
++return buf_pool_get_nth_block(buf_pool, i);
++
++}
+diff -r fe944d2c6e1f innobase/include/btr0btr.h
+--- a/innobase/include/btr0btr.h Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/include/btr0btr.h Mon Nov 10 19:48:24 2008 -0800
+@@ -69,6 +69,12 @@
+ UNIV_INLINE
+ dulint
+ btr_page_get_index_id(
++/*==================*/
++ /* out: index id */
++ page_t* page); /* in: index page */
++
++dulint
++btr_page_get_index_id_noninline(
+ /*==================*/
+ /* out: index id */
+ page_t* page); /* in: index page */
+diff -r fe944d2c6e1f innobase/include/buf0buf.h
+--- a/innobase/include/buf0buf.h Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/include/buf0buf.h Mon Nov 10 19:48:24 2008 -0800
+@@ -703,6 +703,8 @@
+ buf_get_free_list_len(void);
+ /*=======================*/
+
++void buf_pool_dump(void);
++buf_block_t* buf_pool_get_nth_block_no_inline(buf_pool_t* pool, ulint i);
+
+
+ /* The buffer control block structure */
+diff -r fe944d2c6e1f innobase/include/page0page.h
+--- a/innobase/include/page0page.h Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/include/page0page.h Mon Nov 10 19:48:24 2008 -0800
+@@ -260,6 +260,12 @@
+ /*============*/
+ /* out: number of user records */
+ page_t* page); /* in: index page */
++
++ulint
++page_get_n_recs_noninline(
++/*============*/
++ /* out: number of user records */
++ page_t* page); /* in: index page */
+ /*******************************************************************
+ Returns the number of records before the given record in chain.
+ The number includes infimum and supremum records. */
+@@ -519,6 +525,12 @@
+ UNIV_INLINE
+ ulint
+ page_get_data_size(
++/*===============*/
++ /* out: data in bytes */
++ page_t* page); /* in: index page */
++
++ulint
++page_get_data_size_noninline(
+ /*===============*/
+ /* out: data in bytes */
+ page_t* page); /* in: index page */
+diff -r fe944d2c6e1f innobase/page/page0page.c
+--- a/innobase/page/page0page.c Mon Nov 10 19:47:27 2008 -0800
++++ b/innobase/page/page0page.c Mon Nov 10 19:48:24 2008 -0800
+@@ -1994,3 +1994,25 @@
+ page_cur_move_to_next(&cur);
+ }
+ }
++
++ulint
++page_get_n_recs_noninline(
++/*============*/
++ /* out: number of user records */
++ page_t* page) /* in: index page */
++{
++ return page_get_n_recs(page);
++}
++
++
++ulint
++page_get_data_size_noninline(
++/*============*/
++ /* out: number of user records */
++ page_t* page) /* in: index page */
++{
++ return page_get_data_size(page);
++}
++
++
++
+diff -r fe944d2c6e1f mysql-test/r/information_schema.result
+--- a/mysql-test/r/information_schema.result Mon Nov 10 19:47:27 2008 -0800
++++ b/mysql-test/r/information_schema.result Mon Nov 10 19:48:25 2008 -0800
+@@ -42,6 +42,7 @@
+ COLLATION_CHARACTER_SET_APPLICABILITY
+ COLUMNS
+ COLUMN_PRIVILEGES
++INNODB_BUFFER_POOL_CONTENT
+ INDEX_STATISTICS
+ KEY_COLUMN_USAGE
+ PROCESSLIST
+@@ -741,7 +742,7 @@
+ CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1;
+ CREATE VIEW a2 AS SELECT t_CRASHME FROM a1;
+ count(*)
+-107
++108
+ drop view a2, a1;
+ drop table t_crashme;
+ select table_schema,table_name, column_name from
+@@ -802,6 +803,7 @@
+ TABLE_NAME COLUMN_NAME PRIVILEGES
+ COLUMNS TABLE_NAME select
+ COLUMN_PRIVILEGES TABLE_NAME select
++INNODB_BUFFER_POOL_CONTENT TABLE_NAME select
+ INDEX_STATISTICS TABLE_NAME select
+ KEY_COLUMN_USAGE TABLE_NAME select
+ STATISTICS TABLE_NAME select
+@@ -815,7 +817,7 @@
+ flush privileges;
+ SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA;
+ table_schema count(*)
+-information_schema 22
++information_schema 23
+ mysql 17
+ create table t1 (i int, j int);
+ create trigger trg1 before insert on t1 for each row
+@@ -1206,6 +1208,7 @@
+ COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME
+ COLUMNS TABLE_SCHEMA
+ COLUMN_PRIVILEGES TABLE_SCHEMA
++INNODB_BUFFER_POOL_CONTENT TABLE_SCHEMA
+ INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
+ PROCESSLIST ID
+@@ -1243,6 +1246,7 @@
+ COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME
+ COLUMNS TABLE_SCHEMA
+ COLUMN_PRIVILEGES TABLE_SCHEMA
++INNODB_BUFFER_POOL_CONTENT TABLE_SCHEMA
+ INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
+ PROCESSLIST ID
+@@ -1332,6 +1336,7 @@
+ COLUMNS information_schema.COLUMNS 1
+ COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1
+ INDEX_STATISTICS information_schema.INDEX_STATISTICS 1
++INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1
+ KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
+ PROCESSLIST information_schema.PROCESSLIST 1
+ PROFILING information_schema.PROFILING 1
+diff -r fe944d2c6e1f mysql-test/r/information_schema_db.result
+--- a/mysql-test/r/information_schema_db.result Mon Nov 10 19:47:27 2008 -0800
++++ b/mysql-test/r/information_schema_db.result Mon Nov 10 19:48:25 2008 -0800
+@@ -11,6 +11,7 @@
+ COLLATION_CHARACTER_SET_APPLICABILITY
+ COLUMNS
+ COLUMN_PRIVILEGES
++INNODB_BUFFER_POOL_CONTENT
+ INDEX_STATISTICS
+ KEY_COLUMN_USAGE
+ PROCESSLIST
+diff -r fe944d2c6e1f mysql-test/r/mysqlshow.result
+--- a/mysql-test/r/mysqlshow.result Mon Nov 10 19:47:27 2008 -0800
++++ b/mysql-test/r/mysqlshow.result Mon Nov 10 19:48:25 2008 -0800
+@@ -85,6 +85,7 @@
+ | COLLATION_CHARACTER_SET_APPLICABILITY |
+ | COLUMNS |
+ | COLUMN_PRIVILEGES |
++| INNODB_BUFFER_POOL_CONTENT |
+ | INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
+ | PROCESSLIST |
+@@ -112,6 +113,7 @@
+ | COLLATION_CHARACTER_SET_APPLICABILITY |
+ | COLUMNS |
+ | COLUMN_PRIVILEGES |
++| INNODB_BUFFER_POOL_CONTENT |
+ | INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
+ | PROCESSLIST |
+diff -r fe944d2c6e1f patch_info/innodb_show_bp.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/innodb_show_bp.info Mon Nov 10 19:48:25 2008 -0800
+@@ -0,0 +1,6 @@
++File=innodb_show_bp.patch
++Name=show innodb buffer pool content
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
+diff -r fe944d2c6e1f sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Mon Nov 10 19:47:27 2008 -0800
++++ b/sql/ha_innodb.cc Mon Nov 10 19:48:25 2008 -0800
+@@ -128,10 +128,12 @@
+ #include "../innobase/include/lock0lock.h"
+ #include "../innobase/include/dict0crea.h"
+ #include "../innobase/include/btr0cur.h"
++#include "../innobase/include/buf0buf.h"
+ #include "../innobase/include/btr0btr.h"
+ #include "../innobase/include/fsp0fsp.h"
+ #include "../innobase/include/sync0sync.h"
+ #include "../innobase/include/fil0fil.h"
++#include "../innobase/include/page0page.h"
+ #include "../innobase/include/trx0xa.h"
+ }
+
+@@ -6483,6 +6485,116 @@
+ DBUG_RETURN(FALSE);
+ }
+
++bool
++innodb_I_S_buffer_pool_content(THD* thd, TABLE_LIST *tables)
++{
++ ulint size;
++ ulint i;
++ dulint id;
++ ulint n_found;
++ buf_frame_t* frame;
++ dict_index_t* index;
++ buf_block_t* block;
++
++ const char *p;
++ char db_name_raw[NAME_LEN*5+1];
++ char table_name_raw[NAME_LEN*5+1];
++
++ DBUG_ENTER("innodb_I_S_buffer_pool_content");
++
++
++ size = buf_pool->curr_size;
++
++ n_found = 0;
++
++ TABLE *table= tables->table;
++
++
++ //buf_pool_dump();
++
++
++ for (i = 0; i < size; i++) {
++ block = buf_pool_get_nth_block_no_inline(buf_pool, i);
++ frame = block->frame;
++ if (fil_page_get_type(frame)==0) continue;
++
++ char page_type[64];
++
++ switch(fil_page_get_type(frame))
++ {
++ case FIL_PAGE_INDEX:
++ strcpy(page_type, "index");
++ break;
++ case FIL_PAGE_UNDO_LOG:
++ strcpy(page_type, "undo_log");
++ break;
++ case FIL_PAGE_INODE:
++ strcpy(page_type, "inode");
++ break;
++ case FIL_PAGE_IBUF_FREE_LIST:
++ strcpy(page_type, "ibuf_free_list");
++ break;
++ default:
++ sprintf(page_type, "unknown", fil_page_get_type(frame));
++ }
++
++ table->field[0]->store((longlong)i, TRUE);
++ table->field[1]->store((longlong)block->space, TRUE);
++ table->field[2]->store((longlong)block->offset, TRUE);
++ table->field[3]->store((longlong)page_get_n_recs_noninline(block->frame), TRUE);
++ table->field[4]->store( ( fil_page_get_type(frame) == FIL_PAGE_INDEX ) ? (longlong)page_get_data_size_noninline(block->frame):0, TRUE);
++ table->field[5]->store((longlong)block->flush_type, TRUE);
++ table->field[6]->store((longlong)block->buf_fix_count, TRUE);
++ table->field[7]->store((longlong)block->LRU_position, TRUE);
++ table->field[8]->store((longlong)fil_page_get_type(frame), TRUE);
++
++ table->field[9]->store(page_type, strlen(page_type), system_charset_info);
++
++ //fprintf(stderr, "block N %d, space %d, offset %d, records %d, datasize %d, page_type %s, flush_type %d, buf_fix_count %d, LRU_position %d", i, block->space, block->offset, page_get_n_recs_noninline(block->frame), page_get_data_size_noninline(block->frame), page_type,block->flush_type, block->buf_fix_count, block->LRU_position);
++
++ // flush_type, buf_fix_count, LRU_position
++
++ if (fil_page_get_type(frame) == FIL_PAGE_INDEX) {
++
++ id = btr_page_get_index_id_noninline(frame);
++ index = dict_index_get_if_in_cache(id);
++ if (index) {
++ table->field[10]->store(index->name, strlen(index->name), system_charset_info);
++ // fprintf(stderr, " index %s, table %s", index->name, index->table_name);
++
++ if((p = strchr((char *) index->table_name, '/')))
++ {
++ strncpy(db_name_raw, index->table_name, p-index->table_name);
++ db_name_raw[p-index->table_name] = 0;
++ table->field[11]->store(db_name_raw, strlen(db_name_raw), system_charset_info);
++ p++;
++ } else {
++ table->field[11]->store(NULL, 0, system_charset_info);
++ p = index->table_name;
++ }
++ strcpy(table_name_raw, p);
++
++ table->field[12]->store(table_name_raw, strlen(table_name_raw), system_charset_info);
++ } else {
++ table->field[10]->store(NULL, 0, system_charset_info);
++ table->field[11]->store(NULL, 0, system_charset_info);
++ table->field[12]->store(NULL, 0, system_charset_info);
++ }
++ }else{
++ table->field[10]->store(NULL, 0, system_charset_info);
++ table->field[11]->store(NULL, 0, system_charset_info);
++ table->field[12]->store(NULL, 0, system_charset_info);
++ }
++ //fprintf(stderr, "\n");
++ if (schema_table_store_record(thd, table))
++ {
++ DBUG_RETURN(1);
++ }
++ }
++
++ DBUG_RETURN(0);
++}
++
+ /****************************************************************************
+ Implements the SHOW MUTEX STATUS command. . */
+
+diff -r fe944d2c6e1f sql/ha_innodb.h
+--- a/sql/ha_innodb.h Mon Nov 10 19:47:27 2008 -0800
++++ b/sql/ha_innodb.h Mon Nov 10 19:48:25 2008 -0800
+@@ -263,6 +263,7 @@
+
+ int innobase_drop_database(char *path);
+ bool innodb_show_status(THD* thd);
++bool innodb_I_S_buffer_pool_content(THD* thd, TABLE_LIST *tables);
+ bool innodb_mutex_show_status(THD* thd);
+ void innodb_export_status(void);
+
+diff -r fe944d2c6e1f sql/sql_parse.cc
+--- a/sql/sql_parse.cc Mon Nov 10 19:47:27 2008 -0800
++++ b/sql/sql_parse.cc Mon Nov 10 19:48:25 2008 -0800
+@@ -2926,6 +2926,7 @@
+ case SCH_COLUMN_PRIVILEGES:
+ case SCH_TABLE_CONSTRAINTS:
+ case SCH_KEY_COLUMN_USAGE:
++ case SCH_INNODB_I_S_BUFFER_POOL_CONTENT:
+ default:
+ break;
+ }
+diff -r fe944d2c6e1f sql/sql_show.cc
+--- a/sql/sql_show.cc Mon Nov 10 19:47:27 2008 -0800
++++ b/sql/sql_show.cc Mon Nov 10 19:48:25 2008 -0800
+@@ -27,6 +27,10 @@
+
+ #ifdef HAVE_BERKELEY_DB
+ #include "ha_berkeley.h" // For berkeley_show_logs
++#endif
++
++#ifdef HAVE_INNOBASE_DB
++#include "ha_innodb.h"
+ #endif
+
+ #ifndef NO_EMBEDDED_ACCESS_CHECKS
+@@ -4042,6 +4046,19 @@
+ DBUG_RETURN(res);
+ }
+
++int fill_innodb_bp_content(THD *thd, TABLE_LIST *tables, COND *cond)
++{
++ DBUG_ENTER("fill_innodb_bp_content");
++ int res= 0;
++
++ /* deny access to non-superusers */
++ if (check_global_access(thd, PROCESS_ACL)) {
++ DBUG_RETURN(0);
++ }
++
++ innodb_I_S_buffer_pool_content(thd, tables);
++ DBUG_RETURN(res);
++}
+
+ /*
+ Find schema_tables elment by name
+@@ -4951,6 +4962,24 @@
+ };
+
+
++ST_FIELD_INFO innodb_bp_content_fields_info[]=
++{
++ {"BLOCK_NUM", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Block_num"},
++ {"SPACE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Space"},
++ {"OFFSET", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Offset"},
++ {"RECORDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Records"},
++ {"DATASIZE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Datasize"},
++ {"FLUSH_TYPE", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Flush_type"},
++ {"FIX_COUNT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Fix_count"},
++ {"LRU_POSITION", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "LRU_position"},
++ {"PAGE_TYPE_ID", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Page_type_id"},
++ {"PAGE_TYPE", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Page_type"},
++ {"INDEX_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Index_name"},
++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schem"},
++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
+ /*
+ Description of ST_FIELD_INFO in table.h
+ */
+@@ -4969,6 +4998,8 @@
+ get_all_tables, make_columns_old_format, get_schema_column_record, 1, 2, 0},
+ {"COLUMN_PRIVILEGES", column_privileges_fields_info, create_schema_table,
+ fill_schema_column_privileges, 0, 0, -1, -1, 0},
++ {"INNODB_BUFFER_POOL_CONTENT", innodb_bp_content_fields_info, create_schema_table,
++ fill_innodb_bp_content, 0, 0, -1, -1, 0},
+ {"INDEX_STATISTICS", index_stats_fields_info, create_schema_table,
+ fill_schema_index_stats, make_old_format, 0, -1, -1, 0},
+ {"KEY_COLUMN_USAGE", key_column_usage_fields_info, create_schema_table,
+diff -r fe944d2c6e1f sql/table.h
+--- a/sql/table.h Mon Nov 10 19:47:27 2008 -0800
++++ b/sql/table.h Mon Nov 10 19:48:25 2008 -0800
+@@ -375,6 +375,7 @@
+ SCH_COLLATION_CHARACTER_SET_APPLICABILITY,
+ SCH_COLUMNS,
+ SCH_COLUMN_PRIVILEGES,
++ SCH_INNODB_I_S_BUFFER_POOL_CONTENT,
+ SCH_INDEX_STATS,
+ SCH_KEY_COLUMN_USAGE,
+ SCH_OPEN_TABLES,
diff --git a/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch
new file mode 100644
index 0000000..191193e
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory.patch
@@ -0,0 +1,275 @@
+diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c
+--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900
+@@ -2454,13 +2454,15 @@
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
+ fprintf(file,
+- "Buffer pool size %lu\n"
+- "Free buffers %lu\n"
+- "Database pages %lu\n"
+- "Modified db pages %lu\n"
++ "Buffer pool size %lu\n"
++ "Buffer pool size, bytes %lu\n"
++ "Free buffers %lu\n"
++ "Database pages %lu\n"
++ "Modified db pages %lu\n"
+ "Pending reads %lu\n"
+ "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ (ulong) size,
++ (ulong) size * UNIV_PAGE_SIZE,
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c
+--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900
+@@ -4472,3 +4472,30 @@
+
+ return(mach_read_from_2(page + FIL_PAGE_TYPE));
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (fil_system->spaces->n_cells
++ + fil_system->name_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++fil_system_hash_nodes(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (UT_LIST_GET_LEN(fil_system->space_list)
++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h
+--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900
+@@ -701,6 +701,16 @@
+ written to page, the return value not defined */
+ byte* page); /* in: file page */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void);
++/*========================*/
++
++ulint
++fil_system_hash_nodes(void);
++/*========================*/
+
+ typedef struct fil_space_struct fil_space_t;
+
+diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h
+--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900
+@@ -77,6 +77,17 @@
+ /*=============================*/
+ /* out: pointer to the in_ibuf field */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void);
++/*=======================*/
++
++ulint
++thr_local_hash_nodes(void);
++/*=======================*/
++
+ #ifndef UNIV_NONINL
+ #include "thr0loc.ic"
+ #endif
+diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c
+--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900
+@@ -1645,6 +1645,14 @@
+ time_t current_time;
+ ulint n_reserved;
+
++ ulint btr_search_sys_subtotal;
++ ulint lock_sys_subtotal;
++ ulint recv_sys_subtotal;
++ ulint io_counter_subtotal;
++
++ ulint i;
++ trx_t* trx;
++
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+@@ -1747,6 +1755,91 @@
+ ut_total_allocated_memory,
+ mem_pool_get_reserved(mem_comm_pool));
+
++ /* Calcurate reserved memories */
++ if (btr_search_sys && btr_search_sys->hash_index->heap) {
++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap);
++ } else {
++ btr_search_sys_subtotal = 0;
++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) {
++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]);
++ }
++ }
++
++ lock_sys_subtotal = 0;
++ if (trx_sys) {
++ mutex_enter(&kernel_mutex);
++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
++ while (trx) {
++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
++ }
++ mutex_exit(&kernel_mutex);
++ }
++
++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
++ ? mem_heap_get_size(recv_sys->heap) : 0);
++
++ io_counter_subtotal = ((buf_pool->io_counter_heap)
++ ? mem_heap_get_size(buf_pool->io_counter_heap) : 0);
++
++ fprintf(file,
++ "Internal hash tables (constant factor + variable factor)\n"
++ " Adaptive hash index %lu \t(%lu + %lu)\n"
++ " Page hash %lu\n"
++ " Dictionary cache %lu \t(%lu + %lu)\n"
++ " File system %lu \t(%lu + %lu)\n"
++ " Lock system %lu \t(%lu + %lu)\n"
++ " Recovery system %lu \t(%lu + %lu)\n"
++ " Threads %lu \t(%lu + %lu)\n"
++ " innodb_io_pattern %lu \t(%lu + %lu)\n",
++
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0)
++ + btr_search_sys_subtotal,
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) btr_search_sys_subtotal,
++
++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)),
++
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)
++ + dict_sys->size) : 0),
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0),
++ (ulong) (dict_sys ? (dict_sys->size) : 0),
++
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
++ + fil_system_hash_nodes()),
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) fil_system_hash_nodes(),
++
++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + lock_sys_subtotal),
++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) lock_sys_subtotal,
++
++ (ulong) (((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + recv_sys_subtotal),
++ (ulong) ((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) recv_sys_subtotal,
++
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)
++ + thr_local_hash_nodes()),
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) thr_local_hash_nodes(),
++
++ (ulong) (((buf_pool->io_counter_hash) /* needs &(buf_pool->mutex) ? */
++ ? (buf_pool->io_counter_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + io_counter_subtotal),
++ (ulong) ((buf_pool->io_counter_hash) /* needs &(buf_pool->mutex) ? */
++ ? (buf_pool->io_counter_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) io_counter_subtotal);
++
+ if (srv_use_awe) {
+ fprintf(file,
+ "In addition to that %lu MB of AWE memory allocated\n",
+diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c
+--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900
+@@ -32,6 +32,7 @@
+
+ /* The hash table. The module is not yet initialized when it is NULL. */
+ hash_table_t* thr_local_hash = NULL;
++ulint thr_local_hash_n_nodes = 0;
+
+ /* The private data for each thread should be put to
+ the structure below and the accessor functions written
+@@ -223,6 +224,7 @@
+ HASH_INSERT(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(os_thread_get_curr_id()),
+ local);
++ thr_local_hash_n_nodes++;
+
+ mutex_exit(&thr_local_mutex);
+ }
+@@ -251,6 +253,7 @@
+
+ HASH_DELETE(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(id), local);
++ thr_local_hash_n_nodes--;
+
+ mutex_exit(&thr_local_mutex);
+
+@@ -274,3 +277,29 @@
+ mutex_create(&thr_local_mutex);
+ mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL);
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++thr_local_hash_nodes(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash_n_nodes
++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_show_hashed_memory.patch
++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
diff --git a/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch
new file mode 100644
index 0000000..bf8f6b4
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_show_hashed_memory_standalone.patch
@@ -0,0 +1,264 @@
+diff -ruN mysql-5.0.67_highperf/innobase/buf/buf0buf.c mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c
+--- mysql-5.0.67_highperf/innobase/buf/buf0buf.c 2008-11-12 09:25:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/buf/buf0buf.c 2008-11-12 09:27:52.000000000 +0900
+@@ -2454,13 +2454,15 @@
+ (ulong) UT_LIST_GET_LEN(buf_pool->awe_LRU_free_mapped));
+ }
+ fprintf(file,
+- "Buffer pool size %lu\n"
+- "Free buffers %lu\n"
+- "Database pages %lu\n"
+- "Modified db pages %lu\n"
++ "Buffer pool size %lu\n"
++ "Buffer pool size, bytes %lu\n"
++ "Free buffers %lu\n"
++ "Database pages %lu\n"
++ "Modified db pages %lu\n"
+ "Pending reads %lu\n"
+ "Pending writes: LRU %lu, flush list %lu, single page %lu\n",
+ (ulong) size,
++ (ulong) size * UNIV_PAGE_SIZE,
+ (ulong) UT_LIST_GET_LEN(buf_pool->free),
+ (ulong) UT_LIST_GET_LEN(buf_pool->LRU),
+ (ulong) UT_LIST_GET_LEN(buf_pool->flush_list),
+diff -ruN mysql-5.0.67_highperf/innobase/fil/fil0fil.c mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c
+--- mysql-5.0.67_highperf/innobase/fil/fil0fil.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/fil/fil0fil.c 2008-11-12 09:27:52.000000000 +0900
+@@ -4472,3 +4472,30 @@
+
+ return(mach_read_from_2(page + FIL_PAGE_TYPE));
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (fil_system->spaces->n_cells
++ + fil_system->name_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++fil_system_hash_nodes(void)
++/*=======================*/
++{
++ if (fil_system) {
++ return (UT_LIST_GET_LEN(fil_system->space_list)
++ * (sizeof(fil_space_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/innobase/include/fil0fil.h mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h
+--- mysql-5.0.67_highperf/innobase/include/fil0fil.h 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/fil0fil.h 2008-11-12 09:27:52.000000000 +0900
+@@ -701,6 +701,16 @@
+ written to page, the return value not defined */
+ byte* page); /* in: file page */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++fil_system_hash_cells(void);
++/*========================*/
++
++ulint
++fil_system_hash_nodes(void);
++/*========================*/
+
+ typedef struct fil_space_struct fil_space_t;
+
+diff -ruN mysql-5.0.67_highperf/innobase/include/thr0loc.h mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h
+--- mysql-5.0.67_highperf/innobase/include/thr0loc.h 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/include/thr0loc.h 2008-11-12 09:27:52.000000000 +0900
+@@ -77,6 +77,17 @@
+ /*=============================*/
+ /* out: pointer to the in_ibuf field */
+
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void);
++/*=======================*/
++
++ulint
++thr_local_hash_nodes(void);
++/*=======================*/
++
+ #ifndef UNIV_NONINL
+ #include "thr0loc.ic"
+ #endif
+diff -ruN mysql-5.0.67_highperf/innobase/srv/srv0srv.c mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c
+--- mysql-5.0.67_highperf/innobase/srv/srv0srv.c 2008-11-12 09:26:07.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/srv/srv0srv.c 2008-11-12 09:54:19.000000000 +0900
+@@ -1645,6 +1645,14 @@
+ time_t current_time;
+ ulint n_reserved;
+
++ ulint btr_search_sys_subtotal;
++ ulint lock_sys_subtotal;
++ ulint recv_sys_subtotal;
++ ulint io_counter_subtotal;
++
++ ulint i;
++ trx_t* trx;
++
+ mutex_enter(&srv_innodb_monitor_mutex);
+
+ current_time = time(NULL);
+@@ -1747,6 +1755,80 @@
+ ut_total_allocated_memory,
+ mem_pool_get_reserved(mem_comm_pool));
+
++ /* Calcurate reserved memories */
++ if (btr_search_sys && btr_search_sys->hash_index->heap) {
++ btr_search_sys_subtotal = mem_heap_get_size(btr_search_sys->hash_index->heap);
++ } else {
++ btr_search_sys_subtotal = 0;
++ for (i=0; i < btr_search_sys->hash_index->n_mutexes; i++) {
++ btr_search_sys_subtotal += mem_heap_get_size(btr_search_sys->hash_index->heaps[i]);
++ }
++ }
++
++ lock_sys_subtotal = 0;
++ if (trx_sys) {
++ mutex_enter(&kernel_mutex);
++ trx = UT_LIST_GET_FIRST(trx_sys->mysql_trx_list);
++ while (trx) {
++ lock_sys_subtotal += ((trx->lock_heap) ? mem_heap_get_size(trx->lock_heap) : 0);
++ trx = UT_LIST_GET_NEXT(mysql_trx_list, trx);
++ }
++ mutex_exit(&kernel_mutex);
++ }
++
++ recv_sys_subtotal = ((recv_sys && recv_sys->addr_hash)
++ ? mem_heap_get_size(recv_sys->heap) : 0);
++
++ fprintf(file,
++ "Internal hash tables (constant factor + variable factor)\n"
++ " Adaptive hash index %lu \t(%lu + %lu)\n"
++ " Page hash %lu\n"
++ " Dictionary cache %lu \t(%lu + %lu)\n"
++ " File system %lu \t(%lu + %lu)\n"
++ " Lock system %lu \t(%lu + %lu)\n"
++ " Recovery system %lu \t(%lu + %lu)\n"
++ " Threads %lu \t(%lu + %lu)\n",
++
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0)
++ + btr_search_sys_subtotal,
++ (ulong) (btr_search_sys
++ ? (btr_search_sys->hash_index->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) btr_search_sys_subtotal,
++
++ (ulong) (buf_pool->page_hash->n_cells * sizeof(hash_cell_t)),
++
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)
++ + dict_sys->size) : 0),
++ (ulong) (dict_sys ? ((dict_sys->table_hash->n_cells
++ + dict_sys->table_id_hash->n_cells
++ + dict_sys->col_hash->n_cells) * sizeof(hash_cell_t)) : 0),
++ (ulong) (dict_sys ? (dict_sys->size) : 0),
++
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)
++ + fil_system_hash_nodes()),
++ (ulong) (fil_system_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) fil_system_hash_nodes(),
++
++ (ulong) ((lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + lock_sys_subtotal),
++ (ulong) (lock_sys ? (lock_sys->rec_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) lock_sys_subtotal,
++
++ (ulong) (((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0)
++ + recv_sys_subtotal),
++ (ulong) ((recv_sys && recv_sys->addr_hash)
++ ? (recv_sys->addr_hash->n_cells * sizeof(hash_cell_t)) : 0),
++ (ulong) recv_sys_subtotal,
++
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)
++ + thr_local_hash_nodes()),
++ (ulong) (thr_local_hash_cells() * sizeof(hash_cell_t)),
++ (ulong) thr_local_hash_nodes());
++
+ if (srv_use_awe) {
+ fprintf(file,
+ "In addition to that %lu MB of AWE memory allocated\n",
+diff -ruN mysql-5.0.67_highperf/innobase/thr/thr0loc.c mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c
+--- mysql-5.0.67_highperf/innobase/thr/thr0loc.c 2008-11-12 09:24:58.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/innobase/thr/thr0loc.c 2008-11-12 09:27:52.000000000 +0900
+@@ -32,6 +32,7 @@
+
+ /* The hash table. The module is not yet initialized when it is NULL. */
+ hash_table_t* thr_local_hash = NULL;
++ulint thr_local_hash_n_nodes = 0;
+
+ /* The private data for each thread should be put to
+ the structure below and the accessor functions written
+@@ -223,6 +224,7 @@
+ HASH_INSERT(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(os_thread_get_curr_id()),
+ local);
++ thr_local_hash_n_nodes++;
+
+ mutex_exit(&thr_local_mutex);
+ }
+@@ -251,6 +253,7 @@
+
+ HASH_DELETE(thr_local_t, hash, thr_local_hash,
+ os_thread_pf(id), local);
++ thr_local_hash_n_nodes--;
+
+ mutex_exit(&thr_local_mutex);
+
+@@ -274,3 +277,29 @@
+ mutex_create(&thr_local_mutex);
+ mutex_set_level(&thr_local_mutex, SYNC_THR_LOCAL);
+ }
++
++/*************************************************************************
++Return local hash table informations. */
++
++ulint
++thr_local_hash_cells(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash->n_cells);
++ } else {
++ return 0;
++ }
++}
++
++ulint
++thr_local_hash_nodes(void)
++/*======================*/
++{
++ if (thr_local_hash) {
++ return (thr_local_hash_n_nodes
++ * (sizeof(thr_local_t) + MEM_BLOCK_HEADER_SIZE));
++ } else {
++ return 0;
++ }
++}
+diff -ruN mysql-5.0.67_highperf/patch_info/innodb_show_hashed_memory.info mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ mysql-5.0.67_highperf_tmp/patch_info/innodb_show_hashed_memory.info 2008-11-12 09:27:52.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_show_hashed_memory.patch
++Name=Adds additional information of InnoDB internal hash table memories in SHOW INNODB STATUS
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
diff --git a/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch b/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch
new file mode 100644
index 0000000..a23c1e9
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_split_buf_pool_mutex.patch
@@ -0,0 +1,1914 @@
+diff -ruN a/innobase/btr/btr0sea.c b/innobase/btr/btr0sea.c
+--- a/innobase/btr/btr0sea.c 2009-08-28 11:08:16.000000000 +0900
++++ b/innobase/btr/btr0sea.c 2009-08-28 11:06:20.000000000 +0900
+@@ -1101,7 +1101,7 @@
+ ulint* offsets;
+
+ rw_lock_x_lock(&btr_search_latch);
+- mutex_enter(&buf_pool->mutex);
++ mutex_enter(&buf_pool->LRU_mutex);
+
+ table = btr_search_sys->hash_index;
+
+@@ -1186,7 +1186,7 @@
+ block = UT_LIST_GET_PREV(LRU, block);
+ }
+
+- mutex_exit(&buf_pool->mutex);
++ mutex_exit(&buf_pool->LRU_mutex);
+ rw_lock_x_unlock(&btr_search_latch);
+
+ if (UNIV_LIKELY_NULL(heap)) {
+diff -ruN a/innobase/buf/buf0buf.c b/innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c 2009-08-28 11:08:16.000000000 +0900
++++ b/innobase/buf/buf0buf.c 2009-08-28 11:06:30.000000000 +0900
+@@ -549,6 +549,17 @@
+ mutex_create(&(buf_pool->mutex));
+ mutex_set_level(&(buf_pool->mutex), SYNC_BUF_POOL);
+
++ mutex_create(&(buf_pool->LRU_mutex));
++ mutex_set_level(&(buf_pool->LRU_mutex), SYNC_BUF_LRU_LIST);
++ rw_lock_create(&(buf_pool->hash_latch));
++ rw_lock_set_level(&(buf_pool->hash_latch), SYNC_BUF_PAGE_HASH);
++ mutex_create(&(buf_pool->free_mutex));
++ mutex_set_level(&(buf_pool->free_mutex), SYNC_BUF_FREE_LIST);
++ mutex_create(&(buf_pool->flush_list_mutex));
++ mutex_set_level(&(buf_pool->flush_list_mutex), SYNC_BUF_FLUSH_LIST);
++
++ mutex_enter(&(buf_pool->LRU_mutex));
++ rw_lock_x_lock(&(buf_pool->hash_latch));
+ mutex_enter(&(buf_pool->mutex));
+
+ if (srv_use_awe) {
+@@ -724,6 +735,8 @@
+ block->in_free_list = TRUE;
+ }
+
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+ mutex_exit(&(buf_pool->mutex));
+
+ if (srv_use_adaptive_hash_indexes) {
+@@ -753,6 +766,7 @@
+ {
+ buf_block_t* bck;
+
++ ut_error; /* don't support AWE */
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&(buf_pool->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+@@ -851,7 +865,7 @@
+ buf_block_t* block) /* in: block to make younger */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(!mutex_own(&(buf_pool->mutex)));
++ ut_ad(!mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ /* Note that we read freed_page_clock's without holding any mutex:
+@@ -860,12 +874,12 @@
+ if (buf_pool->freed_page_clock >= block->freed_page_clock
+ + 1 + (buf_pool->curr_size / 4)) {
+
+- mutex_enter(&buf_pool->mutex);
++ mutex_enter(&buf_pool->LRU_mutex);
+ /* There has been freeing activity in the LRU list:
+ best to move to the head of the LRU list */
+
+ buf_LRU_make_block_young(block);
+- mutex_exit(&buf_pool->mutex);
++ mutex_exit(&buf_pool->LRU_mutex);
+ }
+ }
+
+@@ -881,7 +895,7 @@
+ {
+ buf_block_t* block;
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
+
+ block = buf_block_align(frame);
+
+@@ -889,7 +903,7 @@
+
+ buf_LRU_make_block_young(block);
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+ }
+
+ /************************************************************************
+@@ -900,7 +914,7 @@
+ /*===========*/
+ buf_block_t* block) /* in, own: block to be freed */
+ {
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex));
+
+ mutex_enter(&block->mutex);
+
+@@ -910,7 +924,7 @@
+
+ mutex_exit(&block->mutex);
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
+ }
+
+ /*************************************************************************
+@@ -951,11 +965,11 @@
+ {
+ buf_block_t* block;
+
+- mutex_enter_fast(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(block);
+ }
+@@ -972,7 +986,7 @@
+ {
+ buf_block_t* block;
+
+- mutex_enter_fast(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+@@ -980,7 +994,7 @@
+ block->check_index_page_at_flush = FALSE;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+ }
+
+ /************************************************************************
+@@ -999,7 +1013,7 @@
+ buf_block_t* block;
+ ibool is_hashed;
+
+- mutex_enter_fast(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+@@ -1009,7 +1023,7 @@
+ is_hashed = block->is_hashed;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(is_hashed);
+ }
+@@ -1051,7 +1065,7 @@
+ {
+ buf_block_t* block;
+
+- mutex_enter_fast(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+@@ -1059,7 +1073,7 @@
+ block->file_page_was_freed = TRUE;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(block);
+ }
+@@ -1080,7 +1094,7 @@
+ {
+ buf_block_t* block;
+
+- mutex_enter_fast(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+@@ -1088,7 +1102,7 @@
+ block->file_page_was_freed = FALSE;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(block);
+ }
+@@ -1167,26 +1181,33 @@
+ buf_pool->n_page_gets++;
+ loop:
+ block = NULL;
+- mutex_enter_fast(&(buf_pool->mutex));
++ //mutex_enter_fast(&(buf_pool->mutex));
+
+ if (guess) {
+ block = buf_block_align(guess);
+
++ mutex_enter(&block->mutex);
+ if ((offset != block->offset) || (space != block->space)
+ || (block->state != BUF_BLOCK_FILE_PAGE)) {
+
++ mutex_exit(&block->mutex);
+ block = NULL;
+ }
+ }
+
+ if (block == NULL) {
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+ block = buf_page_hash_get(space, offset);
++ if(block) {
++ mutex_enter(&block->mutex);
++ }
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+ }
+
+ if (block == NULL) {
+ /* Page not in buf_pool: needs to be read from file */
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
+
+ if (mode == BUF_GET_IF_IN_POOL) {
+
+@@ -1205,7 +1226,7 @@
+ goto loop;
+ }
+
+- mutex_enter(&block->mutex);
++ //mutex_enter(&block->mutex);
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
+@@ -1217,7 +1238,7 @@
+
+ if (mode == BUF_GET_IF_IN_POOL) {
+ /* The page is only being read to buffer */
+- mutex_exit(&buf_pool->mutex);
++ //mutex_exit(&buf_pool->mutex);
+ mutex_exit(&block->mutex);
+
+ return(NULL);
+@@ -1242,7 +1263,7 @@
+ #else
+ buf_block_buf_fix_inc(block);
+ #endif
+- mutex_exit(&buf_pool->mutex);
++ //mutex_exit(&buf_pool->mutex);
+
+ /* Check if this is the first access to the page */
+
+@@ -1685,7 +1706,7 @@
+ buf_block_t* block) /* in: block to init */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ ut_ad(mutex_own(&(block->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
+@@ -1792,7 +1813,8 @@
+
+ ut_a(block);
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ rw_lock_x_lock(&(buf_pool->hash_latch));
+ mutex_enter(&block->mutex);
+
+ if (fil_tablespace_deleted_or_being_deleted_in_mem(space,
+@@ -1807,7 +1829,8 @@
+ being deleted, or the page is already in buf_pool, return */
+
+ mutex_exit(&block->mutex);
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ buf_block_free(block);
+
+@@ -1822,10 +1845,14 @@
+ ut_ad(block);
+
+ buf_page_init(space, offset, block);
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ /* The block must be put to the LRU list, to the old blocks */
+
+ buf_LRU_add_block(block, TRUE); /* TRUE == to old blocks */
++ mutex_exit(&(buf_pool->LRU_mutex));
++
++ mutex_enter(&(buf_pool->mutex)); /* for consistency about aio */
+
+ block->io_fix = BUF_IO_READ;
+
+@@ -1874,7 +1901,8 @@
+
+ free_block = buf_LRU_get_free_block();
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ rw_lock_x_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+@@ -1885,7 +1913,8 @@
+ block->file_page_was_freed = FALSE;
+
+ /* Page can be found in buf_pool */
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ buf_block_free(free_block);
+
+@@ -1908,6 +1937,7 @@
+ mutex_enter(&block->mutex);
+
+ buf_page_init(space, offset, block);
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ /* The block must be put to the LRU list */
+ buf_LRU_add_block(block, FALSE);
+@@ -1919,7 +1949,7 @@
+ #endif
+ buf_pool->n_pages_created++;
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+
+ mtr_memo_push(mtr, block, MTR_MEMO_BUF_FIX);
+
+@@ -1933,7 +1963,7 @@
+ ibuf_merge_or_delete_for_page(NULL, space, offset, TRUE);
+
+ /* Flush pages from the end of the LRU list if necessary */
+- buf_flush_free_margin();
++ buf_flush_free_margin(FALSE);
+
+ frame = block->frame;
+
+@@ -1969,6 +1999,7 @@
+ {
+ ulint io_type;
+ ulint read_page_no;
++ ulint flush_type;
+
+ buf_io_counter_t* io_counter;
+ ulint fold;
+@@ -2051,9 +2082,6 @@
+ }
+ }
+
+- mutex_enter(&(buf_pool->mutex));
+- mutex_enter(&block->mutex);
+-
+ #ifdef UNIV_IBUF_DEBUG
+ ut_a(ibuf_count_get(block->space, block->offset) == 0);
+ #endif
+@@ -2062,9 +2090,12 @@
+ removes the newest lock debug record, without checking the thread
+ id. */
+
+- block->io_fix = 0;
+-
+ if (io_type == BUF_IO_READ) {
++ mutex_enter(&block->mutex);
++ mutex_enter(&(buf_pool->mutex));
++
++ block->io_fix = 0;
++
+ /* NOTE that the call to ibuf may have moved the ownership of
+ the x-latch to this OS thread: do not let this confuse you in
+ debugging! */
+@@ -2095,6 +2126,8 @@
+ }
+ }
+
++ mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&block->mutex);
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fputs("Has read ", stderr);
+@@ -2103,11 +2136,24 @@
+ } else {
+ ut_ad(io_type == BUF_IO_WRITE);
+
++ flush_type = block->flush_type;
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_enter(&(buf_pool->LRU_mutex));
++ }
++ mutex_enter(&block->mutex);
++ mutex_enter(&(buf_pool->mutex));
++
++ block->io_fix = 0;
++
+ /* Write means a flush operation: call the completion
+ routine in the flush system */
+
+ buf_flush_write_complete(block);
+
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_exit(&(buf_pool->LRU_mutex));
++ }
++
+ rw_lock_s_unlock_gen(&(block->lock), BUF_IO_WRITE);
+ /* io_counter here */
+ if (srv_io_pattern && srv_io_pattern_trace_running) {
+@@ -2132,6 +2178,9 @@
+
+ buf_pool->n_pages_written++;
+
++ mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&block->mutex);
++
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fputs("Has written ", stderr);
+@@ -2139,9 +2188,6 @@
+ #endif /* UNIV_DEBUG */
+ }
+
+- mutex_exit(&block->mutex);
+- mutex_exit(&(buf_pool->mutex));
+-
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+ fprintf(stderr, "page space %lu page no %lu\n",
+@@ -2169,11 +2215,11 @@
+ freed = buf_LRU_search_and_free_block(100);
+ }
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
+
+ ut_ad(UT_LIST_GET_LEN(buf_pool->LRU) == 0);
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+ }
+
+ /*************************************************************************
+@@ -2195,7 +2241,10 @@
+
+ ut_ad(buf_pool);
+
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ rw_lock_x_lock(&(buf_pool->hash_latch));
++ /* for keep the new latch order, it cannot validate correctly... */
+
+ for (i = 0; i < buf_pool->curr_size; i++) {
+
+@@ -2256,18 +2305,26 @@
+ }
+
+ ut_a(UT_LIST_GET_LEN(buf_pool->LRU) == n_lru);
++ /* because of latching order with block->mutex, we cannot get free_mutex before that */
++/*
+ if (UT_LIST_GET_LEN(buf_pool->free) != n_free) {
+ fprintf(stderr, "Free list len %lu, free blocks %lu\n",
+ (ulong) UT_LIST_GET_LEN(buf_pool->free), (ulong) n_free);
+ ut_error;
+ }
++*/
++ /* because of latching order with block->mutex, we cannot get flush_list_mutex before that */
++/*
+ ut_a(UT_LIST_GET_LEN(buf_pool->flush_list) == n_flush);
+
+ ut_a(buf_pool->n_flush[BUF_FLUSH_SINGLE_PAGE] == n_single_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LIST] == n_list_flush);
+ ut_a(buf_pool->n_flush[BUF_FLUSH_LRU] == n_lru_flush);
++*/
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ ut_a(buf_LRU_validate());
+ ut_a(buf_flush_validate());
+@@ -2299,7 +2356,9 @@
+ index_ids = mem_alloc(sizeof(dulint) * size);
+ counts = mem_alloc(sizeof(ulint) * size);
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ mutex_enter(&(buf_pool->free_mutex));
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ fprintf(stderr,
+ "buf_pool size %lu\n"
+@@ -2352,7 +2411,9 @@
+ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ mutex_exit(&(buf_pool->free_mutex));
++ mutex_exit(&(buf_pool->flush_list_mutex));
+
+ for (i = 0; i < n_found; i++) {
+ index = dict_index_get_if_in_cache(index_ids[i]);
+@@ -2387,7 +2448,7 @@
+ ulint i;
+ ulint fixed_pages_number = 0;
+
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex));
+
+ for (i = 0; i < buf_pool->curr_size; i++) {
+
+@@ -2404,7 +2465,7 @@
+ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
+ return fixed_pages_number;
+ }
+ #endif /* UNIV_DEBUG */
+@@ -2432,7 +2493,7 @@
+ {
+ ulint ratio;
+
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */
+
+ ratio = (100 * UT_LIST_GET_LEN(buf_pool->flush_list))
+ / (1 + UT_LIST_GET_LEN(buf_pool->LRU)
+@@ -2440,7 +2501,7 @@
+
+ /* 1 + is there to avoid division by zero */
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex)); /* optimistic */
+
+ return(ratio);
+ }
+@@ -2460,7 +2521,10 @@
+ ut_ad(buf_pool);
+ size = buf_pool->curr_size;
+
++ mutex_enter(&(buf_pool->LRU_mutex));
++ mutex_enter(&(buf_pool->free_mutex));
+ mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ if (srv_use_awe) {
+ fprintf(stderr,
+@@ -2533,7 +2597,10 @@
+ buf_pool->n_pages_written_old = buf_pool->n_pages_written;
+ buf_pool->n_pages_awe_remapped_old = buf_pool->n_pages_awe_remapped;
+
++ mutex_exit(&(buf_pool->LRU_mutex));
++ mutex_exit(&(buf_pool->free_mutex));
+ mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->flush_list_mutex));
+ }
+
+ /**************************************************************************
+@@ -2563,7 +2630,7 @@
+
+ ut_ad(buf_pool);
+
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */
+
+ for (i = 0; i < buf_pool->curr_size; i++) {
+
+@@ -2586,7 +2653,7 @@
+ mutex_exit(&block->mutex);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex)); /* optimistic */
+
+ return(TRUE);
+ }
+@@ -2626,11 +2693,11 @@
+ {
+ ulint len;
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->free_mutex));
+
+ len = UT_LIST_GET_LEN(buf_pool->free);
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->free_mutex));
+
+ return(len);
+ }
+diff -ruN a/innobase/buf/buf0flu.c b/innobase/buf/buf0flu.c
+--- a/innobase/buf/buf0flu.c 2009-08-28 11:08:17.000000000 +0900
++++ b/innobase/buf/buf0flu.c 2009-08-28 11:06:30.000000000 +0900
+@@ -49,7 +49,9 @@
+ buf_block_t* block) /* in: block which is modified */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&block->mutex));
++ ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+@@ -79,7 +81,9 @@
+ buf_block_t* b;
+
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&block->mutex));
++ ut_ad(mutex_own(&(buf_pool->flush_list_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ prev_b = NULL;
+@@ -130,16 +134,18 @@
+ BUF_BLOCK_FILE_PAGE and in the LRU list */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+- if (block->state != BUF_BLOCK_FILE_PAGE) {
++ if (!block->in_LRU_list || block->state != BUF_BLOCK_FILE_PAGE) {
++ /* permited not to own LRU_mutex.. */
++/*
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+ " InnoDB: Error: buffer block state %lu in the LRU list!\n",
+ (ulong)block->state);
+ ut_print_buf(stderr, (byte*)block, sizeof(buf_block_t));
+-
++*/
+ return(FALSE);
+ }
+
+@@ -165,12 +171,13 @@
+ ulint flush_type)/* in: BUF_FLUSH_LRU or BUF_FLUSH_LIST */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&(block->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+- ut_a(block->state == BUF_BLOCK_FILE_PAGE);
++ //ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
+- if ((ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
++ if (block->state == BUF_BLOCK_FILE_PAGE
++ && (ut_dulint_cmp(block->oldest_modification, ut_dulint_zero) > 0)
+ && (block->io_fix == 0)) {
+ if (flush_type != BUF_FLUSH_LRU) {
+
+@@ -199,15 +206,17 @@
+ {
+ ut_ad(block);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
++ mutex_enter(&(buf_pool->flush_list_mutex));
+ block->oldest_modification = ut_dulint_zero;
+
+ UT_LIST_REMOVE(flush_list, buf_pool->flush_list, block);
+
+ ut_d(UT_LIST_VALIDATE(flush_list, buf_block_t, buf_pool->flush_list));
++ mutex_exit(&(buf_pool->flush_list_mutex));
+
+ (buf_pool->n_flush[block->flush_type])--;
+
+@@ -553,18 +562,20 @@
+ ut_ad(flush_type == BUF_FLUSH_LRU || flush_type == BUF_FLUSH_LIST
+ || flush_type == BUF_FLUSH_SINGLE_PAGE);
+
+- mutex_enter(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ block = buf_page_hash_get(space, offset);
+
+ ut_a(!block || block->state == BUF_BLOCK_FILE_PAGE);
+
+ if (!block) {
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+ return(0);
+ }
+
+ mutex_enter(&block->mutex);
++ mutex_enter(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ if (flush_type == BUF_FLUSH_LIST
+ && buf_flush_ready_for_flush(block, flush_type)) {
+@@ -761,7 +772,7 @@
+ high = fil_space_get_size(space);
+ }
+
+- mutex_enter(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ for (i = low; i < high; i++) {
+
+@@ -795,7 +806,7 @@
+
+ mutex_exit(&block->mutex);
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ /* Note: as we release the buf_pool mutex
+ above, in buf_flush_try_page we cannot be sure
+@@ -806,14 +817,14 @@
+ count += buf_flush_try_page(space, i,
+ flush_type);
+
+- mutex_enter(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+ } else {
+ mutex_exit(&block->mutex);
+ }
+ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(count);
+ }
+@@ -848,6 +859,7 @@
+ ulint space;
+ ulint offset;
+ ibool found;
++ ulint remaining = 0;
+
+ ut_ad((flush_type == BUF_FLUSH_LRU)
+ || (flush_type == BUF_FLUSH_LIST));
+@@ -866,6 +878,12 @@
+ }
+
+ (buf_pool->init_flush)[flush_type] = TRUE;
++
++ mutex_exit(&(buf_pool->mutex));
++
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_enter(&(buf_pool->LRU_mutex));
++ }
+
+ for (;;) {
+ /* If we have flushed enough, leave the loop */
+@@ -882,7 +900,10 @@
+ } else {
+ ut_ad(flush_type == BUF_FLUSH_LIST);
+
++ mutex_enter(&(buf_pool->flush_list_mutex));
++ remaining = UT_LIST_GET_LEN(buf_pool->flush_list);
+ block = UT_LIST_GET_LAST(buf_pool->flush_list);
++ mutex_exit(&(buf_pool->flush_list_mutex));
+ if (!block
+ || (ut_dulint_cmp(block->oldest_modification,
+ lsn_limit) >= 0)) {
+@@ -912,7 +933,9 @@
+ offset = block->offset;
+
+ mutex_exit(&block->mutex);
+- mutex_exit(&(buf_pool->mutex));
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_exit(&(buf_pool->LRU_mutex));
++ }
+
+ old_page_count = page_count;
+
+@@ -932,7 +955,9 @@
+ flush_type, offset,
+ page_count - old_page_count); */
+
+- mutex_enter(&(buf_pool->mutex));
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_enter(&(buf_pool->LRU_mutex));
++ }
+
+ } else if (flush_type == BUF_FLUSH_LRU) {
+
+@@ -944,17 +969,26 @@
+
+ mutex_exit(&block->mutex);
+
++ mutex_enter(&(buf_pool->flush_list_mutex));
+ block = UT_LIST_GET_PREV(flush_list, block);
++ mutex_exit(&(buf_pool->flush_list_mutex));
++ remaining--;
+ }
+ }
+
+ /* If we could not find anything to flush, leave the loop */
+
+- if (!found) {
++ if (!found && !remaining) {
+ break;
+ }
+ }
+
++ if (flush_type == BUF_FLUSH_LRU) {
++ mutex_exit(&(buf_pool->LRU_mutex));
++ }
++
++ mutex_enter(&(buf_pool->mutex));
++
+ (buf_pool->init_flush)[flush_type] = FALSE;
+
+ if ((buf_pool->n_flush[flush_type] == 0)
+@@ -1013,11 +1047,15 @@
+ buf_block_t* block;
+ ulint n_replaceable;
+ ulint distance = 0;
++ ibool optimistic = TRUE;
+
+- mutex_enter(&(buf_pool->mutex));
+-
++ //mutex_enter(&(buf_pool->mutex));
++retry:
+ n_replaceable = UT_LIST_GET_LEN(buf_pool->free);
+
++ if (!optimistic)
++ mutex_enter(&(buf_pool->LRU_mutex));
++
+ block = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while ((block != NULL)
+@@ -1025,6 +1063,12 @@
+ + BUF_FLUSH_EXTRA_MARGIN)
+ && (distance < BUF_LRU_FREE_SEARCH_LEN)) {
+
++ if (!block->in_LRU_list) {
++ /* reatart. but it is very optimistic */
++ block = UT_LIST_GET_LAST(buf_pool->LRU);
++ continue;
++ }
++
+ mutex_enter(&block->mutex);
+
+ if (buf_flush_ready_for_replace(block)) {
+@@ -1038,11 +1082,17 @@
+ block = UT_LIST_GET_PREV(LRU, block);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
++ if (!optimistic)
++ mutex_exit(&(buf_pool->LRU_mutex));
+
+ if (n_replaceable >= BUF_FLUSH_FREE_BLOCK_MARGIN) {
+
+ return(0);
++ } else if (optimistic) {
++ /* confirm it again with LRU_mutex for exactness */
++ optimistic = FALSE;
++ goto retry;
+ }
+
+ return(BUF_FLUSH_FREE_BLOCK_MARGIN + BUF_FLUSH_EXTRA_MARGIN
+@@ -1057,8 +1107,9 @@
+ immediately, without waiting. */
+
+ void
+-buf_flush_free_margin(void)
++buf_flush_free_margin(
+ /*=======================*/
++ ibool wait)
+ {
+ ulint n_to_flush;
+ ulint n_flushed;
+@@ -1068,7 +1119,7 @@
+ if (n_to_flush > 0) {
+ n_flushed = buf_flush_batch(BUF_FLUSH_LRU, n_to_flush,
+ ut_dulint_zero);
+- if (n_flushed == ULINT_UNDEFINED) {
++ if (wait && n_flushed == ULINT_UNDEFINED) {
+ /* There was an LRU type flush batch already running;
+ let us wait for it to end */
+
+@@ -1118,11 +1169,11 @@
+ {
+ ibool ret;
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ ret = buf_flush_validate_low();
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->flush_list_mutex));
+
+ return(ret);
+ }
+diff -ruN a/innobase/buf/buf0lru.c b/innobase/buf/buf0lru.c
+--- a/innobase/buf/buf0lru.c 2009-07-07 21:53:57.000000000 +0900
++++ b/innobase/buf/buf0lru.c 2009-08-28 11:06:30.000000000 +0900
+@@ -108,7 +108,7 @@
+
+ page_arr = ut_malloc(sizeof(ulint)
+ * BUF_LRU_DROP_SEARCH_HASH_SIZE);
+- mutex_enter(&buf_pool->mutex);
++ mutex_enter(&buf_pool->LRU_mutex);
+
+ scan_again:
+ num_entries = 0;
+@@ -147,12 +147,12 @@
+ }
+ /* Array full. We release the buf_pool->mutex to
+ obey the latching order. */
+- mutex_exit(&buf_pool->mutex);
++ mutex_exit(&buf_pool->LRU_mutex);
+
+ buf_LRU_drop_page_hash_batch(id, page_arr,
+ num_entries);
+ num_entries = 0;
+- mutex_enter(&buf_pool->mutex);
++ mutex_enter(&buf_pool->LRU_mutex);
+ } else {
+ mutex_exit(&block->mutex);
+ }
+@@ -177,7 +177,7 @@
+ }
+ }
+
+- mutex_exit(&buf_pool->mutex);
++ mutex_exit(&buf_pool->LRU_mutex);
+
+ /* Drop any remaining batch of search hashed pages. */
+ buf_LRU_drop_page_hash_batch(id, page_arr, num_entries);
+@@ -206,7 +206,8 @@
+ buf_LRU_drop_page_hash_for_tablespace(id);
+
+ scan_again:
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ rw_lock_x_lock(&(buf_pool->hash_latch));
+
+ all_freed = TRUE;
+
+@@ -244,7 +245,8 @@
+
+ mutex_exit(&block->mutex);
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ /* Note that the following call will acquire
+ an S-latch on the page */
+@@ -274,7 +276,8 @@
+ block = UT_LIST_GET_PREV(LRU, block);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+ if (!all_freed) {
+ os_thread_sleep(20000);
+@@ -297,14 +300,14 @@
+ ulint len;
+ ulint limit;
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
+
+ len = UT_LIST_GET_LEN(buf_pool->LRU);
+
+ if (len < BUF_LRU_OLD_MIN_LEN) {
+ /* The LRU list is too short to do read-ahead */
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+
+ return(0);
+ }
+@@ -313,7 +316,7 @@
+
+ limit = block->LRU_position - len / BUF_LRU_INITIAL_RATIO;
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+
+ return(limit);
+ }
+@@ -337,13 +340,15 @@
+ ulint distance = 0;
+ ibool freed;
+
+- mutex_enter(&(buf_pool->mutex));
++ /* optimistic search... */
++ //mutex_enter(&(buf_pool->mutex));
+
++retry:
+ freed = FALSE;
+ block = UT_LIST_GET_LAST(buf_pool->LRU);
+
+ while (block != NULL) {
+- ut_a(block->in_LRU_list);
++ //ut_a(block->in_LRU_list); /* optimistic */
+
+ mutex_enter(&block->mutex);
+
+@@ -358,9 +363,17 @@
+ }
+ #endif /* UNIV_DEBUG */
+
++ mutex_exit(&block->mutex);
++
++ mutex_enter(&(buf_pool->LRU_mutex));/* optimistic */
++
++ rw_lock_x_lock(&(buf_pool->hash_latch));
++ mutex_enter(&block->mutex);
++ if(block->in_LRU_list && buf_flush_ready_for_replace(block)) {
+ buf_LRU_block_remove_hashed_page(block);
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+ mutex_exit(&block->mutex);
+
+ /* Remove possible adaptive hash index built on the
+@@ -373,7 +386,6 @@
+
+ ut_a(block->buf_fix_count == 0);
+
+- mutex_enter(&(buf_pool->mutex));
+ mutex_enter(&block->mutex);
+
+ buf_LRU_block_free_hashed_page(block);
+@@ -381,6 +393,16 @@
+ mutex_exit(&block->mutex);
+
+ break;
++ } else { /* someone may interrupt...??? */
++ mutex_exit(&(buf_pool->LRU_mutex));/* optimistic */
++
++ rw_lock_x_unlock(&(buf_pool->hash_latch));
++
++ if (!(block->in_LRU_list)) {
++ mutex_exit(&block->mutex);
++ goto retry;
++ }
++ }
+ }
+
+ mutex_exit(&block->mutex);
+@@ -391,6 +413,7 @@
+ if (!freed && n_iterations <= 10
+ && distance > 100 + (n_iterations * buf_pool->curr_size)
+ / 10) {
++ mutex_enter(&(buf_pool->mutex));
+ buf_pool->LRU_flush_ended = 0;
+
+ mutex_exit(&(buf_pool->mutex));
+@@ -398,6 +421,8 @@
+ return(FALSE);
+ }
+ }
++
++ mutex_enter(&(buf_pool->mutex));
+ if (buf_pool->LRU_flush_ended > 0) {
+ buf_pool->LRU_flush_ended--;
+ }
+@@ -449,7 +474,8 @@
+ {
+ ibool ret = FALSE;
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
++ mutex_enter(&(buf_pool->free_mutex));
+
+ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+ + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 4) {
+@@ -457,7 +483,8 @@
+ ret = TRUE;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
++ mutex_exit(&(buf_pool->free_mutex));
+
+ return(ret);
+ }
+@@ -480,7 +507,7 @@
+ ibool mon_value_was = FALSE;
+ ibool started_monitor = FALSE;
+ loop:
+- mutex_enter(&(buf_pool->mutex));
++ //mutex_enter(&(buf_pool->mutex)); /* optimistic */
+
+ if (!recv_recovery_on && UT_LIST_GET_LEN(buf_pool->free)
+ + UT_LIST_GET_LEN(buf_pool->LRU) < buf_pool->max_size / 20) {
+@@ -536,10 +563,16 @@
+ /* If there is a block in the free list, take it */
+ if (UT_LIST_GET_LEN(buf_pool->free) > 0) {
+
+- block = UT_LIST_GET_FIRST(buf_pool->free);
++ mutex_enter(&(buf_pool->free_mutex));
++ block = UT_LIST_GET_LAST(buf_pool->free);
++ if (!block) {
++ mutex_exit(&(buf_pool->free_mutex));
++ goto no_block;
++ }
+ ut_a(block->in_free_list);
+ UT_LIST_REMOVE(free, buf_pool->free, block);
+ block->in_free_list = FALSE;
++ mutex_exit(&(buf_pool->free_mutex));
+ ut_a(block->state != BUF_BLOCK_FILE_PAGE);
+ ut_a(!block->in_LRU_list);
+
+@@ -564,7 +597,7 @@
+
+ mutex_exit(&block->mutex);
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
+
+ if (started_monitor) {
+ srv_print_innodb_monitor = mon_value_was;
+@@ -572,11 +605,12 @@
+
+ return(block);
+ }
++no_block:
+
+ /* If no block was in the free list, search from the end of the LRU
+ list and try to free a block there */
+
+- mutex_exit(&(buf_pool->mutex));
++ //mutex_exit(&(buf_pool->mutex));
+
+ freed = buf_LRU_search_and_free_block(n_iterations);
+
+@@ -613,7 +647,7 @@
+
+ /* No free block was found: try to flush the LRU list */
+
+- buf_flush_free_margin();
++ buf_flush_free_margin(TRUE);
+ ++srv_buf_pool_wait_free;
+
+ os_aio_simulated_wake_handler_threads();
+@@ -655,7 +689,7 @@
+
+ ut_a(buf_pool->LRU_old);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_ad(3 * (BUF_LRU_OLD_MIN_LEN / 8) > BUF_LRU_OLD_TOLERANCE + 5);
+
+@@ -730,7 +764,7 @@
+ ut_ad(buf_pool);
+ ut_ad(block);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+@@ -796,7 +830,7 @@
+ ut_ad(buf_pool);
+ ut_ad(block);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+@@ -861,7 +895,7 @@
+ ut_ad(buf_pool);
+ ut_ad(block);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+@@ -964,7 +998,7 @@
+ buf_block_t* block) /* in: block, must not contain a file page */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_ad(block);
+@@ -981,8 +1015,10 @@
+ /* Wipe contents of page to reveal possible stale pointers to it */
+ memset(block->frame, '\0', UNIV_PAGE_SIZE);
+ #endif
++ mutex_enter(&(buf_pool->free_mutex));
+ UT_LIST_ADD_FIRST(free, buf_pool->free, block);
+ block->in_free_list = TRUE;
++ mutex_exit(&(buf_pool->free_mutex));
+
+ if (srv_use_awe && block->frame) {
+ /* Add to the list of mapped pages */
+@@ -1004,7 +1040,7 @@
+ may or may not be a hash index to the page */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_ad(block);
+@@ -1062,7 +1098,7 @@
+ be in a state where it can be freed */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ ut_ad(mutex_own(&block->mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+ ut_a(block->state == BUF_BLOCK_REMOVE_HASH);
+@@ -1085,7 +1121,7 @@
+ ulint LRU_pos;
+
+ ut_ad(buf_pool);
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
+
+ if (UT_LIST_GET_LEN(buf_pool->LRU) >= BUF_LRU_OLD_MIN_LEN) {
+
+@@ -1130,6 +1166,9 @@
+ ut_a(buf_pool->LRU_old_len == old_len);
+ }
+
++ mutex_exit(&(buf_pool->LRU_mutex));
++ mutex_enter(&(buf_pool->free_mutex));
++
+ UT_LIST_VALIDATE(free, buf_block_t, buf_pool->free);
+
+ block = UT_LIST_GET_FIRST(buf_pool->free);
+@@ -1140,7 +1179,7 @@
+ block = UT_LIST_GET_NEXT(free, block);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->free_mutex));
+ return(TRUE);
+ }
+
+@@ -1156,7 +1195,7 @@
+ ulint len;
+
+ ut_ad(buf_pool);
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&(buf_pool->LRU_mutex));
+
+ fprintf(stderr, "Pool ulint clock %lu\n", (ulong) buf_pool->ulint_clock);
+
+@@ -1200,5 +1239,5 @@
+ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->LRU_mutex));
+ }
+diff -ruN a/innobase/buf/buf0rea.c b/innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c 2009-08-28 11:08:17.000000000 +0900
++++ b/innobase/buf/buf0rea.c 2009-08-28 11:06:30.000000000 +0900
+@@ -277,10 +277,12 @@
+
+ return(0);
+ }
++ mutex_exit(&(buf_pool->mutex));
+
+ /* Count how many blocks in the area have been recently accessed,
+ that is, reside near the start of the LRU list. */
+
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+ for (i = low; i < high; i++) {
+ block = buf_page_hash_get(space, i);
+
+@@ -292,7 +294,7 @@
+ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ if (recent_blocks < BUF_READ_AHEAD_RANDOM_THRESHOLD) {
+ /* Do nothing */
+@@ -388,7 +390,7 @@
+ }
+
+ /* Flush pages from the end of the LRU list if necessary */
+- buf_flush_free_margin();
++ buf_flush_free_margin(FALSE);
+
+ return(count + count2);
+ }
+@@ -491,6 +493,7 @@
+
+ return(0);
+ }
++ mutex_exit(&(buf_pool->mutex));
+
+ /* Check that almost all pages in the area have been accessed; if
+ offset == low, the accesses must be in a descending order, otherwise,
+@@ -504,6 +507,7 @@
+
+ fail_count = 0;
+
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+ for (i = low; i < high; i++) {
+ block = buf_page_hash_get(space, i);
+
+@@ -520,23 +524,23 @@
+ pred_block = block;
+ }
+ }
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ if (fail_count > BUF_READ_AHEAD_LINEAR_AREA -
+ BUF_READ_AHEAD_LINEAR_THRESHOLD) {
+ /* Too many failures: return */
+
+- mutex_exit(&(buf_pool->mutex));
+-
+ return(0);
+ }
+
+ /* If we got this far, we know that enough pages in the area have
+ been accessed in the right order: linear read-ahead can be sensible */
+
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+ block = buf_page_hash_get(space, offset);
+
+ if (block == NULL) {
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ return(0);
+ }
+@@ -552,7 +556,7 @@
+ pred_offset = fil_page_get_prev(frame);
+ succ_offset = fil_page_get_next(frame);
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ if ((offset == low) && (succ_offset == offset + 1)) {
+
+@@ -628,7 +632,7 @@
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+- buf_flush_free_margin();
++ buf_flush_free_margin(FALSE);
+
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints && (count > 0)) {
+@@ -696,7 +700,7 @@
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+- buf_flush_free_margin();
++ buf_flush_free_margin(FALSE);
+
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+@@ -768,7 +772,7 @@
+ os_aio_simulated_wake_handler_threads();
+
+ /* Flush pages from the end of the LRU list if necessary */
+- buf_flush_free_margin();
++ buf_flush_free_margin(FALSE);
+
+ #ifdef UNIV_DEBUG
+ if (buf_debug_prints) {
+diff -ruN a/innobase/include/buf0buf.h b/innobase/include/buf0buf.h
+--- a/innobase/include/buf0buf.h 2009-08-28 11:08:16.000000000 +0900
++++ b/innobase/include/buf0buf.h 2009-08-28 11:06:30.000000000 +0900
+@@ -946,6 +946,7 @@
+ mem_heap_t* io_counter_heap;
+ ulint io_counters;
+ hash_table_t* page_hash; /* hash table of the file pages */
++ rw_lock_t hash_latch;
+
+ ulint n_pend_reads; /* number of pending read operations */
+
+@@ -978,6 +979,7 @@
+ UT_LIST_BASE_NODE_T(buf_block_t) flush_list;
+ /* base node of the modified block
+ list */
++ mutex_t flush_list_mutex;
+ ibool init_flush[BUF_FLUSH_LIST + 1];
+ /* this is TRUE when a flush of the
+ given type is being initialized */
+@@ -1011,8 +1013,10 @@
+ in the case of AWE, at the start are
+ always free blocks for which the
+ physical memory is mapped to a frame */
++ mutex_t free_mutex;
+ UT_LIST_BASE_NODE_T(buf_block_t) LRU;
+ /* base node of the LRU list */
++ mutex_t LRU_mutex;
+ buf_block_t* LRU_old; /* pointer to the about 3/8 oldest
+ blocks in the LRU list; NULL if LRU
+ length less than BUF_LRU_OLD_MIN_LEN */
+diff -ruN a/innobase/include/buf0buf.ic b/innobase/include/buf0buf.ic
+--- a/innobase/include/buf0buf.ic 2009-07-07 21:54:00.000000000 +0900
++++ b/innobase/include/buf0buf.ic 2009-08-28 11:06:30.000000000 +0900
+@@ -112,7 +112,8 @@
+ buf_block_t* block;
+ dulint lsn;
+
+- mutex_enter(&(buf_pool->mutex));
++try_again:
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ block = UT_LIST_GET_LAST(buf_pool->flush_list);
+
+@@ -120,9 +121,13 @@
+ lsn = ut_dulint_zero;
+ } else {
+ lsn = block->oldest_modification;
++ if (ut_dulint_cmp(lsn, ut_dulint_zero) == 0) {
++ mutex_exit(&(buf_pool->flush_list_mutex));
++ goto try_again;
++ }
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(buf_pool->flush_list_mutex));
+
+ return(lsn);
+ }
+@@ -137,7 +142,7 @@
+ /* out: new clock value */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(mutex_own(&(buf_pool->LRU_mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ buf_pool->ulint_clock++;
+@@ -392,18 +397,18 @@
+ /* out: TRUE if io going on */
+ buf_block_t* block) /* in: buf_pool block, must be bufferfixed */
+ {
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&block->mutex);
+
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+
+ if (block->io_fix != 0) {
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&block->mutex);
+
+ return(TRUE);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&block->mutex);
+
+ return(FALSE);
+ }
+@@ -425,7 +430,7 @@
+
+ block = buf_block_align(frame);
+
+- mutex_enter(&(buf_pool->mutex));
++ mutex_enter(&block->mutex);
+
+ if (block->state == BUF_BLOCK_FILE_PAGE) {
+ lsn = block->newest_modification;
+@@ -433,7 +438,7 @@
+ lsn = ut_dulint_zero;
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&block->mutex);
+
+ return(lsn);
+ }
+@@ -456,7 +461,7 @@
+ block = buf_block_align(frame);
+
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
++ ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+ #endif /*UNIV_SYNC_DEBUG */
+
+@@ -477,7 +482,7 @@
+ buf_block_t* block) /* in: block */
+ {
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad((mutex_own(&(buf_pool->mutex)) && (block->buf_fix_count == 0))
++ ut_ad((mutex_own(&(buf_pool->LRU_mutex)) && (block->buf_fix_count == 0))
+ || rw_lock_own(&(block->lock), RW_LOCK_EXCLUSIVE));
+ #endif /* UNIV_SYNC_DEBUG */
+
+@@ -555,7 +560,8 @@
+
+ ut_ad(buf_pool);
+ #ifdef UNIV_SYNC_DEBUG
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ ut_ad(rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_EX)
++ || rw_lock_own(&(buf_pool->hash_latch), RW_LOCK_SHARED));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ /* Look for the page in the hash table */
+@@ -631,11 +637,14 @@
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
++ /* buf_flush_note_modification() should be called before this function. */
++/*
+ if (rw_latch == RW_X_LATCH && mtr->modifications) {
+ mutex_enter(&buf_pool->mutex);
+ buf_flush_note_modification(block, mtr);
+ mutex_exit(&buf_pool->mutex);
+ }
++*/
+
+ mutex_enter(&block->mutex);
+
+diff -ruN a/innobase/include/buf0flu.h b/innobase/include/buf0flu.h
+--- a/innobase/include/buf0flu.h 2009-07-07 21:54:00.000000000 +0900
++++ b/innobase/include/buf0flu.h 2009-08-28 11:06:30.000000000 +0900
+@@ -26,8 +26,9 @@
+ a margin of replaceable pages there. */
+
+ void
+-buf_flush_free_margin(void);
++buf_flush_free_margin(
+ /*=======================*/
++ ibool wait);
+ /************************************************************************
+ Initializes a page for writing to the tablespace. */
+
+diff -ruN a/innobase/include/buf0flu.ic b/innobase/include/buf0flu.ic
+--- a/innobase/include/buf0flu.ic 2009-07-07 21:54:00.000000000 +0900
++++ b/innobase/include/buf0flu.ic 2009-08-28 11:06:30.000000000 +0900
+@@ -38,11 +38,14 @@
+ mtr_t* mtr) /* in: mtr */
+ {
+ ut_ad(block);
++
++ mutex_enter(&block->mutex);
++
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+- ut_ad(mutex_own(&(buf_pool->mutex)));
++ //ut_ad(mutex_own(&(buf_pool->mutex)));
+ #endif /* UNIV_SYNC_DEBUG */
+
+ ut_ad(ut_dulint_cmp(mtr->start_lsn, ut_dulint_zero) != 0);
+@@ -52,16 +55,20 @@
+ block->newest_modification = mtr->end_lsn;
+
+ if (ut_dulint_is_zero(block->oldest_modification)) {
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ block->oldest_modification = mtr->start_lsn;
+ ut_ad(!ut_dulint_is_zero(block->oldest_modification));
+
+ buf_flush_insert_into_flush_list(block);
++ mutex_exit(&(buf_pool->flush_list_mutex));
+ } else {
+ ut_ad(ut_dulint_cmp(block->oldest_modification,
+ mtr->start_lsn) <= 0);
+ }
+
++ mutex_exit(&block->mutex);
++
+ ++srv_buf_pool_write_requests;
+ }
+
+@@ -78,29 +85,32 @@
+ set of mtr's */
+ {
+ ut_ad(block);
++
++ mutex_enter(&(block->mutex));
++
+ ut_ad(block->state == BUF_BLOCK_FILE_PAGE);
+ ut_ad(block->buf_fix_count > 0);
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(rw_lock_own(&(block->lock), RW_LOCK_EX));
+ #endif /* UNIV_SYNC_DEBUG */
+
+- mutex_enter(&(buf_pool->mutex));
+-
+ ut_ad(ut_dulint_cmp(block->newest_modification, end_lsn) <= 0);
+
+ block->newest_modification = end_lsn;
+
+ if (ut_dulint_is_zero(block->oldest_modification)) {
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ block->oldest_modification = start_lsn;
+
+ ut_ad(!ut_dulint_is_zero(block->oldest_modification));
+
+ buf_flush_insert_sorted_into_flush_list(block);
++ mutex_exit(&(buf_pool->flush_list_mutex));
+ } else {
+ ut_ad(ut_dulint_cmp(block->oldest_modification,
+ start_lsn) <= 0);
+ }
+
+- mutex_exit(&(buf_pool->mutex));
++ mutex_exit(&(block->mutex));
+ }
+diff -ruN a/innobase/include/sync0sync.h b/innobase/include/sync0sync.h
+--- a/innobase/include/sync0sync.h 2009-07-07 21:54:06.000000000 +0900
++++ b/innobase/include/sync0sync.h 2009-08-28 11:06:30.000000000 +0900
+@@ -438,8 +438,12 @@
+ SYNC_SEARCH_SYS, as memory allocation
+ can call routines there! Otherwise
+ the level is SYNC_MEM_HASH. */
++#define SYNC_BUF_LRU_LIST 157
++#define SYNC_BUF_PAGE_HASH 156
++#define SYNC_BUF_BLOCK 155
++#define SYNC_BUF_FREE_LIST 153
+ #define SYNC_BUF_POOL 150
+-#define SYNC_BUF_BLOCK 149
++#define SYNC_BUF_FLUSH_LIST 149
+ #define SYNC_DOUBLEWRITE 140
+ #define SYNC_ANY_LATCH 135
+ #define SYNC_THR_LOCAL 133
+diff -ruN a/innobase/log/log0recv.c b/innobase/log/log0recv.c
+--- a/innobase/log/log0recv.c 2009-08-28 11:08:17.000000000 +0900
++++ b/innobase/log/log0recv.c 2009-08-28 11:06:30.000000000 +0900
+@@ -1695,11 +1695,11 @@
+
+ mtr_start(&mtr);
+
+- mutex_enter(&(buf_pool->mutex));
++ rw_lock_s_lock(&(buf_pool->hash_latch));
+
+ page = buf_page_hash_get(space, page_no)->frame;
+
+- mutex_exit(&(buf_pool->mutex));
++ rw_lock_s_unlock(&(buf_pool->hash_latch));
+
+ replica = buf_page_get(space + RECV_REPLICA_SPACE_ADD, page_no,
+ RW_X_LATCH, &mtr);
+diff -ruN a/innobase/mtr/mtr0mtr.c b/innobase/mtr/mtr0mtr.c
+--- a/innobase/mtr/mtr0mtr.c 2009-07-07 21:54:08.000000000 +0900
++++ b/innobase/mtr/mtr0mtr.c 2009-08-28 11:06:30.000000000 +0900
+@@ -103,6 +103,38 @@
+ }
+ }
+
++UNIV_INLINE
++void
++mtr_memo_note_modification_all(
++/*===========================*/
++ mtr_t* mtr) /* in: mtr */
++{
++ mtr_memo_slot_t* slot;
++ dyn_array_t* memo;
++ ulint offset;
++
++ ut_ad(mtr);
++ ut_ad(mtr->magic_n == MTR_MAGIC_N);
++ ut_ad(mtr->state == MTR_COMMITTING); /* Currently only used in
++ commit */
++ ut_ad(mtr->modifications);
++
++ memo = &(mtr->memo);
++
++ offset = dyn_array_get_data_size(memo);
++
++ while (offset > 0) {
++ offset -= sizeof(mtr_memo_slot_t);
++ slot = dyn_array_get_element(memo, offset);
++
++ if (UNIV_LIKELY(slot->object != NULL) &&
++ slot->type == MTR_MEMO_PAGE_X_FIX) {
++ buf_flush_note_modification(
++ (buf_block_t*)slot->object, mtr);
++ }
++ }
++}
++
+ /****************************************************************
+ Writes the contents of a mini-transaction log, if any, to the database log. */
+ static
+@@ -177,6 +209,8 @@
+ #endif
+ if (mtr->modifications) {
+ mtr_log_reserve_and_write(mtr);
++
++ mtr_memo_note_modification_all(mtr);
+ }
+
+ /* We first update the modification info to buffer pages, and only
+@@ -187,12 +221,13 @@
+ required when we insert modified buffer pages in to the flush list
+ which must be sorted on oldest_modification. */
+
+- mtr_memo_pop_all(mtr);
+-
+ if (mtr->modifications) {
+ log_release();
+ }
+
++ /* All unlocking has been moved here, after log_sys mutex release. */
++ mtr_memo_pop_all(mtr);
++
+ #ifdef UNIV_DEBUG
+ mtr->state = MTR_COMMITTED;
+ #endif
+@@ -262,6 +297,12 @@
+ slot = dyn_array_get_element(memo, offset);
+
+ if ((object == slot->object) && (type == slot->type)) {
++ if (mtr->modifications &&
++ UNIV_LIKELY(slot->object != NULL) &&
++ slot->type == MTR_MEMO_PAGE_X_FIX) {
++ buf_flush_note_modification(
++ (buf_block_t*)slot->object, mtr);
++ }
+
+ mtr_memo_slot_release(mtr, slot);
+
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-08-28 11:08:17.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-08-28 11:06:30.000000000 +0900
+@@ -370,6 +370,7 @@
+ ulong srv_n_free_tickets_to_enter = 500;
+ ulong srv_thread_sleep_delay = 10000;
+ ulint srv_spin_wait_delay = 5;
++ulint srv_spins_microsec = 50;
+ ibool srv_priority_boost = TRUE;
+
+ ibool srv_print_thread_releases = FALSE;
+@@ -676,6 +677,47 @@
+ ulint srv_n_threads_active[SRV_MASTER + 1];
+ ulint srv_n_threads[SRV_MASTER + 1];
+
++static
++void
++srv_align_spins_microsec(void)
++{
++ ulint start_sec, end_sec;
++ ulint start_usec, end_usec;
++ ib_longlong usecs;
++
++ /* change temporary */
++ srv_spins_microsec = 1;
++
++ if (ut_usectime(&start_sec, &start_usec)) {
++ srv_spins_microsec = 50;
++ goto end;
++ }
++
++ ut_delay(100000);
++
++ if (ut_usectime(&end_sec, &end_usec)) {
++ srv_spins_microsec = 50;
++ goto end;
++ }
++
++ usecs = (end_sec - start_sec) * 1000000LL + (end_usec - start_usec);
++
++ if (usecs) {
++ srv_spins_microsec = 100000 / usecs;
++ if (srv_spins_microsec == 0)
++ srv_spins_microsec = 1;
++ if (srv_spins_microsec > 50)
++ srv_spins_microsec = 50;
++ } else {
++ srv_spins_microsec = 50;
++ }
++end:
++ if (srv_spins_microsec != 50)
++ fprintf(stderr,
++ "InnoDB: unit of spin count at ut_delay() is aligned to %lu\n",
++ srv_spins_microsec);
++}
++
+ /*************************************************************************
+ Sets the info describing an i/o thread current state. */
+
+@@ -909,6 +951,8 @@
+ dict_table_t* table;
+ ulint i;
+
++ srv_align_spins_microsec();
++
+ srv_sys = mem_alloc(sizeof(srv_sys_t));
+
+ kernel_mutex_temp = mem_alloc(sizeof(mutex_t));
+@@ -2665,7 +2709,7 @@
+ ib_longlong level, bpl;
+ buf_block_t* bpage;
+
+- mutex_enter(&buf_pool->mutex);
++ mutex_enter(&(buf_pool->flush_list_mutex));
+
+ level = 0;
+ bpage = UT_LIST_GET_FIRST(buf_pool->flush_list);
+@@ -2687,7 +2731,7 @@
+ bpl = 0;
+ }
+
+- mutex_exit(&buf_pool->mutex);
++ mutex_exit(&(buf_pool->flush_list_mutex));
+
+ if (!srv_use_doublewrite_buf) {
+ /* flush is faster than when doublewrite */
+diff -ruN a/innobase/sync/sync0sync.c b/innobase/sync/sync0sync.c
+--- a/innobase/sync/sync0sync.c 2009-07-07 21:54:10.000000000 +0900
++++ b/innobase/sync/sync0sync.c 2009-08-28 11:06:30.000000000 +0900
+@@ -1105,11 +1105,19 @@
+ } else if (level == SYNC_DOUBLEWRITE) {
+ ut_a(sync_thread_levels_g(array, SYNC_DOUBLEWRITE));
+ } else if (level == SYNC_BUF_BLOCK) {
+- ut_a((sync_thread_levels_contain(array, SYNC_BUF_POOL)
++ ut_a((sync_thread_levels_contain(array, SYNC_BUF_LRU_LIST)
+ && sync_thread_levels_g(array, SYNC_BUF_BLOCK - 1))
+ || sync_thread_levels_g(array, SYNC_BUF_BLOCK));
+ } else if (level == SYNC_BUF_POOL) {
+ ut_a(sync_thread_levels_g(array, SYNC_BUF_POOL));
++ } else if (level == SYNC_BUF_FLUSH_LIST) {
++ ut_a(sync_thread_levels_g(array, SYNC_BUF_FLUSH_LIST));
++ } else if (level == SYNC_BUF_FREE_LIST) {
++ ut_a(sync_thread_levels_g(array, SYNC_BUF_FREE_LIST));
++ } else if (level == SYNC_BUF_PAGE_HASH) {
++ ut_a(sync_thread_levels_g(array, SYNC_BUF_PAGE_HASH));
++ } else if (level == SYNC_BUF_LRU_LIST) {
++ ut_a(sync_thread_levels_g(array, SYNC_BUF_LRU_LIST));
+ } else if (level == SYNC_SEARCH_SYS) {
+ ut_a(sync_thread_levels_g(array, SYNC_SEARCH_SYS));
+ } else if (level == SYNC_TRX_LOCK_HEAP) {
+diff -ruN a/innobase/ut/ut0ut.c b/innobase/ut/ut0ut.c
+--- a/innobase/ut/ut0ut.c 2009-07-07 21:54:12.000000000 +0900
++++ b/innobase/ut/ut0ut.c 2009-08-28 11:06:30.000000000 +0900
+@@ -347,6 +347,7 @@
+ /*****************************************************************
+ Runs an idle loop on CPU. The argument gives the desired delay
+ in microseconds on 100 MHz Pentium + Visual C++. */
++extern ulint srv_spins_microsec;
+
+ ulint
+ ut_delay(
+@@ -358,7 +359,11 @@
+
+ j = 0;
+
+- for (i = 0; i < delay * 50; i++) {
++ for (i = 0; i < delay * srv_spins_microsec; i++) {
++#if (defined (__i386__) || defined (__x86_64__)) && defined (__GNUC__)
++ /* it is equal to the instruction 'pause' */
++ __asm__ __volatile__ ("rep; nop");
++#endif
+ j += i;
+ }
+
+diff -ruN a/patch_info/innodb_split_buf_pool_mutex.info b/patch_info/innodb_split_buf_pool_mutex.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_split_buf_pool_mutex.info 2009-08-28 11:06:30.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_split_buf_pool_mutex.patch
++Name=InnoDB patch to fix buffer pool scalability
++Version=1.0
++Author=Yasufumi Kinoshita
++License=BSD
++Comment=Backport from XtraDB
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-08-28 11:08:17.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-08-28 11:06:30.000000000 +0900
+@@ -1507,6 +1507,13 @@
+ /* We set srv_pool_size here in units of 1 kB. InnoDB internally
+ changes the value so that it becomes the number of database pages. */
+
++ if (innobase_buffer_pool_awe_mem_mb) {
++ /* split_buf_pool_mutex.patch don't support AWE */
++ fputs("InnoDB: Warning: split_buf_pool_mutex.patch don't support AWE. Disabled.\n",
++ stderr);
++ innobase_buffer_pool_awe_mem_mb = 0;
++ }
++
+ if (innobase_buffer_pool_awe_mem_mb == 0) {
+ /* Careful here: we first convert the signed long int to ulint
+ and only after that divide */
diff --git a/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch b/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch
new file mode 100644
index 0000000..3b8f659
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_thread_concurrency_timer_based.patch
@@ -0,0 +1,389 @@
+diff -ruN a/innobase/configure b/innobase/configure
+--- a/innobase/configure 2009-01-30 06:56:31.000000000 +0900
++++ b/innobase/configure 2009-05-06 15:40:47.000000000 +0900
+@@ -21306,6 +21306,88 @@
+ fi
+ done
+
++
++# as http://lists.mysql.com/commits/40686 does
++{ echo "$as_me:$LINENO: checking whether the compiler provides atomic builtins" >&5
++echo $ECHO_N "checking whether the compiler provides atomic builtins... $ECHO_C" >&6; }
++if test "${mysql_cv_atomic_builtins+set}" = set; then
++ echo $ECHO_N "(cached) $ECHO_C" >&6
++else
++ if test "$cross_compiling" = yes; then
++ { { echo "$as_me:$LINENO: error: cannot run test program while cross compiling
++See \`config.log' for more details." >&5
++echo "$as_me: error: cannot run test program while cross compiling
++See \`config.log' for more details." >&2;}
++ { (exit 1); exit 1; }; }
++else
++ cat >conftest.$ac_ext <<_ACEOF
++/* confdefs.h. */
++_ACEOF
++cat confdefs.h >>conftest.$ac_ext
++cat >>conftest.$ac_ext <<_ACEOF
++/* end confdefs.h. */
++
++ int main()
++ {
++ int foo= -10; int bar= 10;
++ __sync_fetch_and_add(&foo, bar);
++ if (foo)
++ return -1;
++ bar= __sync_lock_test_and_set(&foo, bar);
++ if (bar || foo != 10)
++ return -1;
++ bar= __sync_val_compare_and_swap(&bar, foo, 15);
++ if (bar)
++ return -1;
++ return 0;
++ }
++
++_ACEOF
++rm -f conftest$ac_exeext
++if { (ac_try="$ac_link"
++case "(($ac_try" in
++ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
++ *) ac_try_echo=$ac_try;;
++esac
++eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
++ (eval "$ac_link") 2>&5
++ ac_status=$?
++ echo "$as_me:$LINENO: \$? = $ac_status" >&5
++ (exit $ac_status); } && { ac_try='./conftest$ac_exeext'
++ { (case "(($ac_try" in
++ *\"* | *\`* | *\\*) ac_try_echo=\$ac_try;;
++ *) ac_try_echo=$ac_try;;
++esac
++eval "echo \"\$as_me:$LINENO: $ac_try_echo\"") >&5
++ (eval "$ac_try") 2>&5
++ ac_status=$?
++ echo "$as_me:$LINENO: \$? = $ac_status" >&5
++ (exit $ac_status); }; }; then
++ mysql_cv_atomic_builtins=yes
++else
++ echo "$as_me: program exited with status $ac_status" >&5
++echo "$as_me: failed program was:" >&5
++sed 's/^/| /' conftest.$ac_ext >&5
++
++( exit $ac_status )
++mysql_cv_atomic_builtins=no
++fi
++rm -f core *.core core.conftest.* gmon.out bb.out conftest$ac_exeext conftest.$ac_objext conftest.$ac_ext
++fi
++
++
++fi
++{ echo "$as_me:$LINENO: result: $mysql_cv_atomic_builtins" >&5
++echo "${ECHO_T}$mysql_cv_atomic_builtins" >&6; }
++
++if test "x$mysql_cv_atomic_builtins" = xyes; then
++
++cat >>confdefs.h <<\_ACEOF
++#define HAVE_ATOMIC_BUILTINS 1
++_ACEOF
++
++fi
++
+ #AC_CHECK_FUNCS(readdir_r) MySQL checks that it has also the right args.
+ # Some versions of Unix only take 2 arguments.
+ #AC_C_INLINE Already checked in MySQL
+diff -ruN a/innobase/configure.in b/innobase/configure.in
+--- a/innobase/configure.in 2009-01-30 06:42:15.000000000 +0900
++++ b/innobase/configure.in 2009-05-06 15:40:47.000000000 +0900
+@@ -42,6 +42,31 @@
+ AC_CHECK_FUNCS(sched_yield)
+ AC_CHECK_FUNCS(fdatasync)
+ AC_CHECK_FUNCS(localtime_r)
++
++# as http://lists.mysql.com/commits/40686 does
++AC_CACHE_CHECK([whether the compiler provides atomic builtins],
++ [mysql_cv_atomic_builtins], [AC_TRY_RUN([
++ int main()
++ {
++ int foo= -10; int bar= 10;
++ __sync_fetch_and_add(&foo, bar);
++ if (foo)
++ return -1;
++ bar= __sync_lock_test_and_set(&foo, bar);
++ if (bar || foo != 10)
++ return -1;
++ bar= __sync_val_compare_and_swap(&bar, foo, 15);
++ if (bar)
++ return -1;
++ return 0;
++ }
++], [mysql_cv_atomic_builtins=yes], [mysql_cv_atomic_builtins=no])])
++
++if test "x$mysql_cv_atomic_builtins" = xyes; then
++ AC_DEFINE(HAVE_ATOMIC_BUILTINS, 1,
++ [Define to 1 if compiler provides atomic builtins.])
++fi
++
+ #AC_CHECK_FUNCS(readdir_r) MySQL checks that it has also the right args.
+ # Some versions of Unix only take 2 arguments.
+ #AC_C_INLINE Already checked in MySQL
+diff -ruN a/innobase/ib_config.h b/innobase/ib_config.h
+--- a/innobase/ib_config.h 2009-01-30 07:05:03.000000000 +0900
++++ b/innobase/ib_config.h 2009-05-06 15:40:47.000000000 +0900
+@@ -7,6 +7,9 @@
+ /* Define to 1 if you have the <aio.h> header file. */
+ #define HAVE_AIO_H 1
+
++/* Define to 1 if compiler provides atomic builtins. */
++#define HAVE_ATOMIC_BUILTINS 1
++
+ /* Define to 1 if you have the <dlfcn.h> header file. */
+ #define HAVE_DLFCN_H 1
+
+diff -ruN a/innobase/ib_config.h.in b/innobase/ib_config.h.in
+--- a/innobase/ib_config.h.in 2009-01-30 06:56:11.000000000 +0900
++++ b/innobase/ib_config.h.in 2009-05-06 15:40:47.000000000 +0900
+@@ -6,6 +6,9 @@
+ /* Define to 1 if you have the <aio.h> header file. */
+ #undef HAVE_AIO_H
+
++/* Define to 1 if compiler provides atomic builtins. */
++#undef HAVE_ATOMIC_BUILTINS
++
+ /* Define to 1 if you have the <dlfcn.h> header file. */
+ #undef HAVE_DLFCN_H
+
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-05-06 15:38:01.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-05-06 16:04:36.000000000 +0900
+@@ -90,6 +90,8 @@
+ extern ulint srv_mem_pool_size;
+ extern ulint srv_lock_table_size;
+
++extern ibool srv_thread_concurrency_timer_based;
++
+ extern ulint srv_n_file_io_threads;
+ extern ulint srv_n_read_io_threads;
+ extern ulint srv_n_write_io_threads;
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-05-06 15:38:01.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-05-06 17:12:54.000000000 +0900
+@@ -267,6 +267,7 @@
+ computer. Bigger computers need bigger values. Value 0 will disable the
+ concurrency check. */
+
++ibool srv_thread_concurrency_timer_based = TRUE;
+ ulong srv_thread_concurrency = 0;
+ ulong srv_commit_concurrency = 0;
+
+@@ -1020,6 +1021,74 @@
+ Puts an OS thread to wait if there are too many concurrent threads
+ (>= srv_thread_concurrency) inside InnoDB. The threads wait in a FIFO queue. */
+
++#ifdef HAVE_ATOMIC_BUILTINS
++static void
++enter_innodb_with_tickets(trx_t* trx)
++{
++ trx->declared_to_be_inside_innodb = TRUE;
++ trx->n_tickets_to_enter_innodb = SRV_FREE_TICKETS_TO_ENTER;
++ return;
++}
++
++static void
++srv_conc_enter_innodb_timer_based(trx_t* trx)
++{
++ lint conc_n_threads;
++ ibool has_yielded = FALSE;
++ ulint has_slept = 0;
++
++ if (trx->declared_to_be_inside_innodb) {
++ ut_print_timestamp(stderr);
++ fputs(
++" InnoDB: Error: trying to declare trx to enter InnoDB, but\n"
++"InnoDB: it already is declared.\n", stderr);
++ trx_print(stderr, trx, 0);
++ putc('\n', stderr);
++ }
++retry:
++ if (srv_conc_n_threads < (lint) srv_thread_concurrency) {
++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1);
++ if (conc_n_threads <= (lint) srv_thread_concurrency) {
++ enter_innodb_with_tickets(trx);
++ return;
++ }
++ __sync_add_and_fetch(&srv_conc_n_threads, -1);
++ }
++ if (!has_yielded)
++ {
++ has_yielded = TRUE;
++ os_thread_yield();
++ goto retry;
++ }
++ if (trx->has_search_latch
++ || NULL != UT_LIST_GET_FIRST(trx->trx_locks)) {
++
++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1);
++ enter_innodb_with_tickets(trx);
++ return;
++ }
++ if (has_slept < 2)
++ {
++ trx->op_info = "sleeping before entering InnoDB";
++ os_thread_sleep(10000);
++ trx->op_info = "";
++ has_slept++;
++ }
++ conc_n_threads = __sync_add_and_fetch(&srv_conc_n_threads, 1);
++ enter_innodb_with_tickets(trx);
++ return;
++}
++
++static void
++srv_conc_exit_innodb_timer_based(trx_t* trx)
++{
++ __sync_add_and_fetch(&srv_conc_n_threads, -1);
++ trx->declared_to_be_inside_innodb = FALSE;
++ trx->n_tickets_to_enter_innodb = 0;
++ return;
++}
++#endif
++
+ void
+ srv_conc_enter_innodb(
+ /*==================*/
+@@ -1043,6 +1112,13 @@
+ return;
+ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (srv_thread_concurrency_timer_based) {
++ srv_conc_enter_innodb_timer_based(trx);
++ return;
++ }
++#endif
++
+ os_fast_mutex_lock(&srv_conc_mutex);
+ retry:
+ if (trx->declared_to_be_inside_innodb) {
+@@ -1196,6 +1272,15 @@
+ return;
+ }
+
++ ut_ad(srv_conc_n_threads >= 0);
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (srv_thread_concurrency_timer_based) {
++ __sync_add_and_fetch(&srv_conc_n_threads, 1);
++ trx->declared_to_be_inside_innodb = TRUE;
++ trx->n_tickets_to_enter_innodb = 1;
++ return;
++ }
++#endif
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_threads++;
+@@ -1227,8 +1312,16 @@
+ return;
+ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ if (srv_thread_concurrency_timer_based) {
++ srv_conc_exit_innodb_timer_based(trx);
++ return;
++ }
++#endif
++
+ os_fast_mutex_lock(&srv_conc_mutex);
+
++ ut_ad(srv_conc_n_threads > 0);
+ srv_conc_n_threads--;
+ trx->declared_to_be_inside_innodb = FALSE;
+ trx->n_tickets_to_enter_innodb = 0;
+diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c 2009-05-06 15:38:01.000000000 +0900
++++ b/innobase/srv/srv0start.c 2009-05-06 17:22:26.000000000 +0900
+@@ -1040,6 +1040,11 @@
+ return(DB_ERROR);
+ }
+
++#ifdef HAVE_ATOMIC_BUILTINS
++ fprintf(stderr,
++ "InnoDB: use atomic builtins.\n");
++#endif
++
+ /* Since InnoDB does not currently clean up all its internal data
+ structures in MySQL Embedded Server Library server_end(), we
+ print an error message if someone tries to start up InnoDB a
+diff -ruN a/patch_info/innodb_thread_concurrency_timer_based.info b/patch_info/innodb_thread_concurrency_timer_based.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_thread_concurrency_timer_based.info 2009-05-06 17:17:12.000000000 +0900
+@@ -0,0 +1,6 @@
++File=thread_concurrency_timer_based.patch
++Name=Use InnoDB timer based concurrency throttling (backport from MySQL 5.4.0)
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-05-06 15:38:01.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-05-06 15:54:08.000000000 +0900
+@@ -152,6 +152,7 @@
+ innobase_open_files;
+
+ long innobase_read_io_threads, innobase_write_io_threads;
++my_bool innobase_thread_concurrency_timer_based;
+ long innobase_extra_rsegments;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+
+@@ -1477,6 +1478,9 @@
+ srv_n_log_files = (ulint) innobase_log_files_in_group;
+ srv_log_file_size = (ulint) innobase_log_file_size;
+
++ srv_thread_concurrency_timer_based =
++ (ibool) innobase_thread_concurrency_timer_based;
++
+ #ifdef UNIV_LOG_ARCHIVE
+ srv_log_archive_on = (ulint) innobase_log_archive;
+ #endif /* UNIV_LOG_ARCHIVE */
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-05-06 15:38:01.000000000 +0900
++++ b/sql/ha_innodb.h 2009-05-06 15:55:50.000000000 +0900
+@@ -205,6 +205,7 @@
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
+ extern long innobase_read_io_threads, innobase_write_io_threads;
++extern my_bool innobase_thread_concurrency_timer_based;
+ extern long innobase_extra_rsegments;
+ extern long innobase_force_recovery;
+ extern long innobase_open_files;
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-05-06 15:38:01.000000000 +0900
++++ b/sql/mysqld.cc 2009-05-06 16:22:06.000000000 +0900
+@@ -5096,6 +5096,7 @@
+ OPT_INNODB_ADAPTIVE_CHECKPOINT,
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED,
+ OPT_INNODB_EXTRA_RSEGMENTS,
+ OPT_INNODB_DICT_SIZE_LIMIT,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
+@@ -5455,6 +5456,11 @@
+ "Number of background write I/O threads in InnoDB.",
+ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
+ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_thread_concurrency_timer_based", OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED,
++ "Use InnoDB timer based concurrency throttling. ",
++ (gptr*) &innobase_thread_concurrency_timer_based,
++ (gptr*) &innobase_thread_concurrency_timer_based,
++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"innodb_extra_rsegments", OPT_INNODB_EXTRA_RSEGMENTS,
+ "Number of extra user rollback segments when create new database.",
+ (gptr*) &innobase_extra_rsegments, (gptr*) &innobase_extra_rsegments,
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-05-06 15:38:01.000000000 +0900
++++ b/sql/set_var.cc 2009-05-06 16:02:27.000000000 +0900
+@@ -1063,6 +1063,7 @@
+ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
++ {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL},
+ {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG},
+ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS},
+ {sys_innodb_io_pattern_trace.name, (char*) &sys_innodb_io_pattern_trace, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch b/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch
new file mode 100644
index 0000000..9637315
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/innodb_use_sys_malloc.patch
@@ -0,0 +1,265 @@
+diff -ruN a/innobase/include/srv0srv.h b/innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h 2009-07-06 15:59:52.000000000 +0900
++++ b/innobase/include/srv0srv.h 2009-07-06 16:06:51.000000000 +0900
+@@ -90,6 +90,7 @@
+ extern ulint srv_mem_pool_size;
+ extern ulint srv_lock_table_size;
+
++extern ibool srv_use_sys_malloc;
+ extern ibool srv_thread_concurrency_timer_based;
+
+ extern ulint srv_n_file_io_threads;
+diff -ruN a/innobase/include/ut0mem.h b/innobase/include/ut0mem.h
+--- a/innobase/include/ut0mem.h 2009-07-07 21:54:07.000000000 +0900
++++ b/innobase/include/ut0mem.h 2009-08-03 14:42:17.000000000 +0900
+@@ -30,6 +30,13 @@
+
+
+ /**************************************************************************
++Initializes the mem block list at database startup. */
++
++void
++ut_mem_block_list_init(void);
++/*========================*/
++
++/**************************************************************************
+ Allocates memory. Sets it also to zero if UNIV_SET_MEM_TO_ZERO is
+ defined and set_to_zero is TRUE. */
+
+diff -ruN a/innobase/mem/mem0dbg.c b/innobase/mem/mem0dbg.c
+--- a/innobase/mem/mem0dbg.c 2009-05-08 06:12:10.000000000 +0900
++++ b/innobase/mem/mem0dbg.c 2009-07-06 16:48:17.000000000 +0900
+@@ -134,6 +134,14 @@
+ mem_hash_initialized = TRUE;
+ #endif
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ /* When innodb_use_sys_malloc is set, the
++ mem_comm_pool won't be used for any allocations. We
++ create a dummy mem_comm_pool, because some statistics
++ and debugging code relies on it being initialized. */
++ size = 1;
++ }
++
+ mem_comm_pool = mem_pool_create(size);
+ }
+
+diff -ruN a/innobase/mem/mem0pool.c b/innobase/mem/mem0pool.c
+--- a/innobase/mem/mem0pool.c 2009-05-08 06:12:10.000000000 +0900
++++ b/innobase/mem/mem0pool.c 2009-07-06 17:22:09.000000000 +0900
+@@ -11,6 +11,7 @@
+ #include "mem0pool.ic"
+ #endif
+
++#include "srv0srv.h"
+ #include "sync0sync.h"
+ #include "ut0mem.h"
+ #include "ut0lst.h"
+@@ -191,8 +192,6 @@
+ ulint i;
+ ulint used;
+
+- ut_a(size > 10000);
+-
+ pool = ut_malloc(sizeof(mem_pool_t));
+
+ /* We do not set the memory to zero (FALSE) in the pool,
+@@ -330,6 +329,10 @@
+ ulint n;
+ ibool ret;
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ return(malloc(size));
++ }
++
+ n = ut_2_log(ut_max(size + MEM_AREA_EXTRA_SIZE, MEM_AREA_MIN_SIZE));
+
+ mutex_enter(&(pool->mutex));
+@@ -457,6 +460,11 @@
+ ulint size;
+ ulint n;
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ free(ptr);
++ return;
++ }
++
+ /* It may be that the area was really allocated from the OS with
+ regular malloc: check if ptr points within our memory pool */
+
+diff -ruN a/innobase/srv/srv0srv.c b/innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c 2009-07-06 15:59:52.000000000 +0900
++++ b/innobase/srv/srv0srv.c 2009-07-06 16:08:06.000000000 +0900
+@@ -273,6 +273,7 @@
+ computer. Bigger computers need bigger values. Value 0 will disable the
+ concurrency check. */
+
++ibool srv_use_sys_malloc = TRUE;
+ ibool srv_thread_concurrency_timer_based = TRUE;
+ ulong srv_thread_concurrency = 0;
+ ulong srv_commit_concurrency = 0;
+@@ -1012,6 +1013,7 @@
+ srv_general_init(void)
+ /*==================*/
+ {
++ ut_mem_block_list_init();
+ os_sync_init();
+ sync_init();
+ mem_init(srv_mem_pool_size);
+diff -ruN a/innobase/srv/srv0start.c b/innobase/srv/srv0start.c
+--- a/innobase/srv/srv0start.c 2009-07-06 15:59:52.000000000 +0900
++++ b/innobase/srv/srv0start.c 2009-07-06 16:23:38.000000000 +0900
+@@ -1040,6 +1040,11 @@
+ return(DB_ERROR);
+ }
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ fprintf(stderr,
++ "InnoDB: The InnoDB memory heap is disabled\n");
++ }
++
+ #ifdef HAVE_ATOMIC_BUILTINS
+ fprintf(stderr,
+ "InnoDB: use atomic builtins.\n");
+diff -ruN a/innobase/ut/ut0mem.c b/innobase/ut/ut0mem.c
+--- a/innobase/ut/ut0mem.c 2009-05-08 06:12:13.000000000 +0900
++++ b/innobase/ut/ut0mem.c 2009-07-06 16:42:26.000000000 +0900
+@@ -15,6 +15,7 @@
+ #include "mem0mem.h"
+ #include "os0sync.h"
+ #include "os0thread.h"
++#include "srv0srv.h"
+
+ /* This struct is placed first in every allocated memory block */
+ typedef struct ut_mem_block_struct ut_mem_block_t;
+@@ -43,7 +44,7 @@
+
+ /**************************************************************************
+ Initializes the mem block list at database startup. */
+-static
++
+ void
+ ut_mem_block_list_init(void)
+ /*========================*/
+@@ -70,11 +71,21 @@
+ ulint retry_count = 0;
+ void* ret;
+
+- ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ ret = malloc(n);
++ ut_a(ret || !assert_on_error);
+
+- if (!ut_mem_block_list_inited) {
+- ut_mem_block_list_init();
++#ifdef UNIV_SET_MEM_TO_ZERO
++ if (set_to_zero) {
++ memset(ret, '\0', n);
++ }
++#endif
++ return(ret);
+ }
++
++ ut_ad((sizeof(ut_mem_block_t) % 8) == 0); /* check alignment ok */
++
++ ut_a(ut_mem_block_list_inited);
+ retry:
+ os_fast_mutex_lock(&ut_list_mutex);
+
+@@ -223,6 +236,11 @@
+ {
+ ut_mem_block_t* block;
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ free(ptr);
++ return;
++ }
++
+ block = (ut_mem_block_t*)((byte*)ptr - sizeof(ut_mem_block_t));
+
+ os_fast_mutex_lock(&ut_list_mutex);
+@@ -275,6 +293,10 @@
+ ulint min_size;
+ void* new_ptr;
+
++ if (UNIV_LIKELY(srv_use_sys_malloc)) {
++ return(realloc(ptr, size));
++ }
++
+ if (ptr == NULL) {
+
+ return(ut_malloc(size));
+diff -ruN a/patch_info/innodb_use_sys_malloc.info b/patch_info/innodb_use_sys_malloc.info
+--- /dev/null 1970-01-01 09:00:00.000000000 +0900
++++ b/patch_info/innodb_use_sys_malloc.info 2009-07-06 16:04:24.000000000 +0900
+@@ -0,0 +1,6 @@
++File=innodb_use_sys_malloc.patch
++Name=InnoDB uses malloc directly (backport from InnoDB-Plugin)
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment
+diff -ruN a/sql/ha_innodb.cc b/sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc 2009-07-06 15:59:52.000000000 +0900
++++ b/sql/ha_innodb.cc 2009-07-06 16:10:15.000000000 +0900
+@@ -152,6 +152,7 @@
+ innobase_open_files;
+
+ long innobase_read_io_threads, innobase_write_io_threads;
++my_bool innobase_use_sys_malloc;
+ my_bool innobase_thread_concurrency_timer_based;
+ long innobase_extra_rsegments;
+ longlong innobase_buffer_pool_size, innobase_log_file_size;
+@@ -1492,6 +1493,8 @@
+ srv_n_log_files = (ulint) innobase_log_files_in_group;
+ srv_log_file_size = (ulint) innobase_log_file_size;
+
++ srv_use_sys_malloc = (ibool) innobase_use_sys_malloc;
++
+ srv_thread_concurrency_timer_based =
+ (ibool) innobase_thread_concurrency_timer_based;
+
+diff -ruN a/sql/ha_innodb.h b/sql/ha_innodb.h
+--- a/sql/ha_innodb.h 2009-07-06 15:59:52.000000000 +0900
++++ b/sql/ha_innodb.h 2009-07-06 16:10:42.000000000 +0900
+@@ -205,6 +205,7 @@
+ extern long innobase_buffer_pool_awe_mem_mb;
+ extern long innobase_file_io_threads, innobase_lock_wait_timeout;
+ extern long innobase_read_io_threads, innobase_write_io_threads;
++extern my_bool innobase_use_sys_malloc;
+ extern my_bool innobase_thread_concurrency_timer_based;
+ extern long innobase_extra_rsegments;
+ extern long innobase_force_recovery;
+diff -ruN a/sql/mysqld.cc b/sql/mysqld.cc
+--- a/sql/mysqld.cc 2009-07-06 15:59:52.000000000 +0900
++++ b/sql/mysqld.cc 2009-07-06 16:16:56.000000000 +0900
+@@ -5102,6 +5102,7 @@
+ OPT_INNODB_ADAPTIVE_CHECKPOINT,
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
++ OPT_INNODB_USE_SYS_MALLOC,
+ OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED,
+ OPT_INNODB_EXTRA_RSEGMENTS,
+ OPT_INNODB_DICT_SIZE_LIMIT,
+@@ -5470,6 +5471,10 @@
+ "Number of background write I/O threads in InnoDB.",
+ (gptr*) &innobase_write_io_threads, (gptr*) &innobase_write_io_threads,
+ 0, GET_LONG, REQUIRED_ARG, 8, 1, 64, 0, 0, 0},
++ {"innodb_use_sys_malloc", OPT_INNODB_USE_SYS_MALLOC,
++ "Use OS memory allocator instead of InnoDB's internal memory allocator",
++ (gptr*) &innobase_use_sys_malloc, (gptr*) &innobase_use_sys_malloc,
++ 0, GET_BOOL, NO_ARG, 1, 0, 0, 0, 0, 0},
+ {"innodb_thread_concurrency_timer_based", OPT_INNODB_THREAD_CONCURRENCY_TIMER_BASED,
+ "Use InnoDB timer based concurrency throttling. ",
+ (gptr*) &innobase_thread_concurrency_timer_based,
+diff -ruN a/sql/set_var.cc b/sql/set_var.cc
+--- a/sql/set_var.cc 2009-07-06 15:59:52.000000000 +0900
++++ b/sql/set_var.cc 2009-07-06 16:22:05.000000000 +0900
+@@ -1093,6 +1093,7 @@
+ {sys_innodb_adaptive_checkpoint.name, (char*) &sys_innodb_adaptive_checkpoint, SHOW_SYS},
+ {"innodb_read_io_threads", (char*) &innobase_read_io_threads, SHOW_LONG},
+ {"innodb_write_io_threads", (char*) &innobase_write_io_threads, SHOW_LONG},
++ {"innodb_use_sys_malloc", (char*) &innobase_use_sys_malloc, SHOW_MY_BOOL},
+ {"innodb_thread_concurrency_timer_based", (char*) &innobase_thread_concurrency_timer_based, SHOW_MY_BOOL},
+ {"innodb_extra_rsegments", (char*) &innobase_extra_rsegments, SHOW_LONG},
+ {sys_innodb_dict_size_limit.name, (char*) &sys_innodb_dict_size_limit, SHOW_SYS},
diff --git a/percona/5.0.91-b22-20100522/microsec_process.patch b/percona/5.0.91-b22-20100522/microsec_process.patch
new file mode 100644
index 0000000..2e68888
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/microsec_process.patch
@@ -0,0 +1,282 @@
+diff -r e3b747e556c8 mysql-test/r/information_schema.result
+--- a/mysql-test/r/information_schema.result Mon May 18 18:44:04 2009 -0700
++++ b/mysql-test/r/information_schema.result Mon May 18 18:48:11 2009 -0700
+@@ -44,6 +44,7 @@
+ COLUMN_PRIVILEGES
+ INDEX_STATISTICS
+ KEY_COLUMN_USAGE
++PROCESSLIST
+ PROFILING
+ ROUTINES
+ SCHEMATA
+@@ -740,7 +741,7 @@
+ CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1;
+ CREATE VIEW a2 AS SELECT t_CRASHME FROM a1;
+ count(*)
+-106
++107
+ drop view a2, a1;
+ drop table t_crashme;
+ select table_schema,table_name, column_name from
+@@ -749,6 +750,7 @@
+ table_schema table_name column_name
+ information_schema COLUMNS COLUMN_DEFAULT
+ information_schema COLUMNS COLUMN_TYPE
++information_schema PROCESSLIST INFO
+ information_schema ROUTINES ROUTINE_DEFINITION
+ information_schema ROUTINES SQL_MODE
+ information_schema TRIGGERS ACTION_CONDITION
+@@ -813,7 +815,7 @@
+ flush privileges;
+ SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA;
+ table_schema count(*)
+-information_schema 21
++information_schema 22
+ mysql 17
+ create table t1 (i int, j int);
+ create trigger trg1 before insert on t1 for each row
+@@ -1206,6 +1208,7 @@
+ COLUMN_PRIVILEGES TABLE_SCHEMA
+ INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
++PROCESSLIST ID
+ PROFILING QUERY_ID
+ ROUTINES ROUTINE_SCHEMA
+ SCHEMATA SCHEMA_NAME
+@@ -1242,6 +1245,7 @@
+ COLUMN_PRIVILEGES TABLE_SCHEMA
+ INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
++PROCESSLIST ID
+ PROFILING QUERY_ID
+ ROUTINES ROUTINE_SCHEMA
+ SCHEMATA SCHEMA_NAME
+@@ -1329,6 +1333,7 @@
+ COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1
+ INDEX_STATISTICS information_schema.INDEX_STATISTICS 1
+ KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
++PROCESSLIST information_schema.PROCESSLIST 1
+ PROFILING information_schema.PROFILING 1
+ ROUTINES information_schema.ROUTINES 1
+ SCHEMATA information_schema.SCHEMATA 1
+diff -r e3b747e556c8 mysql-test/r/information_schema_db.result
+--- a/mysql-test/r/information_schema_db.result Mon May 18 18:44:04 2009 -0700
++++ b/mysql-test/r/information_schema_db.result Mon May 18 18:48:11 2009 -0700
+@@ -13,6 +13,7 @@
+ COLUMN_PRIVILEGES
+ INDEX_STATISTICS
+ KEY_COLUMN_USAGE
++PROCESSLIST
+ PROFILING
+ ROUTINES
+ SCHEMATA
+diff -r e3b747e556c8 mysql-test/r/mysqlshow.result
+--- a/mysql-test/r/mysqlshow.result Mon May 18 18:44:04 2009 -0700
++++ b/mysql-test/r/mysqlshow.result Mon May 18 18:48:11 2009 -0700
+@@ -87,6 +87,7 @@
+ | COLUMN_PRIVILEGES |
+ | INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
++| PROCESSLIST |
+ | PROFILING |
+ | ROUTINES |
+ | SCHEMATA |
+@@ -113,6 +114,7 @@
+ | COLUMN_PRIVILEGES |
+ | INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
++| PROCESSLIST |
+ | PROFILING |
+ | ROUTINES |
+ | SCHEMATA |
+diff -r e3b747e556c8 patch_info/microsec_process.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/microsec_process.info Mon May 18 18:48:11 2009 -0700
+@@ -0,0 +1,6 @@
++File=microsec_process.patch
++Name=Adds INFOMATION_SCHEMA.PROCESSLIST with TIME_MS column
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
+diff -r e3b747e556c8 sql/mysql_priv.h
+--- a/sql/mysql_priv.h Mon May 18 18:44:04 2009 -0700
++++ b/sql/mysql_priv.h Mon May 18 18:48:11 2009 -0700
+@@ -249,6 +249,8 @@
+
+ /* Characters shown for the command in 'show processlist' */
+ #define PROCESS_LIST_WIDTH 100
++/* Characters shown for the command in 'information_schema.processlist' */
++#define PROCESS_LIST_INFO_WIDTH 65535
+
+ #define PRECISION_FOR_DOUBLE 53
+ #define PRECISION_FOR_FLOAT 24
+diff -r e3b747e556c8 sql/sql_show.cc
+--- a/sql/sql_show.cc Mon May 18 18:44:04 2009 -0700
++++ b/sql/sql_show.cc Mon May 18 18:48:11 2009 -0700
+@@ -1480,6 +1480,122 @@
+ DBUG_VOID_RETURN;
+ }
+
++int fill_schema_processlist(THD* thd, TABLE_LIST* tables, COND* cond)
++{
++ TABLE *table= tables->table;
++ CHARSET_INFO *cs= system_charset_info;
++ char *user;
++ ulonglong current_timer= my_timer(&current_timer, frequency);
++ DBUG_ENTER("fill_process_list");
++
++ user= thd->security_ctx->master_access & PROCESS_ACL ?
++ NullS : thd->security_ctx->priv_user;
++
++ VOID(pthread_mutex_lock(&LOCK_thread_count));
++
++ if (!thd->killed)
++ {
++ I_List_iterator<THD> it(threads);
++ THD* tmp;
++
++ while ((tmp= it++))
++ {
++ Security_context *tmp_sctx= tmp->security_ctx;
++ struct st_my_thread_var *mysys_var;
++ const char *val;
++
++ if ((!tmp->vio_ok() && !tmp->system_thread) ||
++ (user && (!tmp_sctx->user || strcmp(tmp_sctx->user, user))))
++ continue;
++
++ restore_record(table, s->default_values);
++ /* ID */
++ table->field[0]->store((longlong) tmp->thread_id, TRUE);
++ /* USER */
++ val= tmp_sctx->user ? tmp_sctx->user :
++ (tmp->system_thread ? "system user" : "unauthenticated user");
++ table->field[1]->store(val, strlen(val), cs);
++ /* HOST */
++ if (tmp->peer_port && (tmp_sctx->host || tmp_sctx->ip) &&
++ thd->security_ctx->host_or_ip[0])
++ {
++ char host[LIST_PROCESS_HOST_LEN + 1];
++ my_snprintf(host, LIST_PROCESS_HOST_LEN, "%s:%u",
++ tmp_sctx->host_or_ip, tmp->peer_port);
++ table->field[2]->store(host, strlen(host), cs);
++ }
++ else
++ table->field[2]->store(tmp_sctx->host_or_ip,
++ strlen(tmp_sctx->host_or_ip), cs);
++ /* DB */
++ if (tmp->db)
++ {
++ table->field[3]->store(tmp->db, strlen(tmp->db), cs);
++ table->field[3]->set_notnull();
++ }
++
++ if ((mysys_var= tmp->mysys_var))
++ pthread_mutex_lock(&mysys_var->mutex);
++ /* COMMAND */
++ if ((val= (char *) (tmp->killed == THD::KILL_CONNECTION? "Killed" : 0)))
++ table->field[4]->store(val, strlen(val), cs);
++ else
++ table->field[4]->store(command_name[tmp->command],
++ strlen(command_name[tmp->command]), cs);
++ /* MYSQL_TIME */
++ ulonglong utime= (tmp->start_timer && current_timer) ? current_timer - tmp->start_timer : 0;
++ /* correction for negative time */
++ if (utime > 2629743) utime= 0;
++ table->field[5]->store(utime / 1000000, TRUE);
++ /* STATE */
++#ifndef EMBEDDED_LIBRARY
++ val= (char*) (tmp->locked ? "Locked" :
++ tmp->net.reading_or_writing ?
++ (tmp->net.reading_or_writing == 2 ?
++ "Writing to net" :
++ tmp->command == COM_SLEEP ? "" :
++ "Reading from net") :
++ tmp->proc_info ? tmp->proc_info :
++ tmp->mysys_var &&
++ tmp->mysys_var->current_cond ?
++ "Waiting on cond" : NullS);
++#else
++ val= (char *) "Writing to net";
++#endif
++ if (val)
++ {
++ table->field[6]->store(val, strlen(val), cs);
++ table->field[6]->set_notnull();
++ }
++
++ if (mysys_var)
++ pthread_mutex_unlock(&mysys_var->mutex);
++
++ /* INFO */
++ if (tmp->query)
++ {
++ table->field[7]->store(tmp->query,
++ min(PROCESS_LIST_INFO_WIDTH,
++ tmp->query_length), cs);
++ table->field[7]->set_notnull();
++ }
++
++ /* TIME_MS */
++ table->field[8]->store((double)(utime / 1000.0));
++
++ if (schema_table_store_record(thd, table))
++ {
++ VOID(pthread_mutex_unlock(&LOCK_thread_count));
++ DBUG_RETURN(1);
++ }
++
++ }
++ }
++
++ VOID(pthread_mutex_unlock(&LOCK_thread_count));
++ DBUG_RETURN(0);
++}
++
+ /*****************************************************************************
+ Status functions
+ *****************************************************************************/
+@@ -4849,6 +4965,22 @@
+ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
+ };
+
++ST_FIELD_INFO processlist_fields_info[]=
++{
++ {"ID", 4, MYSQL_TYPE_LONG, 0, 0, "Id"},
++ {"USER", 16, MYSQL_TYPE_STRING, 0, 0, "User"},
++ {"HOST", LIST_PROCESS_HOST_LEN, MYSQL_TYPE_STRING, 0, 0, "Host"},
++ {"DB", NAME_LEN, MYSQL_TYPE_STRING, 0, 1, "Db"},
++ {"COMMAND", 16, MYSQL_TYPE_STRING, 0, 0, "Command"},
++ {"TIME", 7, MYSQL_TYPE_LONG, 0, 0, "Time"},
++ {"STATE", 64, MYSQL_TYPE_STRING, 0, 1, "State"},
++ {"INFO", PROCESS_LIST_INFO_WIDTH, MYSQL_TYPE_STRING, 0, 1, "Info"},
++ {"TIME_MS", 100 * (MY_INT64_NUM_DECIMAL_DIGITS + 1) + 3, MYSQL_TYPE_DECIMAL,
++ 0, 0, "Time_ms"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
++
+ /*
+ Description of ST_FIELD_INFO in table.h
+ */
+@@ -4873,6 +5005,8 @@
+ get_all_tables, 0, get_schema_key_column_usage_record, 4, 5, 0},
+ {"OPEN_TABLES", open_tables_fields_info, create_schema_table,
+ fill_open_tables, make_old_format, 0, -1, -1, 1},
++ {"PROCESSLIST", processlist_fields_info, create_schema_table,
++ fill_schema_processlist, make_old_format, 0, -1, -1, 0},
+ {"PROFILING", query_profile_statistics_info, create_schema_table,
+ fill_query_profile_statistics_info, make_profile_table_for_show,
+ NULL, -1, -1, false},
+diff -r e3b747e556c8 sql/table.h
+--- a/sql/table.h Mon May 18 18:44:04 2009 -0700
++++ b/sql/table.h Mon May 18 18:48:11 2009 -0700
+@@ -379,6 +379,7 @@
+ SCH_INDEX_STATS,
+ SCH_KEY_COLUMN_USAGE,
+ SCH_OPEN_TABLES,
++ SCH_PROCESSLIST,
+ SCH_PROFILES,
+ SCH_PROCEDURES,
+ SCH_SCHEMATA,
diff --git a/percona/5.0.91-b22-20100522/microslow_innodb.patch b/percona/5.0.91-b22-20100522/microslow_innodb.patch
new file mode 100644
index 0000000..11a186c
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/microslow_innodb.patch
@@ -0,0 +1,2492 @@
+diff -r 1242d4575291 include/my_getopt.h
+--- a/include/my_getopt.h Tue Jul 28 23:39:12 2009 -0700
++++ b/include/my_getopt.h Tue Jul 28 23:42:44 2009 -0700
+@@ -28,7 +28,8 @@
+ #define GET_ULL 8
+ #define GET_STR 9
+ #define GET_STR_ALLOC 10
+-#define GET_DISABLED 11
++#define GET_MICROTIME 11
++#define GET_DISABLED 12
+
+ #define GET_ASK_ADDR 128
+ #define GET_TYPE_MASK 127
+diff -r 1242d4575291 include/my_time.h
+--- a/include/my_time.h Tue Jul 28 23:39:12 2009 -0700
++++ b/include/my_time.h Tue Jul 28 23:42:44 2009 -0700
+@@ -140,7 +140,7 @@
+ int my_date_to_str(const MYSQL_TIME *l_time, char *to);
+ int my_datetime_to_str(const MYSQL_TIME *l_time, char *to);
+ int my_TIME_to_str(const MYSQL_TIME *l_time, char *to);
+-
++ulonglong my_timer(ulonglong *ltime, ulonglong frequency);
+ C_MODE_END
+
+ #endif /* _my_time_h_ */
+diff -r 1242d4575291 innobase/buf/buf0buf.c
+--- a/innobase/buf/buf0buf.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/buf/buf0buf.c Tue Jul 28 23:42:44 2009 -0700
+@@ -37,6 +37,10 @@
+ #include "log0log.h"
+ #include "trx0undo.h"
+ #include "srv0srv.h"
++#include "trx0trx.h"
++
++/* prototypes for new functions added to ha_innodb.cc */
++trx_t* innobase_get_trx();
+
+ /*
+ IMPLEMENTATION OF THE BUFFER POOL
+@@ -1086,6 +1090,36 @@
+ return(block);
+ }
+
++inline void _increment_page_get_statistics(buf_block_t* block, trx_t* trx)
++{
++ ulint block_hash;
++ ulint block_hash_byte;
++ byte block_hash_offset;
++
++ ut_ad(block);
++
++ if (!srv_slow_log || !trx || !trx->take_stats)
++ return;
++
++ if (!trx->distinct_page_access_hash) {
++ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
++ }
++
++ block_hash = ut_hash_ulint((block->space << 20) + block->space +
++ block->offset, DPAH_SIZE << 3);
++ block_hash_byte = block_hash >> 3;
++ block_hash_offset = (byte) block_hash & 0x07;
++ if (block_hash_byte < 0 || block_hash_byte >= DPAH_SIZE)
++ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %lu !!!\n", block_hash_byte, block_hash_offset);
++ if (block_hash_offset < 0 || block_hash_offset > 7)
++ fprintf(stderr, "!!! block_hash_byte = %lu block_hash_offset = %lu !!!\n", block_hash_byte, block_hash_offset);
++ if ((trx->distinct_page_access_hash[block_hash_byte] & ((byte) 0x01 << block_hash_offset)) == 0)
++ trx->distinct_page_access++;
++ trx->distinct_page_access_hash[block_hash_byte] |= (byte) 0x01 << block_hash_offset;
++ return;
++}
++
+ /************************************************************************
+ This is the general function used to get access to a database page. */
+
+@@ -1108,6 +1142,11 @@
+ ulint fix_type;
+ ibool success;
+ ibool must_read;
++ trx_t* trx = NULL;
++ ulint sec;
++ ulint ms;
++ ib_longlong start_time;
++ ib_longlong finish_time;
+
+ ut_ad(mtr);
+ ut_ad((rw_latch == RW_S_LATCH)
+@@ -1119,6 +1158,9 @@
+ #ifndef UNIV_LOG_DEBUG
+ ut_ad(!ibuf_inside() || ibuf_page(space, offset));
+ #endif
++ if (srv_slow_log) {
++ trx = innobase_get_trx();
++ }
+ buf_pool->n_page_gets++;
+ loop:
+ block = NULL;
+@@ -1148,7 +1190,7 @@
+ return(NULL);
+ }
+
+- buf_read_page(space, offset);
++ buf_read_page(space, offset, trx);
+
+ #ifdef UNIV_DEBUG
+ buf_dbg_counter++;
+@@ -1261,6 +1303,11 @@
+ /* Let us wait until the read operation
+ completes */
+
++ if (srv_slow_log && trx && trx->take_stats)
++ {
++ ut_usectime(&sec, &ms);
++ start_time = (ib_longlong)sec * 1000000 + ms;
++ }
+ for (;;) {
+ mutex_enter(&block->mutex);
+
+@@ -1276,6 +1323,12 @@
+ break;
+ }
+ }
++ if (srv_slow_log && trx && trx->take_stats && start_time)
++ {
++ ut_usectime(&sec, &ms);
++ finish_time = (ib_longlong)sec * 1000000 + ms;
++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
++ }
+ }
+
+ fix_type = MTR_MEMO_BUF_FIX;
+@@ -1296,12 +1349,17 @@
+ /* In the case of a first access, try to apply linear
+ read-ahead */
+
+- buf_read_ahead_linear(space, offset);
++ buf_read_ahead_linear(space, offset, trx);
+ }
+
+ #ifdef UNIV_IBUF_DEBUG
+ ut_a(ibuf_count_get(block->space, block->offset) == 0);
+ #endif
++
++ if (srv_slow_log) {
++ _increment_page_get_statistics(block, trx);
++ }
++
+ return(block->frame);
+ }
+
+@@ -1326,6 +1384,7 @@
+ ibool accessed;
+ ibool success;
+ ulint fix_type;
++ trx_t* trx = NULL;
+
+ ut_ad(mtr && block);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+@@ -1440,7 +1499,7 @@
+ read-ahead */
+
+ buf_read_ahead_linear(buf_frame_get_space_id(guess),
+- buf_frame_get_page_no(guess));
++ buf_frame_get_page_no(guess), trx);
+ }
+
+ #ifdef UNIV_IBUF_DEBUG
+@@ -1448,6 +1507,11 @@
+ #endif
+ buf_pool->n_page_gets++;
+
++ if (srv_slow_log) {
++ trx = innobase_get_trx();
++ _increment_page_get_statistics(block, trx);
++ }
++
+ return(TRUE);
+ }
+
+@@ -1470,6 +1534,7 @@
+ buf_block_t* block;
+ ibool success;
+ ulint fix_type;
++ trx_t* trx = NULL;
+
+ ut_ad(mtr);
+ ut_ad((rw_latch == RW_S_LATCH) || (rw_latch == RW_X_LATCH));
+@@ -1559,6 +1624,11 @@
+ #endif
+ buf_pool->n_page_gets++;
+
++ if (srv_slow_log) {
++ trx = innobase_get_trx();
++ _increment_page_get_statistics(block, trx);
++ }
++
+ return(TRUE);
+ }
+
+diff -r 1242d4575291 innobase/buf/buf0rea.c
+--- a/innobase/buf/buf0rea.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/buf/buf0rea.c Tue Jul 28 23:42:44 2009 -0700
+@@ -70,7 +70,8 @@
+ treat the tablespace as dropped; this is a timestamp we
+ use to stop dangling page reads from a tablespace
+ which we have DISCARDed + IMPORTed back */
+- ulint offset) /* in: page number */
++ ulint offset, /* in: page number */
++ trx_t* trx)
+ {
+ buf_block_t* block;
+ ulint wake_later;
+@@ -140,10 +141,10 @@
+
+ ut_a(block->state == BUF_BLOCK_FILE_PAGE);
+
+- *err = fil_io(OS_FILE_READ | wake_later,
++ *err = _fil_io(OS_FILE_READ | wake_later,
+ sync, space,
+ offset, 0, UNIV_PAGE_SIZE,
+- (void*)block->frame, (void*)block);
++ (void*)block->frame, (void*)block, trx);
+ ut_a(*err == DB_SUCCESS);
+
+ if (sync) {
+@@ -174,8 +175,9 @@
+ the page at the given page number does not get
+ read even if we return a value > 0! */
+ ulint space, /* in: space id */
+- ulint offset) /* in: page number of a page which the current thread
++ ulint offset, /* in: page number of a page which the current thread
+ wants to access */
++ trx_t* trx)
+ {
+ ib_longlong tablespace_version;
+ buf_block_t* block;
+@@ -270,7 +272,7 @@
+ if (!ibuf_bitmap_page(i)) {
+ count += buf_read_page_low(&err, FALSE, ibuf_mode
+ | OS_AIO_SIMULATED_WAKE_LATER,
+- space, tablespace_version, i);
++ space, tablespace_version, i, trx);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+@@ -314,7 +316,8 @@
+ /* out: number of page read requests issued: this can
+ be > 1 if read-ahead occurred */
+ ulint space, /* in: space id */
+- ulint offset) /* in: page number */
++ ulint offset, /* in: page number */
++ trx_t* trx)
+ {
+ ib_longlong tablespace_version;
+ ulint count;
+@@ -323,13 +326,13 @@
+
+ tablespace_version = fil_space_get_version(space);
+
+- count = buf_read_ahead_random(space, offset);
++ count = buf_read_ahead_random(space, offset, trx);
+
+ /* We do the i/o in the synchronous aio mode to save thread
+ switches: hence TRUE */
+
+ count2 = buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+- tablespace_version, offset);
++ tablespace_version, offset, trx);
+ srv_buf_pool_reads+= count2;
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+@@ -374,8 +377,9 @@
+ /*==================*/
+ /* out: number of page read requests issued */
+ ulint space, /* in: space id */
+- ulint offset) /* in: page number of a page; NOTE: the current thread
++ ulint offset, /* in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
++ trx_t* trx)
+ {
+ ib_longlong tablespace_version;
+ buf_block_t* block;
+@@ -556,7 +560,7 @@
+ if (!ibuf_bitmap_page(i)) {
+ count += buf_read_page_low(&err, FALSE, ibuf_mode
+ | OS_AIO_SIMULATED_WAKE_LATER,
+- space, tablespace_version, i);
++ space, tablespace_version, i, trx);
+ if (err == DB_TABLESPACE_DELETED) {
+ ut_print_timestamp(stderr);
+ fprintf(stderr,
+@@ -625,10 +629,10 @@
+ for (i = 0; i < n_stored; i++) {
+ if ((i + 1 == n_stored) && sync) {
+ buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE,
+- space_ids[i], space_versions[i], page_nos[i]);
++ space_ids[i], space_versions[i], page_nos[i], NULL);
+ } else {
+ buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE,
+- space_ids[i], space_versions[i], page_nos[i]);
++ space_ids[i], space_versions[i], page_nos[i], NULL);
+ }
+
+ if (err == DB_TABLESPACE_DELETED) {
+@@ -704,11 +708,11 @@
+
+ if ((i + 1 == n_stored) && sync) {
+ buf_read_page_low(&err, TRUE, BUF_READ_ANY_PAGE, space,
+- tablespace_version, page_nos[i]);
++ tablespace_version, page_nos[i], NULL);
+ } else {
+ buf_read_page_low(&err, FALSE, BUF_READ_ANY_PAGE
+ | OS_AIO_SIMULATED_WAKE_LATER,
+- space, tablespace_version, page_nos[i]);
++ space, tablespace_version, page_nos[i], NULL);
+ }
+ }
+
+diff -r 1242d4575291 innobase/fil/fil0fil.c
+--- a/innobase/fil/fil0fil.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/fil/fil0fil.c Tue Jul 28 23:42:44 2009 -0700
+@@ -3527,7 +3527,7 @@
+ node->name, node->handle, buf,
+ offset_low, offset_high,
+ UNIV_PAGE_SIZE * n_pages,
+- NULL, NULL);
++ NULL, NULL, NULL);
+ #endif
+ if (success) {
+ node->size += n_pages;
+@@ -3851,7 +3851,7 @@
+ Reads or writes data. This operation is asynchronous (aio). */
+
+ ulint
+-fil_io(
++_fil_io(
+ /*===*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+@@ -3877,8 +3877,9 @@
+ void* buf, /* in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+- void* message) /* in: message for aio handler if non-sync
++ void* message, /* in: message for aio handler if non-sync
+ aio used, else ignored */
++ trx_t* trx)
+ {
+ fil_system_t* system = fil_system;
+ ulint mode;
+@@ -4018,7 +4019,7 @@
+ #else
+ /* Queue the aio request */
+ ret = os_aio(type, mode | wake_later, node->name, node->handle, buf,
+- offset_low, offset_high, len, node, message);
++ offset_low, offset_high, len, node, message, trx);
+ #endif
+ ut_a(ret);
+
+diff -r 1242d4575291 innobase/include/buf0rea.h
+--- a/innobase/include/buf0rea.h Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/include/buf0rea.h Tue Jul 28 23:42:44 2009 -0700
+@@ -10,6 +10,7 @@
+ #define buf0rea_h
+
+ #include "univ.i"
++#include "trx0types.h"
+ #include "buf0types.h"
+
+ /************************************************************************
+@@ -25,7 +26,8 @@
+ /* out: number of page read requests issued: this can
+ be > 1 if read-ahead occurred */
+ ulint space, /* in: space id */
+- ulint offset);/* in: page number */
++ ulint offset, /* in: page number */
++ trx_t* trx);
+ /************************************************************************
+ Applies linear read-ahead if in the buf_pool the page is a border page of
+ a linear read-ahead area and all the pages in the area have been accessed.
+@@ -55,8 +57,9 @@
+ /*==================*/
+ /* out: number of page read requests issued */
+ ulint space, /* in: space id */
+- ulint offset);/* in: page number of a page; NOTE: the current thread
++ ulint offset, /* in: page number of a page; NOTE: the current thread
+ must want access to this page (see NOTE 3 above) */
++ trx_t* trx);
+ /************************************************************************
+ Issues read requests for pages which the ibuf module wants to read in, in
+ order to contract the insert buffer tree. Technically, this function is like
+diff -r 1242d4575291 innobase/include/fil0fil.h
+--- a/innobase/include/fil0fil.h Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/include/fil0fil.h Tue Jul 28 23:42:44 2009 -0700
+@@ -534,8 +534,11 @@
+ /************************************************************************
+ Reads or writes data. This operation is asynchronous (aio). */
+
++#define fil_io(type, sync, space_id, block_offset, byte_offset, len, buf, message) \
++ _fil_io(type, sync, space_id, block_offset, byte_offset, len, buf, message, NULL)
++
+ ulint
+-fil_io(
++_fil_io(
+ /*===*/
+ /* out: DB_SUCCESS, or DB_TABLESPACE_DELETED
+ if we are trying to do i/o on a tablespace
+@@ -561,8 +564,9 @@
+ void* buf, /* in/out: buffer where to store read data
+ or from where to write; in aio this must be
+ appropriately aligned */
+- void* message); /* in: message for aio handler if non-sync
++ void* message, /* in: message for aio handler if non-sync
+ aio used, else ignored */
++ trx_t* trx);
+ /************************************************************************
+ Reads data from a space to a buffer. Remember that the possible incomplete
+ blocks at the end of file are ignored: they are not taken into account when
+diff -r 1242d4575291 innobase/include/os0file.h
+--- a/innobase/include/os0file.h Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/include/os0file.h Tue Jul 28 23:42:44 2009 -0700
+@@ -11,6 +11,8 @@
+
+ #include "univ.i"
+
++#include "trx0types.h"
++
+ #ifndef __WIN__
+ #include <dirent.h>
+ #include <sys/stat.h>
+@@ -421,8 +423,11 @@
+ /***********************************************************************
+ Requests a synchronous read operation. */
+
++#define os_file_read(file, buf, offset, offset_high, n) \
++ _os_file_read(file, buf, offset, offset_high, n, NULL)
++
+ ibool
+-os_file_read(
++_os_file_read(
+ /*=========*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+@@ -432,7 +437,8 @@
+ offset where to read */
+ ulint offset_high,/* in: most significant 32 bits of
+ offset */
+- ulint n); /* in: number of bytes to read */
++ ulint n, /* in: number of bytes to read */
++ trx_t* trx);
+ /***********************************************************************
+ Rewind file to its start, read at most size - 1 bytes from it to str, and
+ NUL-terminate str. All errors are silently ignored. This function is
+@@ -584,7 +590,8 @@
+ can be used to identify a completed aio
+ operation); if mode is OS_AIO_SYNC, these
+ are ignored */
+- void* message2);
++ void* message2,
++ trx_t* trx);
+ /****************************************************************************
+ Wakes up all async i/o threads so that they know to exit themselves in
+ shutdown. */
+diff -r 1242d4575291 innobase/include/srv0srv.h
+--- a/innobase/include/srv0srv.h Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/include/srv0srv.h Tue Jul 28 23:42:44 2009 -0700
+@@ -27,6 +27,8 @@
+ #define SRV_AUTO_EXTEND_INCREMENT \
+ (srv_auto_extend_increment * ((1024 * 1024) / UNIV_PAGE_SIZE))
+
++extern ibool srv_slow_log;
++
+ /* This is set to TRUE if the MySQL user has set it in MySQL */
+ extern ibool srv_lower_case_table_names;
+
+diff -r 1242d4575291 innobase/include/trx0trx.h
+--- a/innobase/include/trx0trx.h Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/include/trx0trx.h Tue Jul 28 23:42:44 2009 -0700
+@@ -668,6 +668,17 @@
+ /*------------------------------*/
+ char detailed_error[256]; /* detailed error message for last
+ error, or empty. */
++ /*------------------------------*/
++ ulint io_reads;
++ ib_longlong io_read;
++ ulint io_reads_wait_timer;
++ ib_longlong lock_que_wait_ustarted;
++ ulint lock_que_wait_timer;
++ ulint innodb_que_wait_timer;
++ ulint distinct_page_access;
++#define DPAH_SIZE 8192
++ byte* distinct_page_access_hash;
++ ibool take_stats;
+ };
+
+ #define TRX_MAX_N_THREADS 32 /* maximum number of concurrent
+diff -r 1242d4575291 innobase/lock/lock0lock.c
+--- a/innobase/lock/lock0lock.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/lock/lock0lock.c Tue Jul 28 23:42:44 2009 -0700
+@@ -1806,6 +1806,8 @@
+ {
+ lock_t* lock;
+ trx_t* trx;
++ ulint sec;
++ ulint ms;
+
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+@@ -1861,6 +1863,10 @@
+ trx->que_state = TRX_QUE_LOCK_WAIT;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ trx->wait_started = time(NULL);
++ if (srv_slow_log && trx->take_stats) {
++ ut_usectime(&sec, &ms);
++ trx->lock_que_wait_ustarted = (ib_longlong)sec * 1000000 + ms;
++ }
+
+ ut_a(que_thr_stop(thr));
+
+@@ -3514,7 +3520,9 @@
+ {
+ lock_t* lock;
+ trx_t* trx;
+-
++ ulint sec;
++ ulint ms;
++
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+@@ -3564,6 +3572,10 @@
+ return(DB_SUCCESS);
+ }
+
++ if (srv_slow_log && trx->take_stats) {
++ ut_usectime(&sec, &ms);
++ trx->lock_que_wait_ustarted = (ib_longlong)sec * 1000000 + ms;
++ }
+ trx->que_state = TRX_QUE_LOCK_WAIT;
+ trx->was_chosen_as_deadlock_victim = FALSE;
+ trx->wait_started = time(NULL);
+diff -r 1242d4575291 innobase/os/os0file.c
+--- a/innobase/os/os0file.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/os/os0file.c Tue Jul 28 23:42:44 2009 -0700
+@@ -14,6 +14,8 @@
+ #include "srv0start.h"
+ #include "fil0fil.h"
+ #include "buf0buf.h"
++#include "trx0sys.h"
++#include "trx0trx.h"
+
+ #if defined(UNIV_HOTBACKUP) && defined(__WIN__)
+ /* Add includes for the _stat() call to compile on Windows */
+@@ -1903,9 +1905,13 @@
+ #ifndef __WIN__
+ /***********************************************************************
+ Does a synchronous read operation in Posix. */
++
++#define os_file_pread(file, buf, n, offset, offset_high) \
++ _os_file_pread(file, buf, n, offset, offset_high, NULL);
++
+ static
+ ssize_t
+-os_file_pread(
++_os_file_pread(
+ /*==========*/
+ /* out: number of bytes read, -1 if error */
+ os_file_t file, /* in: handle to a file */
+@@ -1913,12 +1919,17 @@
+ ulint n, /* in: number of bytes to read */
+ ulint offset, /* in: least significant 32 bits of file
+ offset from where to read */
+- ulint offset_high) /* in: most significant 32 bits of
+- offset */
++ ulint offset_high, /* in: most significant 32 bits of
++ offset */
++ trx_t* trx)
+ {
+ off_t offs;
+ ssize_t n_bytes;
+-
++ ulint sec;
++ ulint ms;
++ ib_longlong start_time;
++ ib_longlong finish_time;
++
+ ut_a((offset & 0xFFFFFFFFUL) == offset);
+
+ /* If off_t is > 4 bytes in size, then we assume we can pass a
+@@ -1937,7 +1948,13 @@
+ }
+
+ os_n_file_reads++;
+-
++ if (srv_slow_log && trx && trx->take_stats)
++ {
++ trx->io_reads++;
++ trx->io_read += n;
++ ut_usectime(&sec, &ms);
++ start_time = (ib_longlong)sec * 1000000 + ms;
++ }
+ #if defined(HAVE_PREAD) && !defined(HAVE_BROKEN_PREAD)
+ os_mutex_enter(os_file_count_mutex);
+ os_file_n_pending_preads++;
+@@ -1951,6 +1968,13 @@
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
++ if (srv_slow_log && trx && trx->take_stats && start_time)
++ {
++ ut_usectime(&sec, &ms);
++ finish_time = (ib_longlong)sec * 1000000 + ms;
++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
++ }
++
+ return(n_bytes);
+ #else
+ {
+@@ -1981,6 +2005,13 @@
+ os_n_pending_reads--;
+ os_mutex_exit(os_file_count_mutex);
+
++ if (srv_slow_log && trx && trx->take_stats && start_time)
++ {
++ ut_usectime(&sec, &ms);
++ finish_time = (ib_longlong)sec * 1000000 + ms;
++ trx->io_reads_wait_timer += (ulint)(finish_time - start_time);
++ }
++
+ return(ret);
+ }
+ #endif
+@@ -2103,7 +2134,7 @@
+ Requests a synchronous positioned read operation. */
+
+ ibool
+-os_file_read(
++_os_file_read(
+ /*=========*/
+ /* out: TRUE if request was
+ successful, FALSE if fail */
+@@ -2113,7 +2144,8 @@
+ offset where to read */
+ ulint offset_high, /* in: most significant 32 bits of
+ offset */
+- ulint n) /* in: number of bytes to read */
++ ulint n, /* in: number of bytes to read */
++ trx_t* trx)
+ {
+ #ifdef __WIN__
+ BOOL ret;
+@@ -2177,7 +2209,7 @@
+ os_bytes_read_since_printout += n;
+
+ try_again:
+- ret = os_file_pread(file, buf, n, offset, offset_high);
++ ret = _os_file_pread(file, buf, n, offset, offset_high, trx);
+
+ if ((ulint)ret == n) {
+
+@@ -3137,7 +3169,8 @@
+ offset */
+ ulint offset_high, /* in: most significant 32 bits of
+ offset */
+- ulint len) /* in: length of the block to read or write */
++ ulint len, /* in: length of the block to read or write */
++ trx_t* trx)
+ {
+ os_aio_slot_t* slot;
+ #ifdef WIN_ASYNC_IO
+@@ -3390,7 +3423,8 @@
+ can be used to identify a completed aio
+ operation); if mode is OS_AIO_SYNC, these
+ are ignored */
+- void* message2)
++ void* message2,
++ trx_t* trx)
+ {
+ os_aio_array_t* array;
+ os_aio_slot_t* slot;
+@@ -3429,8 +3463,8 @@
+ wait in the Windows case. */
+
+ if (type == OS_FILE_READ) {
+- return(os_file_read(file, buf, offset,
+- offset_high, n));
++ return(_os_file_read(file, buf, offset,
++ offset_high, n, trx));
+ }
+
+ ut_a(type == OS_FILE_WRITE);
+@@ -3463,8 +3497,13 @@
+ ut_error;
+ }
+
++ if (trx && type == OS_FILE_READ)
++ {
++ trx->io_reads++;
++ trx->io_read += n;
++ }
+ slot = os_aio_array_reserve_slot(type, array, message1, message2, file,
+- name, buf, offset, offset_high, n);
++ name, buf, offset, offset_high, n, trx);
+ if (type == OS_FILE_READ) {
+ if (os_aio_use_native_aio) {
+ #ifdef WIN_ASYNC_IO
+diff -r 1242d4575291 innobase/srv/srv0srv.c
+--- a/innobase/srv/srv0srv.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/srv/srv0srv.c Tue Jul 28 23:42:44 2009 -0700
+@@ -48,6 +48,8 @@
+ #include "srv0start.h"
+ #include "row0mysql.h"
+
++ibool srv_slow_log = 0;
++
+ /* This is set to TRUE if the MySQL user has set it in MySQL; currently
+ affects only FOREIGN KEY definition parsing */
+ ibool srv_lower_case_table_names = FALSE;
+@@ -1002,6 +1004,10 @@
+ ibool has_slept = FALSE;
+ srv_conc_slot_t* slot = NULL;
+ ulint i;
++ ib_longlong start_time = 0L;
++ ib_longlong finish_time = 0L;
++ ulint sec;
++ ulint ms;
+
+ /* If trx has 'free tickets' to enter the engine left, then use one
+ such ticket */
+@@ -1060,6 +1066,7 @@
+ if (SRV_THREAD_SLEEP_DELAY > 0)
+ {
+ os_thread_sleep(SRV_THREAD_SLEEP_DELAY);
++ trx->innodb_que_wait_timer += SRV_THREAD_SLEEP_DELAY;
+ }
+
+ trx->op_info = "";
+@@ -1115,12 +1122,23 @@
+ /* Go to wait for the event; when a thread leaves InnoDB it will
+ release this thread */
+
++ if (srv_slow_log && trx->take_stats) {
++ ut_usectime(&sec, &ms);
++ start_time = (ib_longlong)sec * 1000000 + ms;
++ }
++
+ trx->op_info = "waiting in InnoDB queue";
+
+ os_event_wait(slot->event);
+
+ trx->op_info = "";
+
++ if (srv_slow_log && trx->take_stats && start_time) {
++ ut_usectime(&sec, &ms);
++ finish_time = (ib_longlong)sec * 1000000 + ms;
++ trx->innodb_que_wait_timer += (ulint)(finish_time - start_time);
++ }
++
+ os_fast_mutex_lock(&srv_conc_mutex);
+
+ srv_conc_n_waiting_threads--;
+diff -r 1242d4575291 innobase/trx/trx0trx.c
+--- a/innobase/trx/trx0trx.c Tue Jul 28 23:39:12 2009 -0700
++++ b/innobase/trx/trx0trx.c Tue Jul 28 23:42:44 2009 -0700
+@@ -190,6 +190,15 @@
+ trx->global_read_view_heap = mem_heap_create(256);
+ trx->global_read_view = NULL;
+ trx->read_view = NULL;
++
++ trx->io_reads = 0;
++ trx->io_read = 0;
++ trx->io_reads_wait_timer = 0;
++ trx->lock_que_wait_timer = 0;
++ trx->innodb_que_wait_timer = 0;
++ trx->distinct_page_access = 0;
++ trx->distinct_page_access_hash = NULL;
++ trx->take_stats = FALSE;
+
+ /* Set X/Open XA transaction identification to NULL */
+ memset(&trx->xid, 0, sizeof(trx->xid));
+@@ -230,6 +239,11 @@
+
+ trx->mysql_process_no = os_proc_get_number();
+
++ if (srv_slow_log && trx->take_stats) {
++ trx->distinct_page_access_hash = mem_alloc(DPAH_SIZE);
++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
++ }
++
+ return(trx);
+ }
+
+@@ -366,6 +380,12 @@
+ /*===============*/
+ trx_t* trx) /* in, own: trx object */
+ {
++ if (trx->distinct_page_access_hash)
++ {
++ mem_free(trx->distinct_page_access_hash);
++ trx->distinct_page_access_hash= NULL;
++ }
++
+ thr_local_free(trx->mysql_thread_id);
+
+ mutex_enter(&kernel_mutex);
+@@ -389,6 +409,12 @@
+ /*====================*/
+ trx_t* trx) /* in, own: trx object */
+ {
++ if (trx->distinct_page_access_hash)
++ {
++ mem_free(trx->distinct_page_access_hash);
++ trx->distinct_page_access_hash= NULL;
++ }
++
+ mutex_enter(&kernel_mutex);
+
+ trx_free(trx);
+@@ -1064,7 +1090,10 @@
+ trx_t* trx) /* in: transaction */
+ {
+ que_thr_t* thr;
+-
++ ulint sec;
++ ulint ms;
++ ib_longlong now;
++
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+ #endif /* UNIV_SYNC_DEBUG */
+@@ -1080,6 +1109,11 @@
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
++ if (srv_slow_log && trx->take_stats) {
++ ut_usectime(&sec, &ms);
++ now = (ib_longlong)sec * 1000000 + ms;
++ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
++ }
+ trx->que_state = TRX_QUE_RUNNING;
+ }
+
+@@ -1093,6 +1127,9 @@
+ trx_t* trx) /* in: transaction in the TRX_QUE_LOCK_WAIT state */
+ {
+ que_thr_t* thr;
++ ulint sec;
++ ulint ms;
++ ib_longlong now;
+
+ #ifdef UNIV_SYNC_DEBUG
+ ut_ad(mutex_own(&kernel_mutex));
+@@ -1109,6 +1146,11 @@
+ thr = UT_LIST_GET_FIRST(trx->wait_thrs);
+ }
+
++ if (srv_slow_log && trx->take_stats) {
++ ut_usectime(&sec, &ms);
++ now = (ib_longlong)sec * 1000000 + ms;
++ trx->lock_que_wait_timer += (ulint)(now - trx->lock_que_wait_ustarted);
++ }
+ trx->que_state = TRX_QUE_RUNNING;
+ }
+
+diff -r 1242d4575291 mysys/my_getopt.c
+--- a/mysys/my_getopt.c Tue Jul 28 23:39:12 2009 -0700
++++ b/mysys/my_getopt.c Tue Jul 28 23:42:44 2009 -0700
+@@ -827,7 +827,8 @@
+ #endif
+ break;
+ default:
+- DBUG_ASSERT((optp->var_type & GET_TYPE_MASK) == GET_ULL);
++ DBUG_ASSERT((optp->var_type & GET_TYPE_MASK) == GET_ULL
++ || (optp->var_type & GET_TYPE_MASK) == GET_MICROTIME);
+ break;
+ }
+
+@@ -1061,6 +1062,9 @@
+ case GET_ULONG:
+ printf("%lu\n", *((ulong*) value));
+ break;
++ case GET_MICROTIME:
++ printf("%6f\n", ((double)(*((longlong*) value))) / 1000000.0);
++ break;
+ case GET_LL:
+ printf("%s\n", llstr(*((longlong*) value), buff));
+ break;
+diff -r 1242d4575291 patch_info/microslow_innodb.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/microslow_innodb.info Tue Jul 28 23:42:44 2009 -0700
+@@ -0,0 +1,15 @@
++File=microslow_innodb.patch
++Name=Extended statistics in slow.log
++Version=1.2
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
++Changelog
++2008-11-26
++YK: Fix inefficient determination of trx, Make not to call useless gettimeofday when don't use slow log. Make log_slow_queries dynamic (bool).
++
++2008-11-07
++VT: Moved log_slow_rate_limit in SHOW VARIABLE into right place
++
++2008-11
++Arjen Lentz: Fixups (backward compatibility) by Arjen Lentz <arjen@openquery.com.au>
+diff -r 1242d4575291 scripts/mysqldumpslow.sh
+--- a/scripts/mysqldumpslow.sh Tue Jul 28 23:39:12 2009 -0700
++++ b/scripts/mysqldumpslow.sh Tue Jul 28 23:42:44 2009 -0700
+@@ -83,8 +83,8 @@
+ s/^#? Time: \d{6}\s+\d+:\d+:\d+.*\n//;
+ my ($user,$host) = s/^#? User\@Host:\s+(\S+)\s+\@\s+(\S+).*\n// ? ($1,$2) : ('','');
+
+- s/^# Query_time: (\d+) Lock_time: (\d+) Rows_sent: (\d+).*\n//;
+- my ($t, $l, $r) = ($1, $2, $3);
++ s/^# Query_time: (\d+(\.\d+)?) Lock_time: (\d+(\.\d+)?) Rows_sent: (\d+(\.\d+)?).*\n//;
++ my ($t, $l, $r) = ($1, $3, $5);
+ $t -= $l unless $opt{l};
+
+ # remove fluff that mysqld writes to log when it (re)starts:
+diff -r 1242d4575291 sql-common/my_time.c
+--- a/sql-common/my_time.c Tue Jul 28 23:39:12 2009 -0700
++++ b/sql-common/my_time.c Tue Jul 28 23:42:44 2009 -0700
+@@ -1253,3 +1253,37 @@
+ return 0;
+ }
+
++/*
++ int my_timer(ulonglong *ltime, ulonglong frequency)
++
++ For performance measurement this function returns the number
++ of microseconds since the epoch (SVr4, BSD 4.3, POSIX 1003.1-2001)
++ or system start (Windows platforms).
++
++ For windows platforms frequency value (obtained via
++ QueryPerformanceFrequency) has to be specified. The global frequency
++ value is set in mysqld.cc.
++
++ If Windows platform doesn't support QueryPerformanceFrequency we will
++ obtain the time via GetClockCount, which supports microseconds only.
++*/
++
++ulonglong my_timer(ulonglong *ltime, ulonglong frequency)
++{
++ ulonglong newtime= 0;
++#ifdef __WIN__
++ if (frequency)
++ {
++ QueryPerformanceCounter((LARGE_INTEGER *)&newtime);
++ newtime/= (frequency * 1000000);
++ } else
++ newtime= (GetTickCount() * 1000; /* GetTickCount only returns milliseconds */
++#else
++ struct timeval t;
++ if (gettimeofday(&t, NULL) != -1)
++ newtime= (ulonglong)t.tv_sec * 1000000 + t.tv_usec;
++#endif
++ if (ltime)
++ *ltime= newtime;
++ return newtime;
++}
+diff -r 1242d4575291 sql/filesort.cc
+--- a/sql/filesort.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/filesort.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -180,6 +180,7 @@
+ {
+ statistic_increment(thd->status_var.filesort_scan_count, &LOCK_status);
+ }
++ thd->query_plan_flags|= QPLAN_FILESORT;
+ #ifdef CAN_TRUST_RANGE
+ if (select && select->quick && select->quick->records > 0L)
+ {
+@@ -245,6 +246,7 @@
+ }
+ else
+ {
++ thd->query_plan_flags|= QPLAN_FILESORT_DISK;
+ if (table_sort.buffpek && table_sort.buffpek_len < maxbuffer)
+ {
+ x_free(table_sort.buffpek);
+@@ -1116,6 +1118,7 @@
+
+ statistic_increment(current_thd->status_var.filesort_merge_passes,
+ &LOCK_status);
++ current_thd->query_plan_fsort_passes++;
+ if (param->not_killable)
+ {
+ killed= &not_killable;
+diff -r 1242d4575291 sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/ha_innodb.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -1,3 +1,4 @@
++
+ /* Copyright (C) 2000-2005 MySQL AB & Innobase Oy
+
+ This program is free software; you can redistribute it and/or modify
+@@ -819,9 +820,34 @@
+ trx->check_unique_secondary = TRUE;
+ }
+
++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) {
++ trx->take_stats = TRUE;
++ } else {
++ trx->take_stats = FALSE;
++ }
++
+ return(trx);
+ }
+
++/*************************************************************************
++Gets current trx. */
++extern "C"
++trx_t*
++innobase_get_trx()
++{
++ THD *thd=current_thd;
++ if (likely(thd != 0)) {
++ return((trx_t*) thd->ha_data[innobase_hton.slot]);
++ } else {
++ return(NULL);
++ }
++}
++
++void
++innobase_update_var_slow_log()
++{
++ srv_slow_log = (ibool) opt_slow_log;
++}
+
+ /*************************************************************************
+ Construct ha_innobase handler. */
+@@ -1324,6 +1350,8 @@
+
+ /* -------------- Log files ---------------------------*/
+
++ srv_slow_log = (ibool) opt_slow_log;
++
+ /* The default dir for log files is the datadir of MySQL */
+
+ if (!innobase_log_group_home_dir) {
+@@ -4697,6 +4725,12 @@
+ trx->check_unique_secondary = FALSE;
+ }
+
++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) {
++ trx->take_stats = TRUE;
++ } else {
++ trx->take_stats = FALSE;
++ }
++
+ if (lower_case_table_names) {
+ srv_lower_case_table_names = TRUE;
+ } else {
+@@ -4962,6 +4996,12 @@
+ trx->check_unique_secondary = FALSE;
+ }
+
++ if (thd->variables.log_slow_verbosity & SLOG_V_INNODB) {
++ trx->take_stats = TRUE;
++ } else {
++ trx->take_stats = FALSE;
++ }
++
+ name_len = strlen(name);
+
+ assert(name_len < 1000);
+@@ -5049,6 +5089,12 @@
+ trx->check_foreigns = FALSE;
+ }
+
++ if (current_thd->variables.log_slow_verbosity & SLOG_V_INNODB) {
++ trx->take_stats = TRUE;
++ } else {
++ trx->take_stats = FALSE;
++ }
++
+ error = row_drop_database_for_mysql(namebuf, trx);
+ my_free(namebuf, MYF(0));
+
+@@ -5115,6 +5161,12 @@
+ trx->check_foreigns = FALSE;
+ }
+
++ if (current_thd->variables.log_slow_verbosity & SLOG_V_INNODB) {
++ trx->take_stats = TRUE;
++ } else {
++ trx->take_stats = FALSE;
++ }
++
+ name_len1 = strlen(from);
+ name_len2 = strlen(to);
+
+@@ -6122,6 +6174,7 @@
+ {
+ row_prebuilt_t* prebuilt = (row_prebuilt_t*) innobase_prebuilt;
+ trx_t* trx;
++ int i;
+
+ DBUG_ENTER("ha_innobase::external_lock");
+ DBUG_PRINT("enter",("lock_type: %d", lock_type));
+@@ -6245,7 +6298,24 @@
+
+ if (trx->n_mysql_tables_in_use == 0) {
+
+- trx->mysql_n_tables_locked = 0;
++ current_thd->innodb_was_used = TRUE;
++ current_thd->innodb_io_reads += trx->io_reads;
++ current_thd->innodb_io_read += trx->io_read;
++ current_thd->innodb_io_reads_wait_timer += trx->io_reads_wait_timer;
++ current_thd->innodb_lock_que_wait_timer += trx->lock_que_wait_timer;
++ current_thd->innodb_innodb_que_wait_timer += trx->innodb_que_wait_timer;
++ current_thd->innodb_page_access += trx->distinct_page_access;
++
++ trx->io_reads = 0;
++ trx->io_read = 0;
++ trx->io_reads_wait_timer = 0;
++ trx->lock_que_wait_timer = 0;
++ trx->innodb_que_wait_timer = 0;
++ trx->distinct_page_access = 0;
++ if (trx->distinct_page_access_hash)
++ memset(trx->distinct_page_access_hash, 0, DPAH_SIZE);
++
++ trx->mysql_n_tables_locked = 0;
+ prebuilt->used_in_HANDLER = FALSE;
+
+ if (!(thd->options & (OPTION_NOT_AUTOCOMMIT | OPTION_BEGIN))) {
+diff -r 1242d4575291 sql/ha_innodb.h
+--- a/sql/ha_innodb.h Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/ha_innodb.h Tue Jul 28 23:42:44 2009 -0700
+@@ -271,6 +271,8 @@
+
+ int innobase_start_trx_and_assign_read_view(THD* thd);
+
++void innobase_update_var_slow_log();
++
+ /***********************************************************************
+ This function is used to prepare X/Open XA distributed transaction */
+
+diff -r 1242d4575291 sql/log.cc
+--- a/sql/log.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/log.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -2289,11 +2289,12 @@
+ */
+
+ bool MYSQL_LOG::write(THD *thd,const char *query, uint query_length,
+- time_t query_start_arg)
++ time_t query_start_arg, ulonglong query_start_timer)
+ {
+ bool error=0;
+ time_t current_time;
+- if (!is_open())
++ ulonglong current_timer;
++ if (!opt_slow_log || !is_open())
+ return 0;
+ DBUG_ENTER("MYSQL_LOG::write");
+
+@@ -2303,7 +2304,8 @@
+ int tmp_errno=0;
+ char buff[80],*end;
+ end=buff;
+- if (!(thd->options & OPTION_UPDATE_LOG))
++ if (!(thd->options & OPTION_UPDATE_LOG) &&
++ !(thd->slave_thread && opt_log_slow_slave_statements))
+ {
+ VOID(pthread_mutex_unlock(&LOCK_log));
+ DBUG_RETURN(0);
+@@ -2333,22 +2335,72 @@
+ if (my_b_printf(&log_file, "# User@Host: %s[%s] @ %s [%s]\n",
+ sctx->priv_user ?
+ sctx->priv_user : "",
+- sctx->user ? sctx->user : "",
++ sctx->user ? sctx->user : (thd->slave_thread ? "SQL_SLAVE" : ""),
+ sctx->host ? sctx->host : "",
+ sctx->ip ? sctx->ip : "") ==
+ (uint) -1)
+ tmp_errno=errno;
+ }
+- if (query_start_arg)
++ if (query_start_timer)
+ {
++ char buf[5][20];
++ ulonglong current_timer= my_timer(&current_timer, frequency);
++ snprintf(buf[0], 20, "%.6f", (current_timer ? (current_timer - query_start_timer):0) / 1000000.0);
++ snprintf(buf[1], 20, "%.6f", (thd->timer_after_lock - query_start_timer) / 1000000.0);
++ if (!query_length)
++ {
++ thd->sent_row_count= thd->examined_row_count= 0;
++ thd->row_count= 0;
++ thd->innodb_was_used= FALSE;
++ thd->query_plan_flags= QPLAN_NONE;
++ thd->query_plan_fsort_passes= 0;
++ }
++
+ /* For slow query log */
+ if (my_b_printf(&log_file,
+- "# Query_time: %lu Lock_time: %lu Rows_sent: %lu Rows_examined: %lu\n",
+- (ulong) (current_time - query_start_arg),
+- (ulong) (thd->time_after_lock - query_start_arg),
++ "# Thread_id: %lu Schema: %s\n" \
++ "# Query_time: %s Lock_time: %s Rows_sent: %lu Rows_examined: %lu Rows_affected: %lu Rows_read: %lu\n",
++ (ulong) thd->thread_id, (thd->db ? thd->db : ""),
++ buf[0], buf[1],
+ (ulong) thd->sent_row_count,
+- (ulong) thd->examined_row_count) == (uint) -1)
++ (ulong) thd->examined_row_count,
++ ((long) thd->row_count_func > 0 ) ? (ulong) thd->row_count_func : 0,
++ (ulong) thd->row_count) == (uint) -1)
+ tmp_errno=errno;
++ if ((thd->variables.log_slow_verbosity & SLOG_V_QUERY_PLAN) &&
++ my_b_printf(&log_file,
++ "# QC_Hit: %s Full_scan: %s Full_join: %s Tmp_table: %s Tmp_table_on_disk: %s\n" \
++ "# Filesort: %s Filesort_on_disk: %s Merge_passes: %lu\n",
++ ((thd->query_plan_flags & QPLAN_QC) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_FULL_SCAN) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_FULL_JOIN) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_TMP_TABLE) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_TMP_DISK) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_FILESORT) ? "Yes" : "No"),
++ ((thd->query_plan_flags & QPLAN_FILESORT_DISK) ? "Yes" : "No"),
++ thd->query_plan_fsort_passes) == (uint) -1)
++ tmp_errno=errno;
++ if ((thd->variables.log_slow_verbosity & SLOG_V_INNODB) && thd->innodb_was_used)
++ {
++ snprintf(buf[2], 20, "%.6f", thd->innodb_io_reads_wait_timer / 1000000.0);
++ snprintf(buf[3], 20, "%.6f", thd->innodb_lock_que_wait_timer / 1000000.0);
++ snprintf(buf[4], 20, "%.6f", thd->innodb_innodb_que_wait_timer / 1000000.0);
++ if (my_b_printf(&log_file,
++ "# InnoDB_IO_r_ops: %lu InnoDB_IO_r_bytes: %lu InnoDB_IO_r_wait: %s\n" \
++ "# InnoDB_rec_lock_wait: %s InnoDB_queue_wait: %s\n" \
++ "# InnoDB_pages_distinct: %lu\n",
++ (ulong) thd->innodb_io_reads,
++ (ulong) thd->innodb_io_read,
++ buf[2], buf[3], buf[4],
++ (ulong) thd->innodb_page_access) == (uint) -1)
++ tmp_errno=errno;
++ }
++ else
++ {
++ if ((thd->variables.log_slow_verbosity & SLOG_V_INNODB) &&
++ my_b_printf(&log_file,"# No InnoDB statistics available for this query\n") == (uint) -1)
++ tmp_errno=errno;
++ }
+ }
+ if (thd->db && strcmp(thd->db,db))
+ { // Database changed
+diff -r 1242d4575291 sql/log_event.cc
+--- a/sql/log_event.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/log_event.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -2061,6 +2061,7 @@
+ /* Execute the query (note that we bypass dispatch_command()) */
+ const char* found_semicolon= NULL;
+ mysql_parse(thd, thd->query, thd->query_length, &found_semicolon);
++ log_slow_statement(thd);
+
+ }
+ else
+diff -r 1242d4575291 sql/mysql_priv.h
+--- a/sql/mysql_priv.h Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/mysql_priv.h Tue Jul 28 23:42:44 2009 -0700
+@@ -507,6 +507,78 @@
+
+ #define STRING_BUFFER_USUAL_SIZE 80
+
++/* Slow log */
++
++struct msl_opts
++{
++ ulong val;
++ const char *name;
++};
++
++#define SLOG_V_MICROTIME 1 << 0
++#define SLOG_V_QUERY_PLAN 1 << 1
++#define SLOG_V_INNODB 1 << 2
++/* ... */
++#define SLOG_V_INVALID 1 << 31
++#define SLOG_V_NONE SLOG_V_MICROTIME
++
++static const struct msl_opts slog_verb[]=
++{
++ /* Basic flags */
++
++ { SLOG_V_MICROTIME, "microtime" },
++ { SLOG_V_QUERY_PLAN, "query_plan" },
++ { SLOG_V_INNODB, "innodb" },
++
++ /* End of baisc flags */
++
++ { 0, "" },
++
++ /* Complex flags */
++
++ { SLOG_V_MICROTIME, "minimal" },
++ { SLOG_V_MICROTIME|SLOG_V_QUERY_PLAN, "standard" },
++ { SLOG_V_MICROTIME|SLOG_V_QUERY_PLAN|SLOG_V_INNODB, "full" },
++
++ /* End of complex flags */
++
++ { SLOG_V_INVALID, (char *)0 }
++};
++
++#define QPLAN_NONE 0
++#define QPLAN_QC 1 << 0
++#define QPLAN_QC_NO 1 << 1
++#define QPLAN_FULL_SCAN 1 << 2
++#define QPLAN_FULL_JOIN 1 << 3
++#define QPLAN_TMP_TABLE 1 << 4
++#define QPLAN_TMP_DISK 1 << 5
++#define QPLAN_FILESORT 1 << 6
++#define QPLAN_FILESORT_DISK 1 << 7
++/* ... */
++#define QPLAN_MAX 1 << 31
++
++#define SLOG_F_QC_NO QPLAN_QC_NO
++#define SLOG_F_FULL_SCAN QPLAN_FULL_SCAN
++#define SLOG_F_FULL_JOIN QPLAN_FULL_JOIN
++#define SLOG_F_TMP_TABLE QPLAN_TMP_TABLE
++#define SLOG_F_TMP_DISK QPLAN_TMP_DISK
++#define SLOG_F_FILESORT QPLAN_FILESORT
++#define SLOG_F_FILESORT_DISK QPLAN_FILESORT_DISK
++#define SLOG_F_INVALID 1 << 31
++#define SLOG_F_NONE 0
++
++static const struct msl_opts slog_filter[]=
++{
++ { SLOG_F_QC_NO, "qc_miss" },
++ { SLOG_F_FULL_SCAN, "full_scan" },
++ { SLOG_F_FULL_JOIN, "full_join" },
++ { SLOG_F_TMP_TABLE, "tmp_table" },
++ { SLOG_F_TMP_DISK, "tmp_table_on_disk" },
++ { SLOG_F_FILESORT, "filesort" },
++ { SLOG_F_FILESORT_DISK, "filesort_on_disk" },
++ { SLOG_F_INVALID, (char *)0 }
++};
++
+ enum enum_parsing_place
+ {
+ NO_MATTER,
+@@ -1365,6 +1437,7 @@
+ extern bool using_update_log, opt_large_files, server_id_supplied;
+ extern bool opt_update_log, opt_bin_log, opt_error_log;
+ extern my_bool opt_log, opt_slow_log, opt_log_queries_not_using_indexes;
++extern char *opt_slow_logname;
+ extern bool opt_disable_networking, opt_skip_show_db;
+ extern my_bool opt_character_set_client_handshake;
+ extern bool volatile abort_loop, shutdown_in_progress, grant_option;
+@@ -1376,7 +1449,8 @@
+ extern my_bool opt_enable_named_pipe, opt_sync_frm, opt_allow_suspicious_udfs;
+ extern my_bool opt_secure_auth;
+ extern char* opt_secure_file_priv;
+-extern my_bool opt_log_slow_admin_statements;
++extern my_bool opt_log_slow_admin_statements, opt_log_slow_slave_statements;
++extern my_bool opt_use_global_long_query_time;
+ extern my_bool sp_automatic_privileges, opt_noacl;
+ extern my_bool opt_old_style_user_limits, trust_function_creators;
+ extern uint opt_crash_binlog_innodb;
+diff -r 1242d4575291 sql/mysqld.cc
+--- a/sql/mysqld.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/mysqld.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -176,7 +176,6 @@
+ static void getvolumeID(BYTE *volumeName);
+ #endif /* __NETWARE__ */
+
+-
+ #ifdef _AIX41
+ int initgroups(const char *,unsigned int);
+ #endif
+@@ -411,10 +410,13 @@
+ my_bool opt_secure_auth= 0;
+ char* opt_secure_file_priv= 0;
+ my_bool opt_log_slow_admin_statements= 0;
++my_bool opt_log_slow_slave_statements= 0;
++my_bool opt_use_global_long_query_time= 0;
+ my_bool lower_case_file_system= 0;
+ my_bool opt_large_pages= 0;
+ uint opt_large_page_size= 0;
+ my_bool opt_old_style_user_limits= 0, trust_function_creators= 0;
++char* opt_slow_logname= 0;
+ /*
+ True if there is at least one per-hour limit for some user, so we should
+ check them before each query (and possibly reset counters when hour is
+@@ -509,6 +511,7 @@
+ Ge_creator ge_creator;
+ Le_creator le_creator;
+
++ulonglong frequency= 0;
+
+ FILE *bootstrap_file;
+ int bootstrap_error;
+@@ -588,7 +591,7 @@
+ static int cleanup_done;
+ static ulong opt_specialflag, opt_myisam_block_size;
+ static char *opt_logname, *opt_update_logname, *opt_binlog_index_name;
+-static char *opt_slow_logname, *opt_tc_heuristic_recover;
++static char *opt_tc_heuristic_recover;
+ static char *mysql_home_ptr, *pidfile_name_ptr;
+ static char **defaults_argv;
+ static char *opt_bin_logname;
+@@ -3697,6 +3700,8 @@
+ unireg_abort(1);
+ }
+ }
++ if (!QueryPerformanceFrequency((LARGE_INTEGER *)&frequency))
++ frequency= 0;
+ #endif /* __WIN__ */
+
+ if (init_common_variables(MYSQL_CONFIG_NAME,
+@@ -4947,7 +4952,7 @@
+ OPT_INTERACTIVE_TIMEOUT, OPT_JOIN_BUFF_SIZE,
+ OPT_KEY_BUFFER_SIZE, OPT_KEY_CACHE_BLOCK_SIZE,
+ OPT_KEY_CACHE_DIVISION_LIMIT, OPT_KEY_CACHE_AGE_THRESHOLD,
+- OPT_LONG_QUERY_TIME,
++ OPT_LONG_QUERY_TIME, OPT_MIN_EXAMINED_ROW_LIMIT,
+ OPT_LOWER_CASE_TABLE_NAMES, OPT_MAX_ALLOWED_PACKET,
+ OPT_MAX_BINLOG_CACHE_SIZE, OPT_MAX_BINLOG_SIZE,
+ OPT_MAX_CONNECTIONS, OPT_MAX_CONNECT_ERRORS,
+@@ -5038,11 +5043,18 @@
+ OPT_TIMED_MUTEXES,
+ OPT_OLD_STYLE_USER_LIMITS,
+ OPT_LOG_SLOW_ADMIN_STATEMENTS,
++ OPT_LOG_SLOW_SLAVE_STATEMENTS,
++ OPT_LOG_SLOW_RATE_LIMIT,
++ OPT_LOG_SLOW_VERBOSITY,
++ OPT_LOG_SLOW_FILTER,
+ OPT_TABLE_LOCK_WAIT_TIMEOUT,
+ OPT_PLUGIN_DIR,
+ OPT_PORT_OPEN_TIMEOUT,
+ OPT_MERGE,
+ OPT_PROFILING,
++ OPT_SLOW_LOG,
++ OPT_SLOW_QUERY_LOG_FILE,
++ OPT_USE_GLOBAL_LONG_QUERY_TIME,
+ OPT_INNODB_ROLLBACK_ON_TIMEOUT,
+ OPT_SECURE_FILE_PRIV,
+ OPT_KEEP_FILES_ON_CREATE,
+@@ -5441,10 +5453,19 @@
+ (gptr*) &opt_log_slow_admin_statements,
+ (gptr*) &opt_log_slow_admin_statements,
+ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
++ {"log-slow-slave-statements", OPT_LOG_SLOW_SLAVE_STATEMENTS,
++ "Log slow replicated statements to the slow log if it is open.",
++ (gptr*) &opt_log_slow_slave_statements,
++ (gptr*) &opt_log_slow_slave_statements,
++ 0, GET_BOOL, NO_ARG, 0, 0, 0, 0, 0, 0},
+ {"log-slow-queries", OPT_SLOW_QUERY_LOG,
+ "Log slow queries to this log file. Defaults logging to hostname-slow.log file. Must be enabled to activate other slow log options.",
+ (gptr*) &opt_slow_logname, (gptr*) &opt_slow_logname, 0, GET_STR, OPT_ARG,
+ 0, 0, 0, 0, 0, 0},
++ {"slow_query_log_file", OPT_SLOW_QUERY_LOG_FILE,
++ "Log slow queries to given log file. Defaults logging to hostname-slow.log. Must be enabled to activate other slow log options.",
++ (gptr*) &opt_slow_logname, (gptr*) &opt_slow_logname, 0, GET_STR, OPT_ARG,
++ 0, 0, 0, 0, 0, 0},
+ {"log-tc", OPT_LOG_TC,
+ "Path to transaction coordinator log (used for transactions that affect "
+ "more than one storage engine, when binary log is disabled)",
+@@ -5808,6 +5829,9 @@
+ "Tells the slave thread to continue replication when a query returns an error from the provided list.",
+ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+ #endif
++ {"slow-query-log", OPT_SLOW_LOG,
++ "Enable|disable slow query log", (gptr*) &opt_slow_log,
++ (gptr*) &opt_slow_log, 0, GET_BOOL, OPT_ARG, 0, 0, 0, 0, 0, 0},
+ {"socket", OPT_SOCKET, "Socket file to use for connection.",
+ (gptr*) &mysqld_unix_port, (gptr*) &mysqld_unix_port, 0, GET_STR,
+ REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
+@@ -6110,11 +6134,31 @@
+ (gptr*) 0,
+ 0, (GET_ULONG | GET_ASK_ADDR) , REQUIRED_ARG, 100,
+ 1, 100, 0, 1, 0},
+- {"long_query_time", OPT_LONG_QUERY_TIME,
+- "Log all queries that have taken more than long_query_time seconds to execute to file.",
+- (gptr*) &global_system_variables.long_query_time,
+- (gptr*) &max_system_variables.long_query_time, 0, GET_ULONG,
+- REQUIRED_ARG, 10, 1, LONG_TIMEOUT, 0, 1, 0},
++ {"log_slow_filter", OPT_LOG_SLOW_FILTER,
++ "Log only the queries that followed certain execution plan. Multiple flags allowed in a comma-separated string. [qc_miss, full_scan, full_join, tmp_table, tmp_table_on_disk, filesort, filesort_on_disk]",
++ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, SLOG_F_NONE, 0, 0},
++ {"log_slow_rate_limit", OPT_LOG_SLOW_RATE_LIMIT,
++ "Rate limit statement writes to slow log to only those from every (1/log_slow_rate_limit) session.",
++ (gptr*) &global_system_variables.log_slow_rate_limit,
++ (gptr*) &max_system_variables.log_slow_rate_limit, 0, GET_ULONG,
++ REQUIRED_ARG, 1, 1, LONG_MAX, 0, 1L, 0},
++ {"log_slow_verbosity", OPT_LOG_SLOW_VERBOSITY,
++ "Choose how verbose the messages to your slow log will be. Multiple flags allowed in a comma-separated string. [microtime, query_plan, innodb]",
++ 0, 0, 0, GET_STR, REQUIRED_ARG, 0, 0, 0, SLOG_V_MICROTIME, 0, 0},
++ {"long_query_time", OPT_LONG_QUERY_TIME,
++ "Log all queries that have taken more than long_query_time seconds to execute to file.",
++ (gptr*) &global_system_variables.long_query_time,
++ (gptr*) &max_system_variables.long_query_time, 0, GET_MICROTIME,
++ REQUIRED_ARG, 10000000, 0, LONG_TIMEOUT * 1000000, 0, 1, 0},
++ {"min_examined_row_limit", OPT_MIN_EXAMINED_ROW_LIMIT,
++ "Don't log queries which examine less than min_examined_row_limit rows to file.",
++ (gptr*) &global_system_variables.min_examined_row_limit,
++ (gptr*) &max_system_variables.min_examined_row_limit, 0, GET_ULONG,
++ REQUIRED_ARG, 0, 0, LONG_MAX, 0, 1L, 0},
++ {"use_global_long_query_time", OPT_USE_GLOBAL_LONG_QUERY_TIME,
++ "Control always use global long_query_time or local long_query_time.",
++ (gptr*) &opt_use_global_long_query_time, (gptr*) &opt_use_global_long_query_time,
++ 0, GET_BOOL, NO_ARG, 0, 0, 1, 0, 1, 0},
+ {"lower_case_table_names", OPT_LOWER_CASE_TABLE_NAMES,
+ "If set to 1 table names are stored in lowercase on disk and table names will be case-insensitive. Should be set to 2 if you are using a case insensitive file system",
+ (gptr*) &lower_case_table_names,
+@@ -6893,7 +6937,11 @@
+ global_system_variables.max_join_size= (ulonglong) HA_POS_ERROR;
+ max_system_variables.max_join_size= (ulonglong) HA_POS_ERROR;
+ global_system_variables.old_passwords= 0;
+-
++ global_system_variables.long_query_time = 10000000;
++ max_system_variables.long_query_time = LONG_TIMEOUT * 1000000;
++ global_system_variables.log_slow_verbosity= SLOG_V_MICROTIME;
++ global_system_variables.log_slow_filter= SLOG_F_NONE;
++
+ /*
+ Default behavior for 4.1 and 5.0 is to treat NULL values as unequal
+ when collecting index statistics for MyISAM tables.
+@@ -7364,6 +7412,35 @@
+ case OPT_BOOTSTRAP:
+ opt_noacl=opt_bootstrap=1;
+ break;
++ case OPT_LOG_SLOW_FILTER:
++ if ((global_system_variables.log_slow_filter=
++ msl_flag_resolve_by_name(slog_filter, argument,
++ SLOG_F_NONE, SLOG_F_INVALID)) == SLOG_F_INVALID)
++ {
++ fprintf(stderr,"Invalid argument in log_slow_filter: %s\n", argument);
++ exit(1);
++ }
++ break;
++ case OPT_LOG_SLOW_VERBOSITY:
++ if ((global_system_variables.log_slow_verbosity=
++ msl_flag_resolve_by_name(slog_verb, argument,
++ SLOG_V_NONE, SLOG_V_INVALID)) == SLOG_V_INVALID)
++ {
++ fprintf(stderr,"Invalid argument in log_slow_verbosity: %s\n", argument);
++ exit(1);
++ }
++ break;
++ case OPT_LONG_QUERY_TIME:
++ {
++ double doubleslow = strtod(argument,NULL);
++ if (doubleslow < 0 || doubleslow > (LONG_TIMEOUT))
++ {
++ fprintf(stderr,"Out of range long_query_time value: %s\n", argument);
++ exit(1);
++ }
++ global_system_variables.long_query_time = (ulonglong) (doubleslow * 1000000);
++ break;
++ }
+ case OPT_STORAGE_ENGINE:
+ {
+ if ((enum db_type)((global_system_variables.table_type=
+@@ -7696,10 +7773,14 @@
+ if (opt_bdb)
+ sql_print_warning("this binary does not contain BDB storage engine");
+ #endif
+- if ((opt_log_slow_admin_statements || opt_log_queries_not_using_indexes) &&
++ if ((opt_log_slow_admin_statements || opt_log_queries_not_using_indexes ||
++ opt_log_slow_slave_statements) &&
+ !opt_slow_log)
+- sql_print_warning("options --log-slow-admin-statements and --log-queries-not-using-indexes have no effect if --log-slow-queries is not set");
+-
++ {
++ sql_print_warning("options --log-slow-admin-statements, --log-slow-slave-statements and --log-queries-not-using-indexes have no effect if --log-slow-queries is not set");
++ opt_log_slow_slave_statements= FALSE;
++ }
++
+ if (argc > 0)
+ {
+ fprintf(stderr, "%s: Too many arguments (first extra is '%s').\nUse --help to get a list of available options\n", my_progname, *argv);
+diff -r 1242d4575291 sql/set_var.cc
+--- a/sql/set_var.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/set_var.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -217,9 +217,13 @@
+ sys_log_queries_not_using_indexes("log_queries_not_using_indexes",
+ &opt_log_queries_not_using_indexes);
+ sys_var_thd_ulong sys_log_warnings("log_warnings", &SV::log_warnings);
+-sys_var_thd_ulong sys_long_query_time("long_query_time",
++sys_var_thd_microtime sys_long_query_time("long_query_time",
+ &SV::long_query_time);
++sys_var_bool_ptr sys_use_global_long_query_time("use_global_long_query_time",
++ &opt_use_global_long_query_time);
+ sys_var_bool_const_ptr sys_log_slow("log_slow_queries", &opt_slow_log);
++sys_var_log_slow sys_slow_query_log("slow_query_log", &opt_slow_log);
++sys_var_const_str_ptr sys_slow_query_log_file("slow_query_log_file", &opt_slow_logname);
+ sys_var_thd_bool sys_low_priority_updates("low_priority_updates",
+ &SV::low_priority_updates,
+ fix_low_priority_updates);
+@@ -283,6 +287,8 @@
+ &SV::max_tmp_tables);
+ sys_var_long_ptr sys_max_write_lock_count("max_write_lock_count",
+ &max_write_lock_count);
++sys_var_thd_ulong sys_min_examined_row_limit("min_examined_row_limit",
++ &SV::min_examined_row_limit);
+ sys_var_thd_ulong sys_multi_range_count("multi_range_count",
+ &SV::multi_range_count);
+ sys_var_long_ptr sys_myisam_data_pointer_size("myisam_data_pointer_size",
+@@ -327,6 +333,20 @@
+ sys_var_bool_ptr sys_relay_log_purge("relay_log_purge",
+ &relay_log_purge);
+ #endif
++sys_var_thd_ulong sys_log_slow_rate_limit("log_slow_rate_limit",
++ &SV::log_slow_rate_limit);
++sys_var_thd_msl_flag sys_log_slow_filter("log_slow_filter",
++ &SV::log_slow_filter,
++ SLOG_F_NONE,
++ SLOG_F_NONE,
++ SLOG_F_INVALID,
++ slog_filter);
++sys_var_thd_msl_flag sys_log_slow_verbosity("log_slow_verbosity",
++ &SV::log_slow_verbosity,
++ SLOG_V_NONE,
++ SLOG_V_MICROTIME,
++ SLOG_V_INVALID,
++ slog_verb);
+ sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank",
+ &rpl_recovery_rank);
+ sys_var_long_ptr sys_query_cache_size("query_cache_size",
+@@ -697,6 +717,10 @@
+ &sys_log_off,
+ &sys_log_queries_not_using_indexes,
+ &sys_log_slow,
++ &sys_log_slow_filter,
++ &sys_log_slow_rate_limit,
++ &sys_log_slow_verbosity,
++ &sys_use_global_long_query_time,
+ &sys_log_update,
+ &sys_log_warnings,
+ &sys_long_query_time,
+@@ -720,6 +744,7 @@
+ &sys_max_tmp_tables,
+ &sys_max_user_connections,
+ &sys_max_write_lock_count,
++ &sys_min_examined_row_limit,
+ &sys_multi_range_count,
+ &sys_myisam_data_pointer_size,
+ &sys_myisam_max_sort_file_size,
+@@ -773,6 +798,8 @@
+ &sys_slave_skip_counter,
+ #endif
+ &sys_slow_launch_time,
++ &sys_slow_query_log,
++ &sys_slow_query_log_file,
+ &sys_sort_buffer,
+ &sys_sql_big_tables,
+ &sys_sql_low_priority_updates,
+@@ -994,8 +1021,11 @@
+ {"log_slave_updates", (char*) &opt_log_slave_updates, SHOW_MY_BOOL},
+ #endif
+ {sys_log_slow.name, (char*) &sys_log_slow, SHOW_SYS},
++ {sys_log_slow_filter.name, (char*) &sys_log_slow_filter, SHOW_SYS},
++ {sys_log_slow_rate_limit.name, (char*) &sys_log_slow_rate_limit, SHOW_SYS},
++ {sys_log_slow_verbosity.name, (char*) &sys_log_slow_verbosity, SHOW_SYS},
+ {sys_log_warnings.name, (char*) &sys_log_warnings, SHOW_SYS},
+- {sys_long_query_time.name, (char*) &sys_long_query_time, SHOW_SYS},
++ {sys_long_query_time.name, (char*) &sys_long_query_time, SHOW_MICROTIME},
+ {sys_low_priority_updates.name, (char*) &sys_low_priority_updates, SHOW_SYS},
+ {"lower_case_file_system", (char*) &lower_case_file_system, SHOW_MY_BOOL},
+ {"lower_case_table_names", (char*) &lower_case_table_names, SHOW_INT},
+@@ -1022,6 +1052,7 @@
+ {sys_max_tmp_tables.name, (char*) &sys_max_tmp_tables, SHOW_SYS},
+ {sys_max_user_connections.name,(char*) &sys_max_user_connections, SHOW_SYS},
+ {sys_max_write_lock_count.name, (char*) &sys_max_write_lock_count,SHOW_SYS},
++ {sys_min_examined_row_limit.name, (char*) &sys_min_examined_row_limit, SHOW_SYS},
+ {sys_multi_range_count.name, (char*) &sys_multi_range_count, SHOW_SYS},
+ {sys_myisam_data_pointer_size.name, (char*) &sys_myisam_data_pointer_size, SHOW_SYS},
+ {sys_myisam_max_sort_file_size.name, (char*) &sys_myisam_max_sort_file_size,
+@@ -1109,6 +1140,8 @@
+ {sys_slave_trans_retries.name,(char*) &sys_slave_trans_retries, SHOW_SYS},
+ #endif
+ {sys_slow_launch_time.name, (char*) &sys_slow_launch_time, SHOW_SYS},
++ {sys_slow_query_log.name, (char*) &sys_slow_query_log, SHOW_SYS},
++ {sys_slow_query_log_file.name,(char*) &sys_slow_query_log_file, SHOW_SYS},
+ #ifdef HAVE_SYS_UN_H
+ {"socket", (char*) &mysqld_unix_port, SHOW_CHAR_PTR},
+ #endif
+@@ -1149,6 +1182,7 @@
+ {sys_tx_isolation.name, (char*) &sys_tx_isolation, SHOW_SYS},
+ {sys_updatable_views_with_limit.name,
+ (char*) &sys_updatable_views_with_limit,SHOW_SYS},
++ {sys_use_global_long_query_time.name, (char*) &sys_use_global_long_query_time, SHOW_SYS},
+ {sys_version.name, (char*) &sys_version, SHOW_SYS},
+ #ifdef HAVE_BERKELEY_DB
+ {sys_version_bdb.name, (char*) &sys_version_bdb, SHOW_SYS},
+@@ -1777,6 +1811,17 @@
+ }
+
+
++bool sys_var_thd_microtime::check(THD *thd, set_var *var)
++{
++ if (var->value->result_type() == DECIMAL_RESULT)
++ var->save_result.ulonglong_value= (ulonglong)(var->value->val_real() * 1000000);
++ else
++ var->save_result.ulonglong_value= (ulonglong)(var->value->val_int() * 1000000);
++
++ return 0;
++}
++
++
+ bool sys_var_thd_bool::update(THD *thd, set_var *var)
+ {
+ if (var->type == OPT_GLOBAL)
+@@ -1933,6 +1978,19 @@
+ pthread_mutex_unlock(&LOCK_global_system_variables);
+ return new Item_int(value);
+ }
++ case SHOW_MICROTIME:
++ {
++ longlong value;
++ char buff[80];
++ int len;
++
++ pthread_mutex_lock(&LOCK_global_system_variables);
++ value= *(longlong*) value_ptr(thd, var_type, base);
++ pthread_mutex_unlock(&LOCK_global_system_variables);
++
++ len = snprintf(buff, 80, "%f", ((double) value) / 1000000.0);
++ return new Item_float(buff,len);
++ }
+ case SHOW_HA_ROWS:
+ {
+ ha_rows value;
+@@ -2765,6 +2823,30 @@
+ }
+
+
++bool sys_var_log_slow::update(THD *thd, set_var *var)
++{
++ bool ret;
++
++ pthread_mutex_lock(&LOCK_global_system_variables);
++ if (var->save_result.ulong_value)
++ {
++ if(!mysql_slow_log.is_open())
++ {
++ mysql_slow_log.open_slow_log(opt_slow_logname);
++ }
++ }
++ pthread_mutex_unlock(&LOCK_global_system_variables);
++
++ ret = sys_var_bool_ptr::update(thd, var);
++
++#ifdef HAVE_INNOBASE_DB
++ innobase_update_var_slow_log();
++#endif
++
++ return(ret);
++}
++
++
+ #ifdef HAVE_REPLICATION
+ bool sys_var_slave_skip_counter::check(THD *thd, set_var *var)
+ {
+@@ -3549,6 +3631,191 @@
+ #endif
+ }
+
++/* Slow log stuff */
++
++ulong msl_option_resolve_by_name(const struct msl_opts *opts, const char *name, ulong len)
++{
++ ulong i;
++
++ for (i=0; opts[i].name; i++)
++ {
++ if (!my_strnncoll(&my_charset_latin1,
++ (const uchar *)name, len,
++ (const uchar *)opts[i].name, strlen(opts[i].name)))
++ return opts[i].val;
++ }
++ return opts[i].val;
++}
++
++ulong msl_flag_resolve_by_name(const struct msl_opts *opts, const char *names_list,
++ const ulong none_val, const ulong invalid_val)
++{
++ const char *p, *e;
++ ulong val= none_val;
++
++ if (!*names_list)
++ return val;
++
++ for (p= e= names_list; ; e++)
++ {
++ ulong i;
++
++ if (*e != ',' && *e)
++ continue;
++ for (i=0; opts[i].name; i++)
++ {
++ if (!my_strnncoll(&my_charset_latin1,
++ (const uchar *)p, e - p,
++ (const uchar *)opts[i].name, strlen(opts[i].name)))
++ {
++ val= val | opts[i].val;
++ break;
++ }
++ }
++ if (opts[i].val == invalid_val)
++ return invalid_val;
++ if (!*e)
++ break;
++ p= e + 1;
++ }
++ return val;
++}
++
++const char *msl_option_get_name(const struct msl_opts *opts, ulong val)
++{
++ for (ulong i=0; opts[i].name && opts[i].name[0]; i++)
++ {
++ if (opts[i].val == val)
++ return opts[i].name;
++ }
++ return "*INVALID*";
++}
++
++char *msl_flag_get_name(const struct msl_opts *opts, char *buf, ulong val)
++{
++ uint offset= 0;
++
++ *buf= '\0';
++ for (ulong i=0; opts[i].name && opts[i].name[0]; i++)
++ {
++ if (opts[i].val & val)
++ offset+= snprintf(buf+offset, STRING_BUFFER_USUAL_SIZE - offset - 1,
++ "%s%s", (offset ? "," : ""), opts[i].name);
++ }
++ return buf;
++}
++
++/****************************************************************************
++ Functions to handle log_slow_verbosity
++****************************************************************************/
++
++/* Based upon sys_var::check_enum() */
++
++bool sys_var_thd_msl_option::check(THD *thd, set_var *var)
++{
++ char buff[STRING_BUFFER_USUAL_SIZE];
++ String str(buff, sizeof(buff), &my_charset_latin1), *res;
++
++ if (var->value->result_type() == STRING_RESULT)
++ {
++ ulong verb= this->invalid_val;
++ if (!(res=var->value->val_str(&str)) ||
++ (var->save_result.ulong_value=
++ (ulong) (verb= msl_option_resolve_by_name(this->opts, res->ptr(), res->length()))) == this->invalid_val)
++ goto err;
++ return 0;
++ }
++
++err:
++ my_error(ER_WRONG_ARGUMENTS, MYF(0), var->var->name);
++ return 1;
++}
++
++byte *sys_var_thd_msl_option::value_ptr(THD *thd, enum_var_type type,
++ LEX_STRING *base)
++{
++ ulong val;
++ val= ((type == OPT_GLOBAL) ? global_system_variables.*offset :
++ thd->variables.*offset);
++ const char *verbosity= msl_option_get_name(this->opts, val);
++ return (byte *) verbosity;
++}
++
++
++void sys_var_thd_msl_option::set_default(THD *thd, enum_var_type type)
++{
++ if (type == OPT_GLOBAL)
++ global_system_variables.*offset= (ulong) this->default_val;
++ else
++ thd->variables.*offset= (ulong) (global_system_variables.*offset);
++}
++
++
++bool sys_var_thd_msl_option::update(THD *thd, set_var *var)
++{
++ if (var->type == OPT_GLOBAL)
++ global_system_variables.*offset= var->save_result.ulong_value;
++ else
++ thd->variables.*offset= var->save_result.ulong_value;
++ return 0;
++}
++
++/****************************************************************************
++ Functions to handle log_slow_filter
++****************************************************************************/
++
++/* Based upon sys_var::check_enum() */
++
++bool sys_var_thd_msl_flag::check(THD *thd, set_var *var)
++{
++ char buff[2 * STRING_BUFFER_USUAL_SIZE];
++ String str(buff, sizeof(buff), &my_charset_latin1), *res;
++
++ if (var->value->result_type() == STRING_RESULT)
++ {
++ ulong filter= this->none_val;
++ if (!(res=var->value->val_str(&str)) ||
++ (var->save_result.ulong_value=
++ (ulong) (filter= msl_flag_resolve_by_name(this->flags, res->ptr(), this->none_val,
++ this->invalid_val))) == this->invalid_val)
++ goto err;
++ return 0;
++ }
++
++err:
++ my_error(ER_WRONG_ARGUMENTS, MYF(0), var->var->name);
++ return 1;
++}
++
++byte *sys_var_thd_msl_flag::value_ptr(THD *thd, enum_var_type type,
++ LEX_STRING *base)
++{
++ ulong val;
++ val= ((type == OPT_GLOBAL) ? global_system_variables.*offset :
++ thd->variables.*offset);
++ msl_flag_get_name(this->flags, this->flags_string, val);
++ return (byte *) this->flags_string;
++}
++
++
++void sys_var_thd_msl_flag::set_default(THD *thd, enum_var_type type)
++{
++ if (type == OPT_GLOBAL)
++ global_system_variables.*offset= (ulong) this->default_val;
++ else
++ thd->variables.*offset= (ulong) (global_system_variables.*offset);
++}
++
++
++bool sys_var_thd_msl_flag::update(THD *thd, set_var *var)
++{
++ if (var->type == OPT_GLOBAL)
++ global_system_variables.*offset= var->save_result.ulong_value;
++ else
++ thd->variables.*offset= var->save_result.ulong_value;
++ return 0;
++}
++
+ /****************************************************************************
+ Functions to handle table_type
+ ****************************************************************************/
+diff -r 1242d4575291 sql/set_var.h
+--- a/sql/set_var.h Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/set_var.h Tue Jul 28 23:42:44 2009 -0700
+@@ -132,6 +132,7 @@
+ };
+
+
++
+ class sys_var_ulonglong_ptr :public sys_var
+ {
+ public:
+@@ -168,6 +169,13 @@
+ bool check_update_type(Item_result type) { return 0; }
+ };
+
++class sys_var_log_slow :public sys_var_bool_ptr
++{
++public:
++ sys_var_log_slow(const char *name_arg, my_bool *value_arg)
++ :sys_var_bool_ptr(name_arg, value_arg) {}
++ bool update(THD *thd, set_var *var);
++};
+
+ class sys_var_bool_const_ptr : public sys_var
+ {
+@@ -340,7 +348,6 @@
+ }
+ };
+
+-
+ class sys_var_thd_ulong :public sys_var_thd
+ {
+ sys_check_func check_func;
+@@ -360,7 +367,6 @@
+ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base);
+ };
+
+-
+ class sys_var_thd_ha_rows :public sys_var_thd
+ {
+ public:
+@@ -378,7 +384,6 @@
+ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base);
+ };
+
+-
+ class sys_var_thd_ulonglong :public sys_var_thd
+ {
+ public:
+@@ -407,6 +412,19 @@
+ }
+ };
+
++class sys_var_thd_microtime :public sys_var_thd_ulonglong
++{
++public:
++ sys_var_thd_microtime(const char *name_arg, ulonglong SV::*offset_arg)
++ :sys_var_thd_ulonglong(name_arg, offset_arg)
++ {}
++ SHOW_TYPE show_type() { return SHOW_MICROTIME; }
++ bool check(THD *thd, set_var *var);
++ bool check_update_type(Item_result type)
++ {
++ return type != INT_RESULT && type != DECIMAL_RESULT;
++ }
++};
+
+ class sys_var_thd_bool :public sys_var_thd
+ {
+@@ -478,6 +496,66 @@
+ };
+
+
++class sys_var_thd_msl_option :public sys_var_thd
++{
++protected:
++ ulong SV::*offset;
++ const ulong none_val;
++ const ulong default_val;
++ const ulong invalid_val;
++ const struct msl_opts *opts;
++public:
++ sys_var_thd_msl_option(const char *name_arg, ulong SV::*offset_arg,
++ const ulong none_val_arg,
++ const ulong default_val_arg,
++ const ulong invalid_val_arg,
++ const struct msl_opts *opts_arg)
++ :sys_var_thd(name_arg), offset(offset_arg), none_val(none_val_arg),
++ default_val(default_val_arg), invalid_val(invalid_val_arg),
++ opts(opts_arg)
++ {}
++ bool check(THD *thd, set_var *var);
++ SHOW_TYPE show_type() { return SHOW_CHAR; }
++ bool check_update_type(Item_result type)
++ {
++ return type != STRING_RESULT; /* Only accept strings */
++ }
++ void set_default(THD *thd, enum_var_type type);
++ bool update(THD *thd, set_var *var);
++ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base);
++};
++
++
++class sys_var_thd_msl_flag :public sys_var_thd
++{
++protected:
++ char flags_string[2 * STRING_BUFFER_USUAL_SIZE];
++ ulong SV::*offset;
++ const ulong none_val;
++ const ulong default_val;
++ const ulong invalid_val;
++ const struct msl_opts *flags;
++public:
++ sys_var_thd_msl_flag(const char *name_arg, ulong SV::*offset_arg,
++ const ulong none_val_arg,
++ const ulong default_val_arg,
++ const ulong invalid_val_arg,
++ const struct msl_opts *flags_arg)
++ :sys_var_thd(name_arg), offset(offset_arg), none_val(none_val_arg),
++ default_val(default_val_arg), invalid_val(invalid_val_arg),
++ flags(flags_arg)
++ {}
++ bool check(THD *thd, set_var *var);
++ SHOW_TYPE show_type() { return SHOW_CHAR; }
++ bool check_update_type(Item_result type)
++ {
++ return type != STRING_RESULT; /* Only accept strings */
++ }
++ void set_default(THD *thd, enum_var_type type);
++ bool update(THD *thd, set_var *var);
++ byte *value_ptr(THD *thd, enum_var_type type, LEX_STRING *base);
++};
++
+ class sys_var_thd_storage_engine :public sys_var_thd
+ {
+ protected:
+@@ -1109,3 +1187,11 @@
+ bool process_key_caches(int (* func) (const char *name, KEY_CACHE *));
+ void delete_elements(I_List<NAMED_LIST> *list,
+ void (*free_element)(const char*, gptr));
++
++/* Slow log functions */
++
++ulong msl_option_resolve_by_name(const struct msl_opts *opts, const char *name, ulong len);
++ulong msl_flag_resolve_by_name(const struct msl_opts *opts, const char *names_list,
++ const ulong none_val, const ulong invalid_val);
++const char *msl_option_get_name(const struct msl_opts *opts, ulong val);
++char *msl_flag_get_name(const struct msl_opts *opts, char *buf, ulong val);
+diff -r 1242d4575291 sql/slave.cc
+--- a/sql/slave.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/slave.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -2983,6 +2983,12 @@
+ + MAX_LOG_EVENT_HEADER; /* note, incr over the global not session var */
+ thd->slave_thread = 1;
+ set_slave_thread_options(thd);
++ if (opt_log_slow_slave_statements)
++ {
++ thd->enable_slow_log= TRUE;
++ /* Slave thread is excluded from rate limiting the slow log writes. */
++ thd->write_to_slow_log= TRUE;
++ }
+ thd->client_capabilities = CLIENT_LOCAL_FILES;
+ thd->real_id=pthread_self();
+ pthread_mutex_lock(&LOCK_thread_count);
+diff -r 1242d4575291 sql/sql_cache.cc
+--- a/sql/sql_cache.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_cache.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -1402,6 +1402,7 @@
+
+ thd->limit_found_rows = query->found_rows();
+ thd->status_var.last_query_cost= 0.0;
++ thd->query_plan_flags|= QPLAN_QC;
+
+ BLOCK_UNLOCK_RD(query_block);
+ DBUG_RETURN(1); // Result sent to client
+@@ -1409,6 +1410,7 @@
+ err_unlock:
+ STRUCT_UNLOCK(&structure_guard_mutex);
+ err:
++ thd->query_plan_flags|= QPLAN_QC_NO;
+ DBUG_RETURN(0); // Query was not cached
+ }
+
+diff -r 1242d4575291 sql/sql_class.cc
+--- a/sql/sql_class.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_class.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -190,7 +190,7 @@
+ table_map_for_update(0),
+ global_read_lock(0), is_fatal_error(0),
+ transaction_rollback_request(0), is_fatal_sub_stmt_error(0),
+- rand_used(0), time_zone_used(0),
++ rand_used(0), time_zone_used(0), user_timer(0),
+ last_insert_id_used(0), last_insert_id_used_bin_log(0), insert_id_used(0),
+ clear_next_insert_id(0), in_lock_tables(0), bootstrap(0),
+ derived_tables_processing(FALSE), spcont(NULL),
+@@ -2251,6 +2251,12 @@
+ backup->cuted_fields= cuted_fields;
+ backup->client_capabilities= client_capabilities;
+ backup->savepoints= transaction.savepoints;
++ backup->innodb_io_reads= innodb_io_reads;
++ backup->innodb_io_read= innodb_io_read;
++ backup->innodb_io_reads_wait_timer= innodb_io_reads_wait_timer;
++ backup->innodb_lock_que_wait_timer= innodb_lock_que_wait_timer;
++ backup->innodb_innodb_que_wait_timer= innodb_innodb_que_wait_timer;
++ backup->innodb_page_access= innodb_page_access;
+
+ if (!lex->requires_prelocking() || is_update_query(lex->sql_command))
+ options&= ~OPTION_BIN_LOG;
+@@ -2267,7 +2273,13 @@
+ sent_row_count= 0;
+ cuted_fields= 0;
+ transaction.savepoints= 0;
+-
++ innodb_io_reads= 0;
++ innodb_io_read= 0;
++ innodb_io_reads_wait_timer= 0;
++ innodb_lock_que_wait_timer= 0;
++ innodb_innodb_que_wait_timer= 0;
++ innodb_page_access= 0;
++
+ /* Surpress OK packets in case if we will execute statements */
+ net.no_send_ok= TRUE;
+ }
+@@ -2320,6 +2332,12 @@
+ */
+ examined_row_count+= backup->examined_row_count;
+ cuted_fields+= backup->cuted_fields;
++ innodb_io_reads+= backup->innodb_io_reads;
++ innodb_io_read+= backup->innodb_io_read;
++ innodb_io_reads_wait_timer+= backup->innodb_io_reads_wait_timer;
++ innodb_lock_que_wait_timer+= backup->innodb_lock_que_wait_timer;
++ innodb_innodb_que_wait_timer+= backup->innodb_innodb_que_wait_timer;
++ innodb_page_access+= backup->innodb_page_access;
+ }
+
+
+diff -r 1242d4575291 sql/sql_class.h
+--- a/sql/sql_class.h Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_class.h Tue Jul 28 23:42:44 2009 -0700
+@@ -43,6 +43,7 @@
+ extern char internal_table_name[2];
+ extern char empty_c_string[1];
+ extern const char **errmesg;
++extern ulonglong frequency;
+
+ #define TC_LOG_PAGE_SIZE 8192
+ #define TC_LOG_MIN_SIZE (3*TC_LOG_PAGE_SIZE)
+@@ -321,7 +322,7 @@
+ bool write(THD *thd, enum enum_server_command command,
+ const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5);
+ bool write(THD *thd, const char *query, uint query_length,
+- time_t query_start=0);
++ time_t query_start=0, ulonglong query_start_timer=0);
+ bool write(Log_event* event_info); // binary log write
+ bool write(THD *thd, IO_CACHE *cache, Log_event *commit_event);
+
+@@ -527,13 +528,14 @@
+ ulong auto_increment_increment, auto_increment_offset;
+ ulong bulk_insert_buff_size;
+ ulong join_buff_size;
+- ulong long_query_time;
++ ulonglong long_query_time;
+ ulong max_allowed_packet;
+ ulong max_error_count;
+ ulong max_length_for_sort_data;
+ ulong max_sort_length;
+ ulong max_tmp_tables;
+ ulong max_insert_delayed_threads;
++ ulong min_examined_row_limit;
+ ulong multi_range_count;
+ ulong myisam_repair_threads;
+ ulong myisam_sort_buff_size;
+@@ -549,10 +551,13 @@
+ ulong preload_buff_size;
+ ulong profiling_history_size;
+ ulong query_cache_type;
++ ulong log_slow_rate_limit;
+ ulong read_buff_size;
+ ulong read_rnd_buff_size;
+ ulong div_precincrement;
+ ulong sortbuff_size;
++ ulong log_slow_filter;
++ ulong log_slow_verbosity;
+ ulong table_type;
+ ulong tx_isolation;
+ ulong completion_type;
+@@ -1129,6 +1134,12 @@
+ uint in_sub_stmt;
+ bool enable_slow_log, insert_id_used, clear_next_insert_id;
+ bool last_insert_id_used;
++ ulong innodb_io_reads;
++ ulonglong innodb_io_read;
++ ulong innodb_io_reads_wait_timer;
++ ulong innodb_lock_que_wait_timer;
++ ulong innodb_innodb_que_wait_timer;
++ ulong innodb_page_access;
+ my_bool no_send_ok;
+ SAVEPOINT *savepoints;
+ };
+@@ -1185,6 +1196,11 @@
+ class THD :public Statement,
+ public Open_tables_state
+ {
++private:
++ inline ulonglong query_start_timer() { return start_timer; }
++ inline void set_timer() { if (user_timer) start_timer=timer_after_lock=user_timer; else timer_after_lock=my_timer(&start_timer, frequency); }
++ inline void end_timer() { my_timer(&start_timer, frequency); }
++ inline void lock_timer() { my_timer(&timer_after_lock, frequency); }
+ public:
+ /*
+ Constant for THD::where initialization in the beginning of every query.
+@@ -1293,10 +1309,24 @@
+ */
+ const char *where;
+ time_t start_time,time_after_lock,user_time;
++ ulonglong start_timer,timer_after_lock, user_timer;
+ time_t connect_time,thr_create_time; // track down slow pthread_create
+ thr_lock_type update_lock_default;
+ Delayed_insert *di;
+
++ bool write_to_slow_log;
++
++ bool innodb_was_used;
++ ulong innodb_io_reads;
++ ulonglong innodb_io_read;
++ ulong innodb_io_reads_wait_timer;
++ ulong innodb_lock_que_wait_timer;
++ ulong innodb_innodb_que_wait_timer;
++ ulong innodb_page_access;
++
++ ulong query_plan_flags;
++ ulong query_plan_fsort_passes;
++
+ /* <> 0 if we are inside of trigger or stored function. */
+ uint in_sub_stmt;
+
+@@ -1696,11 +1726,11 @@
+ sql_print_information("time() failed with %d", errno);
+ }
+
+- inline time_t query_start() { query_start_used=1; return start_time; }
+- inline void set_time() { if (user_time) start_time=time_after_lock=user_time; else { safe_time(&start_time); time_after_lock= start_time; }}
+- inline void end_time() { safe_time(&start_time); }
+- inline void set_time(time_t t) { time_after_lock=start_time=user_time=t; }
+- inline void lock_time() { safe_time(&time_after_lock); }
++ inline time_t query_start() { query_start_timer(); query_start_used=1; return start_time; }
++ inline void set_time() { set_timer(); if (user_time) start_time=time_after_lock=user_time; else { safe_time(&start_time); time_after_lock= start_time; }}
++ inline void end_time() { end_timer(); safe_time(&start_time); }
++ inline void set_time(time_t t) { set_timer(); time_after_lock=start_time=user_time=t; }
++ inline void lock_time() { lock_timer(); safe_time(&time_after_lock); }
+ inline void insert_id(ulonglong id_arg)
+ {
+ last_insert_id= id_arg;
+diff -r 1242d4575291 sql/sql_parse.cc
+--- a/sql/sql_parse.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_parse.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -20,6 +20,7 @@
+ #include <m_ctype.h>
+ #include <myisam.h>
+ #include <my_dir.h>
++#include <my_time.h>
+
+ #ifdef HAVE_INNOBASE_DB
+ #include "ha_innodb.h"
+@@ -1227,6 +1228,15 @@
+ my_net_set_read_timeout(net, thd->variables.net_read_timeout);
+ my_net_set_write_timeout(net, thd->variables.net_write_timeout);
+
++ /*
++ If rate limiting of slow log writes is enabled, decide whether to log this
++ new thread's queries or not. Uses extremely simple algorithm. :)
++ */
++ thd->write_to_slow_log= FALSE;
++ if (thd->variables.log_slow_rate_limit <= 1 ||
++ (thd->thread_id % thd->variables.log_slow_rate_limit) == 0)
++ thd->write_to_slow_log= TRUE;
++
+ while (!net->error && net->vio != 0 &&
+ !(thd->killed == THD::KILL_CONNECTION))
+ {
+@@ -2353,28 +2363,57 @@
+ return; // Don't set time for sub stmt
+
+ start_of_query= thd->start_time;
+- thd->end_time(); // Set start time
++ ulonglong start_of_query_timer= thd->start_timer;
++ thd->end_time(); // Set start timea
++
++
++ /* Follow the slow log filter configuration. */
++ if (thd->variables.log_slow_filter != SLOG_F_NONE &&
++ (!(thd->variables.log_slow_filter & thd->query_plan_flags) ||
++ ((thd->variables.log_slow_filter & SLOG_F_QC_NO) &&
++ (thd->query_plan_flags & QPLAN_QC))))
++ return;
++
++ /*
++ Low long_query_time value most likely means user is debugging stuff and even
++ though some thread's queries are not supposed to be logged b/c of the rate
++ limit, if one of them takes long enough (>= 1 second) it will be sensible
++ to make an exception and write to slow log anyway.
++ */
++
++ if (opt_use_global_long_query_time)
++ thd->variables.long_query_time = global_system_variables.long_query_time;
++
++ /* Do not log this thread's queries due to rate limiting. */
++ if (thd->write_to_slow_log != TRUE
++ && (thd->variables.long_query_time >= 1000000
++ || (ulong) (thd->start_timer - thd->timer_after_lock) < 1000000))
++ return;
++
+
+ /*
+ Do not log administrative statements unless the appropriate option is
+ set; do not log into slow log if reading from backup.
+ */
+- if (thd->enable_slow_log && !thd->user_time)
++ if (thd->enable_slow_log &&
++ (!thd->user_time || (thd->slave_thread && opt_log_slow_slave_statements))
++ )
++
+ {
+ thd_proc_info(thd, "logging slow query");
+
+- if ((thd->start_time > thd->time_after_lock &&
+- (ulong) (thd->start_time - thd->time_after_lock) >
+- thd->variables.long_query_time) ||
+- ((thd->server_status &
+- (SERVER_QUERY_NO_INDEX_USED | SERVER_QUERY_NO_GOOD_INDEX_USED)) &&
+- opt_log_queries_not_using_indexes &&
+- /* == SQLCOM_END unless this is a SHOW command */
+- thd->lex->orig_sql_command == SQLCOM_END))
++ if (((ulong) (thd->start_timer - thd->timer_after_lock) >=
++ thd->variables.long_query_time ||
++ (thd->server_status &
++ (SERVER_QUERY_NO_INDEX_USED | SERVER_QUERY_NO_GOOD_INDEX_USED)) &&
++ opt_log_queries_not_using_indexes &&
++ /* == SQLCOM_END unless this is a SHOW command */
++ thd->lex->orig_sql_command == SQLCOM_END) &&
++ thd->examined_row_count >= thd->variables.min_examined_row_limit)
+ {
+ thd_proc_info(thd, "logging slow query");
+ thd->status_var.long_query_count++;
+- mysql_slow_log.write(thd, thd->query, thd->query_length, start_of_query);
++ mysql_slow_log.write(thd, thd->query, thd->query_length, start_of_query, start_of_query_timer);
+ }
+ }
+ }
+@@ -2669,6 +2708,8 @@
+ context.resolve_in_table_list_only((TABLE_LIST*)select_lex->
+ table_list.first);
+
++ /* Reset the counter at all cases for the extended slow query log */
++ thd->row_count= 1;
+ /*
+ Reset warning count for each query that uses tables
+ A better approach would be to reset this for any commands
+@@ -6203,6 +6244,15 @@
+ thd->total_warn_count=0; // Warnings for this query
+ thd->rand_used= 0;
+ thd->sent_row_count= thd->examined_row_count= 0;
++ thd->innodb_was_used= FALSE;
++ thd->innodb_io_reads= 0;
++ thd->innodb_io_read= 0;
++ thd->innodb_io_reads_wait_timer= 0;
++ thd->innodb_lock_que_wait_timer= 0;
++ thd->innodb_innodb_que_wait_timer= 0;
++ thd->innodb_page_access= 0;
++ thd->query_plan_flags= QPLAN_NONE;
++ thd->query_plan_fsort_passes= 0;
+ }
+ DBUG_VOID_RETURN;
+ }
+diff -r 1242d4575291 sql/sql_select.cc
+--- a/sql/sql_select.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_select.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -6272,8 +6272,11 @@
+ {
+ join->thd->server_status|=SERVER_QUERY_NO_INDEX_USED;
+ if (statistics)
++ {
+ statistic_increment(join->thd->status_var.select_scan_count,
+ &LOCK_status);
++ join->thd->query_plan_flags|= QPLAN_FULL_SCAN;
++ }
+ }
+ }
+ else
+@@ -6288,8 +6291,11 @@
+ {
+ join->thd->server_status|=SERVER_QUERY_NO_INDEX_USED;
+ if (statistics)
++ {
+ statistic_increment(join->thd->status_var.select_full_join_count,
+ &LOCK_status);
++ join->thd->query_plan_flags|= QPLAN_FULL_JOIN;
++ }
+ }
+ }
+ if (!table->no_keyread)
+@@ -9350,6 +9356,7 @@
+ (ulong) rows_limit,test(group)));
+
+ statistic_increment(thd->status_var.created_tmp_tables, &LOCK_status);
++ thd->query_plan_flags|= QPLAN_TMP_TABLE;
+
+ if (use_temp_pool && !(test_flags & TEST_KEEP_TMP_TABLES))
+ temp_pool_slot = bitmap_set_next(&temp_pool);
+@@ -10210,6 +10217,7 @@
+ }
+ statistic_increment(table->in_use->status_var.created_tmp_disk_tables,
+ &LOCK_status);
++ table->in_use->query_plan_flags|= QPLAN_TMP_DISK;
+ table->s->db_record_offset= 1;
+ DBUG_RETURN(0);
+ err:
+diff -r 1242d4575291 sql/sql_show.cc
+--- a/sql/sql_show.cc Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/sql_show.cc Tue Jul 28 23:42:44 2009 -0700
+@@ -1560,6 +1560,12 @@
+ case SHOW_LONGLONG:
+ end= longlong10_to_str(*(longlong*) value, buff, 10);
+ break;
++ case SHOW_MICROTIME:
++ show_type= ((sys_var*) value)->show_type();
++ value= (char*) ((sys_var*) value)->value_ptr(thd, value_type,
++ &null_lex_str);
++ end= buff + sprintf(buff, "%f", (((double) (*(ulonglong*)value))) / 1000000.0);
++ break;
+ case SHOW_HA_ROWS:
+ end= longlong10_to_str((longlong) *(ha_rows*) value, buff, 10);
+ break;
+diff -r 1242d4575291 sql/structs.h
+--- a/sql/structs.h Tue Jul 28 23:39:12 2009 -0700
++++ b/sql/structs.h Tue Jul 28 23:42:44 2009 -0700
+@@ -168,8 +168,8 @@
+ enum SHOW_TYPE
+ {
+ SHOW_UNDEF,
+- SHOW_LONG, SHOW_LONGLONG, SHOW_INT, SHOW_CHAR, SHOW_CHAR_PTR,
+- SHOW_DOUBLE_STATUS,
++ SHOW_LONG, SHOW_LONGLONG, SHOW_MICROTIME, SHOW_INT, SHOW_CHAR, SHOW_CHAR_PTR,
++ SHOW_DOUBLE_STATUS,
+ SHOW_BOOL, SHOW_MY_BOOL, SHOW_OPENTABLES, SHOW_STARTTIME, SHOW_QUERIES,
+ SHOW_LONG_CONST, SHOW_INT_CONST, SHOW_HAVE, SHOW_SYS, SHOW_HA_ROWS,
+ SHOW_VARS,
diff --git a/percona/5.0.91-b22-20100522/mirror_binlog.patch b/percona/5.0.91-b22-20100522/mirror_binlog.patch
new file mode 100644
index 0000000..d52e806
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/mirror_binlog.patch
@@ -0,0 +1,2694 @@
+diff -r 66cc9e0a6768 mysql-test/lib/mtr_cases.pl
+--- a/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:37:12 2008 -0800
++++ b/mysql-test/lib/mtr_cases.pl Thu Dec 04 21:46:15 2008 -0800
+@@ -334,6 +334,10 @@
+
+ $tinfo->{'slave_num'}= 1; # Default for rpl* tests, use one slave
+
++ if ( $tname eq 'rpl_mirror_binlog' )
++ {
++ $tinfo->{'slave_num'}= 3;
++ }
+ }
+
+ if ( defined mtr_match_prefix($tname,"federated") )
+@@ -344,15 +348,20 @@
+
+ my $master_opt_file= "$testdir/$tname-master.opt";
+ my $slave_opt_file= "$testdir/$tname-slave.opt";
+- my $slave_mi_file= "$testdir/$tname.slave-mi";
++ my $slave_mi_files= ["$testdir/$tname.slave-mi",
++ "$testdir/$tname.1.slave-mi",
++ "$testdir/$tname.2.slave-mi"];
+ my $master_sh= "$testdir/$tname-master.sh";
+ my $slave_sh= "$testdir/$tname-slave.sh";
+ my $disabled_file= "$testdir/$tname.disabled";
+ my $im_opt_file= "$testdir/$tname-im.opt";
+
+- $tinfo->{'master_opt'}= [];
+- $tinfo->{'slave_opt'}= [];
+- $tinfo->{'slave_mi'}= [];
++ $tinfo->{'master_opt'}= [];
++ $tinfo->{'slave_opt'}= [];
++ $tinfo->{'slave_mi'}= {};
++ $tinfo->{'slave_mi'}{0}= [];
++ $tinfo->{'slave_mi'}{1}= [];
++ $tinfo->{'slave_mi'}{2}= [];
+
+ if ( -f $master_opt_file )
+ {
+@@ -427,9 +436,14 @@
+ push(@{$tinfo->{'slave_opt'}}, @$slave_opt);
+ }
+
+- if ( -f $slave_mi_file )
++ my $mi_idx= 0;
++ foreach my $slave_mi_file ( @$slave_mi_files )
+ {
+- $tinfo->{'slave_mi'}= mtr_get_opts_from_file($slave_mi_file);
++ if ( -f $slave_mi_file )
++ {
++ $tinfo->{'slave_mi'}{$mi_idx}= mtr_get_opts_from_file($slave_mi_file);
++ }
++ $mi_idx+= 1;
+ }
+
+ if ( -f $master_sh )
+diff -r 66cc9e0a6768 mysql-test/mysql-test-run.pl
+--- a/mysql-test/mysql-test-run.pl Thu Dec 04 21:37:12 2008 -0800
++++ b/mysql-test/mysql-test-run.pl Thu Dec 04 21:46:15 2008 -0800
+@@ -275,6 +275,7 @@
+ our $opt_stress_test_file= "";
+
+ our $opt_warnings;
++our $opt_slave_innodb= 0;
+
+ our $opt_skip_ndbcluster= 0;
+ our $opt_skip_ndbcluster_slave= 0;
+@@ -299,6 +300,8 @@
+ our $used_binlog_format;
+ our $used_default_engine;
+ our $debug_compiled_binaries;
++
++our $current_testname= "";
+
+ our %mysqld_variables;
+
+@@ -645,6 +648,7 @@
+ 'testcase-timeout=i' => \$opt_testcase_timeout,
+ 'suite-timeout=i' => \$opt_suite_timeout,
+ 'warnings|log-warnings' => \$opt_warnings,
++ 'slave-innodb' => \$opt_slave_innodb,
+
+ # Options which are no longer used
+ (map { $_ => \&warn_about_removed_option } @removed_options),
+@@ -1001,6 +1005,14 @@
+ {
+ $ENV{'BIG_TEST'}= 1;
+ }
++
++ # --------------------------------------------------------------------------
++ # Big test flags
++ # --------------------------------------------------------------------------
++ if ( $opt_big_test )
++ {
++ $ENV{'BIG_TEST'}= 1;
++ }
+
+ # --------------------------------------------------------------------------
+ # Gcov flag
+@@ -1885,7 +1897,9 @@
+ $ENV{'SLAVE_MYSOCK'}= $slave->[0]->{'path_sock'};
+ $ENV{'SLAVE_MYPORT'}= $slave->[0]->{'port'};
+ $ENV{'SLAVE_MYPORT1'}= $slave->[1]->{'port'};
++ $ENV{'SLAVE_MYSOCK1'}= $slave->[1]->{'path_sock'};
+ $ENV{'SLAVE_MYPORT2'}= $slave->[2]->{'port'};
++ $ENV{'SLAVE_MYSOCK2'}= $slave->[2]->{'path_sock'};
+ $ENV{'MYSQL_TCP_PORT'}= $mysqld_variables{'port'};
+ $ENV{'DEFAULT_MASTER_PORT'}= $mysqld_variables{'master-port'};
+
+@@ -2375,6 +2389,8 @@
+ if ( ! $glob_win32 )
+ {
+ symlink("$glob_mysql_test_dir/std_data", "$opt_vardir/std_data_ln");
++ my @a = ("chmod", "-R", "o+r", "$glob_mysql_test_dir/std_data");
++ system(@a) == 0 or die "system @ failed: $?"
+ }
+ else
+ {
+@@ -3466,6 +3482,8 @@
+ $ENV{'TZ'}= $tinfo->{'timezone'};
+ mtr_verbose("Setting timezone: $tinfo->{'timezone'}");
+
++ $current_testname= $tinfo->{'name'};
++
+ my $master_restart= run_testcase_need_master_restart($tinfo);
+ my $slave_restart= run_testcase_need_slave_restart($tinfo);
+
+@@ -3881,7 +3899,8 @@
+ unless $mysqld->{'type'} eq 'slave';
+
+ mtr_add_arg($args, "%s--init-rpl-role=slave", $prefix);
+- if (! ( $opt_skip_slave_binlog || $skip_binlog ))
++
++ if (! ($opt_skip_slave_binlog or ($current_testname eq 'rpl_mirror_binlog')) )
+ {
+ mtr_add_arg($args, "%s--log-bin=%s/log/slave%s-bin", $prefix,
+ $opt_vardir, $sidx); # FIXME use own dir for binlogs
+@@ -4568,7 +4587,7 @@
+ if ( ! $slave->[$idx]->{'pid'} )
+ {
+ mysqld_start($slave->[$idx],$tinfo->{'slave_opt'},
+- $tinfo->{'slave_mi'});
++ $tinfo->{'slave_mi'}{$idx});
+
+ }
+ }
+@@ -4580,7 +4599,6 @@
+ # Wait for clusters to start
+ foreach my $cluster (@{$clusters})
+ {
+-
+ next if !$cluster->{'pid'};
+
+ if (ndbcluster_wait_started($cluster, ""))
+@@ -5179,6 +5197,7 @@
+ skip-im Don't start IM, and skip the IM test cases
+ big-test Set the environment variable BIG_TEST, which can be
+ checked from test cases.
++
+
+ Options that specify ports
+
+diff -r 66cc9e0a6768 mysql-test/r/rpl_mirror_binlog.result
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/r/rpl_mirror_binlog.result Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,441 @@
++stop slave;
++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
++reset master;
++reset slave;
++drop table if exists t1,t2,t3,t4,t5,t6,t7,t8,t9;
++start slave;
++drop table if exists t1;
++create table t1(n int) engine = InnoDB;
++insert into t1 values (300);
++insert into t1 values (299);
++insert into t1 values (298);
++insert into t1 values (297);
++insert into t1 values (296);
++insert into t1 values (295);
++insert into t1 values (294);
++insert into t1 values (293);
++insert into t1 values (292);
++insert into t1 values (291);
++insert into t1 values (290);
++insert into t1 values (289);
++insert into t1 values (288);
++insert into t1 values (287);
++insert into t1 values (286);
++insert into t1 values (285);
++insert into t1 values (284);
++insert into t1 values (283);
++insert into t1 values (282);
++insert into t1 values (281);
++insert into t1 values (280);
++insert into t1 values (279);
++insert into t1 values (278);
++insert into t1 values (277);
++insert into t1 values (276);
++insert into t1 values (275);
++insert into t1 values (274);
++insert into t1 values (273);
++insert into t1 values (272);
++insert into t1 values (271);
++insert into t1 values (270);
++insert into t1 values (269);
++insert into t1 values (268);
++insert into t1 values (267);
++insert into t1 values (266);
++insert into t1 values (265);
++insert into t1 values (264);
++insert into t1 values (263);
++insert into t1 values (262);
++insert into t1 values (261);
++insert into t1 values (260);
++insert into t1 values (259);
++insert into t1 values (258);
++insert into t1 values (257);
++insert into t1 values (256);
++insert into t1 values (255);
++insert into t1 values (254);
++insert into t1 values (253);
++insert into t1 values (252);
++insert into t1 values (251);
++insert into t1 values (250);
++insert into t1 values (249);
++insert into t1 values (248);
++insert into t1 values (247);
++insert into t1 values (246);
++insert into t1 values (245);
++insert into t1 values (244);
++insert into t1 values (243);
++insert into t1 values (242);
++insert into t1 values (241);
++insert into t1 values (240);
++insert into t1 values (239);
++insert into t1 values (238);
++insert into t1 values (237);
++insert into t1 values (236);
++insert into t1 values (235);
++insert into t1 values (234);
++insert into t1 values (233);
++insert into t1 values (232);
++insert into t1 values (231);
++insert into t1 values (230);
++insert into t1 values (229);
++insert into t1 values (228);
++insert into t1 values (227);
++insert into t1 values (226);
++insert into t1 values (225);
++insert into t1 values (224);
++insert into t1 values (223);
++insert into t1 values (222);
++insert into t1 values (221);
++insert into t1 values (220);
++insert into t1 values (219);
++insert into t1 values (218);
++insert into t1 values (217);
++insert into t1 values (216);
++insert into t1 values (215);
++insert into t1 values (214);
++insert into t1 values (213);
++insert into t1 values (212);
++insert into t1 values (211);
++insert into t1 values (210);
++insert into t1 values (209);
++insert into t1 values (208);
++insert into t1 values (207);
++insert into t1 values (206);
++insert into t1 values (205);
++insert into t1 values (204);
++insert into t1 values (203);
++insert into t1 values (202);
++insert into t1 values (201);
++insert into t1 values (200);
++insert into t1 values (199);
++insert into t1 values (198);
++insert into t1 values (197);
++insert into t1 values (196);
++insert into t1 values (195);
++insert into t1 values (194);
++insert into t1 values (193);
++insert into t1 values (192);
++insert into t1 values (191);
++insert into t1 values (190);
++insert into t1 values (189);
++insert into t1 values (188);
++insert into t1 values (187);
++insert into t1 values (186);
++insert into t1 values (185);
++insert into t1 values (184);
++insert into t1 values (183);
++insert into t1 values (182);
++insert into t1 values (181);
++insert into t1 values (180);
++insert into t1 values (179);
++insert into t1 values (178);
++insert into t1 values (177);
++insert into t1 values (176);
++insert into t1 values (175);
++insert into t1 values (174);
++insert into t1 values (173);
++insert into t1 values (172);
++insert into t1 values (171);
++insert into t1 values (170);
++insert into t1 values (169);
++insert into t1 values (168);
++insert into t1 values (167);
++insert into t1 values (166);
++insert into t1 values (165);
++insert into t1 values (164);
++insert into t1 values (163);
++insert into t1 values (162);
++insert into t1 values (161);
++insert into t1 values (160);
++insert into t1 values (159);
++insert into t1 values (158);
++insert into t1 values (157);
++insert into t1 values (156);
++insert into t1 values (155);
++insert into t1 values (154);
++insert into t1 values (153);
++insert into t1 values (152);
++insert into t1 values (151);
++insert into t1 values (150);
++insert into t1 values (149);
++insert into t1 values (148);
++insert into t1 values (147);
++insert into t1 values (146);
++insert into t1 values (145);
++insert into t1 values (144);
++insert into t1 values (143);
++insert into t1 values (142);
++insert into t1 values (141);
++insert into t1 values (140);
++insert into t1 values (139);
++insert into t1 values (138);
++insert into t1 values (137);
++insert into t1 values (136);
++insert into t1 values (135);
++insert into t1 values (134);
++insert into t1 values (133);
++insert into t1 values (132);
++insert into t1 values (131);
++insert into t1 values (130);
++insert into t1 values (129);
++insert into t1 values (128);
++insert into t1 values (127);
++insert into t1 values (126);
++insert into t1 values (125);
++insert into t1 values (124);
++insert into t1 values (123);
++insert into t1 values (122);
++insert into t1 values (121);
++insert into t1 values (120);
++insert into t1 values (119);
++insert into t1 values (118);
++insert into t1 values (117);
++insert into t1 values (116);
++insert into t1 values (115);
++insert into t1 values (114);
++insert into t1 values (113);
++insert into t1 values (112);
++insert into t1 values (111);
++insert into t1 values (110);
++insert into t1 values (109);
++insert into t1 values (108);
++insert into t1 values (107);
++insert into t1 values (106);
++insert into t1 values (105);
++insert into t1 values (104);
++insert into t1 values (103);
++insert into t1 values (102);
++insert into t1 values (101);
++insert into t1 values (100);
++insert into t1 values (99);
++insert into t1 values (98);
++insert into t1 values (97);
++insert into t1 values (96);
++insert into t1 values (95);
++insert into t1 values (94);
++insert into t1 values (93);
++insert into t1 values (92);
++insert into t1 values (91);
++insert into t1 values (90);
++insert into t1 values (89);
++insert into t1 values (88);
++insert into t1 values (87);
++insert into t1 values (86);
++insert into t1 values (85);
++insert into t1 values (84);
++insert into t1 values (83);
++insert into t1 values (82);
++insert into t1 values (81);
++insert into t1 values (80);
++insert into t1 values (79);
++insert into t1 values (78);
++insert into t1 values (77);
++insert into t1 values (76);
++insert into t1 values (75);
++insert into t1 values (74);
++insert into t1 values (73);
++insert into t1 values (72);
++insert into t1 values (71);
++insert into t1 values (70);
++insert into t1 values (69);
++insert into t1 values (68);
++insert into t1 values (67);
++insert into t1 values (66);
++insert into t1 values (65);
++insert into t1 values (64);
++insert into t1 values (63);
++insert into t1 values (62);
++insert into t1 values (61);
++insert into t1 values (60);
++insert into t1 values (59);
++insert into t1 values (58);
++insert into t1 values (57);
++insert into t1 values (56);
++insert into t1 values (55);
++insert into t1 values (54);
++insert into t1 values (53);
++insert into t1 values (52);
++insert into t1 values (51);
++insert into t1 values (50);
++insert into t1 values (49);
++insert into t1 values (48);
++insert into t1 values (47);
++insert into t1 values (46);
++insert into t1 values (45);
++insert into t1 values (44);
++insert into t1 values (43);
++insert into t1 values (42);
++insert into t1 values (41);
++insert into t1 values (40);
++insert into t1 values (39);
++insert into t1 values (38);
++insert into t1 values (37);
++insert into t1 values (36);
++insert into t1 values (35);
++insert into t1 values (34);
++insert into t1 values (33);
++insert into t1 values (32);
++insert into t1 values (31);
++insert into t1 values (30);
++insert into t1 values (29);
++insert into t1 values (28);
++insert into t1 values (27);
++insert into t1 values (26);
++insert into t1 values (25);
++insert into t1 values (24);
++insert into t1 values (23);
++insert into t1 values (22);
++insert into t1 values (21);
++insert into t1 values (20);
++insert into t1 values (19);
++insert into t1 values (18);
++insert into t1 values (17);
++insert into t1 values (16);
++insert into t1 values (15);
++insert into t1 values (14);
++insert into t1 values (13);
++insert into t1 values (12);
++insert into t1 values (11);
++insert into t1 values (10);
++insert into t1 values (9);
++insert into t1 values (8);
++insert into t1 values (7);
++insert into t1 values (6);
++insert into t1 values (5);
++insert into t1 values (4);
++insert into t1 values (3);
++insert into t1 values (2);
++insert into t1 values (1);
++"The following are SLAVE."
++select count(distinct n) from t1;
++count(distinct n)
++300
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++300
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9306 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No #
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000014 2849
++"The following are SLAVE1."
++start slave;
++select count(distinct n) from t1;
++count(distinct n)
++300
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++300
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000014 2849 # # master-bin.000014 Yes Yes # 0 0 2849 # None 0 No #
++"The following are SLAVE."
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication-log';
++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
++stop slave;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication_log';
++ERROR HY000: Could not initialize master info structure; more error messages can be found in the MySQL error log
++MAKE MASTER REVOKE SESSION WITH KILL;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++MASTER_SERVER_ID=2,
++INDEX='replication_log'
++ WITH BINLOG;
++MAKE MASTER GRANT SESSION;
++delete from t1 where n > 250;
++select count(distinct n) from t1;
++count(distinct n)
++250
++"The following are SLAVE1."
++select count(distinct n) from t1;
++count(distinct n)
++250
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++250
++"The following are SLAVE2."
++start slave;
++select count(distinct n) from t1;
++count(distinct n)
++250
++select min(n) from t1;
++min(n)
++1
++select max(n) from t1;
++max(n)
++250
++show slave status;
++Slave_IO_State Master_Host Master_User Master_Port Connect_Retry Master_Log_File Read_Master_Log_Pos Relay_Log_File Relay_Log_Pos Relay_Master_Log_File Slave_IO_Running Slave_SQL_Running Replicate_Do_DB Replicate_Ignore_DB Replicate_Do_Table Replicate_Ignore_Table Replicate_Wild_Do_Table Replicate_Wild_Ignore_Table Last_Errno Last_Error Skip_Counter Exec_Master_Log_Pos Relay_Log_Space Until_Condition Until_Log_File Until_Log_Pos Master_SSL_Allowed Master_SSL_CA_File Master_SSL_CA_Path Master_SSL_Cert Master_SSL_Cipher Master_SSL_Key Seconds_Behind_Master
++Waiting for master to send event 127.0.0.1 root 9308 1 master-bin.000015 189 # # master-bin.000015 Yes Yes # 0 0 189 # None 0 No #
++drop table t1;
++drop table t1;
++"The following are SLAVE."
++show master logs;
++Log_name File_size
++master-bin.000001 4214
++master-bin.000002 4212
++master-bin.000003 4212
++master-bin.000004 4212
++master-bin.000005 4212
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000015 265
++"The following are SLAVE2."
++show master logs;
++Log_name File_size
++master-bin.000001 4214
++master-bin.000002 4212
++master-bin.000003 4212
++master-bin.000004 4212
++master-bin.000005 4212
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++show master status;
++File Position Binlog_Do_DB Binlog_Ignore_DB
++master-bin.000015 265
++purge master logs to 'master-bin.000006';
++show master logs;
++Log_name File_size
++master-bin.000006 4212
++master-bin.000007 4212
++master-bin.000008 4212
++master-bin.000009 4212
++master-bin.000010 4194
++master-bin.000011 4190
++master-bin.000012 4190
++master-bin.000013 4190
++master-bin.000014 2849
++master-bin.000015 265
++reset master;
++ERROR HY000: Binlog closed, cannot RESET MASTER
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-master.opt
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog-master.opt Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++-O max_binlog_size=4096
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog-slave.opt
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog-slave.opt Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--rpl_mirror_binlog_enabled=1 --log-bin-index=replication_log
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.1.slave-mi
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.1.slave-mi Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=3
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.2.slave-mi
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.2.slave-mi Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,1 @@
++--master-user=root --master-connect-retry=1 --master-host=127.0.0.1 --master-password="" --master-port=9308 --server-id=4
+diff -r 66cc9e0a6768 mysql-test/t/rpl_mirror_binlog.test
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/mysql-test/t/rpl_mirror_binlog.test Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,119 @@
++-- source include/master-slave.inc
++-- source include/have_innodb.inc
++connect (slave_sec,localhost,root,,test,$SLAVE_MYPORT1,$SLAVE_MYSOCK1);
++connect (slave_ter,localhost,root,,test,$SLAVE_MYPORT2,$SLAVE_MYSOCK2);
++
++connection master;
++--disable_warnings
++drop table if exists t1;
++--enable_warnings
++create table t1(n int) engine = InnoDB;
++
++let $i=300;
++while ($i)
++{
++ eval insert into t1 values ($i);
++ dec $i;
++}
++
++save_master_pos;
++
++connection slave;
++sync_with_master;
++
++echo "The following are SLAVE.";
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++show master status;
++
++connection slave_sec;
++echo "The following are SLAVE1.";
++start slave;
++sync_with_master;
++
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++
++# make the slave the new master
++connection slave;
++echo "The following are SLAVE.";
++
++# The first 1201 error is caused by running slave.
++--error 1201
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication-log';
++stop slave;
++
++# The second 1201 error is caused by failover mode.
++--error 1201
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication_log';
++
++MAKE MASTER REVOKE SESSION WITH KILL;
++MAKE MASTER MASTER_LOG_FILE='master-bin',
++ MASTER_SERVER_ID=2,
++ INDEX='replication_log'
++ WITH BINLOG;
++
++MAKE MASTER GRANT SESSION;
++
++delete from t1 where n > 250;
++save_master_pos;
++
++select count(distinct n) from t1;
++
++connection slave_sec;
++echo "The following are SLAVE1.";
++
++sync_with_master;
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++
++connection slave_ter;
++echo "The following are SLAVE2.";
++start slave;
++sync_with_master;
++
++select count(distinct n) from t1;
++select min(n) from t1;
++select max(n) from t1;
++
++--replace_column 8 # 9 # 18 # 23 # 33 #
++show slave status;
++
++connection master;
++drop table t1;
++
++connection slave;
++drop table t1;
++save_master_pos;
++
++connection slave_sec;
++sync_with_master;
++
++connection slave;
++echo "The following are SLAVE.";
++
++show master logs;
++show master status;
++
++
++connection slave_ter;
++echo "The following are SLAVE2.";
++sync_with_master;
++
++show master logs;
++show master status;
++purge master logs to 'master-bin.000006';
++show master logs;
++--error 1186
++reset master;
+diff -r 66cc9e0a6768 patch_info/mirror_binlog.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/mirror_binlog.info Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,6 @@
++File=mirror_binlog.patch
++Name=Mirroring binary logs on slave
++Version=V1
++Author=Google
++License=GPL
++Comment=contains FastMaster promotion patch
+diff -r 66cc9e0a6768 sql/Makefile.am
+--- a/sql/Makefile.am Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/Makefile.am Thu Dec 04 21:46:15 2008 -0800
+@@ -68,7 +68,7 @@
+ sql_array.h sql_cursor.h \
+ examples/ha_example.h ha_archive.h \
+ examples/ha_tina.h ha_blackhole.h \
+- ha_federated.h
++ ha_federated.h repl_mule.h
+ mysqld_SOURCES = sql_lex.cc sql_handler.cc \
+ item.cc item_sum.cc item_buff.cc item_func.cc \
+ item_cmpfunc.cc item_strfunc.cc item_timefunc.cc \
+@@ -105,7 +105,7 @@
+ sp_cache.cc parse_file.cc sql_trigger.cc \
+ examples/ha_example.cc ha_archive.cc \
+ examples/ha_tina.cc ha_blackhole.cc \
+- ha_federated.cc
++ ha_federated.cc repl_mule.cc
+
+ gen_lex_hash_SOURCES = gen_lex_hash.cc
+ gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS)
+diff -r 66cc9e0a6768 sql/Makefile.in
+--- a/sql/Makefile.in Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/Makefile.in Thu Dec 04 21:46:15 2008 -0800
+@@ -152,7 +152,7 @@
+ sp_rcontext.$(OBJEXT) sp.$(OBJEXT) sp_cache.$(OBJEXT) \
+ parse_file.$(OBJEXT) sql_trigger.$(OBJEXT) \
+ ha_example.$(OBJEXT) ha_archive.$(OBJEXT) ha_tina.$(OBJEXT) \
+- ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT)
++ ha_blackhole.$(OBJEXT) ha_federated.$(OBJEXT) repl_mule.$(OBJEXT)
+ mysqld_OBJECTS = $(am_mysqld_OBJECTS)
+ mysqld_DEPENDENCIES = $(am__DEPENDENCIES_1) $(am__DEPENDENCIES_2) \
+ $(am__DEPENDENCIES_2) $(am__DEPENDENCIES_2) \
+@@ -516,7 +516,7 @@
+ sql_array.h sql_cursor.h \
+ examples/ha_example.h ha_archive.h \
+ examples/ha_tina.h ha_blackhole.h \
+- ha_federated.h
++ ha_federated.h repl_mule.h
+
+ mysqld_SOURCES = sql_lex.cc sql_handler.cc \
+ item.cc item_sum.cc item_buff.cc item_func.cc \
+@@ -554,7 +554,7 @@
+ sp_cache.cc parse_file.cc sql_trigger.cc \
+ examples/ha_example.cc ha_archive.cc \
+ examples/ha_tina.cc ha_blackhole.cc \
+- ha_federated.cc
++ ha_federated.cc repl_mule.cc
+
+ gen_lex_hash_SOURCES = gen_lex_hash.cc
+ gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS)
+@@ -748,6 +748,7 @@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/protocol.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/records.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_failsafe.Po@am__quote@
++@AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/repl_mule.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/set_var.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/slave.Po@am__quote@
+ @AMDEP_TRUE@@am__include@ @am__quote@./$(DEPDIR)/sp.Po@am__quote@
+diff -r 66cc9e0a6768 sql/lex.h
+--- a/sql/lex.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/lex.h Thu Dec 04 21:46:15 2008 -0800
+@@ -292,6 +292,7 @@
+ { "LONGTEXT", SYM(LONGTEXT)},
+ { "LOOP", SYM(LOOP_SYM)},
+ { "LOW_PRIORITY", SYM(LOW_PRIORITY)},
++ { "MAKE", SYM(MAKE_SYM)},
+ { "MASTER", SYM(MASTER_SYM)},
+ { "MASTER_CONNECT_RETRY", SYM(MASTER_CONNECT_RETRY_SYM)},
+ { "MASTER_HOST", SYM(MASTER_HOST_SYM)},
+diff -r 66cc9e0a6768 sql/log.cc
+--- a/sql/log.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/log.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -79,7 +79,9 @@
+
+ bool binlog_init()
+ {
+- return !opt_bin_log;
++ if (!opt_bin_log)
++ binlog_hton.prepare = NULL;
++ return 0; /* return !opt_bin_log; */
+ }
+
+ static int binlog_close_connection(THD *thd)
+@@ -406,6 +408,7 @@
+ :bytes_written(0), last_time(0), query_start(0), name(0),
+ prepared_xids(0), log_type(LOG_CLOSED), file_id(1), open_count(1),
+ write_error(FALSE), inited(FALSE), need_start_event(TRUE),
++ mule_binlog_(0),
+ description_event_for_exec(0), description_event_for_queue(0)
+ {
+ /*
+@@ -506,7 +509,10 @@
+ const char *log_name)
+ {
+ File index_file_nr= -1;
+- DBUG_ASSERT(!my_b_inited(&index_file));
++
++ /* If the index is already opened, do not open it again. */
++ if (my_b_inited(&index_file))
++ return FALSE;
+
+ /*
+ First open of this class instance
+@@ -750,7 +756,7 @@
+ if (file >= 0)
+ my_close(file,MYF(0));
+ end_io_cache(&log_file);
+- end_io_cache(&index_file);
++ close_index_file();
+ safeFree(name);
+ log_type= LOG_CLOSED;
+ DBUG_RETURN(1);
+@@ -768,7 +774,10 @@
+ int MYSQL_LOG::raw_get_current_log(LOG_INFO* linfo)
+ {
+ strmake(linfo->log_file_name, log_file_name, sizeof(linfo->log_file_name)-1);
+- linfo->pos = my_b_tell(&log_file);
++ if (!mule_binlog_)
++ linfo->pos = my_b_tell(&log_file);
++ else
++ linfo->pos = my_b_filelength(&log_file);
+ return 0;
+ }
+
+@@ -935,6 +944,11 @@
+ if (need_lock)
+ pthread_mutex_lock(&LOCK_index);
+ safe_mutex_assert_owner(&LOCK_index);
++
++ if (open_index_file(index_file_name, NULL) != 0) {
++ error = -1;
++ goto err;
++ }
+
+ /* As the file is flushed, we can't get an error here */
+ (void) reinit_io_cache(&index_file, READ_CACHE, linfo->index_file_offset, 0,
+@@ -1446,18 +1460,19 @@
+ SYNOPSIS
+ new_file()
+ need_lock Set to 1 if caller has not locked LOCK_log
++ logfile_name the specified log filename.
+
+ NOTE
+ The new file name is stored last in the index file
+ */
+
+-void MYSQL_LOG::new_file(bool need_lock)
++void MYSQL_LOG::new_file(bool need_lock, const char* log_filename)
+ {
+ char new_name[FN_REFLEN], *new_name_ptr, *old_name;
+ enum_log_type save_log_type;
+
+ DBUG_ENTER("MYSQL_LOG::new_file");
+- if (!is_open())
++ if (!is_log_open())
+ {
+ DBUG_PRINT("info",("log is closed"));
+ DBUG_VOID_RETURN;
+@@ -1496,7 +1511,9 @@
+ We have to do this here and not in open as we want to store the
+ new file name in the current binary log file.
+ */
+- if (generate_new_name(new_name, name))
++ if (log_filename) {
++ fn_format(new_name,log_filename,mysql_data_home,"",4);
++ } else if (generate_new_name(new_name, name))
+ goto end;
+ new_name_ptr=new_name;
+
+@@ -1571,7 +1588,7 @@
+ bytes_written+= ev->data_written;
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+ if ((uint) my_b_append_tell(&log_file) > max_size)
+- new_file(0);
++ new_file(0);
+
+ err:
+ pthread_mutex_unlock(&LOCK_log);
+@@ -1600,8 +1617,14 @@
+ bytes_written += len;
+ } while ((buf=va_arg(args,const char*)) && (len=va_arg(args,uint)));
+ DBUG_PRINT("info",("max_size: %lu",max_size));
+- if ((uint) my_b_append_tell(&log_file) > max_size)
+- new_file(0);
++
++ /* If max_size is BINLOG_NOSWITCH_SIZE, binlog would not switch because
++ * of file size limit.
++ */
++ if (max_size != BINLOG_NOSWITCH_SIZE &&
++ (uint) my_b_append_tell(&log_file) > max_size) {
++ new_file(0);
++ }
+
+ err:
+ if (!error)
+@@ -2492,6 +2515,17 @@
+ DBUG_VOID_RETURN;
+ }
+
++int MYSQL_LOG::flush_log_file() {
++ return flush_io_cache(&log_file);
++}
++
++int MYSQL_LOG::close_index_file() {
++ if (my_b_inited(&index_file)) {
++ end_io_cache(&index_file);
++ my_close(index_file.file, MYF(0));
++ }
++ return 0;
++}
+
+ /*
+ Check if a string is a valid number
+diff -r 66cc9e0a6768 sql/log_event.h
+--- a/sql/log_event.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/log_event.h Thu Dec 04 21:46:15 2008 -0800
+@@ -94,6 +94,14 @@
+ #define LINE_TERM_EMPTY 0x4
+ #define LINE_START_EMPTY 0x8
+ #define ESCAPED_EMPTY 0x10
++
++/* This server-id value is used to indicate a special master-info event
++ * in relay-log.
++ * We will enforce in database that replication can not set this value
++ * as the server-id.
++ */
++#define MASTER_INFO_SERVER_ID 0xffffffff
++
+
+ /*****************************************************************************
+
+diff -r 66cc9e0a6768 sql/mysql_priv.h
+--- a/sql/mysql_priv.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/mysql_priv.h Thu Dec 04 21:46:15 2008 -0800
+@@ -462,6 +462,7 @@
+ /* BINLOG_DUMP options */
+
+ #define BINLOG_DUMP_NON_BLOCK 1
++#define BINLOG_MIRROR_CLIENT 0x0004
+
+ /* sql_show.cc:show_log_files() */
+ #define SHOW_LOG_STATUS_FREE "FREE"
+@@ -1374,6 +1375,7 @@
+ extern const char **errmesg; /* Error messages */
+ extern const char *myisam_recover_options_str;
+ extern const char *in_left_expr_name, *in_additional_cond, *in_having_cond;
++extern char *opt_binlog_index_name;
+ extern const char * const triggers_file_ext;
+ extern const char * const trigname_file_ext;
+ extern Eq_creator eq_creator;
+@@ -1875,6 +1877,10 @@
+ extern "C" void unireg_abort(int exit_code);
+ void kill_delayed_threads(void);
+ bool check_stack_overrun(THD *thd, long margin, char *dummy);
++extern my_bool rpl_mirror_binlog_enabled;
++extern ulong sync_mirror_binlog_period;
++extern my_bool rpl_mirror_binlog_no_replicate;
++extern ulong rpl_mirror_binlog_clients, rpl_mirror_binlog_status;
+ #else
+ #define unireg_abort(exit_code) DBUG_RETURN(exit_code)
+ inline void kill_delayed_threads(void) {}
+diff -r 66cc9e0a6768 sql/mysqld.cc
+--- a/sql/mysqld.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/mysqld.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -555,6 +555,7 @@
+ pthread_mutex_t LOCK_global_user_client_stats;
+ pthread_mutex_t LOCK_global_table_stats;
+ pthread_mutex_t LOCK_global_index_stats;
++pthread_mutex_t LOCK_failover_master;
+ /*
+ The below lock protects access to two global server variables:
+ max_prepared_stmt_count and prepared_stmt_count. These variables
+@@ -584,13 +585,15 @@
+ char *master_ssl_key, *master_ssl_cert;
+ char *master_ssl_ca, *master_ssl_capath, *master_ssl_cipher;
+
++char *opt_binlog_index_name;
++
+ /* Static variables */
+
+ static bool kill_in_progress, segfaulted;
+ static my_bool opt_do_pstack, opt_bootstrap, opt_myisam_log;
+ static int cleanup_done;
+ static ulong opt_specialflag, opt_myisam_block_size;
+-static char *opt_logname, *opt_update_logname, *opt_binlog_index_name;
++static char *opt_logname, *opt_update_logname;
+ static char *opt_tc_heuristic_recover;
+ static char *mysql_home_ptr, *pidfile_name_ptr;
+ static char **defaults_argv;
+@@ -598,6 +601,32 @@
+
+ static my_socket unix_sock,ip_sock;
+ struct rand_struct sql_rand; // used by sql_class.cc:THD::THD()
++
++/* When set, we are inside a failover slave and deny all non-super access */
++bool failover_deny_access= 0;
++
++/* When set, binlog will be mirrored on the replica. */
++my_bool rpl_mirror_binlog_enabled;
++
++/* Sync the mirrored binlog to disk after every #th event. */
++ulong sync_mirror_binlog_period;
++
++/* The fixed size for replication event buffer. Replication event can exceed
++ * the size.
++ */
++//ulong rpl_event_buffer_size;
++
++/* This is a mirror binlog status variable on the primary to indicate how many
++ * mirror binlog servers are connecting.
++ */
++ulong rpl_mirror_binlog_clients = 0;
++
++/* This indicates whether mirror binlog is working on a replica database. It
++ * requires:
++ * . rpl_mirror_binlog_enabled = 1
++ * . the slave I/O thread is running and mirror binlog is also dumped
++ */
++ulong rpl_mirror_binlog_status = 0;
+
+ /* OS specific variables */
+
+@@ -1315,6 +1344,7 @@
+ (void) pthread_cond_destroy(&COND_flush_thread_cache);
+ (void) pthread_cond_destroy(&COND_manager);
+ (void) pthread_mutex_destroy(&LOCK_stats);
++ (void) pthread_mutex_destroy(&LOCK_failover_master);
+ (void) pthread_mutex_destroy(&LOCK_global_user_client_stats);
+ (void) pthread_mutex_destroy(&LOCK_global_table_stats);
+ (void) pthread_mutex_destroy(&LOCK_global_index_stats);
+@@ -3164,6 +3194,7 @@
+ (void) pthread_cond_init(&COND_rpl_status, NULL);
+ #endif
+ (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST);
++ (void) pthread_mutex_init(&LOCK_failover_master, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST);
+ (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST);
+@@ -3398,39 +3429,8 @@
+
+ if (opt_bin_log)
+ {
+- char buf[FN_REFLEN];
+- const char *ln;
+- ln= mysql_bin_log.generate_name(opt_bin_logname, "-bin", 1, buf);
+- if (!opt_bin_logname && !opt_binlog_index_name)
+- {
+- /*
+- User didn't give us info to name the binlog index file.
+- Picking `hostname`-bin.index like did in 4.x, causes replication to
+- fail if the hostname is changed later. So, we would like to instead
+- require a name. But as we don't want to break many existing setups, we
+- only give warning, not error.
+- */
+- sql_print_warning("No argument was provided to --log-bin, and "
+- "--log-bin-index was not used; so replication "
+- "may break when this MySQL server acts as a "
+- "master and has his hostname changed!! Please "
+- "use '--log-bin=%s' to avoid this problem.", ln);
+- }
+- if (ln == buf)
+- {
+- my_free(opt_bin_logname, MYF(MY_ALLOW_ZERO_PTR));
+- opt_bin_logname=my_strdup(buf, MYF(0));
+- }
+- if (mysql_bin_log.open_index_file(opt_binlog_index_name, ln))
+- {
+- unireg_abort(1);
+- }
+-
+- /*
+- Used to specify which type of lock we need to use for queries of type
+- INSERT ... SELECT. This will change when we have row level logging.
+- */
+- using_update_log=1;
++ if (make_master_open_index(&opt_bin_logname, opt_binlog_index_name) != 0)
++ unireg_abort(1);
+ }
+
+ if (xid_cache_init())
+@@ -3480,9 +3480,10 @@
+ unireg_abort(1);
+ }
+
+- if (opt_bin_log && mysql_bin_log.open(opt_bin_logname, LOG_BIN, 0,
+- WRITE_CACHE, 0, max_binlog_size, 0))
+- unireg_abort(1);
++ if (opt_bin_log &&
++ make_master(NULL, opt_bin_logname, opt_binlog_index_name, NULL) != 0) {
++ unireg_abort(1);
++ }
+
+ #ifdef HAVE_REPLICATION
+ if (opt_bin_log && expire_logs_days)
+@@ -5098,6 +5098,8 @@
+ OPT_INNODB_READ_IO_THREADS,
+ OPT_INNODB_WRITE_IO_THREADS,
+ OPT_INNODB_ADAPTIVE_HASH_INDEX,
++ OPT_RPL_MIRROR_BINLOG,
++ OPT_SYNC_MIRROR_BINLOG,
+ OPT_FEDERATED,
+ OPT_INNODB_USE_LEGACY_CARDINALITY_ALGORITHM
+ };
+@@ -5725,6 +5728,11 @@
+ {"rpl-recovery-rank", OPT_RPL_RECOVERY_RANK, "Undocumented.",
+ (gptr*) &rpl_recovery_rank, (gptr*) &rpl_recovery_rank, 0, GET_ULONG,
+ REQUIRED_ARG, 0, 0, 0, 0, 0, 0},
++ {"rpl_mirror_binlog_enabled", OPT_RPL_MIRROR_BINLOG,
++ "1 = support mirroring binlogs. 0 = disable mirroring binlogs",
++ (gptr*) &rpl_mirror_binlog_enabled,
++ (gptr*) &rpl_mirror_binlog_enabled, 0, GET_BOOL, NO_ARG,
++ 0, 0, 1, 0, 1, 0},
+ {"safe-mode", OPT_SAFE, "Skip some optimize stages (for testing).",
+ 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0},
+ #ifndef TO_BE_DELETED
+@@ -5849,6 +5857,11 @@
+ {"symbolic-links", 's', "Enable symbolic link support.",
+ (gptr*) &my_use_symdir, (gptr*) &my_use_symdir, 0, GET_BOOL, NO_ARG,
+ IF_PURIFY(0,1), 0, 0, 0, 0, 0},
++ {"sync-mirror-binlog", OPT_SYNC_MIRROR_BINLOG,
++ "Sync the mirrored binlog to disk after every #th event. "
++ "#=0 (the default) does no sync. Syncing slows MySQL down",
++ (gptr*) &sync_mirror_binlog_period,
++ (gptr*) &sync_mirror_binlog_period, 0, GET_ULONG, REQUIRED_ARG, 0, 0, ~0L, 0, 1, 0},
+ {"sysdate-is-now", OPT_SYSDATE_IS_NOW,
+ "Non-default option to alias SYSDATE() to NOW() to make it safe-replicable. Since 5.0, SYSDATE() returns a `dynamic' value different for different invocations, even within the same statement.",
+ (gptr*) &global_system_variables.sysdate_is_now,
+@@ -6625,6 +6638,7 @@
+ {"Delayed_errors", (char*) &delayed_insert_errors, SHOW_LONG},
+ {"Delayed_insert_threads", (char*) &delayed_insert_threads, SHOW_LONG_CONST},
+ {"Delayed_writes", (char*) &delayed_insert_writes, SHOW_LONG},
++ {"Failover_deny_access", (char*) &failover_deny_access, SHOW_LONG},
+ {"Flush_commands", (char*) &refresh_version, SHOW_LONG_CONST},
+ {"Handler_commit", (char*) offsetof(STATUS_VAR, ha_commit_count), SHOW_LONG_STATUS},
+ {"Handler_delete", (char*) offsetof(STATUS_VAR, ha_delete_count), SHOW_LONG_STATUS},
+diff -r 66cc9e0a6768 sql/repl_mule.cc
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/sql/repl_mule.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,466 @@
++/*
++ Copyright (C) 2007 Google Inc.
++
++This program is free software; you can redistribute it and/or
++modify it under the terms of the GNU General Public License
++as published by the Free Software Foundation; either version 2
++of the License, or (at your option) any later version.
++
++This program is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with this program; if not, write to the Free Software
++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
++*/
++
++#include "mysql_priv.h"
++#include <my_dir.h>
++#include "slave.h"
++#include "repl_mule.h"
++
++/* max log size: 2GB */
++#define MAX_LOG_SIZE BINLOG_NOSWITCH_SIZE
++
++ReplMule::ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status,
++ my_off_t file_size, const char *binlog_indexname,
++ MYSQL_LOG *binlog, ulong sync_period)
++ : desc_event_(new Format_description_log_event(BINLOG_VERSION)),
++ io_thd_(thd), mi_(mi), status_(status), dump_position_(0L),
++ file_size_(file_size), mule_log_(binlog),
++ mule_log_sync_period_(sync_period), mule_log_event_counter_(0) {
++ char llbuf1[22], llbuf2[22];
++
++ DBUG_ENTER("ReplMule::ReplMule");
++
++ /* Indicate that we are in replication mule mode. */
++ mule_log_->set_mule_mode();
++
++ strmake(curr_log_filename_, mi->master_log_name,
++ sizeof(curr_log_filename_)-1);
++ strmake(mule_indexname_, binlog_indexname, sizeof(mule_indexname_)-1);
++
++ /* Open the mule log file */
++ if (!mule_log_->is_log_open()) {
++ /* Do not open binlog file when master_log_name is not specified. We
++ * are at the I/O thread initialization time and we do not know what
++ * filename we are going to dump.
++ * We wait for the next rotation event to indicate the filename.
++ */
++ if (strlen(curr_log_filename_) > 0 &&
++ mule_log_->open(curr_log_filename_, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) {
++ sql_print_error("ReplMule: open binlog failed: %s",
++ curr_log_filename_);
++ status_ = MULE_ERROR;
++ DBUG_VOID_RETURN;
++ }
++ }
++
++ switch (status_) {
++ case MULE_BEHIND:
++ dump_position_ = mi->master_log_pos;
++ mi->master_log_pos = file_size_;
++ sql_print_information("ReplicationMule: MULE_BEHIND - new(%s), old(%s)",
++ llstr(mi->master_log_pos, llbuf1),
++ llstr(dump_position_, llbuf2));
++ break;
++ case RELAY_MATCH_MULE:
++ case RELAY_MATCH_MULE_RUN:
++ dump_position_ = mi->master_log_pos;
++ sql_print_information("ReplicationMule: RELAY_MATCH_MULE.");
++ break;
++ case MULE_VERIFY:
++ case MULE_VERIFY_RELAY_BEHIND:
++ dump_position_ = mi->master_log_pos;
++ mi->master_log_pos = BIN_LOG_HEADER_SIZE;
++ sql_print_information(
++ "ReplicationMule: MULE_VERIFY - old(%s), file_size(%s)",
++ llstr(dump_position_, llbuf1), llstr(file_size_, llbuf2));
++
++ /* seek to the beginning of the file for verification */
++ seekToPosition(BIN_LOG_HEADER_SIZE);
++ break;
++ }
++
++ DBUG_VOID_RETURN;
++}
++
++ReplMule::~ReplMule() {
++ DBUG_ENTER("ReplMule::~ReplMule");
++
++ if (mule_log_->is_log_open())
++ mule_log_->close(LOG_CLOSE_INDEX);
++ mule_log_->clear_mule_mode();
++
++ /* If we are still in MULE_BEHIND or MULE_VERIFY state and we exit from
++ * I/O thread, it means we encountered some errors.
++ * mi->master_log_pos might be used by later slave start. It is being
++ * changed here to do event dumping or event verification. So, we should
++ * restore it to its original value.
++ */
++ switch (status_) {
++ case MULE_BEHIND:
++ case MULE_VERIFY:
++ if (mi_->master_log_pos < dump_position_)
++ mi_->master_log_pos = dump_position_;
++ break;
++ }
++
++ delete desc_event_;
++
++ DBUG_VOID_RETURN;
++}
++
++ReplMule::WriteStatus ReplMule::writeEvent(const char* buf, ulong event_len) {
++ WriteStatus dump_status = WRITE_RELAY;
++ char llbuf1[22], llbuf2[22], llbuf3[22];
++ char *verify_event;
++ bool verified = false;
++ bool skip_event = false;
++
++ DBUG_ENTER("ReplMule::dumpEvent");
++ switch (status_) {
++ case MULE_VERIFY:
++ case MULE_VERIFY_RELAY_BEHIND:
++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT &&
++ IsFakeRotation(buf, event_len)) {
++ /* Do not verify the faked rotate event */
++ if (status_ == MULE_VERIFY)
++ dump_status = SKIP_RELAY;
++ break;
++ }
++ verify_event = new char[event_len];
++ if (verify_event == NULL) {
++ sql_print_error(
++ "ReplMule::dumpEvent - insufficient memory in verification, "
++ "position(%s), event_len(%d).",
++ llstr(mi_->master_log_pos, llbuf1), event_len);
++ dump_status = WRITE_ERROR;
++ break;
++ }
++ if (my_b_read(mule_log_->get_log_file(), (byte*) verify_event,
++ event_len) != 0) {
++ sql_print_error(
++ "ReplMule::dumpEvent - read log error in verification, "
++ "position(%s), event_len(%d).",
++ llstr(mi_->master_log_pos, llbuf1), event_len);
++ dump_status = WRITE_ERROR;
++ delete verify_event;
++ break;
++ }
++ verified = (memcmp(buf, verify_event, event_len) == 0);
++ delete verify_event;
++ if (!verified) {
++ sql_print_error(
++ "ReplMule::dumpEvent - event does not match at position(%s)",
++ llstr(mi_->master_log_pos, llbuf1));
++ dump_status = WRITE_ERROR;
++ break;
++ }
++ /* fall through */
++ case MULE_BEHIND:
++ dump_status = SKIP_RELAY;
++ if (status_ == MULE_BEHIND &&
++ queueEvent(buf, event_len, &skip_event) != 0) {
++ dump_status = WRITE_ERROR;
++ break;
++ }
++
++ /* Skip faked rotation event */
++ if (!skip_event)
++ mi_->master_log_pos += event_len;
++
++ if (mi_->master_log_pos == dump_position_) {
++ if (dump_position_ < file_size_) {
++ status_ = MULE_VERIFY_RELAY_BEHIND;
++ } else {
++ status_ = RELAY_MATCH_MULE;
++ }
++ sql_print_information(
++ "ReplMule::dumpEvent - new status(%d) "
++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_,
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2),
++ llstr(file_size_, llbuf3));
++ } else if (mi_->master_log_pos == file_size_) {
++ if (dump_position_ > file_size_) {
++ status_ = MULE_BEHIND;
++ } else {
++ status_ = RELAY_MATCH_MULE;
++ }
++ sql_print_information(
++ "ReplMule::dumpEvent - new status(%d) "
++ "master_log_pos(%s), dump_pos(%s), file_size(%s)", status_,
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2),
++ llstr(file_size_, llbuf3));
++ } else if (status_ != MULE_VERIFY_RELAY_BEHIND &&
++ mi_->master_log_pos > dump_position_) {
++ sql_print_error(
++ "ReplMule::dumpEvent - mule position(%s) does not match "
++ "relay-log position(%s).",
++ llstr(mi_->master_log_pos, llbuf1), llstr(dump_position_, llbuf2));
++ dump_status = WRITE_ERROR;
++ }
++ break;
++ case RELAY_MATCH_MULE_RUN:
++ if (buf[EVENT_TYPE_OFFSET] == FORMAT_DESCRIPTION_EVENT) {
++ sql_print_information(" RELAY_MATCH_MULE event %d", buf[EVENT_TYPE_OFFSET] );
++ /* Do not write format description record if size is the same */
++ break;
++ }
++ case RELAY_MATCH_MULE:
++ if (queueEvent(buf, event_len, &skip_event) != 0)
++ dump_status = WRITE_ERROR;
++ break;
++ }
++
++ DBUG_RETURN(dump_status);
++}
++
++int ReplMule::appendEvent(const char* buf, ulong event_len) {
++ char llbuf1[22];
++ int error;
++
++ DBUG_ENTER("ReplMule::appendEvent");
++
++ error = mule_log_->appendv(buf,event_len,0);
++ if (error != 0) {
++ sql_print_error("ReplMule::appendEvent - append error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ } else if (mule_log_->flush_log_file() != 0) {
++ sql_print_error("ReplMule::appendEvent - flush error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ error = -1;
++ } else if (mule_log_sync_period_ > 0) {
++ mule_log_event_counter_++;
++ if (mule_log_event_counter_ >= mule_log_sync_period_) {
++ mule_log_event_counter_ = 0;
++ error = my_sync(mule_log_->get_log_file()->file, MYF(MY_WME));
++ if (error != 0)
++ sql_print_error("ReplMule::appendEvent - sync error at %s(%s)",
++ mi_->master_log_name,
++ llstr(mi_->master_log_pos, llbuf1));
++ }
++ }
++
++ DBUG_RETURN(error);
++}
++
++int ReplMule::queueEvent(const char* buf, ulong event_len, bool *skip_event) {
++ int error = 0;
++
++ DBUG_ENTER("ReplMule::queueEvent");
++
++ *skip_event = false;
++
++ mule_log_->lock_log();
++ if (buf[EVENT_TYPE_OFFSET] == ROTATE_EVENT) {
++ Rotate_log_event rev(buf, event_len, desc_event_);
++
++ /* If this is a faked rotate event and the specified filename is
++ * the same as the current binlog filename, ignore the event.
++ */
++ if (IsFakeRotation(rev)) {
++ *skip_event = true;
++ DBUG_PRINT("info",("skipped faked rotation event"));
++ } else {
++ /* Only append real events. */
++ if (rev.when != 0)
++ error = appendEvent(buf, event_len);
++
++ /* Only rotate file when append succeeds. */
++ if (error == 0) {
++ /* Create a new file: lock both index and log. */
++ if (strlen(curr_log_filename_) == 0) {
++ /* If curr_log_filename_ is not specified, then this is the first
++ * valid rotation event to indicate the filename.
++ */
++ error = mule_log_->open(rev.new_log_ident, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0);
++ } else {
++ mule_log_->new_file(0, rev.new_log_ident);
++ }
++
++ strmake(curr_log_filename_, rev.new_log_ident,
++ strlen(rev.new_log_ident));
++
++ DBUG_PRINT("info",("rotate file: %s", rev.new_log_ident));
++ }
++ }
++ } else {
++ error = appendEvent(buf, event_len);
++ }
++ mule_log_->unlock_log();
++
++ DBUG_RETURN(error);
++}
++
++void ReplMule::seekToPosition(my_off_t pos) {
++ DBUG_ENTER("ReplMule::seekToPosition");
++ DBUG_PRINT("enter",("seek_pos: %ld", (ulong) pos));
++
++ my_b_seek(mule_log_->get_log_file(), pos);
++ DBUG_VOID_RETURN;
++}
++
++bool ReplMule::IsFakeRotation(const char* buf, ulong event_len) {
++ DBUG_ENTER("ReplMule::IsFakeRotation");
++
++ Rotate_log_event rev(buf, event_len, desc_event_);
++ DBUG_RETURN(IsFakeRotation(rev));
++}
++
++bool ReplMule::IsFakeRotation(const Rotate_log_event& rev) {
++ DBUG_ENTER("ReplMule::IsFakeRotation");
++ DBUG_RETURN(rev.when == 0 &&
++ rev.ident_len == strlen(curr_log_filename_) &&
++ strcmp(rev.new_log_ident, curr_log_filename_) == 0);
++}
++
++/* createReplicationMule:
++ * Create a mule that relays master's replication binlog and
++ * generate an exact same copy on the local filesystem.
++ *
++ * Code flow:
++ * last_mulelog = scan the existing mule log index to find it
++ * if (mulelog index is not created or there is no mule log inside it)
++ * old_mule_log <- requested dumping position
++ * requested dumping position <- 0 in the file
++ * else
++ * check whether the mule log matches the requested dump
++ * (whether the last mule log name/size matches)
++ * if the mule log name does not match
++ * exit with an error
++ * if (the mule log size does not match the requested dump position)
++ * request the dump from position 0 and read all events
++ * verify all events with the corresponding events in mule log
++ * if (the verification succeeds)
++ * continue the dump
++ * else
++ * exit with an error
++ */
++ReplMule* ReplMule::createReplicationMule(
++ THD* thd, MASTER_INFO *mi, const char *binlog_indexname,
++ MYSQL_LOG *binlog) {
++ ReplMule *mule = NULL;
++ LOG_INFO linfo;
++ bool index_opened = false;
++
++ DBUG_ENTER("ReplMule::createReplicationMule");
++
++ /* binlog_indexname must be set to some real value. */
++ DBUG_ASSERT(binlog_indexname);
++
++ /* Lock binlog index for all binlog operations */
++ binlog->lock_index();
++ index_opened = binlog->open_index_file(binlog_indexname, NULL);
++ DBUG_PRINT("info",("open index file succeed: %d", index_opened));
++ sql_print_information("createReplicationMule");
++
++ /* Scan the existing binlog index to find the last relayed binlog */
++ if (index_opened ||
++ binlog->find_log_pos(&linfo, NullS, false) != 0) {
++ /* binlog index is not created or has no log file inside:
++ * . old_relay_binlog <- requested dumping position
++ * . requested dumping position <- 0 in the file
++ */
++ if (mi->master_log_pos == BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog, sync_mirror_binlog_period);
++ } else {
++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog, sync_mirror_binlog_period);
++ }
++
++ if (mule == NULL) {
++ sql_print_error("Mule malloc operation failed.");
++ }
++ } else {
++ IO_CACHE* log_file;
++ MY_STAT stat;
++ char last_binlog_name[FN_REFLEN];
++
++ /* Find the last log file from the binlog index.
++ * Check whether the last binlog matches the requested dump for both
++ * binlog name and binlog size.
++ */
++ for (;;) {
++ strmake(last_binlog_name, linfo.log_file_name, FN_REFLEN);
++ last_binlog_name[FN_REFLEN - 1] = '\0';
++ if (binlog->find_next_log(&linfo, false))
++ break;
++ }
++ DBUG_PRINT("info",("the last binlog: %s", last_binlog_name));
++
++ /* if the binlog name does not match, exit with an error. */
++ if (strcmp(last_binlog_name+dirname_length(last_binlog_name),
++ mi->master_log_name) != 0) {
++ sql_print_error("Mule binlog(%s) does not match new relay-binlog(%s)",
++ last_binlog_name, mi->master_log_name);
++ } /* Open the last binlog. */
++ else if (binlog->open(last_binlog_name, LOG_BIN, NULL,
++ SEQ_READ_APPEND, true, MAX_LOG_SIZE, 0) != 0) {
++ sql_print_error("Mule open last binlog failed: %s", last_binlog_name);
++ } else {
++ bool valid_file_size = true;
++
++ /* Get the binlog size. */
++ log_file = binlog->get_log_file();
++ if (my_fstat(log_file->file, &stat, MYF(0)) == 0) {
++ /* If the binlog size does not match the requested dump position, then
++ * request the dump from position 0 and verify all events, we need to
++ * verify events because the mule log might be used for serving during
++ * anytime. We must be sure that they are correct.
++ */
++ sql_print_information("Binglog size %d", stat.st_size);
++ if (stat.st_size == mi->master_log_pos) {
++ mule = new ReplMule(thd, mi, RELAY_MATCH_MULE_RUN, stat.st_size,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else if (stat.st_size > BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, MULE_VERIFY, stat.st_size,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else if (stat.st_size == BIN_LOG_HEADER_SIZE) {
++ mule = new ReplMule(thd, mi, MULE_BEHIND, BIN_LOG_HEADER_SIZE,
++ binlog_indexname, binlog,
++ sync_mirror_binlog_period);
++ } else {
++ char llbuf[22];
++ valid_file_size = false;
++ sql_print_error("Mule binlog file(%s) invalid size: %s",
++ last_binlog_name, llstr(stat.st_size, llbuf));
++ }
++ } else {
++ valid_file_size = false;
++ sql_print_error("Mule binlog file(%s): fstat failed.",
++ last_binlog_name);
++ }
++
++ if (valid_file_size) {
++ if (mule == NULL) {
++ sql_print_error("Mule malloc operation failed.");
++ } else if (mule->status_ == MULE_ERROR) {
++ /* If mule creation fails, indicate the error. */
++ delete mule;
++ mule = NULL;
++ }
++ }
++ }
++ }
++
++ /* Clear the mule binlog mode if there are errors. */
++ if (mule == NULL) {
++ binlog->clear_mule_mode();
++ binlog->close_index_file();
++ }
++
++ /* Unlock binlog index */
++ binlog->unlock_index();
++
++ DBUG_RETURN(mule);
++}
+diff -r 66cc9e0a6768 sql/repl_mule.h
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/sql/repl_mule.h Thu Dec 04 21:46:15 2008 -0800
+@@ -0,0 +1,166 @@
++/*
++ Copyright (C) 2007 Google Inc.
++
++This program is free software; you can redistribute it and/or
++modify it under the terms of the GNU General Public License
++as published by the Free Software Foundation; either version 2
++of the License, or (at your option) any later version.
++
++This program is distributed in the hope that it will be useful,
++but WITHOUT ANY WARRANTY; without even the implied warranty of
++MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++GNU General Public License for more details.
++
++You should have received a copy of the GNU General Public License
++along with this program; if not, write to the Free Software
++Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA.
++*/
++
++#ifndef SQL_REPL_MULE_H__
++#define SQL_REPL_MULE_H__
++
++/* Replication Mule is the class that is responsible for generating
++ * an exact copy of the binlog from a master database. We call this feature
++ * mirror binlog and it can be enabled by setting rpl_mirror_binlog. We
++ * need to keep the same copy for the following purposes:
++ * . The replica can serve the binlog transparently as if they are the
++ * master database. This can relieve master connection overhead.
++ * . During failover, the replica can become the new master and serve
++ * old binlogs transparently.
++ * (The Mule name comes from the popular P2P software eMule.)
++ *
++ * Internally, we call the mirrored binlog mule log.
++ */
++
++class THD;
++class Rotate_log_event;
++class Format_description_log_event;
++typedef struct st_master_info MASTER_INFO;
++
++class ReplMule {
++ public:
++ /* Because I/O thread also creates relay-binlog, instead of an exact
++ * copy of the original master's binlog, we have two resources that
++ * might get out of sync.
++ * This enum indicates the status:
++ * MULE_BEHIND - the mule's header is behind:
++ * (mule is activated for the first time)
++ * RELAY_MATCH_MULE - mule matches relay-log
++ * RELAY_MATCH_MULE_RUN - mule matches relay-log and it was not empty binlog
++ * MULE_VERIFY - mule has more events than the relay-log and needs
++ * verification; we can not verify based on relay-log
++ * events because events might get changed a little;
++ * verification starts with downloading all events in
++ * the last binlog from the master and compare with
++ * all events in the mule log;
++ * MULE_VERIFY_RELAY_BEHIND - mule has more events than the relay-log
++ * and relay-log needs to write events
++ * MULE_ERROR - mule detects errors in event duplicate
++ *
++ * When the mule mirrors binlogs, it writes an event into the mule log
++ * first. Then, I/O thread writes the event into the relay log.
++ */
++ enum RelayStatus {
++ MULE_BEHIND = 1,
++ RELAY_MATCH_MULE = 2,
++ RELAY_MATCH_MULE_RUN = 7,
++ MULE_VERIFY = 3,
++ MULE_VERIFY_RELAY_BEHIND = 4,
++ MULE_ERROR = 5,
++ };
++
++ enum WriteStatus {
++ WRITE_RELAY = 1,
++ WRITE_ERROR = 2,
++ SKIP_RELAY = 3,
++ };
++
++ private:
++ const Format_description_log_event *desc_event_;
++ THD *io_thd_;
++ MASTER_INFO *mi_;
++
++ /*
++ * I/O thread will write both mule log for mirror binlog and relay log
++ * for SQL thread.
++ * The variable indicates whether the two are in sync.
++ */
++ RelayStatus status_;
++
++ /* The starting event writing position. */
++ my_off_t dump_position_;
++
++ /* During the initial setup, the last mule log's file size. */
++ my_off_t file_size_;
++
++ /* Internally, we call the mirrored binlog mule log. */
++ MYSQL_LOG *mule_log_;
++
++ /* Sync the mule log to disk for every #N events. */
++ ulong mule_log_sync_period_;
++ ulong mule_log_event_counter_;
++
++ /* mule log's index filename */
++ char mule_indexname_[FN_REFLEN];
++
++ /* the current mule log's filename */
++ char curr_log_filename_[FN_REFLEN];
++
++ ReplMule(THD* thd, MASTER_INFO *mi, RelayStatus status,
++ my_off_t file_size, const char *binlog_indexname,
++ MYSQL_LOG *binlog, ulong sync_period);
++
++ /*
++ * Queue the event into the current mule log. If it is a rotation
++ * event, generate a new mule log file.
++ * Indicate whether the event is skipped because it is an fake event.
++ * A fake event is generated by the master to indicate the current
++ * reading position.
++ */
++ int queueEvent(const char* buf, ulong event_len, bool *skip_event);
++
++ /* Append the event to the current mule log. */
++ int appendEvent(const char* buf, ulong event_len);
++
++ bool IsFakeRotation(const char* buf, ulong event_len);
++ bool IsFakeRotation(const Rotate_log_event& rev);
++
++ /* Seek to the specified position in the current open mule log. */
++ void seekToPosition(my_off_t pos);
++
++ public:
++
++ ~ReplMule();
++
++ /* Dump the event into mule binlog.
++ * Input:
++ * buf (IN) - replication event buffer
++ * event_len (IN) - the event length
++ *
++ * Return:
++ * . WRITE_RELAY: the relay log needs to writing the event
++ * . WRITE_ERROR: the writing encountered errors
++ * . SKIP_RELAY: the relay log should skip the event
++ */
++ WriteStatus writeEvent(const char* buf, ulong event_len);
++
++ /* createReplicationMule:
++ * Create a mule that relays master's replication binlog and
++ * generate an exact same copy on the local filesystem.
++ *
++ * Input:
++ * thd (IN) - replication I/O thread
++ * mi (IN) - master info struct for I/O thread's progress
++ * binlog_indexname (IN) - filename for binlog's index
++ * binlog (IN) - replication binlog
++ *
++ * Return:
++ * . a replication mule if success
++ * . NULL if there are any errors
++ */
++ static ReplMule *createReplicationMule(THD* thd, MASTER_INFO *mi,
++ const char *binlog_indexname,
++ MYSQL_LOG *binlog);
++};
++
++#endif /* SQL_REPL_MULE_H__ */
+diff -r 66cc9e0a6768 sql/set_var.cc
+--- a/sql/set_var.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/set_var.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -345,6 +345,8 @@
+ slog_verb);
+ sys_var_long_ptr sys_rpl_recovery_rank("rpl_recovery_rank",
+ &rpl_recovery_rank);
++sys_var_bool_ptr sys_rpl_mirror_binlog_enabled("rpl_mirror_binlog_enabled",
++ &rpl_mirror_binlog_enabled);
+ sys_var_long_ptr sys_query_cache_size("query_cache_size",
+ &query_cache_size,
+ fix_query_cache_size);
+@@ -364,6 +366,9 @@
+ sys_var_thd_ulong sys_trans_prealloc_size("transaction_prealloc_size",
+ &SV::trans_prealloc_size,
+ 0, fix_trans_mem_root);
++sys_var_long_ptr sys_sync_mirror_binlog_period(
++ "sync_mirror_binlog_period",
++ &sync_mirror_binlog_period);
+
+ #ifdef HAVE_QUERY_CACHE
+ sys_var_long_ptr sys_query_cache_limit("query_cache_limit",
+@@ -774,6 +779,7 @@
+ &sys_relay_log_purge,
+ #endif
+ &sys_rpl_recovery_rank,
++ &sys_rpl_mirror_binlog_enabled,
+ &sys_safe_updates,
+ &sys_secure_auth,
+ &sys_secure_file_priv,
+@@ -1113,6 +1119,8 @@
+ {"relay_log_space_limit", (char*) &relay_log_space_limit, SHOW_LONGLONG},
+ #endif
+ {sys_rpl_recovery_rank.name,(char*) &sys_rpl_recovery_rank, SHOW_SYS},
++ {sys_rpl_mirror_binlog_enabled.name,
++ (char *) &sys_rpl_mirror_binlog_enabled, SHOW_SYS},
+ {"secure_auth", (char*) &sys_secure_auth, SHOW_SYS},
+ {"secure_file_priv", (char*) &sys_secure_file_priv, SHOW_SYS},
+ #ifdef HAVE_SMEM
+diff -r 66cc9e0a6768 sql/slave.cc
+--- a/sql/slave.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/slave.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -25,6 +25,7 @@
+ #include <thr_alarm.h>
+ #include <my_dir.h>
+ #include <sql_common.h>
++#include "repl_mule.h"
+ #include <errmsg.h>
+ #include <mysys_err.h>
+
+@@ -3527,6 +3528,7 @@
+ RELAY_LOG_INFO *rli= &mi->rli;
+ char llbuff[22];
+ uint retry_count;
++ ReplMule *mule = NULL;
+
+ // needs to call my_thread_init(), otherwise we get a coredump in DBUG_ stuff
+ my_thread_init();
+@@ -3609,6 +3611,23 @@
+ if (get_master_version_and_clock(mysql, mi))
+ goto err;
+
++ if (rpl_mirror_binlog_enabled && !mule) {
++ if (opt_binlog_index_name == NULL) {
++ sql_print_error("\"log-bin-index\" must be set in mirror binlog.");
++ goto err;
++ }
++
++ /* Create the mule to generate the exact copy of the binlog */
++ mule = ReplMule::createReplicationMule(
++ thd, mi, opt_binlog_index_name, &mysql_bin_log);
++
++ /* If we could not create the mule, we stop the I/O thread and report
++ * an error.
++ */
++ if (mule == NULL)
++ goto err;
++ }
++
+ if (mi->rli.relay_log.description_event_for_queue->binlog_version > 1)
+ {
+ /*
+@@ -3624,6 +3643,7 @@
+ DBUG_PRINT("info",("Starting reading binary log from master"));
+ while (!io_slave_killed(thd,mi))
+ {
++ const char* event_buf;
+ bool suppress_warnings= 0;
+ thd_proc_info(thd, "Requesting binlog dump");
+ if (request_dump(mysql, mi, &suppress_warnings))
+@@ -3754,10 +3774,25 @@
+ goto connected;
+ } // if (event_len == packet_error)
+
++ event_buf = (const char*)mysql->net.read_pos + 1;
++
++ if (mule) {
++ ReplMule::WriteStatus d_status =
++ mule->writeEvent(event_buf, event_len);
++ switch (d_status) {
++ case ReplMule::WRITE_RELAY:
++ break;
++ case ReplMule::SKIP_RELAY:
++ /* Skip writing relay event; go back to read the next event */
++ continue;
++ case ReplMule::WRITE_ERROR:
++ goto err;
++ }
++ }
++
+ retry_count=0; // ok event, reset retry counter
+ thd_proc_info(thd, "Queueing master event to the relay log");
+- if (queue_event(mi,(const char*)mysql->net.read_pos + 1,
+- event_len))
++ if (queue_event(mi, event_buf, event_len))
+ {
+ sql_print_error("Slave I/O thread could not queue event from master");
+ goto err;
+@@ -3847,6 +3882,7 @@
+ change_rpl_status(RPL_ACTIVE_SLAVE,RPL_IDLE_SLAVE);
+ DBUG_ASSERT(thd->net.buff != 0);
+ net_end(&thd->net); // destructor will not free it, because net.vio is 0
++ delete mule;
+ close_thread_tables(thd, 0);
+ pthread_mutex_lock(&LOCK_thread_count);
+ THD_CHECK_SENTRY(thd);
+diff -r 66cc9e0a6768 sql/sql_class.h
+--- a/sql/sql_class.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_class.h Thu Dec 04 21:46:15 2008 -0800
+@@ -152,6 +152,12 @@
+ #define LOG_INFO_FATAL -7
+ #define LOG_INFO_IN_USE -8
+
++/* If the maximum size is equal to this value, binlog would not rotate on
++ * size limit.
++ */
++#define BINLOG_NOSWITCH_SIZE ((ulong) -1)
++
++
+ /* bitmap to SQL_LOG::close() */
+ #define LOG_CLOSE_INDEX 1
+ #define LOG_CLOSE_TO_BE_OPENED 2
+@@ -245,6 +251,9 @@
+ bool no_auto_events;
+ friend class Log_event;
+
++ /* mule replication mode */
++ bool mule_binlog_;
++
+ public:
+ /*
+ These describe the log's format. This is used only for relay logs.
+@@ -317,7 +326,8 @@
+ }
+ bool open_index_file(const char *index_file_name_arg,
+ const char *log_name);
+- void new_file(bool need_lock);
++ int close_index_file();
++ void new_file(bool need_lock= 1, const char* log_filename= NULL);
+ bool write(THD *thd, enum enum_server_command command,
+ const char *format, ...) ATTRIBUTE_FORMAT(printf, 4, 5);
+ bool write(THD *thd, const char *query, uint query_length,
+@@ -357,7 +367,27 @@
+ int get_current_log(LOG_INFO* linfo);
+ int raw_get_current_log(LOG_INFO* linfo);
+ uint next_file_id();
+- inline bool is_open() { return log_type != LOG_CLOSED; }
++
++ /* Because mysql use is_open() to check whether replication is on,
++ * we will let the check fail during binlog mule mode. Mule replication
++ * and normal master replication can not be on at the same time.
++ *
++ * is_log_open(): the binlog file is open for either purpose
++ *
++ * is_open(): the binlog is open for master replication.
++ * is_mule_open(): the binlog is open for mirror binlog or for
++ * replication mule; refer repl_mule.h for details
++ */
++ bool is_log_open() {
++ return log_type != LOG_CLOSED;
++ }
++ bool is_open() {
++ return (!mule_binlog_) && is_log_open();
++ }
++ bool is_mule_open() {
++ return (mule_binlog_) && is_log_open();
++ }
++
+ inline char* get_index_fname() { return index_file_name;}
+ inline char* get_log_fname() { return log_file_name; }
+ inline char* get_name() { return name; }
+@@ -366,8 +396,18 @@
+
+ inline void lock_index() { pthread_mutex_lock(&LOCK_index);}
+ inline void unlock_index() { pthread_mutex_unlock(&LOCK_index);}
++ inline void lock_log() { pthread_mutex_lock(&LOCK_log);}
++ inline void unlock_log() { pthread_mutex_unlock(&LOCK_log);}
+ inline IO_CACHE *get_index_file() { return &index_file;}
+ inline uint32 get_open_count() { return open_count; }
++ /* Look in file repl_mule.h for the definition of mule. */
++ void set_mule_mode() {
++ mule_binlog_ = 1;
++ }
++ void clear_mule_mode() {
++ mule_binlog_ = 0;
++ }
++ int flush_log_file();
+ };
+
+ /*
+diff -r 66cc9e0a6768 sql/sql_lex.h
+--- a/sql/sql_lex.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_lex.h Thu Dec 04 21:46:15 2008 -0800
+@@ -104,6 +104,7 @@
+ // TODO(mcallaghan): update status_vars in mysqld to export these
+ SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS,
+ SQLCOM_SHOW_CLIENT_STATS,
++ SQLCOM_MAKE_MASTER,
+ /* This should be the last !!! */
+ SQLCOM_END
+ };
+@@ -171,6 +172,12 @@
+ char *ssl_key, *ssl_cert, *ssl_ca, *ssl_capath, *ssl_cipher;
+ char *relay_log_name;
+ ulong relay_log_pos;
++
++ /* the following fields are used for make master command */
++ char *log_index_name;
++ bool in_failover;
++ bool kill_session;
++ bool with_old_binlog;
+ } LEX_MASTER_INFO;
+
+
+diff -r 66cc9e0a6768 sql/sql_parse.cc
+--- a/sql/sql_parse.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_parse.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -402,6 +402,15 @@
+ passwd_len ? "yes": "no",
+ thd->main_security_ctx.master_access,
+ (thd->db ? thd->db : "*none*")));
++
++ /* If we are in failover mode, reject all non-super user connections. */
++ if (is_in_failover() &&
++ !(thd->main_security_ctx.master_access & SUPER_ACL)) {
++ net_send_error(thd, ER_SPECIFIC_ACCESS_DENIED_ERROR,
++ "super-user only during failover");
++ DBUG_RETURN(-1);
++ }
++
+
+ if (check_count)
+ {
+@@ -3470,6 +3479,22 @@
+ else
+ res = load_master_data(thd);
+ break;
++
++ case SQLCOM_MAKE_MASTER:
++ {
++ thd_proc_info(thd, "Making master");
++
++ if (check_global_access(thd, SUPER_ACL))
++ goto error;
++ res = make_master(thd, NULL, NULL, &lex->mi);
++ if (res == 0) {
++ // TODO -- wei is this OK, setting it to NULL?
++ thd_proc_info(thd, 0);
++ send_ok(thd);
++ }
++ break;
++ }
++
+ #endif /* HAVE_REPLICATION */
+ #ifdef HAVE_NDBCLUSTER_DB
+ case SQLCOM_SHOW_NDBCLUSTER_STATUS:
+diff -r 66cc9e0a6768 sql/sql_repl.cc
+--- a/sql/sql_repl.cc Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_repl.cc Thu Dec 04 21:46:15 2008 -0800
+@@ -20,11 +20,19 @@
+ #include "log_event.h"
+ #include <my_dir.h>
+
++extern pthread_mutex_t LOCK_failover_master;
++extern bool failover_deny_access;
++
+ int max_binlog_dump_events = 0; // unlimited
+ my_bool opt_sporadic_binlog_dump_fail = 0;
+ #ifndef DBUG_OFF
+ static int binlog_dump_count = 0;
+ #endif
++
++static int make_master_open_log(MYSQL_LOG *log, const char *opt_name,
++ bool no_auto_events, ulong max_size);
++static int set_in_failover(bool kill_session);
++static void clear_in_failover(void);
+
+ /*
+ fake_rotate_event() builds a fake (=which does not exist physically in any
+@@ -255,7 +263,7 @@
+ bool purge_master_logs(THD* thd, const char* to_log)
+ {
+ char search_file_name[FN_REFLEN];
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ send_ok(thd);
+ return FALSE;
+@@ -308,6 +316,44 @@
+ return error;
+ }
+
++/* Show processlist command dump the binlog state.
++ *
++ * Input:
++ * output_info - (OUT) the output proc_info
++ * output_len - (IN) output proc_info's length
++ * thd - (IN) the thread
++ * input_msg - (IN) the input proc_info
++ * log_file_name - (IN) binlog file name
++ * log_pos - (IN) binlog position
++ */
++static void processlist_show_binlog_state(char *output_info,
++ int output_len,
++ THD *thd,
++ const char *input_msg,
++ const char *log_file_name,
++ my_off_t log_pos) {
++ DBUG_ENTER("processlist_show_binlog_state");
++
++ /* Point to input_msg in case "show processlist" access it before the copy
++ * is finished.
++ */
++ thd_proc_info(thd, input_msg);
++
++ if (snprintf(output_info, output_len, "%s :%s:%lld:", input_msg,
++ log_file_name + dirname_length(log_file_name),
++ log_pos) > 0) {
++ thd_proc_info(thd, output_info);
++ }
++
++ DBUG_VOID_RETURN;
++}
++
++static void repl_cleanup(ushort flags) {
++ if (flags & BINLOG_MIRROR_CLIENT) {
++ /* One less mirror binlog client. */
++ thread_safe_sub(rpl_mirror_binlog_clients, 1, &LOCK_stats);
++ }
++}
+
+ /*
+ TODO: Clean up loop to only have one call to send_file()
+@@ -319,6 +365,11 @@
+ LOG_INFO linfo;
+ char *log_file_name = linfo.log_file_name;
+ char search_file_name[FN_REFLEN], *name;
++
++ /* This buffer should be enough for "comments + :file_name:file_pos:". */
++ char binlog_state_msg[FN_REFLEN + 100];
++ int binlog_state_msg_len = FN_REFLEN + 100;
++
+ IO_CACHE log;
+ File file = -1;
+ String* packet = &thd->packet;
+@@ -335,6 +386,15 @@
+
+ bzero((char*) &log,sizeof(log));
+
++ sql_print_information("Start %s binlog_dump to slave_server(%d), pos(%s, %lu)",
++ "asynchronous",
++ thd->server_id, log_ident, (ulong)pos);
++
++ if (flags & BINLOG_MIRROR_CLIENT) {
++ /* One more mirror binlog clients. */
++ thread_safe_increment(rpl_mirror_binlog_clients, &LOCK_stats);
++ }
++
+ #ifndef DBUG_OFF
+ if (opt_sporadic_binlog_dump_fail && (binlog_dump_count++ % 2))
+ {
+@@ -344,7 +404,7 @@
+ }
+ #endif
+
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ errmsg = "Binary log is not open";
+ my_errno= ER_MASTER_FATAL_ERROR_READING_BINLOG;
+@@ -529,6 +589,12 @@
+ }
+ #endif
+
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Send binlog events to slave",
++ log_file_name, pos);
++
+ if ((*packet)[EVENT_TYPE_OFFSET+1] == FORMAT_DESCRIPTION_EVENT)
+ {
+ binlog_can_be_corrupted= test((*packet)[FLAGS_OFFSET+1] &
+@@ -634,6 +700,13 @@
+ }
+ if (!thd->killed)
+ {
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Has sent all binlog to slave; "
++ "waiting for binlog to be updated",
++ log_file_name, pos);
++
+ /* Note that the following call unlocks lock_log */
+ mysql_bin_log.wait_for_update(thd, 0);
+ }
+@@ -650,7 +723,12 @@
+
+ if (read_packet)
+ {
+- thd_proc_info(thd, "Sending binlog event to slave");
++ // thd_proc_info(thd, "Sending binlog event to slave");
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(binlog_state_msg,
++ binlog_state_msg_len, thd,
++ "Sending binlog event to slave",
++ log_file_name, pos);
+ if (my_net_write(net, (char*)packet->ptr(), packet->length()) )
+ {
+ errmsg = "Failed on my_net_write()";
+@@ -685,10 +763,21 @@
+ }
+ else
+ {
++ char old_log_file_name[FN_REFLEN];
+ bool loop_breaker = 0;
+ /* need this to break out of the for loop from switch */
+
+- thd_proc_info(thd, "Finished reading one binlog; switching to next binlog");
++ // thd_proc_info(thd, "Finished reading one binlog; switching to next binlog");
++ /* Update the binlog sending state. */
++ processlist_show_binlog_state(
++ binlog_state_msg, binlog_state_msg_len, thd,
++ "Finished reading one binlog; switching to next binlog",
++ log_file_name, pos);
++
++ /* Keep the old fileename. */
++ strmake(old_log_file_name, log_file_name,
++ sizeof(old_log_file_name) - 1);
++
+ switch (mysql_bin_log.find_next_log(&linfo, 1)) {
+ case LOG_INFO_EOF:
+ loop_breaker = (flags & BINLOG_DUMP_NON_BLOCK);
+@@ -706,6 +795,16 @@
+
+ end_io_cache(&log);
+ (void) my_close(file, MYF(MY_WME));
++
++ /* A sanity check that we can not serve the same binlog twice because
++ * the filenames are stored in a .index file.
++ */
++ if (strcmp(old_log_file_name, log_file_name) >= 0) {
++ errmsg = "Re-serving an already served binlog file.";
++ my_errno = ER_MASTER_FATAL_ERROR_READING_BINLOG;
++ goto err;
++ }
++
+
+ /*
+ Call fake_rotate_event() in case the previous log (the one which
+@@ -733,6 +832,8 @@
+ end_io_cache(&log);
+ (void)my_close(file, MYF(MY_WME));
+
++ repl_cleanup(flags);
++
+ send_eof(thd);
+ thd_proc_info(thd, "Waiting to finalize termination");
+ pthread_mutex_lock(&LOCK_thread_count);
+@@ -743,6 +844,7 @@
+ err:
+ thd_proc_info(thd, "Waiting to finalize termination");
+ end_io_cache(&log);
++ repl_cleanup(flags);
+ /*
+ Exclude iteration through thread list
+ this is needed for purge_logs() - it will iterate through
+@@ -1316,7 +1418,7 @@
+ Format_description_log_event *description_event= new
+ Format_description_log_event(3); /* MySQL 4.0 by default */
+
+- if (mysql_bin_log.is_open())
++ if (mysql_bin_log.is_log_open())
+ {
+ LEX_MASTER_INFO *lex_mi= &thd->lex->mi;
+ SELECT_LEX_UNIT *unit= &thd->lex->unit;
+@@ -1456,7 +1558,7 @@
+ DBUG_RETURN(TRUE);
+ protocol->prepare_for_resend();
+
+- if (mysql_bin_log.is_open())
++ if (mysql_bin_log.is_log_open())
+ {
+ LOG_INFO li;
+ mysql_bin_log.get_current_log(&li);
+@@ -1497,7 +1599,7 @@
+ Protocol *protocol= thd->protocol;
+ DBUG_ENTER("show_binlogs");
+
+- if (!mysql_bin_log.is_open())
++ if (!mysql_bin_log.is_log_open())
+ {
+ my_message(ER_NO_BINARY_LOGGING, ER(ER_NO_BINARY_LOGGING), MYF(0));
+ return 1;
+@@ -1606,6 +1708,235 @@
+ DBUG_RETURN(0);
+ }
+
++
++/* make_master: Make the current database a primary and starts the
++ * binlog logging for all updates.
++ *
++ * The function handles the following sql commands:
++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1,
++ * [WITH BINLOG];
++ * . MAKE MASTER MASTER_LOG_FILE='replication_log', MASTER_SERVER_ID=1,
++ * INDEX='replication_log.index' [WITH BINLOG];
++ * . MAKE MASTER REVOKE SESSION;
++ * . MAKE MASTER REVOKE SESSION WITH KILL;
++ * . MAKE MASTER GRANT SESSION;
++ *
++ * Args:
++ * thd - the current thread
++ * binlog_name - binlog's filename
++ * binlog_indexname - binlog index's filename
++ * mi - master info struct containing binlog name
++ * (set when we enable master during runtime)
++ *
++ * Return:
++ * 0 : success
++ * -1 : failure
++ */
++int make_master(THD* thd,
++ const char *binlog_name,
++ const char *binlog_indexname,
++ const LEX_MASTER_INFO* mi) {
++ int error = 0;
++
++ DBUG_ENTER("make_master");
++ /* In two mode, we enable the binlog:
++ * . !mi - LEX is not provided; this is called from startup time
++ * . mi->log_file_name - binlog is specified in the command
++ */
++ if (!mi || mi->log_file_name) {
++ /* Get the mutex */
++ VOID(pthread_mutex_lock(&LOCK_failover_master));
++
++ /* If the binlog is already opened, we issue an error. We reuse one
++ * existing error, which might not be fully accurate.
++ */
++ if (mysql_bin_log.is_log_open()) {
++ my_error(ER_MASTER_INFO, MYF(0));
++ sql_print_error("Replication master log is already open: cannot "
++ "make another master!");
++ error = -1;
++ } else {
++ if (!mi) {
++ /* This opening happens at mysql startup time. */
++ if (make_master_open_log(&mysql_bin_log, binlog_name,
++ 0, max_binlog_size) != 0) {
++ error = -1;
++ }
++ } else {
++ /* This opening happens during mysql runtime, which is mostly
++ * requested to do failover.
++ */
++
++ error = -1;
++ if (!is_in_failover()) {
++ sql_print_error(
++ "\"make master\" runs only in failover mode. "
++ "Please run \"make master revoke session (with kill)\"");
++ } else if (strlen(mi->log_file_name) == 0) {
++ sql_print_error("Master log filename is not specified correctly.");
++ } else if (!mi->server_id || mi->server_id == MASTER_INFO_SERVER_ID) {
++ sql_print_error("\"make master\": invalid server_id(%d)",
++ mi->server_id);
++ } else {
++ /* Open the new log files and delete all existing ones to avoid
++ * conflicts.
++ */
++ uint32 old_server_id = server_id;
++ char *binlog_name = NULL;
++
++ /* Set the global master server id.
++ * We would not change server id for all connection threads.
++ * All non-super sessions should be blocked by revoke sessions.
++ * Super-user sessions are responsible for their own operations.
++ */
++ server_id = mi->server_id;
++ thd->server_id = mi->server_id;
++
++ if (!(binlog_name = my_strdup(mi->log_file_name, MYF(0))) ||
++ make_master_open_index(&binlog_name, mi->log_index_name) != 0 ||
++ make_master_open_log(&mysql_bin_log, binlog_name,
++ 0, max_binlog_size) != 0) {
++ sql_print_error("Open master logfile failed.");
++ thd->server_id = old_server_id;
++ server_id = old_server_id;
++ } else if (!mi->with_old_binlog &&
++ mysql_bin_log.reset_logs(thd) != 0) {
++ sql_print_error("Cleanup existing master logfiles failed.");
++ thd->server_id = old_server_id;
++ server_id = old_server_id;
++ } else {
++ error = 0;
++ }
++ }
++ if (error == -1)
++ my_error(ER_MASTER_INFO, MYF(0));
++ }
++ }
++
++ if (error == 0) {
++ /* indicates that binlog is enabled now */
++ using_update_log = 1;
++ } else if (mysql_bin_log.is_open()) {
++ mysql_bin_log.close(LOG_CLOSE_INDEX);
++ }
++
++ /* Release the mutex */
++ VOID(pthread_mutex_unlock(&LOCK_failover_master));
++ } else {
++ /* The following actions are related to session management during
++ * failover operation. We do not want some sessions come in
++ * during failover and make updates.
++ * This is invoked for command: MAKE MASTER GRANT/REVOKE SESSION;
++ */
++ if (mi->in_failover) {
++ set_in_failover(mi->kill_session);
++ } else {
++ clear_in_failover();
++ }
++ }
++
++ DBUG_RETURN(error);
++}
++
++static int make_master_open_log(MYSQL_LOG *log,
++ const char *opt_name,
++ bool no_auto_events,
++ ulong max_size) {
++ char tmp[FN_REFLEN];
++
++ // get rid of extension
++ char *p = fn_ext(opt_name);
++ uint length=(uint) (p-opt_name);
++ strmake(tmp,opt_name,min(length,FN_REFLEN));
++ opt_name=tmp;
++
++ return log->open(opt_name, LOG_BIN, NULL, WRITE_CACHE, 0,
++ max_size, 0);
++}
++
++int make_master_open_index(char **binlog_name,
++ const char *binlog_indexname) {
++ char buf[FN_REFLEN];
++ const char *ln;
++ DBUG_ENTER("make_master_open_index");
++
++ ln= mysql_bin_log.generate_name(*binlog_name, "-bin", 1, buf);
++ if (!(*binlog_name) && !binlog_indexname) {
++ /*
++ User didn't give us info to name the binlog index file.
++ Picking `hostname`-bin.index like did in 4.x, causes replication to
++ fail if the hostname is changed later. So, we would like to instead
++ require a name. But as we don't want to break many existing setups, we
++ only give warning, not error.
++ */
++ sql_print_warning("No argument was provided to --log-bin, and "
++ "--log-bin-index was not used; so replication "
++ "may break when this MySQL server acts as a "
++ "master and has his hostname changed!! Please "
++ "use '--log-bin=%s' to avoid this problem.", ln);
++ }
++ if (ln == buf) {
++ my_free(*binlog_name, MYF(MY_ALLOW_ZERO_PTR));
++ *binlog_name = my_strdup(buf, MYF(0));
++ }
++ if (mysql_bin_log.open_index_file(binlog_indexname, ln) != 0) {
++ DBUG_RETURN(-1);
++ }
++
++ /*
++ Used to specify which type of lock we need to use for queries of type
++ INSERT ... SELECT. This will change when we have row level logging.
++ */
++ using_update_log=1;
++
++ DBUG_RETURN(0);
++}
++
++/* Set the status indicating that we are in failover and deny all non-super
++ * user access.
++ *
++ * Args:
++ * kill_session - kill all non-super sessions if specified
++ *
++ * Return:
++ * 0 - success
++ * -1 - failure (caused by not killing all sessions)
++ */
++static int set_in_failover(bool kill_session) {
++ failover_deny_access = 1;
++
++ if (kill_session) {
++ /* If kill session option is specified, we need to kill all non-super
++ * user sessions.
++ */
++ THD *kill_thd;
++
++ uint error=ER_NO_SUCH_THREAD;
++ pthread_mutex_lock(&LOCK_thread_count); // For unlink from list
++ I_List_iterator<THD> it(threads);
++ while ((kill_thd=it++)) {
++ if (!(kill_thd->main_security_ctx.master_access & SUPER_ACL)) {
++ pthread_mutex_lock(&kill_thd->LOCK_delete); // Lock from delete
++
++ /* ask the thread to die */
++ kill_thd->awake(THD::KILL_CONNECTION);
++ pthread_mutex_unlock(&kill_thd->LOCK_delete);
++ }
++ }
++ pthread_mutex_unlock(&LOCK_thread_count);
++ }
++ return 0;
++}
++
++static void clear_in_failover(void) {
++ failover_deny_access = 0;
++}
++
++bool is_in_failover(void) {
++ return failover_deny_access;
++}
++
++
+ #endif /* HAVE_REPLICATION */
+
+
+diff -r 66cc9e0a6768 sql/sql_repl.h
+--- a/sql/sql_repl.h Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_repl.h Thu Dec 04 21:46:15 2008 -0800
+@@ -38,6 +38,10 @@
+ int start_slave(THD* thd, MASTER_INFO* mi, bool net_report);
+ int stop_slave(THD* thd, MASTER_INFO* mi, bool net_report);
+ bool change_master(THD* thd, MASTER_INFO* mi);
++int make_master(THD* thd, const char *binlog_name,
++ const char *binlog_indexname, const LEX_MASTER_INFO* mi);
++int make_master_open_index(char **binlog_name, const char *binlog_indexname);
++bool is_in_failover(void);
+ bool mysql_show_binlog_events(THD* thd);
+ int cmp_master_pos(const char* log_file_name1, ulonglong log_pos1,
+ const char* log_file_name2, ulonglong log_pos2);
+diff -r 66cc9e0a6768 sql/sql_yacc.yy
+--- a/sql/sql_yacc.yy Thu Dec 04 21:37:12 2008 -0800
++++ b/sql/sql_yacc.yy Thu Dec 04 21:46:15 2008 -0800
+@@ -735,6 +735,7 @@
+ %token LOOP_SYM
+ %token LOW_PRIORITY
+ %token LT
++%token MAKE_SYM
+ %token MAKE_SET_SYM
+ %token MASTER_CONNECT_RETRY_SYM
+ %token MASTER_HOST_SYM
+@@ -1167,7 +1168,7 @@
+ query verb_clause create change select do drop insert replace insert2
+ insert_values update delete truncate rename
+ show describe load alter optimize keycache preload flush
+- reset purge begin commit rollback savepoint release
++ make reset purge begin commit rollback savepoint release
+ slave master_def master_defs master_file_def slave_until_opts
+ repair restore backup analyze check start checksum
+ field_list field_list_item field_spec kill column_def key_def
+@@ -1301,6 +1302,7 @@
+ | kill
+ | load
+ | lock
++ | make
+ | optimize
+ | keycache
+ | preload
+@@ -1428,6 +1430,56 @@
+ master_defs
+ {}
+ ;
++
++/* make master */
++make:
++ MAKE_SYM MASTER_SYM
++ {
++ LEX *lex = Lex;
++ lex->sql_command = SQLCOM_MAKE_MASTER;
++ bzero((char*) &lex->mi, sizeof(lex->mi));
++ }
++ make_master_defs
++ {
++ }
++ ;
++
++make_master_defs:
++ MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num
++ {
++ Lex->mi.log_file_name = $3.str;
++ Lex->mi.server_id = $7;
++ }
++ make_master_with_defs {}
++ | MASTER_LOG_FILE_SYM EQ TEXT_STRING ',' MASTER_SERVER_ID_SYM EQ ulong_num ',' INDEX_SYM EQ TEXT_STRING
++ {
++ Lex->mi.log_file_name = $3.str;
++ Lex->mi.server_id = $7;
++ Lex->mi.log_index_name = $11.str;
++ }
++ make_master_with_defs {}
++ | GRANT SESSION_SYM
++ {
++ Lex->mi.in_failover = 0;
++ }
++ | REVOKE SESSION_SYM
++ {
++ Lex->mi.in_failover = 1;
++ }
++ | REVOKE SESSION_SYM WITH KILL_SYM
++ {
++ Lex->mi.in_failover = 1;
++ Lex->mi.kill_session = 1;
++ }
++ ;
++
++make_master_with_defs:
++ /* empty */ {}
++ | WITH BINLOG_SYM
++ {
++ /* All old binlogs will be kept after "make master" command. */
++ Lex->mi.with_old_binlog = 1;
++ }
+
+ master_defs:
+ master_def
+@@ -8396,6 +8448,7 @@
+ | HANDLER_SYM {}
+ | HELP_SYM {}
+ | LANGUAGE_SYM {}
++ | MAKE_SYM {}
+ | NO_SYM {}
+ | OPEN_SYM {}
+ | PREPARE_SYM {}
diff --git a/percona/5.0.91-b22-20100522/mysql-test.patch b/percona/5.0.91-b22-20100522/mysql-test.patch
new file mode 100644
index 0000000..00e0eb9
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/mysql-test.patch
@@ -0,0 +1,140 @@
+--- a/mysql-test/r/information_schema.result 2009-05-07 19:31:26.000000000 +0000
++++ b/mysql-test/r/information_schema.result 2009-05-07 19:32:59.000000000 +0000
+@@ -60,6 +60,7 @@
+ USER_STATISTICS
+ VIEWS
+ INNODB_IO_PATTERN
++INNODB_RSEG
+ columns_priv
+ db
+ func
+@@ -743,7 +744,7 @@
+ CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1;
+ CREATE VIEW a2 AS SELECT t_CRASHME FROM a1;
+ count(*)
+-109
++110
+ drop view a2, a1;
+ drop table t_crashme;
+ select table_schema,table_name, column_name from
+@@ -819,7 +820,7 @@
+ flush privileges;
+ SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA;
+ table_schema count(*)
+-information_schema 24
++information_schema 25
+ mysql 17
+ create table t1 (i int, j int);
+ create trigger trg1 before insert on t1 for each row
+@@ -1228,6 +1229,7 @@
+ USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
+ INNODB_IO_PATTERN SPACE
++INNODB_RSEG RSEG_ID
+ SELECT t.table_name, c1.column_name
+ FROM information_schema.tables t
+ INNER JOIN
+@@ -1267,6 +1269,7 @@
+ USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
+ INNODB_IO_PATTERN SPACE
++INNODB_RSEG RSEG_ID
+ SELECT MAX(table_name) FROM information_schema.tables;
+ MAX(table_name)
+ VIEWS
+@@ -1342,6 +1345,7 @@
+ INDEX_STATISTICS information_schema.INDEX_STATISTICS 1
+ INNODB_BUFFER_POOL_CONTENT information_schema.INNODB_BUFFER_POOL_CONTENT 1
+ INNODB_IO_PATTERN information_schema.INNODB_IO_PATTERN 1
++INNODB_RSEG information_schema.INNODB_RSEG 1
+ KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
+ PROCESSLIST information_schema.PROCESSLIST 1
+ PROFILING information_schema.PROFILING 1
+--- a/mysql-test/r/information_schema_db.result 2009-05-07 19:31:27.000000000 +0000
++++ b/mysql-test/r/information_schema_db.result 2009-05-07 19:35:01.000000000 +0000
+@@ -29,6 +29,7 @@
+ USER_STATISTICS
+ VIEWS
+ INNODB_IO_PATTERN
++INNODB_RSEG
+ show tables from INFORMATION_SCHEMA like 'T%';
+ Tables_in_information_schema (T%)
+ TABLES
+--- a/mysql-test/r/mysqlshow.result 2009-05-07 19:31:26.000000000 +0000
++++ b/mysql-test/r/mysqlshow.result 2009-05-07 19:36:32.000000000 +0000
+@@ -103,6 +103,7 @@
+ | USER_STATISTICS |
+ | VIEWS |
+ | INNODB_IO_PATTERN |
++| INNODB_RSEG |
+ +---------------------------------------+
+ Database: INFORMATION_SCHEMA
+ +---------------------------------------+
+@@ -132,6 +133,7 @@
+ | USER_STATISTICS |
+ | VIEWS |
+ | INNODB_IO_PATTERN |
++| INNODB_RSEG |
+ +---------------------------------------+
+ Wildcard: inf_rmation_schema
+ +--------------------+
+--- a/mysql-test/r/profiling.result 2009-05-28 19:39:42.000000000 +0000
++++ b/mysql-test/r/profiling.result 2009-05-28 19:40:14.000000000 +0000
+@@ -6,6 +6,8 @@
+ Variable_name Value
+ profiling OFF
+ profiling_history_size 15
++profiling_server OFF
++profiling_use_getrusage OFF
+ select @@profiling;
+ @@profiling
+ 0
+@@ -16,12 +18,16 @@
+ Variable_name Value
+ profiling OFF
+ profiling_history_size 100
++profiling_server OFF
++profiling_use_getrusage OFF
+ set session profiling = ON;
+ set session profiling_history_size=30;
+ show session variables like 'profil%';
+ Variable_name Value
+ profiling ON
+ profiling_history_size 30
++profiling_server OFF
++profiling_use_getrusage OFF
+ select @@profiling;
+ @@profiling
+ 1
+--- a/mysql-test/r/mysql.result 2010-02-19 23:59:36.000000000 -0500
++++ b/mysql-test/r/mysql.result 2010-02-19 23:58:50.000000000 -0500
+@@ -162,8 +162,8 @@
+ ERROR 1049 (42000) at line 1: Unknown database 'invalid'
+ Test connect with dbname + hostname
+ Test connect with dbname + _invalid_ hostname
+-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno)
+-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno)
++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno)
++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno)
+ The commands reported in the bug report
+ ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'cyril has found a bug :)XXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXXX' (errno)
+ Too long dbname
+@@ -198,6 +198,6 @@
+ 1
+ COUNT (*)
+ 1
+-ERROR 2005 (HY000) at line 1: Unknown MySQL server host 'invalid_hostname' (errno)
++ERROR 2003 (HY000) at line 1: Can't connect to MySQL server on 'invalid_hostname' (errno)
+ <TABLE BORDER=1><TR><TH>&lt;</TH></TR><TR><TD>&lt; &amp; &gt;</TD></TR></TABLE>
+ End of 5.0 tests
+--- a/mysql-test/r/mysql_upgrade.result 2010-02-19 23:58:16.000000000 -0500
++++ b/mysql-test/r/mysql_upgrade.result 2010-02-20 00:01:34.000000000 -0500
+@@ -58,7 +58,7 @@
+ mysql.user OK
+ DROP USER mysqltest1@'%';
+ Run mysql_upgrade with a non existing server socket
+-mysqlcheck: Got error: 2005: Unknown MySQL server host 'not_existing_host' (errno) when trying to connect
++mysqlcheck: Got error: 2003: Can't connect to MySQL server on 'not_existing_host' (errno) when trying to connect
+ FATAL ERROR: Upgrade failed
+ set GLOBAL sql_mode='STRICT_ALL_TABLES,ANSI_QUOTES,NO_ZERO_DATE';
+ mysql.columns_priv OK
diff --git a/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch b/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch
new file mode 100644
index 0000000..a493a29
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/mysqld_safe_syslog.patch
@@ -0,0 +1,127 @@
+diff -r d91edeb58b50 patch_info/mysqld_safe_syslog.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/mysqld_safe_syslog.info Mon Sep 01 21:58:00 2008 -0700
+@@ -0,0 +1,6 @@
++File=mysqld_safe_syslog.patch
++Name=Patch allows redirect output of error.log to syslog-ng
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=Ported from Debian
+diff -r d91edeb58b50 scripts/mysqld_safe.sh
+--- a/scripts/mysqld_safe.sh Mon Sep 01 21:57:21 2008 -0700
++++ b/scripts/mysqld_safe.sh Mon Sep 01 21:58:00 2008 -0700
+@@ -10,12 +10,16 @@
+ # mysql.server works by first doing a cd to the base directory and from there
+ # executing mysqld_safe
+
+-KILL_MYSQLD=1;
+ MYSQLD=
+
+ trap '' 1 2 3 15 # we shouldn't let anyone kill us
+
+ umask 007
++
++KILL_MYSQLD=1;
++
++# This command can be used as pipe to syslog. With "-s" it also logs to stderr.
++ERR_LOGGER="logger -p daemon.err -t mysqld_safe -i"
+
+ defaults=
+ case "$1" in
+@@ -177,7 +181,6 @@
+
+ # these rely on $DATADIR by default, so we'll set them later on
+ pid_file=
+-err_log=
+
+ # Get first arguments from the my.cnf file, groups [mysqld] and [mysqld_safe]
+ # and then merge with the command line arguments
+@@ -245,7 +248,6 @@
+ * ) pid_file="$DATADIR/$pid_file" ;;
+ esac
+ fi
+-test -z "$err_log" && err_log=$DATADIR/`@HOSTNAME@`.err
+
+ if test -n "$mysql_unix_port"
+ then
+@@ -315,8 +317,6 @@
+ then
+ USER_OPTION="--user=$user"
+ fi
+- # If we are root, change the err log to the right user.
+- touch $err_log; chown $user $err_log
+ if test -n "$open_files"
+ then
+ ulimit -n $open_files
+@@ -341,18 +341,16 @@
+ then
+ if @FIND_PROC@
+ then # The pid contains a mysqld process
+- echo "A mysqld process already exists"
+- echo "A mysqld process already exists at " `date` >> $err_log
++ echo "A mysqld process already exists" | $ERR_LOGGER -s
+ exit 1
+ fi
+ fi
+ rm -f $pid_file
+ if test -f $pid_file
+ then
+- echo "Fatal error: Can't remove the pid file: $pid_file"
+- echo "Fatal error: Can't remove the pid file: $pid_file at " `date` >> $err_log
+- echo "Please remove it manually and start $0 again"
+- echo "mysqld daemon not started"
++ echo "Fatal error: Can't remove the pid file: $pid_file" | $ERR_LOGGER -s
++ echo "Please remove it manually and start $0 again" | $ERR_LOGGER -s
++ echo "mysqld daemon not started" | $ERR_LOGGER -s
+ exit 1
+ fi
+ fi
+@@ -377,15 +375,15 @@
+ # ulimit -n 256 > /dev/null 2>&1 # Fix for BSD and FreeBSD systems
+ #fi
+
+-echo "`date +'%y%m%d %H:%M:%S mysqld started'`" >> $err_log
++echo "started" | $ERR_LOGGER -s
+ while true
+ do
+ rm -f $safe_mysql_unix_port $pid_file # Some extra safety
+ if test -z "$args"
+ then
+- $NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ >> $err_log 2>&1
++ $NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ 2>&1 | $ERR_LOGGER -t mysqld
+ else
+- eval "$NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ $args >> $err_log 2>&1"
++ eval "$NOHUP_NICENESS $ledir/$MYSQLD $defaults --basedir=$MY_BASEDIR_VERSION --datadir=$DATADIR $USER_OPTION --pid-file=$pid_file @MYSQLD_DEFAULT_SWITCHES@ $args 2>&1 | $ERR_LOGGER -t mysqld"
+ fi
+ if test ! -f $pid_file # This is removed if normal shutdown
+ then
+@@ -402,7 +400,7 @@
+ # kill -9 is used or the process won't react on the kill.
+ numofproces=`ps xaww | grep -v "grep" | grep "$ledir/$MYSQLD\>" | grep -c "pid-file=$pid_file"`
+
+- echo -e "\nNumber of processes running now: $numofproces" | tee -a $err_log
++ echo -e "\nNumber of processes running now: $numofproces" | $ERR_LOGGER -s
+ I=1
+ while test "$I" -le "$numofproces"
+ do
+@@ -415,16 +413,14 @@
+ # echo "TEST $I - $T **"
+ if kill -9 $T
+ then
+- echo "$MYSQLD process hanging, pid $T - killed" | tee -a $err_log
++ echo "$MYSQLD process hanging, pid $T - killed" | $ERR_LOGGER -s
+ else
+ break
+ fi
+ I=`expr $I + 1`
+ done
+ fi
+- echo "`date +'%y%m%d %H:%M:%S'` mysqld restarted" | tee -a $err_log
++ echo "restarted" | $ERR_LOGGER -s
+ done
+
+-echo "`date +'%y%m%d %H:%M:%S'` mysqld ended" | tee -a $err_log
+-echo "" | tee -a $err_log
+-
++echo "ended" | $ERR_LOGGER -s
diff --git a/percona/5.0.91-b22-20100522/profiling_slow.patch b/percona/5.0.91-b22-20100522/profiling_slow.patch
new file mode 100644
index 0000000..78d35a0
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/profiling_slow.patch
@@ -0,0 +1,271 @@
+diff -r 4636d2e0b0d0 patch_info/profiling_slow.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/profiling_slow.info Fri Jul 03 15:40:29 2009 -0700
+@@ -0,0 +1,9 @@
++File=profiling_slow.info
++Name=profiling from SHOW PROFILE to slow.log
++Version=1.0
++Author=Percona <info@percona.com>
++License=GPL
++Comment=
++Changelog
++2009-05-18
++Initial implementation
+diff -r 4636d2e0b0d0 sql/log.cc
+--- a/sql/log.cc Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/log.cc Fri Jul 03 15:40:29 2009 -0700
+@@ -2402,6 +2402,11 @@
+ tmp_errno=errno;
+ }
+ }
++
++#if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER)
++ thd->profiling.print_current(&log_file);
++#endif
++
+ if (thd->db && strcmp(thd->db,db))
+ { // Database changed
+ if (my_b_printf(&log_file,"use %s;\n",thd->db) == (uint) -1)
+diff -r 4636d2e0b0d0 sql/mysqld.cc
+--- a/sql/mysqld.cc Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/mysqld.cc Fri Jul 03 15:40:29 2009 -0700
+@@ -5052,6 +5052,8 @@
+ OPT_PORT_OPEN_TIMEOUT,
+ OPT_MERGE,
+ OPT_PROFILING,
++ OPT_PROFILING_SERVER,
++ OPT_PROFILING_USE_GETRUSAGE,
+ OPT_SLOW_LOG,
+ OPT_SLOW_QUERY_LOG_FILE,
+ OPT_USE_GLOBAL_LONG_QUERY_TIME,
+@@ -5675,6 +5677,16 @@
+ (gptr*) &global_system_variables.profiling_history_size,
+ (gptr*) &max_system_variables.profiling_history_size,
+ 0, GET_ULONG, REQUIRED_ARG, 15, 0, 100, 0, 0, 0},
++ {"profiling_server", OPT_PROFILING_SERVER,
++ "Enable profiling of all threads",
++ (gptr*) &global_system_variables.profiling_server,
++ (gptr*) &max_system_variables.profiling_server, 0, GET_BOOL,
++ OPT_ARG, 0, 0, 0, 0, 0, 0 },
++ {"profiling_use_getrusage", OPT_PROFILING_USE_GETRUSAGE,
++ "Enable getrusage function call for profiling",
++ (gptr*) &global_system_variables.profiling_use_getrusage,
++ (gptr*) &max_system_variables.profiling_use_getrusage, 0, GET_BOOL,
++ OPT_ARG, 0, 0, 0, 0, 0, 0 },
+ #endif
+ {"relay-log", OPT_RELAY_LOG,
+ "The location and name to use for relay logs.",
+diff -r 4636d2e0b0d0 sql/set_var.cc
+--- a/sql/set_var.cc Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/set_var.cc Fri Jul 03 15:40:29 2009 -0700
+@@ -592,6 +592,10 @@
+ ulonglong(OPTION_PROFILING));
+ static sys_var_thd_ulong sys_profiling_history_size("profiling_history_size",
+ &SV::profiling_history_size);
++static sys_var_thd_bool sys_profiling_server("profiling_server",
++ &SV::profiling_server);
++static sys_var_thd_bool sys_profiling_use_getrusage("profiling_use_getrusage",
++ &SV::profiling_use_getrusage);
+ #endif
+
+ /* Local state variables */
+@@ -764,6 +768,8 @@
+ #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER)
+ &sys_profiling,
+ &sys_profiling_history_size,
++ &sys_profiling_server,
++ &sys_profiling_use_getrusage,
+ #endif
+ &sys_pseudo_thread_id,
+ &sys_query_alloc_block_size,
+@@ -1094,6 +1100,8 @@
+ #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER)
+ {sys_profiling.name, (char*) &sys_profiling, SHOW_SYS},
+ {sys_profiling_history_size.name, (char*) &sys_profiling_history_size, SHOW_SYS},
++ {sys_profiling_server.name, (char*) &sys_profiling_server, SHOW_SYS},
++ {sys_profiling_use_getrusage.name, (char*) &sys_profiling_use_getrusage, SHOW_SYS},
+ #endif
+ {"protocol_version", (char*) &protocol_version, SHOW_INT},
+ {sys_query_alloc_block_size.name, (char*) &sys_query_alloc_block_size,
+diff -r 4636d2e0b0d0 sql/sql_class.h
+--- a/sql/sql_class.h Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/sql_class.h Fri Jul 03 15:40:29 2009 -0700
+@@ -550,6 +550,8 @@
+ ulong optimizer_search_depth;
+ ulong preload_buff_size;
+ ulong profiling_history_size;
++ my_bool profiling_server;
++ my_bool profiling_use_getrusage;
+ ulong query_cache_type;
+ ulong log_slow_rate_limit;
+ ulong read_buff_size;
+diff -r 4636d2e0b0d0 sql/sql_profile.cc
+--- a/sql/sql_profile.cc Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/sql_profile.cc Fri Jul 03 15:40:29 2009 -0700
+@@ -221,9 +221,22 @@
+ */
+ void PROF_MEASUREMENT::collect()
+ {
++ struct timespec tp;
+ time_usecs= (double) my_getsystime() / 10.0; /* 1 sec was 1e7, now is 1e6 */
+ #ifdef HAVE_GETRUSAGE
+- getrusage(RUSAGE_SELF, &rusage);
++ if ((profile->get_profiling())->enabled_getrusage())
++ getrusage(RUSAGE_SELF, &rusage);
++#endif
++
++#ifdef HAVE_CLOCK_GETTIME
++ if (!(clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ {
++ cpu_time_usecs= tp.tv_sec*1000000000.0 + tp.tv_nsec;
++ }
++ else
++ {
++ cpu_time_usecs= 0;
++ }
+ #endif
+ }
+
+@@ -341,7 +354,7 @@
+ finish_current_query();
+ }
+
+- enabled= (((thd)->options & OPTION_PROFILING) != 0);
++ enabled= (((thd)->options & OPTION_PROFILING) != 0) || ( thd->variables.profiling_server );
+
+ if (! enabled) DBUG_VOID_RETURN;
+
+@@ -379,7 +392,8 @@
+ status_change("ending", NULL, NULL, 0);
+
+ if ((enabled) && /* ON at start? */
+- ((thd->options & OPTION_PROFILING) != 0) && /* and ON at end? */
++ (((thd->options & OPTION_PROFILING) != 0) ||
++ (thd->variables.profiling_server)) && /* and ON at end? */
+ (current->query_source != NULL) &&
+ (! current->entries.is_empty()))
+ {
+@@ -480,6 +494,88 @@
+ DBUG_VOID_RETURN;
+ }
+
++bool PROFILING::enabled_getrusage()
++{
++ return thd->variables.profiling_use_getrusage;
++}
++
++/**
++ Print output for current query to file
++*/
++
++int PROFILING::print_current(IO_CACHE *log_file)
++{
++ DBUG_ENTER("PROFILING::print_current");
++ ulonglong row_number= 0;
++ char query_time_buff[22+7];
++ char query_cpu_time_buff[22+7];
++
++ QUERY_PROFILE *query;
++ /* Get current query */
++ if (current == NULL)
++ {
++ DBUG_RETURN(0);
++ }
++
++ query= current;
++
++ my_b_printf(log_file, "# PROFILE_VALUES ");
++
++ void *entry_iterator;
++ PROF_MEASUREMENT *entry, *previous= NULL, *first= NULL;
++ /* ...and for each query, go through all its state-change steps. */
++ for (entry_iterator= query->entries.new_iterator();
++ entry_iterator != NULL;
++ entry_iterator= query->entries.iterator_next(entry_iterator),
++ previous=entry, row_number++)
++ {
++ entry= query->entries.iterator_value(entry_iterator);
++
++ /* Skip the first. We count spans of fence, not fence-posts. */
++ if (previous == NULL) {first= entry; continue;}
++
++ if (thd->lex->orig_sql_command == SQLCOM_SHOW_PROFILE)
++ {
++ /*
++ We got here via a SHOW command. That means that we stored
++ information about the query we wish to show and that isn't
++ in a WHERE clause at a higher level to filter out rows we
++ wish to exclude.
++
++ Because that functionality isn't available in the server yet,
++ we must filter here, at the wrong level. Once one can con-
++ struct where and having conditions at the SQL layer, then this
++ condition should be ripped out.
++ */
++ if (thd->lex->profile_query_id == 0) /* 0 == show final query */
++ {
++ if (query != last)
++ continue;
++ }
++ else
++ {
++ if (thd->lex->profile_query_id != query->profiling_query_id)
++ continue;
++ }
++ }
++
++ snprintf(query_time_buff, sizeof(query_time_buff), "%.6f", (entry->time_usecs-previous->time_usecs)/(1000.0*1000));
++ snprintf(query_cpu_time_buff, sizeof(query_cpu_time_buff), "%.6f", (entry->cpu_time_usecs-previous->cpu_time_usecs)/(1000.0*1000*1000));
++ my_b_printf(log_file, "%s: %s (cpu: %s), ", previous->status, query_time_buff, query_cpu_time_buff);
++
++ }
++
++ my_b_printf(log_file, "\n");
++ if ((entry != NULL) && (first != NULL))
++ {
++ snprintf(query_time_buff, sizeof(query_time_buff), "%.6f", (entry->time_usecs-first->time_usecs)/(1000.0*1000));
++ snprintf(query_cpu_time_buff, sizeof(query_cpu_time_buff), "%.6f", (entry->cpu_time_usecs-first->cpu_time_usecs)/(1000.0*1000*1000));
++ my_b_printf(log_file, "# PROFILE_TOTALS Total: %s (cpu: %s)\n", query_time_buff, query_cpu_time_buff);
++ }
++
++ DBUG_RETURN(0);
++}
++
+ /**
+ Fill the information schema table, "query_profile", as defined in show.cc .
+ There are two ways to get to this function: Selecting from the information
+diff -r 4636d2e0b0d0 sql/sql_profile.h
+--- a/sql/sql_profile.h Fri Jul 03 15:40:20 2009 -0700
++++ b/sql/sql_profile.h Fri Jul 03 15:40:29 2009 -0700
+@@ -193,6 +193,7 @@
+ unsigned int line;
+
+ double time_usecs;
++ double cpu_time_usecs;
+ char *allocated_status_memory;
+
+ void set_label(const char *status_arg, const char *function_arg,
+@@ -243,6 +244,11 @@
+
+ /* Show this profile. This is called by PROFILING. */
+ bool show(uint options);
++
++public:
++
++ inline PROFILING * get_profiling() { return profiling; };
++
+ };
+
+
+@@ -288,9 +294,11 @@
+
+ /* SHOW PROFILES */
+ bool show_profiles();
++ bool enabled_getrusage();
+
+ /* ... from INFORMATION_SCHEMA.PROFILING ... */
+ int fill_statistics_info(THD *thd, TABLE_LIST *tables, Item *cond);
++ int print_current(IO_CACHE *log_file);
+ };
+
+ # endif /* HAVE_PROFILING */
diff --git a/percona/5.0.91-b22-20100522/series b/percona/5.0.91-b22-20100522/series
new file mode 100644
index 0000000..0dcc631
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/series
@@ -0,0 +1,22 @@
+show_patches.patch
+microslow_innodb.patch
+profiling_slow.patch
+userstatv2.patch
+microsec_process.patch
+innodb_io_patches.patch
+mysqld_safe_syslog.patch
+innodb_locks_held.patch
+innodb_show_bp.patch
+innodb_check_fragmentation.patch
+innodb_io_pattern.patch
+innodb_fsync_source.patch
+innodb_show_hashed_memory.patch
+innodb_dict_size_limit.patch
+innodb_extra_rseg.patch
+innodb_thread_concurrency_timer_based.patch
+innodb_use_sys_malloc.patch
+innodb_recovery_patches.patch
+innodb_misc_patch.patch
+innodb_split_buf_pool_mutex.patch
+innodb_rw_lock.patch
+mysql-test.patch
diff --git a/percona/5.0.91-b22-20100522/show_patches.patch b/percona/5.0.91-b22-20100522/show_patches.patch
new file mode 100644
index 0000000..7f1d431
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/show_patches.patch
@@ -0,0 +1,288 @@
+diff -r c3e57b0c22c4 patch_info/show_patches.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/show_patches.info Mon Dec 22 00:25:06 2008 -0800
+@@ -0,0 +1,6 @@
++File=show_patches.patch
++Name=SHOW PATCHES
++Version=1.0
++Author=Jeremy Cole
++License=N/A
++Comment
+diff -r c3e57b0c22c4 sql/Makefile.am
+--- a/sql/Makefile.am Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/Makefile.am Mon Dec 22 00:25:06 2008 -0800
+@@ -118,7 +118,7 @@
+ -DSHAREDIR="\"$(MYSQLSHAREdir)\"" \
+ @DEFS@
+
+-BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h
++BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h patch_info.h
+ EXTRA_DIST = $(BUILT_SOURCES) nt_servc.cc nt_servc.h \
+ message.mc message.h message.rc MSG00001.bin \
+ examples/CMakeLists.txt CMakeLists.txt \
+@@ -175,6 +175,8 @@
+ udf_example_la_SOURCES= udf_example.c
+ udf_example_la_LDFLAGS= -module -rpath $(pkglibdir)
+
++patch_info.h: patch_info.h.pl
++ $(PERL) $< > $@
+
+ # Don't update the files from bitkeeper
+ %::SCCS/s.%
+diff -r c3e57b0c22c4 sql/Makefile.in
+--- a/sql/Makefile.in Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/Makefile.in Mon Dec 22 00:25:06 2008 -0800
+@@ -561,7 +561,7 @@
+ gen_lex_hash_LDADD = $(LDADD) $(CXXLDFLAGS)
+ mysql_tzinfo_to_sql_SOURCES = mysql_tzinfo_to_sql.cc
+ mysql_tzinfo_to_sql_LDADD = @MYSQLD_EXTRA_LDFLAGS@ $(LDADD) $(CXXLDFLAGS)
+-BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h
++BUILT_SOURCES = sql_yacc.cc sql_yacc.h lex_hash.h patch_info.h
+ EXTRA_DIST = $(BUILT_SOURCES) nt_servc.cc nt_servc.h \
+ message.mc message.h message.rc MSG00001.bin \
+ examples/CMakeLists.txt CMakeLists.txt \
+@@ -1237,6 +1237,9 @@
+ ./gen_lex_hash$(EXEEXT) > $@-t
+ $(MV) $@-t $@
+
++patch_info.h: patch_info.h.pl
++ $(PERL) $< > $@
++
+ # Don't update the files from bitkeeper
+ %::SCCS/s.%
+ # Tell versions [3.59,3.63) of GNU make to not export all variables.
+diff -r c3e57b0c22c4 sql/lex.h
+--- a/sql/lex.h Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/lex.h Mon Dec 22 00:25:06 2008 -0800
+@@ -367,6 +367,7 @@
+ { "PACK_KEYS", SYM(PACK_KEYS_SYM)},
+ { "PARTIAL", SYM(PARTIAL)},
+ { "PASSWORD", SYM(PASSWORD)},
++ { "PATCHES", SYM(PATCHES)},
+ { "PHASE", SYM(PHASE_SYM)},
+ { "POINT", SYM(POINT_SYM)},
+ { "POLYGON", SYM(POLYGON)},
+diff -r c3e57b0c22c4 sql/mysql_priv.h
+--- a/sql/mysql_priv.h Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/mysql_priv.h Mon Dec 22 00:25:06 2008 -0800
+@@ -968,6 +968,7 @@
+ int mysqld_show_status(THD *thd);
+ int mysqld_show_variables(THD *thd,const char *wild);
+ bool mysqld_show_storage_engines(THD *thd);
++bool mysqld_show_patches(THD *thd);
+ bool mysqld_show_privileges(THD *thd);
+ bool mysqld_show_column_types(THD *thd);
+ bool mysqld_help (THD *thd, const char *text);
+diff -r c3e57b0c22c4 sql/patch_info.h.pl
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/sql/patch_info.h.pl Mon Dec 22 00:25:06 2008 -0800
+@@ -0,0 +1,65 @@
++use strict;
++
++my $patch_info_path = '../patch_info';
++my $file = '';
++my $output = '';
++
++
++if (opendir(PATCH_DIR, $patch_info_path))
++{
++ while ((my $file = readdir(PATCH_DIR)))
++ {
++ open(PATCH_FILE, "<$patch_info_path/$file") || die("Unable to open $patch_info_path/$file ($!)");
++ my %fields;
++
++ if ($file =~ /^\./)
++ {
++ next;
++ }
++
++ while (<PATCH_FILE>)
++ {
++ chomp;
++
++ my ($key, $value) = split(/\s*=\s*/);
++ $fields{lc($key)} = $value;
++ }
++
++ $output .= "{\"$fields{'file'}\", \"$fields{'name'}\", \"$fields{'version'}\", \"$fields{'author'}\", \"$fields{'license'}\",\"$fields{'comment'}\"},\n"
++ }
++}
++
++print <<HEADER;
++
++/* Copyright (C) 2002-2006 MySQL AB
++
++ This program is free software; you can redistribute it and/or modify
++ it under the terms of the GNU General Public License as published by
++ the Free Software Foundation; version 2 of the License.
++
++ This program is distributed in the hope that it will be useful,
++ but WITHOUT ANY WARRANTY; without even the implied warranty of
++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
++ GNU General Public License for more details.
++
++ You should have received a copy of the GNU General Public License
++ along with this program; if not, write to the Free Software
++ Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA */
++
++#ifdef USE_PRAGMA_INTERFACE
++#pragma interface /* gcc class implementation */
++#endif
++
++struct patch {
++ const char *file;
++ const char *name;
++ const char *version;
++ const char *author;
++ const char *license;
++ const char *comment;
++}patches[] = {
++$output
++{NULL, NULL, NULL, NULL}
++};
++
++HEADER
+diff -r c3e57b0c22c4 sql/sp_head.cc
+--- a/sql/sp_head.cc Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sp_head.cc Mon Dec 22 00:25:06 2008 -0800
+@@ -191,6 +191,7 @@
+ case SQLCOM_SHOW_MUTEX_STATUS:
+ case SQLCOM_SHOW_NEW_MASTER:
+ case SQLCOM_SHOW_OPEN_TABLES:
++ case SQLCOM_SHOW_PATCHES:
+ case SQLCOM_SHOW_PRIVILEGES:
+ case SQLCOM_SHOW_PROCESSLIST:
+ case SQLCOM_SHOW_SLAVE_HOSTS:
+diff -r c3e57b0c22c4 sql/sql_lex.h
+--- a/sql/sql_lex.h Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sql_lex.h Mon Dec 22 00:25:06 2008 -0800
+@@ -95,6 +95,7 @@
+ SQLCOM_XA_COMMIT, SQLCOM_XA_ROLLBACK, SQLCOM_XA_RECOVER,
+ SQLCOM_SHOW_PROC_CODE, SQLCOM_SHOW_FUNC_CODE,
+ SQLCOM_SHOW_PROFILE, SQLCOM_SHOW_PROFILES,
++ SQLCOM_SHOW_PATCHES,
+
+ /*
+ When a command is added here, be sure it's also added in mysqld.cc
+diff -r c3e57b0c22c4 sql/sql_parse.cc
+--- a/sql/sql_parse.cc Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sql_parse.cc Mon Dec 22 00:25:06 2008 -0800
+@@ -3947,6 +3947,9 @@
+ break;
+ case SQLCOM_SHOW_STORAGE_ENGINES:
+ res= mysqld_show_storage_engines(thd);
++ break;
++ case SQLCOM_SHOW_PATCHES:
++ res= mysqld_show_patches(thd);
+ break;
+ case SQLCOM_SHOW_PRIVILEGES:
+ res= mysqld_show_privileges(thd);
+diff -r c3e57b0c22c4 sql/sql_prepare.cc
+--- a/sql/sql_prepare.cc Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sql_prepare.cc Mon Dec 22 00:25:06 2008 -0800
+@@ -1790,6 +1790,7 @@
+ case SQLCOM_SHOW_DATABASES:
+ case SQLCOM_SHOW_PROCESSLIST:
+ case SQLCOM_SHOW_STORAGE_ENGINES:
++ case SQLCOM_SHOW_PATCHES:
+ case SQLCOM_SHOW_PRIVILEGES:
+ case SQLCOM_SHOW_COLUMN_TYPES:
+ case SQLCOM_SHOW_STATUS:
+diff -r c3e57b0c22c4 sql/sql_show.cc
+--- a/sql/sql_show.cc Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sql_show.cc Mon Dec 22 00:25:06 2008 -0800
+@@ -22,6 +22,7 @@
+ #include "sp.h"
+ #include "sp_head.h"
+ #include "sql_trigger.h"
++#include "patch_info.h"
+ #include <my_dir.h>
+
+ #ifdef HAVE_BERKELEY_DB
+@@ -45,6 +46,47 @@
+ static int
+ view_store_create_info(THD *thd, TABLE_LIST *table, String *buff);
+ bool schema_table_store_record(THD *thd, TABLE *table);
++
++/***************************************************************************
++** List patches built into this release
++***************************************************************************/
++
++bool mysqld_show_patches(THD *thd)
++{
++ List<Item> field_list;
++ int i = 0;
++ Protocol *protocol= thd->protocol;
++ DBUG_ENTER("mysqld_show_patches");
++
++ field_list.push_back(new Item_empty_string("File", 255));
++ field_list.push_back(new Item_empty_string("Name", 50));
++ field_list.push_back(new Item_empty_string("Version", 10));
++ field_list.push_back(new Item_empty_string("Author", 50));
++ field_list.push_back(new Item_empty_string("License", 50));
++ field_list.push_back(new Item_empty_string("Comment", 32));
++
++ if (protocol->send_fields(&field_list, Protocol::SEND_NUM_ROWS | Protocol::SEND_EOF))
++ DBUG_RETURN(TRUE);
++
++ for (i = 0; patches[i].file; i++)
++ {
++ protocol->prepare_for_resend();
++ protocol->store(patches[i].file, system_charset_info);
++ protocol->store(patches[i].name, system_charset_info);
++ protocol->store(patches[i].version, system_charset_info);
++ protocol->store(patches[i].author, system_charset_info);
++ protocol->store(patches[i].license, system_charset_info);
++ protocol->store(patches[i].comment, system_charset_info);
++
++ if (protocol->write())
++ DBUG_RETURN(TRUE);
++ }
++
++
++ send_eof(thd);
++ DBUG_RETURN(FALSE);
++
++}
+
+
+ /***************************************************************************
+diff -r c3e57b0c22c4 sql/sql_yacc.yy
+--- a/sql/sql_yacc.yy Mon Dec 22 00:20:06 2008 -0800
++++ b/sql/sql_yacc.yy Mon Dec 22 00:25:06 2008 -0800
+@@ -824,6 +824,7 @@
+ %token PAGE_SYM
+ %token PARTIAL
+ %token PASSWORD
++%token PATCHES
+ %token PARAM_MARKER
+ %token PHASE_SYM
+ %token POINTFROMTEXT
+@@ -8019,7 +8020,7 @@
+ ;
+
+ show_param:
+- DATABASES wild_and_where
++ DATABASES wild_and_where
+ {
+ LEX *lex= Lex;
+ lex->sql_command= SQLCOM_SELECT;
+@@ -8119,6 +8120,10 @@
+ LEX *lex=Lex;
+ lex->sql_command= SQLCOM_SHOW_STORAGE_ENGINES;
+ WARN_DEPRECATED("SHOW TABLE TYPES", "SHOW [STORAGE] ENGINES");
++ }
++ | PATCHES
++ {
++ Lex->sql_command= SQLCOM_SHOW_PATCHES;
+ }
+ | opt_storage ENGINES_SYM
+ {
+@@ -9554,6 +9559,7 @@
+ | PAGE_SYM {}
+ | PARTIAL {}
+ | PASSWORD {}
++ | PATCHES {}
+ | PHASE_SYM {}
+ | POINT_SYM {}
+ | POLYGON {}
diff --git a/percona/5.0.91-b22-20100522/userstatv2.patch b/percona/5.0.91-b22-20100522/userstatv2.patch
new file mode 100644
index 0000000..427fef7
--- /dev/null
+++ b/percona/5.0.91-b22-20100522/userstatv2.patch
@@ -0,0 +1,4406 @@
+diff -r 592f6c3641ba BUILD/Makefile.in
+--- a/BUILD/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/BUILD/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -146,6 +146,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba Docs/Makefile.in
+--- a/Docs/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/Docs/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -144,6 +144,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba Makefile.in
+--- a/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -171,6 +171,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba SSL/Makefile.in
+--- a/SSL/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/SSL/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -144,6 +144,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba client/Makefile.in
+--- a/client/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/client/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -247,6 +247,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @CLIENT_LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba cmd-line-utils/Makefile.in
+--- a/cmd-line-utils/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/cmd-line-utils/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -157,6 +157,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba cmd-line-utils/libedit/Makefile.in
+--- a/cmd-line-utils/libedit/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/cmd-line-utils/libedit/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -166,6 +166,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba cmd-line-utils/readline/Makefile.in
+--- a/cmd-line-utils/readline/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/cmd-line-utils/readline/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -173,6 +173,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba configure
+--- a/configure Wed Jul 29 13:33:34 2009 -0700
++++ b/configure Wed Jul 29 13:34:11 2009 -0700
+@@ -35347,7 +35347,91 @@
+ # We also disable for SCO for the time being, the headers for the
+ # thread library we use conflicts with other headers.
+ ;;
+- *)
++*)
++ # most systems require the program be linked with librt library to use
++ # the function clock_gettime
++ my_save_LIBS="$LIBS"
++ LIBS=""
++
++echo "$as_me:$LINENO: checking for clock_gettime in -lrt" >&5
++echo $ECHO_N "checking for clock_gettime in -lrt... $ECHO_C" >&6
++if test "${ac_cv_lib_rt_clock_gettime+set}" = set; then
++ echo $ECHO_N "(cached) $ECHO_C" >&6
++else
++ ac_check_lib_save_LIBS=$LIBS
++LIBS="-lrt $LIBS"
++cat >conftest.$ac_ext <<_ACEOF
++/* confdefs.h. */
++_ACEOF
++cat confdefs.h >>conftest.$ac_ext
++cat >>conftest.$ac_ext <<_ACEOF
++/* end confdefs.h. */
++
++/* Override any gcc2 internal prototype to avoid an error. */
++#ifdef __cplusplus
++extern "C"
++#endif
++/* We use char because int might match the return type of a gcc2
++ builtin and then its argument prototype would still apply. */
++char clock_gettime ();
++int
++main ()
++{
++clock_gettime ();
++ ;
++ return 0;
++}
++_ACEOF
++rm -f conftest.$ac_objext conftest$ac_exeext
++if { (eval echo "$as_me:$LINENO: \"$ac_link\"") >&5
++ (eval $ac_link) 2>conftest.er1
++ ac_status=$?
++ grep -v '^ *+' conftest.er1 >conftest.err
++ rm -f conftest.er1
++ cat conftest.err >&5
++ echo "$as_me:$LINENO: \$? = $ac_status" >&5
++ (exit $ac_status); } &&
++ { ac_try='test -z "$ac_c_werror_flag"
++ || test ! -s conftest.err'
++ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
++ (eval $ac_try) 2>&5
++ ac_status=$?
++ echo "$as_me:$LINENO: \$? = $ac_status" >&5
++ (exit $ac_status); }; } &&
++ { ac_try='test -s conftest$ac_exeext'
++ { (eval echo "$as_me:$LINENO: \"$ac_try\"") >&5
++ (eval $ac_try) 2>&5
++ ac_status=$?
++ echo "$as_me:$LINENO: \$? = $ac_status" >&5
++ (exit $ac_status); }; }; then
++ ac_cv_lib_rt_clock_gettime=yes
++else
++ echo "$as_me: failed program was:" >&5
++sed 's/^/| /' conftest.$ac_ext >&5
++
++ac_cv_lib_rt_clock_gettime=no
++fi
++rm -f conftest.err conftest.$ac_objext \
++ conftest$ac_exeext conftest.$ac_ext
++LIBS=$ac_check_lib_save_LIBS
++fi
++echo "$as_me:$LINENO: result: $ac_cv_lib_rt_clock_gettime" >&5
++echo "${ECHO_T}$ac_cv_lib_rt_clock_gettime" >&6
++if test $ac_cv_lib_rt_clock_gettime = yes; then
++ cat >>confdefs.h <<_ACEOF
++#define HAVE_LIBRT 1
++_ACEOF
++
++ LIBS="-lrt $LIBS"
++
++fi
++
++ LIBRT=$LIBS
++ LIBS="$my_save_LIBS"
++
++
++ LIBS="$LIBS $LIBRT"
++
+ for ac_func in clock_gettime
+ do
+ as_ac_var=`$as_echo "ac_cv_func_$ac_func" | $as_tr_sh`
+@@ -38791,7 +38875,7 @@
+
+ fi
+
+-CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS"
++CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS $LIBRT"
+
+
+
+diff -r 592f6c3641ba configure.in
+--- a/configure.in Wed Jul 29 13:33:34 2009 -0700
++++ b/configure.in Wed Jul 29 13:34:11 2009 -0700
+@@ -2136,7 +2136,18 @@
+ # We also disable for SCO for the time being, the headers for the
+ # thread library we use conflicts with other headers.
+ ;;
+- *) AC_CHECK_FUNCS(clock_gettime)
++*)
++ # most systems require the program be linked with librt library to use
++ # the function clock_gettime
++ my_save_LIBS="$LIBS"
++ LIBS=""
++ AC_CHECK_LIB(rt,clock_gettime)
++ LIBRT=$LIBS
++ LIBS="$my_save_LIBS"
++ AC_SUBST(LIBRT)
++
++ LIBS="$LIBS $LIBRT"
++ AC_CHECK_FUNCS(clock_gettime)
+ ;;
+ esac
+
+@@ -2772,7 +2783,7 @@
+ AC_DEFINE([THREAD_SAFE_CLIENT], [1], [Should be client be thread safe])
+ fi
+
+-CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS"
++CLIENT_LIBS="$NON_THREADED_LIBS $openssl_libs $ZLIB_LIBS $STATIC_NSS_FLAGS $LIBRT"
+
+ AC_SUBST(CLIENT_LIBS)
+ AC_SUBST(NON_THREADED_LIBS)
+diff -r 592f6c3641ba dbug/Makefile.in
+--- a/dbug/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/dbug/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -192,6 +192,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/Makefile.in
+--- a/extra/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -240,6 +240,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/Makefile.in
+--- a/extra/yassl/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -142,6 +142,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/src/Makefile.in
+--- a/extra/yassl/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -151,6 +151,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/taocrypt/Makefile.in
+--- a/extra/yassl/taocrypt/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/taocrypt/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -142,6 +142,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/taocrypt/benchmark/Makefile.in
+--- a/extra/yassl/taocrypt/benchmark/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/taocrypt/benchmark/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -153,6 +153,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/taocrypt/src/Makefile.in
+--- a/extra/yassl/taocrypt/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/taocrypt/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -164,6 +164,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/taocrypt/test/Makefile.in
+--- a/extra/yassl/taocrypt/test/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/taocrypt/test/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -153,6 +153,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba extra/yassl/testsuite/Makefile.in
+--- a/extra/yassl/testsuite/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/extra/yassl/testsuite/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba heap/Makefile.in
+--- a/heap/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/heap/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -202,6 +202,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba include/Makefile.in
+--- a/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -160,6 +160,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba include/mysql_com.h
+--- a/include/mysql_com.h Wed Jul 29 13:33:34 2009 -0700
++++ b/include/mysql_com.h Wed Jul 29 13:34:11 2009 -0700
+@@ -25,6 +25,7 @@
+ #define USERNAME_LENGTH 16
+ #define SERVER_VERSION_LENGTH 60
+ #define SQLSTATE_LENGTH 5
++#define LIST_PROCESS_HOST_LEN 64
+
+ /*
+ USER_HOST_BUFF_SIZE -- length of string buffer, that is enough to contain
+@@ -106,6 +107,11 @@
+ thread */
+ #define REFRESH_MASTER 128 /* Remove all bin logs in the index
+ and truncate the index */
++#define REFRESH_TABLE_STATS 256 /* Refresh table stats hash table */
++#define REFRESH_INDEX_STATS 512 /* Refresh index stats hash table */
++#define REFRESH_USER_STATS 1024 /* Refresh user stats hash table */
++#define REFRESH_SLOW_QUERY_LOG 4096 /* Flush slow query log and rotate*/
++#define REFRESH_CLIENT_STATS 8192 /* Refresh client stats hash table */
+
+ /* The following can't be set with mysql_refresh() */
+ #define REFRESH_READ_LOCK 16384 /* Lock tables for read */
+diff -r 592f6c3641ba libmysql/Makefile.in
+--- a/libmysql/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/libmysql/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -224,6 +224,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @CLIENT_LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba libmysql_r/Makefile.in
+--- a/libmysql_r/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/libmysql_r/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -221,6 +221,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@ @ZLIB_LIBS@ @openssl_libs@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba libmysqld/Makefile.in
+--- a/libmysqld/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/libmysqld/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -246,6 +246,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba libmysqld/examples/Makefile.in
+--- a/libmysqld/examples/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/libmysqld/examples/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -192,6 +192,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@ @WRAPLIBS@ @CLIENT_LIBS@ $(yassl_libs)
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba man/Makefile.in
+--- a/man/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/man/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -151,6 +151,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba myisam/Makefile.in
+--- a/myisam/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/myisam/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -235,6 +235,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba myisammrg/Makefile.in
+--- a/myisammrg/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/myisammrg/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -183,6 +183,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba mysql-test/Makefile.in
+--- a/mysql-test/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/mysql-test/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -161,6 +161,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba mysql-test/ndb/Makefile.in
+--- a/mysql-test/ndb/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/mysql-test/ndb/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -147,6 +147,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba mysql-test/r/information_schema.result
+--- a/mysql-test/r/information_schema.result Wed Jul 29 13:33:34 2009 -0700
++++ b/mysql-test/r/information_schema.result Wed Jul 29 13:34:11 2009 -0700
+@@ -37,10 +37,12 @@
+ select * from v1;
+ c
+ CHARACTER_SETS
++CLIENT_STATISTICS
+ COLLATIONS
+ COLLATION_CHARACTER_SET_APPLICABILITY
+ COLUMNS
+ COLUMN_PRIVILEGES
++INDEX_STATISTICS
+ KEY_COLUMN_USAGE
+ PROFILING
+ ROUTINES
+@@ -50,8 +52,10 @@
+ TABLES
+ TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES
++TABLE_STATISTICS
+ TRIGGERS
+ USER_PRIVILEGES
++USER_STATISTICS
+ VIEWS
+ columns_priv
+ db
+@@ -83,6 +87,7 @@
+ TABLES TABLES
+ TABLE_CONSTRAINTS TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES TABLE_PRIVILEGES
++TABLE_STATISTICS TABLE_STATISTICS
+ TRIGGERS TRIGGERS
+ tables_priv tables_priv
+ time_zone time_zone
+@@ -102,6 +107,7 @@
+ TABLES TABLES
+ TABLE_CONSTRAINTS TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES TABLE_PRIVILEGES
++TABLE_STATISTICS TABLE_STATISTICS
+ TRIGGERS TRIGGERS
+ tables_priv tables_priv
+ time_zone time_zone
+@@ -121,6 +127,7 @@
+ TABLES TABLES
+ TABLE_CONSTRAINTS TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES TABLE_PRIVILEGES
++TABLE_STATISTICS TABLE_STATISTICS
+ TRIGGERS TRIGGERS
+ tables_priv tables_priv
+ time_zone time_zone
+@@ -594,12 +601,13 @@
+ where table_schema='information_schema' limit 2;
+ TABLE_NAME TABLE_TYPE ENGINE
+ CHARACTER_SETS SYSTEM VIEW MEMORY
+-COLLATIONS SYSTEM VIEW MEMORY
++CLIENT_STATISTICS SYSTEM VIEW MEMORY
+ show tables from information_schema like "T%";
+ Tables_in_information_schema (T%)
+ TABLES
+ TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES
++TABLE_STATISTICS
+ TRIGGERS
+ create database information_schema;
+ ERROR 42000: Access denied for user 'root'@'localhost' to database 'information_schema'
+@@ -609,6 +617,7 @@
+ TABLES SYSTEM VIEW
+ TABLE_CONSTRAINTS SYSTEM VIEW
+ TABLE_PRIVILEGES SYSTEM VIEW
++TABLE_STATISTICS SYSTEM VIEW
+ TRIGGERS SYSTEM VIEW
+ create table t1(a int);
+ ERROR 42S02: Unknown table 't1' in information_schema
+@@ -621,6 +630,7 @@
+ TABLES
+ TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES
++TABLE_STATISTICS
+ TRIGGERS
+ select table_name from tables where table_name='user';
+ table_name
+@@ -730,7 +740,7 @@
+ CREATE VIEW a1 (t_CRASHME) AS SELECT f1 FROM t_crashme GROUP BY f1;
+ CREATE VIEW a2 AS SELECT t_CRASHME FROM a1;
+ count(*)
+-102
++106
+ drop view a2, a1;
+ drop table t_crashme;
+ select table_schema,table_name, column_name from
+@@ -790,18 +800,20 @@
+ TABLE_NAME COLUMN_NAME PRIVILEGES
+ COLUMNS TABLE_NAME select
+ COLUMN_PRIVILEGES TABLE_NAME select
++INDEX_STATISTICS TABLE_NAME select
+ KEY_COLUMN_USAGE TABLE_NAME select
+ STATISTICS TABLE_NAME select
+ TABLES TABLE_NAME select
+ TABLE_CONSTRAINTS TABLE_NAME select
+ TABLE_PRIVILEGES TABLE_NAME select
++TABLE_STATISTICS TABLE_NAME select
+ VIEWS TABLE_NAME select
+ delete from mysql.user where user='mysqltest_4';
+ delete from mysql.db where user='mysqltest_4';
+ flush privileges;
+ SELECT table_schema, count(*) FROM information_schema.TABLES GROUP BY TABLE_SCHEMA;
+ table_schema count(*)
+-information_schema 17
++information_schema 21
+ mysql 17
+ create table t1 (i int, j int);
+ create trigger trg1 before insert on t1 for each row
+@@ -1187,10 +1199,12 @@
+ );
+ table_name column_name
+ CHARACTER_SETS CHARACTER_SET_NAME
++CLIENT_STATISTICS CLIENT
+ COLLATIONS COLLATION_NAME
+ COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME
+ COLUMNS TABLE_SCHEMA
+ COLUMN_PRIVILEGES TABLE_SCHEMA
++INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
+ PROFILING QUERY_ID
+ ROUTINES ROUTINE_SCHEMA
+@@ -1200,8 +1214,10 @@
+ TABLES TABLE_SCHEMA
+ TABLE_CONSTRAINTS CONSTRAINT_SCHEMA
+ TABLE_PRIVILEGES TABLE_SCHEMA
++TABLE_STATISTICS TABLE_SCHEMA
+ TRIGGERS TRIGGER_SCHEMA
+ USER_PRIVILEGES GRANTEE
++USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
+ SELECT t.table_name, c1.column_name
+ FROM information_schema.tables t
+@@ -1219,10 +1235,12 @@
+ );
+ table_name column_name
+ CHARACTER_SETS CHARACTER_SET_NAME
++CLIENT_STATISTICS CLIENT
+ COLLATIONS COLLATION_NAME
+ COLLATION_CHARACTER_SET_APPLICABILITY COLLATION_NAME
+ COLUMNS TABLE_SCHEMA
+ COLUMN_PRIVILEGES TABLE_SCHEMA
++INDEX_STATISTICS TABLE_SCHEMA
+ KEY_COLUMN_USAGE CONSTRAINT_SCHEMA
+ PROFILING QUERY_ID
+ ROUTINES ROUTINE_SCHEMA
+@@ -1232,8 +1250,10 @@
+ TABLES TABLE_SCHEMA
+ TABLE_CONSTRAINTS CONSTRAINT_SCHEMA
+ TABLE_PRIVILEGES TABLE_SCHEMA
++TABLE_STATISTICS TABLE_SCHEMA
+ TRIGGERS TRIGGER_SCHEMA
+ USER_PRIVILEGES GRANTEE
++USER_STATISTICS USER
+ VIEWS TABLE_SCHEMA
+ SELECT MAX(table_name) FROM information_schema.tables;
+ MAX(table_name)
+@@ -1302,10 +1322,12 @@
+ group by t.table_name order by num1, t.table_name;
+ table_name group_concat(t.table_schema, '.', t.table_name) num1
+ CHARACTER_SETS information_schema.CHARACTER_SETS 1
++CLIENT_STATISTICS information_schema.CLIENT_STATISTICS 1
+ COLLATIONS information_schema.COLLATIONS 1
+ COLLATION_CHARACTER_SET_APPLICABILITY information_schema.COLLATION_CHARACTER_SET_APPLICABILITY 1
+ COLUMNS information_schema.COLUMNS 1
+ COLUMN_PRIVILEGES information_schema.COLUMN_PRIVILEGES 1
++INDEX_STATISTICS information_schema.INDEX_STATISTICS 1
+ KEY_COLUMN_USAGE information_schema.KEY_COLUMN_USAGE 1
+ PROFILING information_schema.PROFILING 1
+ ROUTINES information_schema.ROUTINES 1
+@@ -1315,8 +1337,10 @@
+ TABLES information_schema.TABLES 1
+ TABLE_CONSTRAINTS information_schema.TABLE_CONSTRAINTS 1
+ TABLE_PRIVILEGES information_schema.TABLE_PRIVILEGES 1
++TABLE_STATISTICS information_schema.TABLE_STATISTICS 1
+ TRIGGERS information_schema.TRIGGERS 1
+ USER_PRIVILEGES information_schema.USER_PRIVILEGES 1
++USER_STATISTICS information_schema.USER_STATISTICS 1
+ VIEWS information_schema.VIEWS 1
+ create table t1(f1 int);
+ create view v1 as select f1+1 as a from t1;
+diff -r 592f6c3641ba mysql-test/r/information_schema_db.result
+--- a/mysql-test/r/information_schema_db.result Wed Jul 29 13:33:34 2009 -0700
++++ b/mysql-test/r/information_schema_db.result Wed Jul 29 13:34:11 2009 -0700
+@@ -6,10 +6,12 @@
+ show tables;
+ Tables_in_information_schema
+ CHARACTER_SETS
++CLIENT_STATISTICS
+ COLLATIONS
+ COLLATION_CHARACTER_SET_APPLICABILITY
+ COLUMNS
+ COLUMN_PRIVILEGES
++INDEX_STATISTICS
+ KEY_COLUMN_USAGE
+ PROFILING
+ ROUTINES
+@@ -19,14 +21,17 @@
+ TABLES
+ TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES
++TABLE_STATISTICS
+ TRIGGERS
+ USER_PRIVILEGES
++USER_STATISTICS
+ VIEWS
+ show tables from INFORMATION_SCHEMA like 'T%';
+ Tables_in_information_schema (T%)
+ TABLES
+ TABLE_CONSTRAINTS
+ TABLE_PRIVILEGES
++TABLE_STATISTICS
+ TRIGGERS
+ create database `inf%`;
+ create database mbase;
+diff -r 592f6c3641ba mysql-test/r/mysqlshow.result
+--- a/mysql-test/r/mysqlshow.result Wed Jul 29 13:33:34 2009 -0700
++++ b/mysql-test/r/mysqlshow.result Wed Jul 29 13:34:11 2009 -0700
+@@ -80,10 +80,12 @@
+ | Tables |
+ +---------------------------------------+
+ | CHARACTER_SETS |
++| CLIENT_STATISTICS |
+ | COLLATIONS |
+ | COLLATION_CHARACTER_SET_APPLICABILITY |
+ | COLUMNS |
+ | COLUMN_PRIVILEGES |
++| INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
+ | PROFILING |
+ | ROUTINES |
+@@ -93,8 +95,10 @@
+ | TABLES |
+ | TABLE_CONSTRAINTS |
+ | TABLE_PRIVILEGES |
++| TABLE_STATISTICS |
+ | TRIGGERS |
+ | USER_PRIVILEGES |
++| USER_STATISTICS |
+ | VIEWS |
+ +---------------------------------------+
+ Database: INFORMATION_SCHEMA
+@@ -102,10 +106,12 @@
+ | Tables |
+ +---------------------------------------+
+ | CHARACTER_SETS |
++| CLIENT_STATISTICS |
+ | COLLATIONS |
+ | COLLATION_CHARACTER_SET_APPLICABILITY |
+ | COLUMNS |
+ | COLUMN_PRIVILEGES |
++| INDEX_STATISTICS |
+ | KEY_COLUMN_USAGE |
+ | PROFILING |
+ | ROUTINES |
+@@ -115,8 +121,10 @@
+ | TABLES |
+ | TABLE_CONSTRAINTS |
+ | TABLE_PRIVILEGES |
++| TABLE_STATISTICS |
+ | TRIGGERS |
+ | USER_PRIVILEGES |
++| USER_STATISTICS |
+ | VIEWS |
+ +---------------------------------------+
+ Wildcard: inf_rmation_schema
+diff -r 592f6c3641ba mysys/Makefile.in
+--- a/mysys/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/mysys/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -228,6 +228,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/Makefile.in
+--- a/ndb/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -171,6 +171,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/docs/Makefile.in
+--- a/ndb/docs/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/docs/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -149,6 +149,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/include/Makefile.in
+--- a/ndb/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -179,6 +179,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/Makefile.in
+--- a/ndb/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -204,6 +204,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/Makefile.in
+--- a/ndb/src/common/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -174,6 +174,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/debugger/Makefile.in
+--- a/ndb/src/common/debugger/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/debugger/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -206,6 +206,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/debugger/signaldata/Makefile.in
+--- a/ndb/src/common/debugger/signaldata/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/debugger/signaldata/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -211,6 +211,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/logger/Makefile.in
+--- a/ndb/src/common/logger/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/logger/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -197,6 +197,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/mgmcommon/Makefile.in
+--- a/ndb/src/common/mgmcommon/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/mgmcommon/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -211,6 +211,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/portlib/Makefile.in
+--- a/ndb/src/common/portlib/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/portlib/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -222,6 +222,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/transporter/Makefile.in
+--- a/ndb/src/common/transporter/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/transporter/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -197,6 +197,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/common/util/Makefile.in
+--- a/ndb/src/common/util/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/common/util/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -217,6 +217,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/cw/Makefile.in
+--- a/ndb/src/cw/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/cw/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/cw/cpcd/Makefile.in
+--- a/ndb/src/cw/cpcd/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/cw/cpcd/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -207,6 +207,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/Makefile.in
+--- a/ndb/src/kernel/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -227,6 +227,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/Makefile.in
+--- a/ndb/src/kernel/blocks/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/backup/Makefile.in
+--- a/ndb/src/kernel/blocks/backup/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/backup/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/cmvmi/Makefile.in
+--- a/ndb/src/kernel/blocks/cmvmi/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/cmvmi/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbacc/Makefile.in
+--- a/ndb/src/kernel/blocks/dbacc/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbacc/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbdict/Makefile.in
+--- a/ndb/src/kernel/blocks/dbdict/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbdict/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -206,6 +206,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbdih/Makefile.in
+--- a/ndb/src/kernel/blocks/dbdih/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbdih/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -203,6 +203,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dblqh/Makefile.in
+--- a/ndb/src/kernel/blocks/dblqh/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dblqh/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -204,6 +204,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtc/Makefile.in
+--- a/ndb/src/kernel/blocks/dbtc/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbtc/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtup/Makefile.in
+--- a/ndb/src/kernel/blocks/dbtup/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbtup/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -204,6 +204,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbtux/Makefile.in
+--- a/ndb/src/kernel/blocks/dbtux/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbtux/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -199,6 +199,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/dbutil/Makefile.in
+--- a/ndb/src/kernel/blocks/dbutil/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/dbutil/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/ndbcntr/Makefile.in
+--- a/ndb/src/kernel/blocks/ndbcntr/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/ndbcntr/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -197,6 +197,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/ndbfs/Makefile.in
+--- a/ndb/src/kernel/blocks/ndbfs/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/ndbfs/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -197,6 +197,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/qmgr/Makefile.in
+--- a/ndb/src/kernel/blocks/qmgr/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/qmgr/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/suma/Makefile.in
+--- a/ndb/src/kernel/blocks/suma/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/suma/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/blocks/trix/Makefile.in
+--- a/ndb/src/kernel/blocks/trix/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/blocks/trix/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/error/Makefile.in
+--- a/ndb/src/kernel/error/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/error/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -206,6 +206,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/kernel/vm/Makefile.in
+--- a/ndb/src/kernel/vm/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/kernel/vm/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -207,6 +207,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/mgmapi/Makefile.in
+--- a/ndb/src/mgmapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/mgmapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -205,6 +205,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/mgmclient/Makefile.in
+--- a/ndb/src/mgmclient/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/mgmclient/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -216,6 +216,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/mgmsrv/Makefile.in
+--- a/ndb/src/mgmsrv/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/mgmsrv/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -213,6 +213,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/src/ndbapi/Makefile.in
+--- a/ndb/src/ndbapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/src/ndbapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -215,6 +215,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/Makefile.in
+--- a/ndb/test/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/ndbapi/Makefile.in
+--- a/ndb/test/ndbapi/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/ndbapi/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -595,6 +595,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/ndbapi/bank/Makefile.in
+--- a/ndb/test/ndbapi/bank/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/ndbapi/bank/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -282,6 +282,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/run-test/Makefile.in
+--- a/ndb/test/run-test/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/run-test/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -243,6 +243,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/src/Makefile.in
+--- a/ndb/test/src/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/src/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -213,6 +213,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/test/tools/Makefile.in
+--- a/ndb/test/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/test/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -325,6 +325,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba ndb/tools/Makefile.in
+--- a/ndb/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/ndb/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -344,6 +344,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba netware/Makefile.in
+--- a/netware/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/netware/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -199,6 +199,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba os2/Makefile.in
+--- a/os2/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/os2/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba os2/include/Makefile.in
+--- a/os2/include/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/os2/include/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -156,6 +156,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba os2/include/sys/Makefile.in
+--- a/os2/include/sys/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/os2/include/sys/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -144,6 +144,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba patch_info/userstats.info
+--- /dev/null Thu Jan 01 00:00:00 1970 +0000
++++ b/patch_info/userstats.info Wed Jul 29 13:34:11 2009 -0700
+@@ -0,0 +1,14 @@
++File=userstatsv2.patch
++Name=SHOW USER/TABLE/INDEX statistics
++Version=V2
++Author=Google
++License=GPL
++Comment=Added INFORMATION_SCHEMA.*_STATISTICS
++2008-12-01
++YK: fix behavior for prepared statements
++
++2008-11-26
++YK: add switch variable "userstat_running" to control INFORMATION_SCHEMA.*_STATISTICS (default:OFF)
++
++2008-12-09
++YK: fixed "Row_sent: 0" problem at microslow_innodb.patch
+diff -r 592f6c3641ba pstack/Makefile.in
+--- a/pstack/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/pstack/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -196,6 +196,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba pstack/aout/Makefile.in
+--- a/pstack/aout/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/pstack/aout/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -134,6 +134,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba regex/Makefile.in
+--- a/regex/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/regex/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -180,6 +180,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba scripts/Makefile.in
+--- a/scripts/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/scripts/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -176,6 +176,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba server-tools/Makefile.in
+--- a/server-tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/server-tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -155,6 +155,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba server-tools/instance-manager/Makefile.in
+--- a/server-tools/instance-manager/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/server-tools/instance-manager/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -205,6 +205,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba sql/Makefile.in
+--- a/sql/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -274,6 +274,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba sql/ha_innodb.cc
+--- a/sql/ha_innodb.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/ha_innodb.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -3341,6 +3341,8 @@
+
+ error = row_insert_for_mysql((byte*) record, prebuilt);
+
++ if (error == DB_SUCCESS) rows_changed++;
++
+ if (error == DB_SUCCESS && auto_inc_used) {
+
+ /* Fetch the value that was set in the autoincrement field */
+@@ -3613,6 +3615,8 @@
+ }
+ }
+
++ if (error == DB_SUCCESS) rows_changed++;
++
+ innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+ error = convert_error_code_to_mysql(error, user_thd);
+@@ -3661,6 +3665,8 @@
+
+ error = row_update_for_mysql((byte*) record, prebuilt);
+
++ if (error == DB_SUCCESS) rows_changed++;
++
+ innodb_srv_conc_exit_innodb(prebuilt->trx);
+
+ error = convert_error_code_to_mysql(error, user_thd);
+@@ -4092,6 +4098,9 @@
+ if (ret == DB_SUCCESS) {
+ error = 0;
+ table->status = 0;
++ rows_read++;
++ if (active_index >= 0 && active_index < MAX_KEY)
++ index_rows_read[active_index]++;
+
+ } else if (ret == DB_RECORD_NOT_FOUND) {
+ error = HA_ERR_END_OF_FILE;
+diff -r 592f6c3641ba sql/ha_myisam.cc
+--- a/sql/ha_myisam.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/ha_myisam.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -670,7 +670,9 @@
+ if ((error= update_auto_increment()))
+ return error;
+ }
+- return mi_write(file,buf);
++ int error=mi_write(file,buf);
++ if (!error) rows_changed++;
++ return error;
+ }
+
+ int ha_myisam::check(THD* thd, HA_CHECK_OPT* check_opt)
+@@ -1521,13 +1523,17 @@
+ statistic_increment(table->in_use->status_var.ha_update_count,&LOCK_status);
+ if (table->timestamp_field_type & TIMESTAMP_AUTO_SET_ON_UPDATE)
+ table->timestamp_field->set_time();
+- return mi_update(file,old_data,new_data);
++ int error=mi_update(file,old_data,new_data);
++ if (!error) rows_changed++;
++ return error;
+ }
+
+ int ha_myisam::delete_row(const byte * buf)
+ {
+ statistic_increment(table->in_use->status_var.ha_delete_count,&LOCK_status);
+- return mi_delete(file,buf);
++ int error=mi_delete(file,buf);
++ if (!error) rows_changed++;
++ return error;
+ }
+
+ int ha_myisam::index_read(byte * buf, const byte * key,
+@@ -1538,6 +1544,13 @@
+ &LOCK_status);
+ int error=mi_rkey(file,buf,active_index, key, key_len, find_flag);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1548,6 +1561,13 @@
+ &LOCK_status);
+ int error=mi_rkey(file,buf,index, key, key_len, find_flag);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1558,6 +1578,13 @@
+ &LOCK_status);
+ int error=mi_rkey(file,buf,active_index, key, key_len, HA_READ_PREFIX_LAST);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1568,6 +1595,13 @@
+ &LOCK_status);
+ int error=mi_rnext(file,buf,active_index);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1578,6 +1612,13 @@
+ &LOCK_status);
+ int error=mi_rprev(file,buf, active_index);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1588,6 +1629,13 @@
+ &LOCK_status);
+ int error=mi_rfirst(file, buf, active_index);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1598,6 +1646,13 @@
+ &LOCK_status);
+ int error=mi_rlast(file, buf, active_index);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1614,6 +1669,13 @@
+ error= mi_rnext_same(file,buf);
+ } while (error == HA_ERR_RECORD_DELETED);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) {
++ rows_read++;
++
++ int inx = (active_index == -1) ? file->lastinx : active_index;
++ if (inx >= 0 && inx < MAX_KEY)
++ index_rows_read[inx]++;
++ }
+ return error;
+ }
+
+@@ -1631,6 +1693,7 @@
+ &LOCK_status);
+ int error=mi_scan(file, buf);
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) rows_read++;
+ return error;
+ }
+
+@@ -1645,6 +1708,7 @@
+ &LOCK_status);
+ int error=mi_rrnd(file, buf, my_get_ptr(pos,ref_length));
+ table->status=error ? STATUS_NOT_FOUND: 0;
++ if (!error) rows_read++;
+ return error;
+ }
+
+diff -r 592f6c3641ba sql/handler.cc
+--- a/sql/handler.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/handler.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -726,6 +726,8 @@
+ if (cookie)
+ tc_log->unlog(cookie, xid);
+ DBUG_EXECUTE_IF("crash_commit_after", abort(););
++ if (is_real_trans)
++ thd->diff_commit_trans++;
+ end:
+ if (is_real_trans)
+ start_waiting_global_read_lock(thd);
+@@ -783,6 +785,7 @@
+ thd->transaction.cleanup();
+ }
+ }
++ thd->diff_rollback_trans++;
+ #endif /* USING_TRANSACTIONS */
+ DBUG_RETURN(error);
+ }
+@@ -1223,6 +1226,7 @@
+ statistic_increment(thd->status_var.ha_rollback_count,&LOCK_status);
+ *ht=0; // keep it conveniently zero-filled
+ }
++ thd->diff_rollback_trans++;
+ DBUG_RETURN(error);
+ }
+
+@@ -1453,6 +1457,8 @@
+ else
+ dupp_ref=ref+ALIGN_SIZE(ref_length);
+ }
++ rows_read = rows_changed = 0;
++ memset(index_rows_read, 0, sizeof(index_rows_read));
+ DBUG_RETURN(error);
+ }
+
+@@ -2287,6 +2293,111 @@
+ return error;
+ }
+
++// Updates the global table stats with the TABLE this handler represents.
++void handler::update_global_table_stats() {
++ if (!opt_userstat_running) {
++ rows_read = rows_changed = 0;
++ return;
++ }
++
++ if (!rows_read && !rows_changed) return; // Nothing to update.
++ // table_cache_key is db_name + '\0' + table_name + '\0'.
++ if (!table->s || !table->s->table_cache_key || !table->s->table_name) return;
++
++ TABLE_STATS* table_stats;
++ char key[NAME_LEN * 2 + 2];
++ // [db] + '.' + [table]
++ sprintf(key, "%s.%s", table->s->table_cache_key, table->s->table_name);
++
++ pthread_mutex_lock(&LOCK_global_table_stats);
++ // Gets the global table stats, creating one if necessary.
++ if (!(table_stats = (TABLE_STATS*)hash_search(&global_table_stats,
++ (byte*)key,
++ strlen(key)))) {
++ if (!(table_stats = ((TABLE_STATS*)
++ my_malloc(sizeof(TABLE_STATS), MYF(MY_WME | MY_ZEROFILL))))) {
++ // Out of memory.
++ sql_print_error("Allocating table stats failed.");
++ goto end;
++ }
++ strncpy(table_stats->table, key, sizeof(table_stats->table));
++ table_stats->rows_read = 0;
++ table_stats->rows_changed = 0;
++ table_stats->rows_changed_x_indexes = 0;
++ table_stats->engine_type = (int) ht->db_type;
++
++ if (my_hash_insert(&global_table_stats, (byte*)table_stats)) {
++ // Out of memory.
++ sql_print_error("Inserting table stats failed.");
++ my_free((char*)table_stats, 0);
++ goto end;
++ }
++ }
++ // Updates the global table stats.
++ table_stats->rows_read += rows_read;
++ table_stats->rows_changed += rows_changed;
++ table_stats->rows_changed_x_indexes +=
++ rows_changed * (table->s->keys ? table->s->keys : 1);
++ current_thd->diff_total_read_rows += rows_read;
++ rows_read = rows_changed = 0;
++end:
++ pthread_mutex_unlock(&LOCK_global_table_stats);
++}
++
++// Updates the global index stats with this handler's accumulated index reads.
++void handler::update_global_index_stats() {
++ // table_cache_key is db_name + '\0' + table_name + '\0'.
++ if (!table->s || !table->s->table_cache_key || !table->s->table_name) return;
++
++ if (!opt_userstat_running) {
++ for (int x = 0; x < table->s->keys; x++) {
++ index_rows_read[x] = 0;
++ }
++ return;
++ }
++
++ for (int x = 0; x < table->s->keys; x++) {
++ if (index_rows_read[x]) {
++ // Rows were read using this index.
++ KEY* key_info = &table->key_info[x];
++
++ if (!key_info->name) continue;
++
++ INDEX_STATS* index_stats;
++ char key[NAME_LEN * 3 + 3];
++ // [db] + '.' + [table] + '.' + [index]
++ sprintf(key, "%s.%s.%s", table->s->table_cache_key,
++ table->s->table_name, key_info->name);
++
++ pthread_mutex_lock(&LOCK_global_index_stats);
++ // Gets the global index stats, creating one if necessary.
++ if (!(index_stats = (INDEX_STATS*)hash_search(&global_index_stats,
++ (byte*)key,
++ strlen(key)))) {
++ if (!(index_stats = ((INDEX_STATS*)
++ my_malloc(sizeof(INDEX_STATS), MYF(MY_WME | MY_ZEROFILL))))) {
++ // Out of memory.
++ sql_print_error("Allocating index stats failed.");
++ goto end;
++ }
++ strncpy(index_stats->index, key, sizeof(index_stats->index));
++ index_stats->rows_read = 0;
++
++ if (my_hash_insert(&global_index_stats, (byte*)index_stats)) {
++ // Out of memory.
++ sql_print_error("Inserting index stats failed.");
++ my_free((char*)index_stats, 0);
++ goto end;
++ }
++ }
++ // Updates the global index stats.
++ index_stats->rows_read += index_rows_read[x];
++ index_rows_read[x] = 0;
++end:
++ pthread_mutex_unlock(&LOCK_global_index_stats);
++ }
++ }
++}
+
+ /****************************************************************************
+ ** Some general functions that isn't in the handler class
+diff -r 592f6c3641ba sql/handler.h
+--- a/sql/handler.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/handler.h Wed Jul 29 13:34:11 2009 -0700
+@@ -32,6 +32,10 @@
+ #define USING_TRANSACTIONS
+ #endif
+
++#if MAX_KEY > 128
++#error MAX_KEY is too large. Values up to 128 are supported.
++#endif
++
+ // the following is for checking tables
+
+ #define HA_ADMIN_ALREADY_DONE 1
+@@ -604,6 +608,9 @@
+ bool auto_increment_column_changed;
+ bool implicit_emptied; /* Can be !=0 only if HEAP */
+ const COND *pushed_cond;
++ ulonglong rows_read;
++ ulonglong rows_changed;
++ ulonglong index_rows_read[MAX_KEY];
+
+ handler(const handlerton *ht_arg, TABLE *table_arg) :table(table_arg),
+ ht(ht_arg),
+@@ -615,8 +622,10 @@
+ ref_length(sizeof(my_off_t)), block_size(0),
+ raid_type(0), ft_handler(0), inited(NONE),
+ locked(FALSE), implicit_emptied(0),
+- pushed_cond(NULL)
+- {}
++ pushed_cond(NULL), rows_read(0), rows_changed(0)
++ {
++ memset(index_rows_read, 0, sizeof(index_rows_read));
++ }
+ virtual ~handler(void) { DBUG_ASSERT(locked == FALSE); /* TODO: DBUG_ASSERT(inited == NONE); */ }
+ virtual handler *clone(MEM_ROOT *mem_root);
+ int ha_open(const char *name, int mode, int test_if_locked);
+@@ -625,7 +634,11 @@
+ virtual void print_error(int error, myf errflag);
+ virtual bool get_error_message(int error, String *buf);
+ uint get_dup_key(int error);
+- void change_table_ptr(TABLE *table_arg) { table=table_arg; }
++ void change_table_ptr(TABLE *table_arg) {
++ table=table_arg;
++ rows_read = rows_changed = 0;
++ memset(index_rows_read, 0, sizeof(index_rows_read));
++ }
+ virtual double scan_time()
+ { return ulonglong2double(data_file_length) / IO_SIZE + 2; }
+ virtual double read_time(uint index, uint ranges, ha_rows rows)
+@@ -886,6 +899,9 @@
+ virtual bool is_crashed() const { return 0; }
+ virtual bool auto_repair() const { return 0; }
+
++ void update_global_table_stats();
++ void update_global_index_stats();
++
+ /*
+ default rename_table() and delete_table() rename/delete files with a
+ given name and extensions from bas_ext()
+diff -r 592f6c3641ba sql/lex.h
+--- a/sql/lex.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/lex.h Wed Jul 29 13:34:11 2009 -0700
+@@ -109,6 +109,7 @@
+ { "CHECKSUM", SYM(CHECKSUM_SYM)},
+ { "CIPHER", SYM(CIPHER_SYM)},
+ { "CLIENT", SYM(CLIENT_SYM)},
++ { "CLIENT_STATISTICS", SYM(CLIENT_STATS_SYM)},
+ { "CLOSE", SYM(CLOSE_SYM)},
+ { "CODE", SYM(CODE_SYM)},
+ { "COLLATE", SYM(COLLATE_SYM)},
+@@ -238,6 +239,7 @@
+ { "IN", SYM(IN_SYM)},
+ { "INDEX", SYM(INDEX_SYM)},
+ { "INDEXES", SYM(INDEXES)},
++ { "INDEX_STATISTICS", SYM(INDEX_STATS_SYM)},
+ { "INFILE", SYM(INFILE)},
+ { "INNER", SYM(INNER_SYM)},
+ { "INNOBASE", SYM(INNOBASE_SYM)},
+@@ -443,6 +445,7 @@
+ { "SIGNED", SYM(SIGNED_SYM)},
+ { "SIMPLE", SYM(SIMPLE_SYM)},
+ { "SLAVE", SYM(SLAVE)},
++ { "SLOW", SYM(SLOW_SYM)},
+ { "SNAPSHOT", SYM(SNAPSHOT_SYM)},
+ { "SMALLINT", SYM(SMALLINT)},
+ { "SOME", SYM(ANY_SYM)},
+@@ -488,6 +491,7 @@
+ { "TABLE", SYM(TABLE_SYM)},
+ { "TABLES", SYM(TABLES)},
+ { "TABLESPACE", SYM(TABLESPACE)},
++ { "TABLE_STATISTICS", SYM(TABLE_STATS_SYM)},
+ { "TEMPORARY", SYM(TEMPORARY)},
+ { "TEMPTABLE", SYM(TEMPTABLE_SYM)},
+ { "TERMINATED", SYM(TERMINATED)},
+@@ -525,6 +529,7 @@
+ { "USE", SYM(USE_SYM)},
+ { "USER", SYM(USER)},
+ { "USER_RESOURCES", SYM(RESOURCES)},
++ { "USER_STATISTICS", SYM(USER_STATS_SYM)},
+ { "USE_FRM", SYM(USE_FRM)},
+ { "USING", SYM(USING)},
+ { "UTC_DATE", SYM(UTC_DATE_SYM)},
+diff -r 592f6c3641ba sql/log.cc
+--- a/sql/log.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/log.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -1958,18 +1958,24 @@
+ thd->current_insert_id);
+ if (e.write(file))
+ goto err;
++ if (file == &log_file)
++ thd->binlog_bytes_written += e.data_written;
+ }
+ if (thd->insert_id_used)
+ {
+ Intvar_log_event e(thd,(uchar) INSERT_ID_EVENT,thd->last_insert_id);
+ if (e.write(file))
+ goto err;
++ if (file == &log_file)
++ thd->binlog_bytes_written += e.data_written;
+ }
+ if (thd->rand_used)
+ {
+ Rand_log_event e(thd,thd->rand_saved_seed1,thd->rand_saved_seed2);
+ if (e.write(file))
+ goto err;
++ if (file == &log_file)
++ thd->binlog_bytes_written += e.data_written;
+ }
+ if (thd->user_var_events.elements)
+ {
+@@ -1985,6 +1991,8 @@
+ user_var_event->charset_number);
+ if (e.write(file))
+ goto err;
++ if (file == &log_file)
++ thd->binlog_bytes_written += e.data_written;
+ }
+ }
+ }
+@@ -1995,6 +2003,8 @@
+
+ if (event_info->write(file))
+ goto err;
++ if (file == &log_file)
++ thd->binlog_bytes_written += event_info->data_written;
+
+ if (file == &log_file) // we are writing to the real log (disk)
+ {
+@@ -2117,6 +2127,7 @@
+ */
+ if (qinfo.write(&log_file))
+ goto err;
++ thd->binlog_bytes_written += qinfo.data_written;
+
+ /* Read from the file used to cache the queries .*/
+ if (reinit_io_cache(cache, READ_CACHE, 0, 0, 0))
+@@ -2163,6 +2174,7 @@
+ /* write the first half of the split header */
+ if (my_b_write(&log_file, header, carry))
+ goto err;
++ thd->binlog_bytes_written += carry;
+
+ /*
+ copy fixed second half of header to cache so the correct
+@@ -2231,6 +2243,8 @@
+ /* Write data to the binary log file */
+ if (my_b_write(&log_file, cache->read_pos, length))
+ goto err;
++ thd->binlog_bytes_written += length;
++
+ cache->read_pos=cache->read_end; // Mark buffer used up
+ DBUG_EXECUTE_IF("half_binlogged_transaction", goto DBUG_skip_commit;);
+ } while ((length=my_b_fill(cache)));
+@@ -2239,6 +2253,8 @@
+
+ if (commit_event->write(&log_file))
+ goto err;
++ thd->binlog_bytes_written += commit_event->data_written;
++
+ #ifndef DBUG_OFF
+ DBUG_skip_commit:
+ #endif
+diff -r 592f6c3641ba sql/mysql_priv.h
+--- a/sql/mysql_priv.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/mysql_priv.h Wed Jul 29 13:34:11 2009 -0700
+@@ -837,7 +837,15 @@
+ bool multi_delete_set_locks_and_link_aux_tables(LEX *lex);
+ void init_max_user_conn(void);
+ void init_update_queries(void);
++void init_global_user_stats(void);
++void init_global_table_stats(void);
++void init_global_index_stats(void);
++void init_global_client_stats(void);
+ void free_max_user_conn(void);
++void free_global_user_stats(void);
++void free_global_table_stats(void);
++void free_global_index_stats(void);
++void free_global_client_stats(void);
+ pthread_handler_t handle_one_connection(void *arg);
+ pthread_handler_t handle_bootstrap(void *arg);
+ void end_thread(THD *thd,bool put_in_cache);
+@@ -1416,6 +1424,7 @@
+ extern ulong max_connections,max_connect_errors, connect_timeout;
+ extern ulong slave_net_timeout, slave_trans_retries;
+ extern uint max_user_connections;
++extern ulonglong denied_connections;
+ extern ulong what_to_log,flush_time;
+ extern ulong query_buff_size, thread_stack;
+ extern ulong max_prepared_stmt_count, prepared_stmt_count;
+@@ -1446,6 +1455,7 @@
+ extern my_bool opt_safe_show_db, opt_local_infile;
+ extern my_bool opt_slave_compressed_protocol, use_temp_pool;
+ extern my_bool opt_readonly, lower_case_file_system;
++extern my_bool opt_userstat_running;
+ extern my_bool opt_enable_named_pipe, opt_sync_frm, opt_allow_suspicious_udfs;
+ extern my_bool opt_secure_auth;
+ extern char* opt_secure_file_priv;
+@@ -1493,6 +1503,14 @@
+ extern struct system_variables max_system_variables;
+ extern struct system_status_var global_status_var;
+ extern struct rand_struct sql_rand;
++extern HASH global_user_stats;
++extern HASH global_client_stats;
++extern pthread_mutex_t LOCK_global_user_client_stats;
++extern HASH global_table_stats;
++extern pthread_mutex_t LOCK_global_table_stats;
++extern HASH global_index_stats;
++extern pthread_mutex_t LOCK_global_index_stats;
++extern pthread_mutex_t LOCK_stats;
+
+ extern const char *opt_date_time_formats[];
+ extern KNOWN_DATE_TIME_FORMAT known_date_time_formats[];
+diff -r 592f6c3641ba sql/mysqld.cc
+--- a/sql/mysqld.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/mysqld.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -417,6 +417,7 @@
+ uint opt_large_page_size= 0;
+ my_bool opt_old_style_user_limits= 0, trust_function_creators= 0;
+ char* opt_slow_logname= 0;
++my_bool opt_userstat_running= 0;
+ /*
+ True if there is at least one per-hour limit for some user, so we should
+ check them before each query (and possibly reset counters when hour is
+@@ -453,6 +454,7 @@
+ ulong binlog_cache_use= 0, binlog_cache_disk_use= 0;
+ ulong max_connections, max_connect_errors;
+ uint max_user_connections= 0;
++ulonglong denied_connections = 0;
+ /*
+ Limit of the total number of prepared statements in the server.
+ Is necessary to protect the server against out-of-memory attacks.
+@@ -555,6 +557,10 @@
+ LOCK_crypt, LOCK_bytes_sent, LOCK_bytes_received,
+ LOCK_global_system_variables,
+ LOCK_user_conn, LOCK_slave_list, LOCK_active_mi;
++pthread_mutex_t LOCK_stats;
++pthread_mutex_t LOCK_global_user_client_stats;
++pthread_mutex_t LOCK_global_table_stats;
++pthread_mutex_t LOCK_global_index_stats;
+ /*
+ The below lock protects access to two global server variables:
+ max_prepared_stmt_count and prepared_stmt_count. These variables
+@@ -1196,6 +1202,10 @@
+ x_free(opt_secure_file_priv);
+ bitmap_free(&temp_pool);
+ free_max_user_conn();
++ free_global_user_stats();
++ free_global_client_stats();
++ free_global_table_stats();
++ free_global_index_stats();
+ #ifdef HAVE_REPLICATION
+ end_slave_list();
+ free_list(&replicate_do_db);
+@@ -1310,6 +1320,10 @@
+ (void) pthread_cond_destroy(&COND_thread_cache);
+ (void) pthread_cond_destroy(&COND_flush_thread_cache);
+ (void) pthread_cond_destroy(&COND_manager);
++ (void) pthread_mutex_destroy(&LOCK_stats);
++ (void) pthread_mutex_destroy(&LOCK_global_user_client_stats);
++ (void) pthread_mutex_destroy(&LOCK_global_table_stats);
++ (void) pthread_mutex_destroy(&LOCK_global_index_stats);
+ }
+
+ #endif /*EMBEDDED_LIBRARY*/
+@@ -3157,6 +3171,10 @@
+ (void) pthread_mutex_init(&LOCK_rpl_status, MY_MUTEX_INIT_FAST);
+ (void) pthread_cond_init(&COND_rpl_status, NULL);
+ #endif
++ (void) pthread_mutex_init(&LOCK_stats, MY_MUTEX_INIT_FAST);
++ (void) pthread_mutex_init(&LOCK_global_user_client_stats, MY_MUTEX_INIT_FAST);
++ (void) pthread_mutex_init(&LOCK_global_table_stats, MY_MUTEX_INIT_FAST);
++ (void) pthread_mutex_init(&LOCK_global_index_stats, MY_MUTEX_INIT_FAST);
+ sp_cache_init();
+ /* Parameter for threads created for connections */
+ (void) pthread_attr_init(&connection_attrib);
+@@ -3428,6 +3446,10 @@
+ sql_print_error("Out of memory");
+ unireg_abort(1);
+ }
++
++ init_global_table_stats();
++ init_global_index_stats();
++
+ if (ha_init())
+ {
+ sql_print_error("Can't init databases");
+@@ -3510,6 +3532,8 @@
+
+ init_max_user_conn();
+ init_update_queries();
++ init_global_user_stats();
++ init_global_client_stats();
+ DBUG_RETURN(0);
+ }
+
+@@ -4236,6 +4260,7 @@
+ {
+ DBUG_PRINT("error",("Too many connections"));
+ close_connection(thd, ER_CON_COUNT_ERROR, 1);
++ statistic_increment(denied_connections, &LOCK_status);
+ delete thd;
+ DBUG_VOID_RETURN;
+ }
+@@ -5056,6 +5081,7 @@
+ OPT_PROFILING_USE_GETRUSAGE,
+ OPT_SLOW_LOG,
+ OPT_SLOW_QUERY_LOG_FILE,
++ OPT_USERSTAT_RUNNING,
+ OPT_USE_GLOBAL_LONG_QUERY_TIME,
+ OPT_INNODB_ROLLBACK_ON_TIMEOUT,
+ OPT_SECURE_FILE_PRIV,
+@@ -6523,6 +6549,10 @@
+ (gptr*) &max_system_variables.net_wait_timeout, 0, GET_ULONG,
+ REQUIRED_ARG, NET_WAIT_TIMEOUT, 1, IF_WIN(INT_MAX32/1000, LONG_TIMEOUT),
+ 0, 1, 0},
++ {"userstat_running", OPT_USERSTAT_RUNNING,
++ "Control USER_STATISTICS, CLIENT_STATISTICS, INDEX_STATISTICS and TABLE_STATISTICS running",
++ (gptr*) &opt_userstat_running, (gptr*) &opt_userstat_running,
++ 0, GET_BOOL, NO_ARG, 0, 0, 1, 0, 1, 0},
+ {0, 0, 0, 0, 0, 0, GET_NO_ARG, NO_ARG, 0, 0, 0, 0, 0, 0}
+ };
+
+diff -r 592f6c3641ba sql/set_var.cc
+--- a/sql/set_var.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/set_var.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -325,6 +325,7 @@
+ sys_var_thd_ulong sys_read_buff_size("read_buffer_size",
+ &SV::read_buff_size);
+ sys_var_bool_ptr sys_readonly("read_only", &opt_readonly);
++sys_var_bool_ptr sys_userstat_running("userstat_running", &opt_userstat_running);
+ sys_var_thd_ulong sys_read_rnd_buff_size("read_rnd_buffer_size",
+ &SV::read_rnd_buff_size);
+ sys_var_thd_ulong sys_div_precincrement("div_precision_increment",
+@@ -837,6 +838,7 @@
+ &sys_trans_alloc_block_size,
+ &sys_trans_prealloc_size,
+ &sys_tx_isolation,
++ &sys_userstat_running,
+ &sys_version,
+ #ifdef HAVE_BERKELEY_DB
+ &sys_version_bdb,
+@@ -1190,6 +1192,7 @@
+ {sys_tx_isolation.name, (char*) &sys_tx_isolation, SHOW_SYS},
+ {sys_updatable_views_with_limit.name,
+ (char*) &sys_updatable_views_with_limit,SHOW_SYS},
++ {sys_userstat_running.name, (char*) &sys_userstat_running, SHOW_SYS},
+ {sys_use_global_long_query_time.name, (char*) &sys_use_global_long_query_time, SHOW_SYS},
+ {sys_version.name, (char*) &sys_version, SHOW_SYS},
+ #ifdef HAVE_BERKELEY_DB
+diff -r 592f6c3641ba sql/share/Makefile.in
+--- a/sql/share/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/share/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -144,6 +144,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba sql/sql_base.cc
+--- a/sql/sql_base.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_base.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -624,6 +624,12 @@
+ DBUG_ENTER("close_thread_table");
+ DBUG_ASSERT(table->key_read == 0);
+ DBUG_ASSERT(!table->file || table->file->inited == handler::NONE);
++
++ if(table->file)
++ {
++ table->file->update_global_table_stats();
++ table->file->update_global_index_stats();
++ }
+
+ *table_ptr=table->next;
+ if (table->needs_reopen_or_name_lock() ||
+@@ -670,6 +676,9 @@
+ {
+ DBUG_ENTER("close_temporary");
+ char path[FN_REFLEN];
++
++ table->file->update_global_table_stats();
++ table->file->update_global_index_stats();
+ db_type table_type=table->s->db_type;
+ strmov(path,table->s->path);
+ free_io_cache(table);
+diff -r 592f6c3641ba sql/sql_class.cc
+--- a/sql/sql_class.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_class.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -239,6 +239,13 @@
+ bzero(ha_data, sizeof(ha_data));
+ mysys_var=0;
+ binlog_evt_union.do_union= FALSE;
++ busy_time = 0;
++ cpu_time = 0;
++ bytes_received = 0;
++ bytes_sent = 0;
++ binlog_bytes_written = 0;
++ updated_row_count = 0;
++ sent_row_count_2 = 0;
+ #ifndef DBUG_OFF
+ dbug_sentry=THD_SENTRY_MAGIC;
+ #endif
+@@ -378,6 +385,88 @@
+ total_warn_count= 0;
+ update_charset();
+ bzero((char *) &status_var, sizeof(status_var));
++ reset_stats();
++}
++
++// Resets stats in a THD.
++void THD::reset_stats(void) {
++ current_connect_time = time(NULL);
++ last_global_update_time = current_connect_time;
++ reset_diff_stats();
++}
++
++// Resets the 'diff' stats, which are used to update global stats.
++void THD::reset_diff_stats(void) {
++ diff_total_busy_time = 0;
++ diff_total_cpu_time = 0;
++ diff_total_bytes_received = 0;
++ diff_total_bytes_sent = 0;
++ diff_total_binlog_bytes_written = 0;
++ diff_total_sent_rows = 0;
++ diff_total_updated_rows = 0;
++ diff_total_read_rows = 0;
++ diff_select_commands = 0;
++ diff_update_commands = 0;
++ diff_other_commands = 0;
++ diff_commit_trans = 0;
++ diff_rollback_trans = 0;
++ diff_denied_connections = 0;
++ diff_lost_connections = 0;
++ diff_access_denied_errors = 0;
++ diff_empty_queries = 0;
++}
++
++// Updates 'diff' stats of a THD.
++void THD::update_stats(bool ran_command) {
++ if (opt_userstat_running) {
++ diff_total_busy_time += busy_time;
++ diff_total_cpu_time += cpu_time;
++ diff_total_bytes_received += bytes_received;
++ diff_total_bytes_sent += bytes_sent;
++ diff_total_binlog_bytes_written += binlog_bytes_written;
++ diff_total_sent_rows += sent_row_count_2;
++ diff_total_updated_rows += updated_row_count;
++ // diff_total_read_rows is updated in handler.cc.
++
++ if (ran_command) {
++ // The replication thread has the COM_CONNECT command.
++ if ((old_command == COM_QUERY || command == COM_CONNECT) &&
++ (lex->sql_command >= 0 && lex->sql_command < SQLCOM_END)) {
++ // A SQL query.
++ if (lex->sql_command == SQLCOM_SELECT) {
++ if (lex->orig_sql_command == SQLCOM_END) {
++ diff_select_commands++;
++ if (!sent_row_count_2)
++ diff_empty_queries++;
++ } else {
++ // 'SHOW ' commands become SQLCOM_SELECT.
++ diff_other_commands++;
++ // 'SHOW ' commands shouldn't inflate total sent row count.
++ diff_total_sent_rows -= sent_row_count_2;
++ }
++ } else if (is_update_query(lex->sql_command)) {
++ diff_update_commands++;
++ } else {
++ diff_other_commands++;
++ }
++ }
++ }
++ // diff_commit_trans is updated in handler.cc.
++ // diff_rollback_trans is updated in handler.cc.
++ // diff_denied_connections is updated in sql_parse.cc.
++ // diff_lost_connections is updated in sql_parse.cc.
++ // diff_access_denied_errors is updated in sql_parse.cc.
++
++ /* reset counters to zero to avoid double-counting since values
++ are already store in diff_total_*. */
++ }
++ busy_time = 0;
++ cpu_time = 0;
++ bytes_received = 0;
++ bytes_sent = 0;
++ binlog_bytes_written = 0;
++ updated_row_count = 0;
++ sent_row_count_2 = 0;
+ }
+
+
+@@ -907,6 +996,33 @@
+ }
+ #endif
+
++char *THD::get_client_host_port(THD *client)
++{
++ Security_context *client_sctx= client->security_ctx;
++ char *client_host= NULL;
++
++ if (client->peer_port && (client_sctx->host || client_sctx->ip) &&
++ security_ctx->host_or_ip[0])
++ {
++ if ((client_host= this->alloc(LIST_PROCESS_HOST_LEN+1)))
++ my_snprintf((char *) client_host, LIST_PROCESS_HOST_LEN,
++ "%s:%u", client_sctx->host_or_ip, client->peer_port);
++ }
++ else
++ client_host= this->strdup(client_sctx->host_or_ip[0] ?
++ client_sctx->host_or_ip :
++ client_sctx->host ? client_sctx->host : "");
++
++ return client_host;
++}
++
++const char *get_client_host(THD *client)
++{
++ return client->security_ctx->host_or_ip[0] ?
++ client->security_ctx->host_or_ip :
++ client->security_ctx->host ? client->security_ctx->host : "";
++}
++
+
+ struct Item_change_record: public ilink
+ {
+@@ -1082,6 +1198,7 @@
+ buffer.set(buff, sizeof(buff), &my_charset_bin);
+ }
+ thd->sent_row_count++;
++ thd->sent_row_count_2++;
+ if (!thd->vio_ok())
+ DBUG_RETURN(0);
+ if (!thd->net.report_error)
+@@ -1174,6 +1291,7 @@
+ select_export::~select_export()
+ {
+ thd->sent_row_count=row_count;
++ thd->sent_row_count_2=row_count;
+ }
+
+
+@@ -2108,6 +2226,7 @@
+ if (likely(thd != 0))
+ { /* current_thd==0 when close_connection() calls net_send_error() */
+ thd->status_var.bytes_sent+= length;
++ thd->bytes_sent+= length;
+ }
+ }
+
+@@ -2115,6 +2234,7 @@
+ void thd_increment_bytes_received(ulong length)
+ {
+ current_thd->status_var.bytes_received+= length;
++ current_thd->bytes_received+= length;
+ }
+
+
+diff -r 592f6c3641ba sql/sql_class.h
+--- a/sql/sql_class.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_class.h Wed Jul 29 13:34:11 2009 -0700
+@@ -1302,6 +1302,8 @@
+ first byte of the packet in do_command()
+ */
+ enum enum_server_command command;
++ // Used to save the command, before it is set to COM_SLEEP.
++ enum enum_server_command old_command;
+ uint32 server_id;
+ uint32 file_id; // for LOAD DATA INFILE
+ /*
+@@ -1498,6 +1500,8 @@
+ /* variables.transaction_isolation is reset to this after each commit */
+ enum_tx_isolation session_tx_isolation;
+ enum_check_fields count_cuted_fields;
++ ha_rows updated_row_count;
++ ha_rows sent_row_count_2; /* for userstat */
+
+ DYNAMIC_ARRAY user_var_events; /* For user variables replication */
+ MEM_ROOT *user_var_events_alloc; /* Allocate above array elements here */
+@@ -1607,6 +1611,49 @@
+ */
+ LOG_INFO* current_linfo;
+ NET* slave_net; // network connection from slave -> m.
++
++ /*
++ Used to update global user stats. The global user stats are updated
++ occasionally with the 'diff' variables. After the update, the 'diff'
++ variables are reset to 0.
++ */
++ // Time when the current thread connected to MySQL.
++ time_t current_connect_time;
++ // Last time when THD stats were updated in global_user_stats.
++ time_t last_global_update_time;
++ // Busy (non-idle) time for just one command.
++ double busy_time;
++ // Busy time not updated in global_user_stats yet.
++ double diff_total_busy_time;
++ // Cpu (non-idle) time for just one thread.
++ double cpu_time;
++ // Cpu time not updated in global_user_stats yet.
++ double diff_total_cpu_time;
++ /* bytes counting */
++ ulonglong bytes_received;
++ ulonglong diff_total_bytes_received;
++ ulonglong bytes_sent;
++ ulonglong diff_total_bytes_sent;
++ ulonglong binlog_bytes_written;
++ ulonglong diff_total_binlog_bytes_written;
++
++ // Number of rows not reflected in global_user_stats yet.
++ ha_rows diff_total_sent_rows, diff_total_updated_rows, diff_total_read_rows;
++ // Number of commands not reflected in global_user_stats yet.
++ ulonglong diff_select_commands, diff_update_commands, diff_other_commands;
++ // Number of transactions not reflected in global_user_stats yet.
++ ulonglong diff_commit_trans, diff_rollback_trans;
++ // Number of connection errors not reflected in global_user_stats yet.
++ ulonglong diff_denied_connections, diff_lost_connections;
++ // Number of db access denied, not reflected in global_user_stats yet.
++ ulonglong diff_access_denied_errors;
++ // Number of queries that return 0 rows
++ ulonglong diff_empty_queries;
++
++ // Per account query delay in miliseconds. When not 0, sleep this number of
++ // milliseconds before every SQL command.
++ ulonglong query_delay_millis;
++
+ /* Used by the sys_var class to store temporary values */
+ union
+ {
+@@ -1662,6 +1709,11 @@
+ alloc_root.
+ */
+ void init_for_queries();
++ void reset_stats(void);
++ void reset_diff_stats(void);
++ // ran_command is true when this is called immediately after a
++ // command has been run.
++ void update_stats(bool ran_command);
+ void change_user(void);
+ void cleanup(void);
+ void cleanup_after_query();
+@@ -1891,8 +1943,14 @@
+ if (p_db_length)
+ *p_db_length= db_length;
+ return FALSE;
++
++ // Returns string as 'IP:port' for the client-side of the connnection represented
++ // by 'client' as displayed by SHOW PROCESSLIST. Allocates memory from the heap of
++ // this THD and that is not reclaimed immediately, so use sparingly. May return NULL.
+ }
+
++ char *get_client_host_port(THD *client);
++
+ public:
+ /**
+ Add an internal error handler to the thread execution context.
+@@ -1935,6 +1993,10 @@
+ MEM_ROOT main_mem_root;
+ };
+
++// Returns string as 'IP' for the client-side of the connection represented by
++// 'client'. Does not allocate memory. May return "".
++const char *get_client_host(THD *client);
++
+
+ #define tmp_disable_binlog(A) \
+ {ulonglong tmp_disable_binlog__save_options= (A)->options; \
+diff -r 592f6c3641ba sql/sql_delete.cc
+--- a/sql/sql_delete.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_delete.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -358,6 +358,7 @@
+ send_ok(thd,deleted);
+ DBUG_PRINT("info",("%ld records deleted",(long) deleted));
+ }
++ thd->updated_row_count += deleted;
+ DBUG_RETURN(error >= 0 || thd->net.report_error);
+ }
+
+@@ -887,6 +888,7 @@
+ thd->row_count_func= deleted;
+ ::send_ok(thd, deleted);
+ }
++ thd->updated_row_count += deleted;
+ return 0;
+ }
+
+diff -r 592f6c3641ba sql/sql_insert.cc
+--- a/sql/sql_insert.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_insert.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -990,6 +990,7 @@
+ thd->row_count_func= info.copied + info.deleted + updated;
+ ::send_ok(thd, (ulong) thd->row_count_func, id, buff);
+ }
++ thd->updated_row_count += thd->row_count_func;
+ thd->abort_on_warning= 0;
+ DBUG_RETURN(FALSE);
+
+@@ -3094,6 +3095,7 @@
+ autoinc_value_of_first_inserted_row : thd->insert_id_used ?
+ thd->last_insert_id : 0;
+ ::send_ok(thd, (ulong) thd->row_count_func, id, buff);
++ thd->updated_row_count += thd->row_count_func;
+ DBUG_RETURN(0);
+ }
+
+diff -r 592f6c3641ba sql/sql_lex.h
+--- a/sql/sql_lex.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_lex.h Wed Jul 29 13:34:11 2009 -0700
+@@ -101,6 +101,9 @@
+ When a command is added here, be sure it's also added in mysqld.cc
+ in "struct show_var_st status_vars[]= {" ...
+ */
++ // TODO(mcallaghan): update status_vars in mysqld to export these
++ SQLCOM_SHOW_USER_STATS, SQLCOM_SHOW_TABLE_STATS, SQLCOM_SHOW_INDEX_STATS,
++ SQLCOM_SHOW_CLIENT_STATS,
+ /* This should be the last !!! */
+ SQLCOM_END
+ };
+diff -r 592f6c3641ba sql/sql_parse.cc
+--- a/sql/sql_parse.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_parse.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -78,6 +78,12 @@
+ const char *table_name);
+ static bool check_show_create_table_access(THD *thd, TABLE_LIST *table);
+
++// Increments connection count for user.
++static int increment_connection_count(THD* thd, bool use_lock);
++
++// Uses the THD to update the global stats by user name and client IP
++void update_global_user_stats(THD* thd, bool create_user, time_t now);
++
+ const char *any_db="*any*"; // Special symbol for check_access
+
+ const char *command_name[]={
+@@ -146,6 +152,17 @@
+ static bool do_command(THD *thd);
+ #endif // EMBEDDED_LIBRARY
+
++HASH global_user_stats;
++HASH global_client_stats;
++// Protects global_user_stats and global_client_stats
++extern pthread_mutex_t LOCK_global_user_client_stats;
++
++HASH global_table_stats;
++extern pthread_mutex_t LOCK_global_table_stats;
++
++HASH global_index_stats;
++extern pthread_mutex_t LOCK_global_index_stats;
++
+ #ifdef __WIN__
+ extern void win_install_sigabrt_handler(void);
+ #endif
+@@ -504,6 +521,7 @@
+ mysql_log.write(thd,COM_CONNECT,"%s",ER(ER_NOT_SUPPORTED_AUTH_MODE));
+ DBUG_RETURN(-1);
+ }
++ thd->diff_access_denied_errors++;
+ net_printf_error(thd, ER_ACCESS_DENIED_ERROR,
+ thd->main_security_ctx.user,
+ thd->main_security_ctx.host_or_ip,
+@@ -536,12 +554,190 @@
+ void init_max_user_conn(void)
+ {
+ #ifndef NO_EMBEDDED_ACCESS_CHECKS
+- (void) hash_init(&hash_user_connections,system_charset_info,max_connections,
+- 0,0,
+- (hash_get_key) get_key_conn, (hash_free_key) free_user,
+- 0);
+-#endif
+-}
++ if (hash_init(&hash_user_connections,system_charset_info,max_connections,
++ 0,0,
++ (hash_get_key) get_key_conn, (hash_free_key) free_user,
++ 0)) {
++ sql_print_error("Initializing hash_user_connections failed.");
++ exit(1);
++ }
++#endif
++}
++
++byte *get_key_user_stats(USER_STATS *user_stats, uint *length,
++ my_bool not_used __attribute__((unused)))
++{
++ *length = strlen(user_stats->user);
++ return (byte*)user_stats->user;
++}
++
++void free_user_stats(USER_STATS* user_stats)
++{
++ my_free((char*)user_stats, MYF(0));
++}
++
++void init_user_stats(USER_STATS *user_stats,
++ const char *user,
++ const char *priv_user,
++ uint total_connections,
++ uint concurrent_connections,
++ time_t connected_time,
++ double busy_time,
++ double cpu_time,
++ ulonglong bytes_received,
++ ulonglong bytes_sent,
++ ulonglong binlog_bytes_written,
++ ha_rows rows_fetched,
++ ha_rows rows_updated,
++ ha_rows rows_read,
++ ulonglong select_commands,
++ ulonglong update_commands,
++ ulonglong other_commands,
++ ulonglong commit_trans,
++ ulonglong rollback_trans,
++ ulonglong denied_connections,
++ ulonglong lost_connections,
++ ulonglong access_denied_errors,
++ ulonglong empty_queries)
++{
++ DBUG_ENTER("init_user_stats");
++ DBUG_PRINT("info",
++ ("Add user_stats entry for user %s - priv_user %s",
++ user, priv_user));
++ strncpy(user_stats->user, user, sizeof(user_stats->user));
++ strncpy(user_stats->priv_user, priv_user, sizeof(user_stats->priv_user));
++
++ user_stats->total_connections = total_connections;
++ user_stats->concurrent_connections = concurrent_connections;
++ user_stats->connected_time = connected_time;
++ user_stats->busy_time = busy_time;
++ user_stats->cpu_time = cpu_time;
++ user_stats->bytes_received = bytes_received;
++ user_stats->bytes_sent = bytes_sent;
++ user_stats->binlog_bytes_written = binlog_bytes_written;
++ user_stats->rows_fetched = rows_fetched;
++ user_stats->rows_updated = rows_updated;
++ user_stats->rows_read = rows_read;
++ user_stats->select_commands = select_commands;
++ user_stats->update_commands = update_commands;
++ user_stats->other_commands = other_commands;
++ user_stats->commit_trans = commit_trans;
++ user_stats->rollback_trans = rollback_trans;
++ user_stats->denied_connections = denied_connections;
++ user_stats->lost_connections = lost_connections;
++ user_stats->access_denied_errors = access_denied_errors;
++ user_stats->empty_queries = empty_queries;
++ DBUG_VOID_RETURN;
++}
++
++void add_user_stats(USER_STATS *user_stats,
++ uint total_connections,
++ uint concurrent_connections,
++ time_t connected_time,
++ double busy_time,
++ double cpu_time,
++ ulonglong bytes_received,
++ ulonglong bytes_sent,
++ ulonglong binlog_bytes_written,
++ ha_rows rows_fetched,
++ ha_rows rows_updated,
++ ha_rows rows_read,
++ ulonglong select_commands,
++ ulonglong update_commands,
++ ulonglong other_commands,
++ ulonglong commit_trans,
++ ulonglong rollback_trans,
++ ulonglong denied_connections,
++ ulonglong lost_connections,
++ ulonglong access_denied_errors,
++ ulonglong empty_queries)
++{
++ user_stats->total_connections += total_connections;
++ user_stats->concurrent_connections += concurrent_connections;
++ user_stats->connected_time += connected_time;
++ user_stats->busy_time += busy_time;
++ user_stats->cpu_time += cpu_time;
++ user_stats->bytes_received += bytes_received;
++ user_stats->bytes_sent += bytes_sent;
++ user_stats->binlog_bytes_written += binlog_bytes_written;
++ user_stats->rows_fetched += rows_fetched;
++ user_stats->rows_updated += rows_updated;
++ user_stats->rows_read += rows_read;
++ user_stats->select_commands += select_commands;
++ user_stats->update_commands += update_commands;
++ user_stats->other_commands += other_commands;
++ user_stats->commit_trans += commit_trans;
++ user_stats->rollback_trans += rollback_trans;
++ user_stats->denied_connections += denied_connections;
++ user_stats->lost_connections += lost_connections;
++ user_stats->access_denied_errors += access_denied_errors;
++ user_stats->empty_queries += empty_queries;
++}
++
++void init_global_user_stats(void)
++{
++ if (hash_init(&global_user_stats, system_charset_info, max_connections,
++ 0, 0, (hash_get_key)get_key_user_stats,
++ (hash_free_key)free_user_stats, 0)) {
++ sql_print_error("Initializing global_user_stats failed.");
++ exit(1);
++ }
++}
++
++void init_global_client_stats(void)
++{
++ if (hash_init(&global_client_stats, system_charset_info, max_connections,
++ 0, 0, (hash_get_key)get_key_user_stats,
++ (hash_free_key)free_user_stats, 0)) {
++ sql_print_error("Initializing global_client_stats failed.");
++ exit(1);
++ }
++}
++
++extern "C" byte *get_key_table_stats(TABLE_STATS *table_stats, uint *length,
++ my_bool not_used __attribute__((unused)))
++{
++ *length = strlen(table_stats->table);
++ return (byte*)table_stats->table;
++}
++
++extern "C" void free_table_stats(TABLE_STATS* table_stats)
++{
++ my_free((char*)table_stats, MYF(0));
++}
++
++void init_global_table_stats(void)
++{
++ if (hash_init(&global_table_stats, system_charset_info, max_connections,
++ 0, 0, (hash_get_key)get_key_table_stats,
++ (hash_free_key)free_table_stats, 0)) {
++ sql_print_error("Initializing global_table_stats failed.");
++ exit(1);
++ }
++}
++
++extern "C" byte *get_key_index_stats(INDEX_STATS *index_stats, uint *length,
++ my_bool not_used __attribute__((unused)))
++{
++ *length = strlen(index_stats->index);
++ return (byte*)index_stats->index;
++}
++
++extern "C" void free_index_stats(INDEX_STATS* index_stats)
++{
++ my_free((char*)index_stats, MYF(0));
++}
++
++void init_global_index_stats(void)
++{
++ if (hash_init(&global_index_stats, system_charset_info, max_connections,
++ 0, 0, (hash_get_key)get_key_index_stats,
++ (hash_free_key)free_index_stats, 0)) {
++ sql_print_error("Initializing global_index_stats failed.");
++ exit(1);
++ }
++}
++
+
+
+ /*
+@@ -599,7 +795,10 @@
+
+ end:
+ if (error)
++ {
++ statistic_increment(denied_connections, &LOCK_status);
+ uc->connections--; // no need for decrease_user_connections() here
++ }
+ (void) pthread_mutex_unlock(&LOCK_user_conn);
+ DBUG_RETURN(error);
+ }
+@@ -646,6 +845,25 @@
+ #endif /* NO_EMBEDDED_ACCESS_CHECKS */
+ }
+
++void free_global_user_stats(void)
++{
++ hash_free(&global_user_stats);
++}
++
++void free_global_table_stats(void)
++{
++ hash_free(&global_table_stats);
++}
++
++void free_global_index_stats(void)
++{
++ hash_free(&global_index_stats);
++}
++
++void free_global_client_stats(void)
++{
++ hash_free(&global_client_stats);
++}
+
+
+ /*
+@@ -698,6 +916,214 @@
+ return uc_update_queries[command] != 0;
+ }
+
++// 'mysql_system_user' is used for when the user is not defined for a THD.
++static char mysql_system_user[] = "#mysql_system#";
++
++// Returns 'user' if it's not NULL. Returns 'mysql_system_user' otherwise.
++static char* get_valid_user_string(char* user) {
++ return user ? user : mysql_system_user;
++}
++
++// Increments the global stats connection count for an entry from
++// global_client_stats or global_user_stats. Returns 0 on success
++// and 1 on error.
++static int increment_count_by_name(const char *name, const char *role_name,
++ HASH *users_or_clients, THD *thd)
++{
++ USER_STATS* user_stats;
++
++ if (!(user_stats = (USER_STATS*)hash_search(users_or_clients, name,
++ strlen(name))))
++ {
++ // First connection for this user or client
++ if (!(user_stats = ((USER_STATS*)
++ my_malloc(sizeof(USER_STATS), MYF(MY_WME | MY_ZEROFILL)))))
++ {
++ return 1; // Out of memory
++ }
++
++ init_user_stats(user_stats, name, role_name,
++ 0, 0, // connections
++ 0, 0, 0, // time
++ 0, 0, 0, // bytes sent, received and written
++ 0, 0, 0, // rows fetched, updated and read
++ 0, 0, 0, // select, update and other commands
++ 0, 0, // commit and rollback trans
++ thd->diff_denied_connections,
++ 0, // lost connections
++ 0, // access denied errors
++ 0); // empty queries
++
++ if (my_hash_insert(users_or_clients, (byte*)user_stats))
++ {
++ my_free((char*)user_stats, 0);
++ return 1; // Out of memory
++ }
++ }
++ user_stats->total_connections++;
++ return 0;
++}
++
++// Increments the global user and client stats connection count. If 'use_lock'
++// is true, LOCK_global_user_client_stats will be locked/unlocked. Returns
++// 0 on success, 1 on error.
++static int increment_connection_count(THD* thd, bool use_lock)
++{
++ char* user_string = get_valid_user_string(thd->main_security_ctx.user);
++ const char* client_string = get_client_host(thd);
++ int return_value = 0;
++
++ if (!opt_userstat_running)
++ return return_value;
++
++ if (use_lock) pthread_mutex_lock(&LOCK_global_user_client_stats);
++
++ if (increment_count_by_name(user_string, user_string,
++ &global_user_stats, thd))
++ {
++ return_value = 1;
++ goto end;
++ }
++ if (increment_count_by_name(client_string,
++ user_string,
++ &global_client_stats, thd))
++ {
++ return_value = 1;
++ goto end;
++ }
++
++end:
++ if (use_lock) pthread_mutex_unlock(&LOCK_global_user_client_stats);
++ return return_value;
++}
++
++// Used to update the global user and client stats.
++static void update_global_user_stats_with_user(THD* thd,
++ USER_STATS* user_stats,
++ time_t now)
++{
++ user_stats->connected_time += now - thd->last_global_update_time;
++ thd->last_global_update_time = now;
++ user_stats->busy_time += thd->diff_total_busy_time;
++ user_stats->cpu_time += thd->diff_total_cpu_time;
++ user_stats->bytes_received += thd->diff_total_bytes_received;
++ user_stats->bytes_sent += thd->diff_total_bytes_sent;
++ user_stats->binlog_bytes_written += thd->diff_total_binlog_bytes_written;
++ user_stats->rows_fetched += thd->diff_total_sent_rows;
++ user_stats->rows_updated += thd->diff_total_updated_rows;
++ user_stats->rows_read += thd->diff_total_read_rows;
++ user_stats->select_commands += thd->diff_select_commands;
++ user_stats->update_commands += thd->diff_update_commands;
++ user_stats->other_commands += thd->diff_other_commands;
++ user_stats->commit_trans += thd->diff_commit_trans;
++ user_stats->rollback_trans += thd->diff_rollback_trans;
++ user_stats->denied_connections += thd->diff_denied_connections;
++ user_stats->lost_connections += thd->diff_lost_connections;
++ user_stats->access_denied_errors += thd->diff_access_denied_errors;
++ user_stats->empty_queries += thd->diff_empty_queries;
++}
++
++// Updates the global stats of a user or client
++void update_global_user_stats(THD* thd, bool create_user, time_t now)
++{
++ if (opt_userstat_running) {
++ char* user_string = get_valid_user_string(thd->main_security_ctx.user);
++ const char* client_string = get_client_host(thd);
++
++ USER_STATS* user_stats;
++ pthread_mutex_lock(&LOCK_global_user_client_stats);
++
++ // Update by user name
++ if ((user_stats = (USER_STATS*)hash_search(&global_user_stats,
++ (byte*)user_string,
++ strlen(user_string)))) {
++ // Found user.
++ update_global_user_stats_with_user(thd, user_stats, now);
++ } else {
++ // Create the entry
++ if (create_user) {
++ increment_count_by_name(user_string, user_string,
++ &global_user_stats, thd);
++ }
++ }
++
++ // Update by client IP
++ if ((user_stats = (USER_STATS*)hash_search(&global_client_stats,
++ (byte*)client_string,
++ strlen(client_string)))) {
++ // Found by client IP
++ update_global_user_stats_with_user(thd, user_stats, now);
++ } else {
++ // Create the entry
++ if (create_user) {
++ increment_count_by_name(client_string,
++ user_string,
++ &global_client_stats, thd);
++ }
++ }
++ thd->reset_diff_stats();
++
++ pthread_mutex_unlock(&LOCK_global_user_client_stats);
++ } else {
++ thd->reset_diff_stats();
++ }
++}
++
++// Determines the concurrent number of connections of current threads.
++static void set_connections_stats()
++{
++ USER_STATS* user_stats;
++
++ pthread_mutex_lock(&LOCK_global_user_client_stats);
++ pthread_mutex_lock(&LOCK_thread_count);
++
++ // Resets all concurrent connections to 0.
++ for (int i = 0; i < global_user_stats.records; ++i) {
++ user_stats = (USER_STATS*)hash_element(&global_user_stats, i);
++ user_stats->concurrent_connections = 0;
++ }
++ for (int i = 0; i < global_client_stats.records; ++i) {
++ user_stats = (USER_STATS*)hash_element(&global_client_stats, i);
++ user_stats->concurrent_connections = 0;
++ }
++
++ I_List_iterator<THD> it(threads);
++ THD* thd;
++ time_t now = time(NULL);
++ // Iterates through the current threads.
++ while ((thd = it++)) {
++ char* user_string = get_valid_user_string(thd->main_security_ctx.user);
++ if ((user_stats = (USER_STATS*)hash_search(&global_user_stats,
++ (byte*)user_string,
++ strlen(user_string)))) {
++ // Found user.
++ user_stats->concurrent_connections++;
++ update_global_user_stats_with_user(thd, user_stats, now);
++ } else {
++ // The user name should exist.
++ if (user_string == mysql_system_user) {
++ // Only create the user if it is the mysql_system_user
++ increment_count_by_name(user_string, user_string,
++ &global_user_stats, thd);
++ }
++ }
++
++ const char* client_string = get_client_host(thd);
++ if ((user_stats = (USER_STATS*)hash_search(&global_client_stats,
++ (byte*)client_string,
++ strlen(client_string)))) {
++ // Found user.
++ user_stats->concurrent_connections++;
++ update_global_user_stats_with_user(thd, user_stats, now);
++ } else {
++ // Do nothing, unlike what is done for global_user_stats
++ }
++ thd->reset_diff_stats();
++ }
++ pthread_mutex_unlock(&LOCK_thread_count);
++ pthread_mutex_unlock(&LOCK_global_user_client_stats);
++}
++
+ /*
+ Reset per-hour user resource limits when it has been more than
+ an hour since they were last checked
+@@ -1184,6 +1610,8 @@
+ my_net_set_read_timeout(net, connect_timeout);
+ my_net_set_write_timeout(net, connect_timeout);
+
++ bool create_user = true;
++
+ if ((error=check_connection(thd)))
+ { // Wrong permissions
+ if (error > 0)
+@@ -1193,8 +1621,22 @@
+ my_sleep(1000); /* must wait after eof() */
+ #endif
+ statistic_increment(aborted_connects,&LOCK_status);
++ thd->diff_denied_connections++;
++ if (error == -2) {
++ // Do not create statistics for a user who does not exist, or failed
++ // to authenticate.
++ create_user = false;
++ }
+ goto end_thread;
+ }
++
++ thd->reset_stats();
++ // Updates global user connection stats.
++ if (increment_connection_count(thd, true)) {
++ net_send_error(thd, ER_OUTOFMEMORY); // Out of memory
++ goto end_thread;
++ }
++
+ #ifdef __NETWARE__
+ netware_reg_user(sctx->ip, sctx->user, "MySQL");
+ #endif
+@@ -1251,6 +1693,7 @@
+ (net->vio && net->error && net->report_error))
+ {
+ statistic_increment(aborted_threads, &LOCK_status);
++ thd->diff_lost_connections++;
+ }
+
+ if (net->error && net->vio != 0 && net->report_error)
+@@ -1270,6 +1713,8 @@
+
+ end_thread:
+ close_connection(thd, 0, 1);
++ thd->update_stats(false);
++ update_global_user_stats(thd, create_user, time(NULL));
+ end_thread(thd,1);
+ /*
+ If end_thread returns, we are either running with --one-thread
+@@ -1601,6 +2046,13 @@
+
+ thd->clear_error(); // Clear error message
+
++ thd->updated_row_count=0;
++ thd->busy_time=0;
++ thd->cpu_time=0;
++ thd->bytes_received=0;
++ thd->bytes_sent=0;
++ thd->binlog_bytes_written=0;
++
+ net_new_transaction(net);
+
+ packet_length= my_net_read(net);
+@@ -1759,6 +2211,9 @@
+ }
+
+ thd->command=command;
++ // To increment the corrent command counter for user stats, 'command' must
++ // be saved because it is set to COM_SLEEP at the end of this function.
++ thd->old_command = command;
+ /*
+ Commands which always take a long time are logged into
+ the slow log only if opt_log_slow_admin_statements is set.
+@@ -4539,6 +4994,15 @@
+ if (check_global_access(thd,RELOAD_ACL))
+ goto error;
+
++ if(lex->type & REFRESH_SLOW_QUERY_LOG) {
++ /* We are only flushing slow query log */
++ mysql_slow_log.new_file(1);
++
++ send_ok(thd);
++ break;
++ }
++
++
+ /*
+ reload_acl_and_cache() will tell us if we are allowed to write to the
+ binlog or not.
+@@ -4847,6 +5311,7 @@
+ {
+ if (check_global_access(thd, SUPER_ACL))
+ {
++ thd->diff_access_denied_errors++;
+ my_error(ER_SPECIFIC_ACCESS_DENIED_ERROR, MYF(0), "SUPER");
+ goto create_sp_error;
+ }
+@@ -5691,6 +6156,7 @@
+ if (!no_errors)
+ {
+ const char *db_name= db ? db : thd->db;
++ thd->diff_access_denied_errors++;
+ my_error(ER_DBACCESS_DENIED_ERROR, MYF(0),
+ sctx->priv_user, sctx->priv_host, db_name);
+ }
+@@ -5726,6 +6192,7 @@
+ { // We can never grant this
+ DBUG_PRINT("error",("No possible access"));
+ if (!no_errors)
++ thd->diff_access_denied_errors++;
+ my_error(ER_ACCESS_DENIED_ERROR, MYF(0),
+ sctx->priv_user,
+ sctx->priv_host,
+@@ -5758,11 +6225,15 @@
+
+ DBUG_PRINT("error",("Access denied"));
+ if (!no_errors)
++ {
++ // increment needs !no_errors condition, otherwise double counting.
++ thd->diff_access_denied_errors++;
+ my_error(ER_DBACCESS_DENIED_ERROR, MYF(0),
+ sctx->priv_user, sctx->priv_host,
+ (db ? db : (thd->db ?
+ thd->db :
+ "unknown"))); /* purecov: tested */
++ }
+ DBUG_RETURN(TRUE); /* purecov: tested */
+ #endif /* NO_EMBEDDED_ACCESS_CHECKS */
+ }
+@@ -5796,6 +6267,7 @@
+ if ((thd->security_ctx->master_access & want_access))
+ return 0;
+ get_privilege_desc(command, sizeof(command), want_access);
++ thd->diff_access_denied_errors++;
+ my_error(ER_SPECIFIC_ACCESS_DENIED_ERROR, MYF(0), command);
+ return 1;
+ #endif /* NO_EMBEDDED_ACCESS_CHECKS */
+@@ -5828,6 +6300,7 @@
+
+ if (!thd->col_access && check_grant_db(thd, dst_db_name))
+ {
++ thd->diff_access_denied_errors++;
+ my_error(ER_DBACCESS_DENIED_ERROR, MYF(0),
+ thd->security_ctx->priv_user,
+ thd->security_ctx->priv_host,
+@@ -5859,6 +6332,12 @@
+ check_grant(thd, SELECT_ACL, dst_table, 2, UINT_MAX, FALSE);
+ }
+
++
++ case SCH_USER_STATS:
++ case SCH_CLIENT_STATS:
++ return check_global_access(thd, SUPER_ACL | PROCESS_ACL);
++ case SCH_TABLE_STATS:
++ case SCH_INDEX_STATS:
+ case SCH_OPEN_TABLES:
+ case SCH_VARIABLES:
+ case SCH_STATUS:
+@@ -5912,8 +6391,8 @@
+ #ifndef NO_EMBEDDED_ACCESS_CHECKS
+ TABLE_LIST *org_tables= tables;
+ #endif
++ Security_context *sctx= thd->security_ctx, *backup_ctx= thd->security_ctx;
+ TABLE_LIST *first_not_own_table= thd->lex->first_not_own_table();
+- Security_context *sctx= thd->security_ctx, *backup_ctx= thd->security_ctx;
+ /*
+ The check that first_not_own_table is not reached is for the case when
+ the given table list refers to the list for prelocking (contains tables
+@@ -5930,9 +6409,12 @@
+ (want_access & ~(SELECT_ACL | EXTRA_ACL | FILE_ACL)))
+ {
+ if (!no_errors)
++ {
++ thd->diff_access_denied_errors++;
+ my_error(ER_DBACCESS_DENIED_ERROR, MYF(0),
+ sctx->priv_user, sctx->priv_host,
+ INFORMATION_SCHEMA_NAME.str);
++ }
+ return TRUE;
+ }
+ /*
+@@ -6442,6 +6924,30 @@
+ lex_start(thd);
+ mysql_reset_thd_for_next_command(thd);
+
++ int start_time_error = 0;
++ int end_time_error = 0;
++ struct timeval start_time, end_time;
++ double start_usecs = 0;
++ double end_usecs = 0;
++ /* cpu time */
++ int cputime_error = 0;
++ struct timespec tp;
++ double start_cpu_nsecs = 0;
++ double end_cpu_nsecs = 0;
++
++ if (opt_userstat_running) {
++#ifdef HAVE_CLOCK_GETTIME
++ /* get start cputime */
++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++
++ // Gets the start time, in order to measure how long this command takes.
++ if (!(start_time_error = gettimeofday(&start_time, NULL))) {
++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec;
++ }
++ }
++
+ if (query_cache_send_result_to_client(thd, (char*) inBuf, length) <= 0)
+ {
+ LEX *lex= thd->lex;
+@@ -6520,6 +7026,43 @@
+ *found_semicolon= NULL;
+ }
+
++ if (opt_userstat_running) {
++ // Gets the end time.
++ if (!(end_time_error = gettimeofday(&end_time, NULL))) {
++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec;
++ }
++
++ // Calculates the difference between the end and start times.
++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) {
++ thd->busy_time = (end_usecs - start_usecs) / 1000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->busy_time > 2629743) {
++ thd->busy_time = 0;
++ }
++ } else {
++ // end time went back in time, or gettimeofday() failed.
++ thd->busy_time = 0;
++ }
++
++#ifdef HAVE_CLOCK_GETTIME
++ /* get end cputime */
++ if (!cputime_error &&
++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++ if (start_cpu_nsecs && !cputime_error) {
++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->cpu_time > 2629743) {
++ thd->cpu_time = 0;
++ }
++ } else
++ thd->cpu_time = 0;
++ }
++ // Updates THD stats and the global user stats.
++ thd->update_stats(true);
++ update_global_user_stats(thd, true, time(NULL));
++
+ DBUG_VOID_RETURN;
+ }
+
+@@ -7531,8 +8074,35 @@
+ pthread_mutex_unlock(&LOCK_active_mi);
+ }
+ #endif
+- if (options & REFRESH_USER_RESOURCES)
+- reset_mqh((LEX_USER *) NULL);
++ if (options & REFRESH_TABLE_STATS)
++ {
++ pthread_mutex_lock(&LOCK_global_table_stats);
++ free_global_table_stats();
++ init_global_table_stats();
++ pthread_mutex_unlock(&LOCK_global_table_stats);
++ }
++ if (options & REFRESH_INDEX_STATS)
++ {
++ pthread_mutex_lock(&LOCK_global_index_stats);
++ free_global_index_stats();
++ init_global_index_stats();
++ pthread_mutex_unlock(&LOCK_global_index_stats);
++ }
++ if (options & (REFRESH_USER_STATS | REFRESH_CLIENT_STATS))
++ {
++ pthread_mutex_lock(&LOCK_global_user_client_stats);
++ if (options & REFRESH_USER_STATS)
++ {
++ free_global_user_stats();
++ init_global_user_stats();
++ }
++ if (options & REFRESH_CLIENT_STATS)
++ {
++ free_global_client_stats();
++ init_global_client_stats();
++ }
++ pthread_mutex_unlock(&LOCK_global_user_client_stats);
++ }
+ *write_to_binlog= tmp_write_to_binlog;
+ return result;
+ }
+diff -r 592f6c3641ba sql/sql_prepare.cc
+--- a/sql/sql_prepare.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_prepare.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -81,6 +81,9 @@
+ #include <mysql_com.h>
+ #endif
+
++// Uses the THD to update the global stats by user name and client IP
++void update_global_user_stats(THD* thd, bool create_user, time_t now);
++
+ /* A result class used to send cursor rows using the binary protocol. */
+
+ class Select_fetch_protocol_prep: public select_send
+@@ -1910,8 +1913,32 @@
+ /* First of all clear possible warnings from the previous command */
+ mysql_reset_thd_for_next_command(thd);
+
++ int start_time_error = 0;
++ int end_time_error = 0;
++ struct timeval start_time, end_time;
++ double start_usecs = 0;
++ double end_usecs = 0;
++ /* cpu time */
++ int cputime_error = 0;
++ struct timespec tp;
++ double start_cpu_nsecs = 0;
++ double end_cpu_nsecs = 0;
++
++ if (opt_userstat_running) {
++#ifdef HAVE_CLOCK_GETTIME
++ /* get start cputime */
++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++
++ // Gets the start time, in order to measure how long this command takes.
++ if (!(start_time_error = gettimeofday(&start_time, NULL))) {
++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec;
++ }
++ }
++
+ if (! (stmt= new Prepared_statement(thd, &thd->protocol_prep)))
+- DBUG_VOID_RETURN; /* out of memory: error is set in Sql_alloc */
++ goto end; /* out of memory: error is set in Sql_alloc */
+
+ if (thd->stmt_map.insert(thd, stmt))
+ {
+@@ -1919,7 +1946,7 @@
+ The error is set in the insert. The statement itself
+ will be also deleted there (this is how the hash works).
+ */
+- DBUG_VOID_RETURN;
++ goto end;
+ }
+
+ /* Reset warnings from previous command */
+@@ -1941,6 +1968,44 @@
+ thd->stmt_map.erase(stmt);
+ }
+ /* check_prepared_statemnt sends the metadata packet in case of success */
++end:
++ if (opt_userstat_running) {
++ // Gets the end time.
++ if (!(end_time_error = gettimeofday(&end_time, NULL))) {
++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec;
++ }
++
++ // Calculates the difference between the end and start times.
++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) {
++ thd->busy_time = (end_usecs - start_usecs) / 1000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->busy_time > 2629743) {
++ thd->busy_time = 0;
++ }
++ } else {
++ // end time went back in time, or gettimeofday() failed.
++ thd->busy_time = 0;
++ }
++
++#ifdef HAVE_CLOCK_GETTIME
++ /* get end cputime */
++ if (!cputime_error &&
++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++ if (start_cpu_nsecs && !cputime_error) {
++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->cpu_time > 2629743) {
++ thd->cpu_time = 0;
++ }
++ } else
++ thd->cpu_time = 0;
++ }
++ // Updates THD stats and the global user stats.
++ thd->update_stats(true);
++ update_global_user_stats(thd, true, time(NULL));
++
+ DBUG_VOID_RETURN;
+ }
+
+@@ -2281,8 +2346,32 @@
+ /* First of all clear possible warnings from the previous command */
+ mysql_reset_thd_for_next_command(thd);
+
++ int start_time_error = 0;
++ int end_time_error = 0;
++ struct timeval start_time, end_time;
++ double start_usecs = 0;
++ double end_usecs = 0;
++ /* cpu time */
++ int cputime_error = 0;
++ struct timespec tp;
++ double start_cpu_nsecs = 0;
++ double end_cpu_nsecs = 0;
++
++ if (opt_userstat_running) {
++#ifdef HAVE_CLOCK_GETTIME
++ /* get start cputime */
++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++
++ // Gets the start time, in order to measure how long this command takes.
++ if (!(start_time_error = gettimeofday(&start_time, NULL))) {
++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec;
++ }
++ }
++
+ if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_execute")))
+- DBUG_VOID_RETURN;
++ goto end;
+
+ #if defined(ENABLED_PROFILING) && defined(COMMUNITY_SERVER)
+ thd->profiling.set_query_source(stmt->query, stmt->query_length);
+@@ -2325,11 +2414,50 @@
+ test(flags & (ulong) CURSOR_TYPE_READ_ONLY));
+ if (!(specialflag & SPECIAL_NO_PRIOR))
+ my_pthread_setprio(pthread_self(), WAIT_PRIOR);
+- DBUG_VOID_RETURN;
++ goto end;
+
+ set_params_data_err:
+ my_error(ER_WRONG_ARGUMENTS, MYF(0), "mysql_stmt_execute");
+ reset_stmt_params(stmt);
++
++end:
++ if (opt_userstat_running) {
++ // Gets the end time.
++ if (!(end_time_error = gettimeofday(&end_time, NULL))) {
++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec;
++ }
++
++ // Calculates the difference between the end and start times.
++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) {
++ thd->busy_time = (end_usecs - start_usecs) / 1000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->busy_time > 2629743) {
++ thd->busy_time = 0;
++ }
++ } else {
++ // end time went back in time, or gettimeofday() failed.
++ thd->busy_time = 0;
++ }
++
++#ifdef HAVE_CLOCK_GETTIME
++ /* get end cputime */
++ if (!cputime_error &&
++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++ if (start_cpu_nsecs && !cputime_error) {
++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->cpu_time > 2629743) {
++ thd->cpu_time = 0;
++ }
++ } else
++ thd->cpu_time = 0;
++ }
++ // Updates THD stats and the global user stats.
++ thd->update_stats(true);
++ update_global_user_stats(thd, true, time(NULL));
++
+ DBUG_VOID_RETURN;
+ }
+
+@@ -2423,6 +2551,31 @@
+
+ /* First of all clear possible warnings from the previous command */
+ mysql_reset_thd_for_next_command(thd);
++
++ int start_time_error = 0;
++ int end_time_error = 0;
++ struct timeval start_time, end_time;
++ double start_usecs = 0;
++ double end_usecs = 0;
++ /* cpu time */
++ int cputime_error = 0;
++ struct timespec tp;
++ double start_cpu_nsecs = 0;
++ double end_cpu_nsecs = 0;
++
++ if (opt_userstat_running) {
++#ifdef HAVE_CLOCK_GETTIME
++ /* get start cputime */
++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++
++ // Gets the start time, in order to measure how long this command takes.
++ if (!(start_time_error = gettimeofday(&start_time, NULL))) {
++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec;
++ }
++ }
++
+ statistic_increment(thd->status_var.com_stmt_fetch, &LOCK_status);
+ if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_fetch")))
+ DBUG_VOID_RETURN;
+@@ -2455,6 +2608,43 @@
+ thd->restore_backup_statement(stmt, &stmt_backup);
+ thd->stmt_arena= thd;
+
++ if (opt_userstat_running) {
++ // Gets the end time.
++ if (!(end_time_error = gettimeofday(&end_time, NULL))) {
++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec;
++ }
++
++ // Calculates the difference between the end and start times.
++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) {
++ thd->busy_time = (end_usecs - start_usecs) / 1000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->busy_time > 2629743) {
++ thd->busy_time = 0;
++ }
++ } else {
++ // end time went back in time, or gettimeofday() failed.
++ thd->busy_time = 0;
++ }
++
++#ifdef HAVE_CLOCK_GETTIME
++ /* get end cputime */
++ if (!cputime_error &&
++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++ if (start_cpu_nsecs && !cputime_error) {
++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->cpu_time > 2629743) {
++ thd->cpu_time = 0;
++ }
++ } else
++ thd->cpu_time = 0;
++ }
++ // Updates THD stats and the global user stats.
++ thd->update_stats(true);
++ update_global_user_stats(thd, true, time(NULL));
++
+ DBUG_VOID_RETURN;
+ }
+
+@@ -2487,6 +2677,30 @@
+ /* First of all clear possible warnings from the previous command */
+ mysql_reset_thd_for_next_command(thd);
+
++ int start_time_error = 0;
++ int end_time_error = 0;
++ struct timeval start_time, end_time;
++ double start_usecs = 0;
++ double end_usecs = 0;
++ /* cpu time */
++ int cputime_error = 0;
++ struct timespec tp;
++ double start_cpu_nsecs = 0;
++ double end_cpu_nsecs = 0;
++
++ if (opt_userstat_running) {
++#ifdef HAVE_CLOCK_GETTIME
++ /* get start cputime */
++ if (!(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ start_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++
++ // Gets the start time, in order to measure how long this command takes.
++ if (!(start_time_error = gettimeofday(&start_time, NULL))) {
++ start_usecs = start_time.tv_sec * 1000000.0 + start_time.tv_usec;
++ }
++ }
++
+ statistic_increment(thd->status_var.com_stmt_reset, &LOCK_status);
+ if (!(stmt= find_prepared_statement(thd, stmt_id, "mysql_stmt_reset")))
+ DBUG_VOID_RETURN;
+@@ -2503,6 +2717,43 @@
+
+ send_ok(thd);
+
++ if (opt_userstat_running) {
++ // Gets the end time.
++ if (!(end_time_error = gettimeofday(&end_time, NULL))) {
++ end_usecs = end_time.tv_sec * 1000000.0 + end_time.tv_usec;
++ }
++
++ // Calculates the difference between the end and start times.
++ if (start_usecs && end_usecs >= start_usecs && !start_time_error && !end_time_error) {
++ thd->busy_time = (end_usecs - start_usecs) / 1000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->busy_time > 2629743) {
++ thd->busy_time = 0;
++ }
++ } else {
++ // end time went back in time, or gettimeofday() failed.
++ thd->busy_time = 0;
++ }
++
++#ifdef HAVE_CLOCK_GETTIME
++ /* get end cputime */
++ if (!cputime_error &&
++ !(cputime_error = clock_gettime(CLOCK_THREAD_CPUTIME_ID, &tp)))
++ end_cpu_nsecs = tp.tv_sec*1000000000.0+tp.tv_nsec;
++#endif
++ if (start_cpu_nsecs && !cputime_error) {
++ thd->cpu_time = (end_cpu_nsecs - start_cpu_nsecs) / 1000000000;
++ // In case there are bad values, 2629743 is the #seconds in a month.
++ if (thd->cpu_time > 2629743) {
++ thd->cpu_time = 0;
++ }
++ } else
++ thd->cpu_time = 0;
++ }
++ // Updates THD stats and the global user stats.
++ thd->update_stats(true);
++ update_global_user_stats(thd, true, time(NULL));
++
+ DBUG_VOID_RETURN;
+ }
+
+diff -r 592f6c3641ba sql/sql_show.cc
+--- a/sql/sql_show.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_show.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -540,6 +540,7 @@
+ sctx->master_access);
+ if (!(db_access & DB_ACLS) && (!grant_option || check_grant_db(thd,dbname)))
+ {
++ thd->diff_access_denied_errors++;
+ my_error(ER_DBACCESS_DENIED_ERROR, MYF(0),
+ sctx->priv_user, sctx->host_or_ip, dbname);
+ mysql_log.write(thd,COM_INIT_DB,ER(ER_DBACCESS_DENIED_ERROR),
+@@ -1890,6 +1891,300 @@
+ DBUG_RETURN(FALSE);
+ }
+
++/*
++ Aggregate values for mapped_user entries by their role.
++
++ SYNOPSIS
++ aggregate_user_stats
++ all_user_stats - input to aggregate
++ agg_user_stats - returns aggregated values
++
++ RETURN
++ 0 - OK
++ 1 - error
++ */
++static int
++aggregate_user_stats(HASH *all_user_stats, HASH *agg_user_stats)
++{
++ DBUG_ENTER("aggregate_user_stats");
++ if (hash_init(agg_user_stats, system_charset_info,
++ max(all_user_stats->records, 1),
++ 0, 0, (hash_get_key)get_key_user_stats,
++ (hash_free_key)free_user_stats, 0))
++ {
++ sql_print_error("Malloc in aggregate_user_stats failed");
++ DBUG_RETURN(1);
++ }
++
++ for (int i = 0; i < all_user_stats->records; ++i) {
++ USER_STATS *user = (USER_STATS*)hash_element(all_user_stats, i);
++ USER_STATS *agg_user;
++ if (!(agg_user = (USER_STATS*)hash_search(agg_user_stats,
++ (byte*)user->priv_user,
++ strlen(user->priv_user))))
++ {
++ // First entry for this role.
++ if (!(agg_user =
++ (USER_STATS*) my_malloc(sizeof(USER_STATS), MYF(MY_WME | MY_ZEROFILL))))
++ {
++ sql_print_error("Malloc in aggregate_user_stats failed");
++ DBUG_RETURN(1);
++ }
++
++ init_user_stats(agg_user, user->priv_user, user->priv_user,
++ user->total_connections, user->concurrent_connections,
++ user->connected_time, user->busy_time, user->cpu_time,
++ user->bytes_received, user->bytes_sent,
++ user->binlog_bytes_written,
++ user->rows_fetched, user->rows_updated, user->rows_read,
++ user->select_commands, user->update_commands,
++ user->other_commands,
++ user->commit_trans, user->rollback_trans,
++ user->denied_connections, user->lost_connections,
++ user->access_denied_errors, user->empty_queries);
++
++ if (my_hash_insert(agg_user_stats, (byte*)agg_user))
++ {
++ // Out of memory.
++ my_free((char*)agg_user, 0);
++ sql_print_error("Malloc in aggregate_user_stats failed");
++ DBUG_RETURN(1);
++ }
++ }
++ else
++ {
++ // Aggregate with existing values for this role.
++ add_user_stats(agg_user,
++ user->total_connections, user->concurrent_connections,
++ user->connected_time, user->busy_time, user->cpu_time,
++ user->bytes_received, user->bytes_sent,
++ user->binlog_bytes_written,
++ user->rows_fetched, user->rows_updated, user->rows_read,
++ user->select_commands, user->update_commands,
++ user->other_commands,
++ user->commit_trans, user->rollback_trans,
++ user->denied_connections, user->lost_connections,
++ user->access_denied_errors, user->empty_queries);
++ }
++ }
++ DBUG_PRINT("exit", ("aggregated %d input into %d output entries",
++ all_user_stats->records, agg_user_stats->records));
++ DBUG_RETURN(0);
++}
++
++/*
++ Write result to network for SHOW USER_STATISTICS
++
++ SYNOPSIS
++ send_user_stats
++ all_user_stats - values to return
++ table - I_S table
++
++ RETURN
++ 0 - OK
++ 1 - error
++ */
++int send_user_stats(THD* thd, HASH *all_user_stats, TABLE *table)
++{
++ DBUG_ENTER("send_user_stats");
++ for (int i = 0; i < all_user_stats->records; ++i) {
++ restore_record(table, s->default_values);
++ USER_STATS *user_stats = (USER_STATS*)hash_element(all_user_stats, i);
++ table->field[0]->store(user_stats->user, strlen(user_stats->user), system_charset_info);
++ table->field[1]->store((longlong)user_stats->total_connections);
++ table->field[2]->store((longlong)user_stats->concurrent_connections);
++ table->field[3]->store((longlong)user_stats->connected_time);
++ table->field[4]->store((longlong)user_stats->busy_time);
++ table->field[5]->store((longlong)user_stats->cpu_time);
++ table->field[6]->store((longlong)user_stats->bytes_received);
++ table->field[7]->store((longlong)user_stats->bytes_sent);
++ table->field[8]->store((longlong)user_stats->binlog_bytes_written);
++ table->field[9]->store((longlong)user_stats->rows_fetched);
++ table->field[10]->store((longlong)user_stats->rows_updated);
++ table->field[11]->store((longlong)user_stats->rows_read);
++ table->field[12]->store((longlong)user_stats->select_commands);
++ table->field[13]->store((longlong)user_stats->update_commands);
++ table->field[14]->store((longlong)user_stats->other_commands);
++ table->field[15]->store((longlong)user_stats->commit_trans);
++ table->field[16]->store((longlong)user_stats->rollback_trans);
++ table->field[17]->store((longlong)user_stats->denied_connections);
++ table->field[18]->store((longlong)user_stats->lost_connections);
++ table->field[19]->store((longlong)user_stats->access_denied_errors);
++ table->field[20]->store((longlong)user_stats->empty_queries);
++ if (schema_table_store_record(thd, table))
++ {
++ DBUG_PRINT("error", ("store record error"));
++ DBUG_RETURN(1);
++ }
++ }
++ DBUG_RETURN(0);
++}
++
++/*
++ Process SHOW USER_STATISTICS
++
++ SYNOPSIS
++ mysqld_show_user_stats
++ thd - current thread
++ wild - limit results to the entry for this user
++ with_roles - when true, display role for mapped users
++
++ RETURN
++ 0 - OK
++ 1 - error
++ */
++
++
++int fill_schema_user_stats(THD* thd, TABLE_LIST* tables, COND* cond)
++{
++ TABLE *table= tables->table;
++ DBUG_ENTER("fill_schema_user_stats");
++
++ if (check_global_access(thd, SUPER_ACL | PROCESS_ACL))
++ DBUG_RETURN(1);
++
++ // Iterates through all the global stats and sends them to the client.
++ // Pattern matching on the client IP is supported.
++
++ pthread_mutex_lock(&LOCK_global_user_client_stats);
++ int result= send_user_stats(thd, &global_user_stats, table);
++ pthread_mutex_unlock(&LOCK_global_user_client_stats);
++ if (result)
++ goto err;
++
++ DBUG_PRINT("exit", ("fill_schema_user_stats result is 0"));
++ DBUG_RETURN(0);
++
++ err:
++ DBUG_PRINT("exit", ("fill_schema_user_stats result is 1"));
++ DBUG_RETURN(1);
++}
++
++/*
++ Process SHOW CLIENT_STATISTICS
++
++ SYNOPSIS
++ mysqld_show_client_stats
++ thd - current thread
++ wild - limit results to the entry for this client
++
++ RETURN
++ 0 - OK
++ 1 - error
++ */
++
++
++int fill_schema_client_stats(THD* thd, TABLE_LIST* tables, COND* cond)
++{
++ TABLE *table= tables->table;
++ DBUG_ENTER("fill_schema_client_stats");
++
++ if (check_global_access(thd, SUPER_ACL | PROCESS_ACL))
++ DBUG_RETURN(1);
++
++ // Iterates through all the global stats and sends them to the client.
++ // Pattern matching on the client IP is supported.
++
++ pthread_mutex_lock(&LOCK_global_user_client_stats);
++ int result= send_user_stats(thd, &global_client_stats, table);
++ pthread_mutex_unlock(&LOCK_global_user_client_stats);
++ if (result)
++ goto err;
++
++ DBUG_PRINT("exit", ("mysqld_show_client_stats result is 0"));
++ DBUG_RETURN(0);
++
++ err:
++ DBUG_PRINT("exit", ("mysqld_show_client_stats result is 1"));
++ DBUG_RETURN(1);
++}
++
++
++// Sends the global table stats back to the client.
++int fill_schema_table_stats(THD* thd, TABLE_LIST* tables, COND* cond)
++{
++ TABLE *table= tables->table;
++ DBUG_ENTER("fill_schema_table_stats");
++ char *table_full_name, *table_schema;
++
++ pthread_mutex_lock(&LOCK_global_table_stats);
++ for (int i = 0; i < global_table_stats.records; ++i) {
++ restore_record(table, s->default_values);
++ TABLE_STATS *table_stats =
++ (TABLE_STATS*)hash_element(&global_table_stats, i);
++
++ table_full_name= thd->strdup(table_stats->table);
++ table_schema= strsep(&table_full_name, ".");
++
++ TABLE_LIST tmp_table;
++ bzero((char*) &tmp_table,sizeof(tmp_table));
++ tmp_table.table_name= table_full_name;
++ tmp_table.db= table_schema;
++ tmp_table.grant.privilege= 0;
++ if (check_access(thd, SELECT_ACL | EXTRA_ACL, tmp_table.db,
++ &tmp_table.grant.privilege, 0, 0,
++ is_schema_db(table_schema)) ||
++ grant_option && check_grant(thd, SELECT_ACL, &tmp_table, 1, UINT_MAX, 1))
++ continue;
++
++ table->field[0]->store(table_schema, strlen(table_schema), system_charset_info);
++ table->field[1]->store(table_full_name, strlen(table_full_name), system_charset_info);
++ table->field[2]->store((longlong)table_stats->rows_read, TRUE);
++ table->field[3]->store((longlong)table_stats->rows_changed, TRUE);
++ table->field[4]->store((longlong)table_stats->rows_changed_x_indexes, TRUE);
++
++ if (schema_table_store_record(thd, table))
++ {
++ VOID(pthread_mutex_unlock(&LOCK_global_table_stats));
++ DBUG_RETURN(1);
++ }
++ }
++ pthread_mutex_unlock(&LOCK_global_table_stats);
++ DBUG_RETURN(0);
++}
++
++// Sends the global index stats back to the client.
++int fill_schema_index_stats(THD* thd, TABLE_LIST* tables, COND* cond)
++{
++ TABLE *table= tables->table;
++ DBUG_ENTER("fill_schema_index_stats");
++ char *index_full_name, *table_schema, *table_name;
++
++ pthread_mutex_lock(&LOCK_global_index_stats);
++ for (int i = 0; i < global_index_stats.records; ++i) {
++ restore_record(table, s->default_values);
++ INDEX_STATS *index_stats =
++ (INDEX_STATS*)hash_element(&global_index_stats, i);
++
++ index_full_name= thd->strdup(index_stats->index);
++ table_schema= strsep(&index_full_name, ".");
++ table_name= strsep(&index_full_name, ".");
++
++ TABLE_LIST tmp_table;
++ bzero((char*) &tmp_table,sizeof(tmp_table));
++ tmp_table.table_name= table_name;
++ tmp_table.db= table_schema;
++ tmp_table.grant.privilege= 0;
++ if (check_access(thd, SELECT_ACL | EXTRA_ACL, tmp_table.db,
++ &tmp_table.grant.privilege, 0, 0,
++ is_schema_db(table_schema)) ||
++ grant_option && check_grant(thd, SELECT_ACL, &tmp_table, 1, UINT_MAX, 1))
++ continue;
++
++ table->field[0]->store(table_schema, strlen(table_schema), system_charset_info);
++ table->field[1]->store(table_name, strlen(table_name), system_charset_info);
++ table->field[2]->store(index_full_name, strlen(index_full_name), system_charset_info);
++ table->field[3]->store((longlong)index_stats->rows_read, TRUE);
++
++ if (schema_table_store_record(thd, table))
++ {
++ VOID(pthread_mutex_unlock(&LOCK_global_index_stats));
++ DBUG_RETURN(1);
++ }
++ }
++ pthread_mutex_unlock(&LOCK_global_index_stats);
++ DBUG_RETURN(0);
++}
+
+ /* collect status for all running threads */
+
+@@ -4500,6 +4795,77 @@
+ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
+ };
+
++ST_FIELD_INFO user_stats_fields_info[]=
++{
++ {"USER", USERNAME_LENGTH, MYSQL_TYPE_STRING, 0, 0, "User"},
++ {"TOTAL_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Total_connections"},
++ {"CONCURRENT_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Concurrent_connections"},
++ {"CONNECTED_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Connected_time"},
++ {"BUSY_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Busy_time"},
++ {"CPU_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Cpu_time"},
++ {"BYTES_RECEIVED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_received"},
++ {"BYTES_SENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_sent"},
++ {"BINLOG_BYTES_WRITTEN", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Binlog_bytes_written"},
++ {"ROWS_FETCHED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_fetched"},
++ {"ROWS_UPDATED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_updated"},
++ {"TABLE_ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Table_rows_read"},
++ {"SELECT_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Select_commands"},
++ {"UPDATE_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Update_commands"},
++ {"OTHER_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Other_commands"},
++ {"COMMIT_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Commit_transactions"},
++ {"ROLLBACK_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rollback_transactions"},
++ {"DENIED_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Denied_connections"},
++ {"LOST_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Lost_connections"},
++ {"ACCESS_DENIED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Access_denied"},
++ {"EMPTY_QUERIES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Empty_queries"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
++ST_FIELD_INFO client_stats_fields_info[]=
++{
++ {"CLIENT", LIST_PROCESS_HOST_LEN, MYSQL_TYPE_STRING, 0, 0, "Client"},
++ {"TOTAL_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Total_connections"},
++ {"CONCURRENT_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Concurrent_connections"},
++ {"CONNECTED_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Connected_time"},
++ {"BUSY_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Busy_time"},
++ {"CPU_TIME", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Cpu_time"},
++ {"BYTES_RECEIVED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_received"},
++ {"BYTES_SENT", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Bytes_sent"},
++ {"BINLOG_BYTES_WRITTEN", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Binlog_bytes_written"},
++ {"ROWS_FETCHED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_fetched"},
++ {"ROWS_UPDATED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_updated"},
++ {"TABLE_ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Table_rows_read"},
++ {"SELECT_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Select_commands"},
++ {"UPDATE_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Update_commands"},
++ {"OTHER_COMMANDS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Other_commands"},
++ {"COMMIT_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Commit_transactions"},
++ {"ROLLBACK_TRANSACTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rollback_transactions"},
++ {"DENIED_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Denied_connections"},
++ {"LOST_CONNECTIONS", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Lost_connections"},
++ {"ACCESS_DENIED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Access_denied"},
++ {"EMPTY_QUERIES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Empty_queries"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
++
++ST_FIELD_INFO table_stats_fields_info[]=
++{
++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schema"},
++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"},
++ {"ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_read"},
++ {"ROWS_CHANGED", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_changed"},
++ {"ROWS_CHANGED_X_INDEXES", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_changed_x_#indexes"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
++
++ST_FIELD_INFO index_stats_fields_info[]=
++{
++ {"TABLE_SCHEMA", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_schema"},
++ {"TABLE_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Table_name"},
++ {"INDEX_NAME", NAME_LEN, MYSQL_TYPE_STRING, 0, 0, "Index_name"},
++ {"ROWS_READ", MY_INT64_NUM_DECIMAL_DIGITS, MYSQL_TYPE_LONG, 0, 0, "Rows_read"},
++ {0, 0, MYSQL_TYPE_STRING, 0, 0, 0}
++};
+
+ /*
+ Description of ST_FIELD_INFO in table.h
+@@ -4509,6 +4875,8 @@
+ {
+ {"CHARACTER_SETS", charsets_fields_info, create_schema_table,
+ fill_schema_charsets, make_character_sets_old_format, 0, -1, -1, 0},
++ {"CLIENT_STATISTICS", client_stats_fields_info, create_schema_table,
++ fill_schema_client_stats, make_old_format, 0, -1, -1, 0},
+ {"COLLATIONS", collation_fields_info, create_schema_table,
+ fill_schema_collation, make_old_format, 0, -1, -1, 0},
+ {"COLLATION_CHARACTER_SET_APPLICABILITY", coll_charset_app_fields_info,
+@@ -4517,6 +4885,8 @@
+ get_all_tables, make_columns_old_format, get_schema_column_record, 1, 2, 0},
+ {"COLUMN_PRIVILEGES", column_privileges_fields_info, create_schema_table,
+ fill_schema_column_privileges, 0, 0, -1, -1, 0},
++ {"INDEX_STATISTICS", index_stats_fields_info, create_schema_table,
++ fill_schema_index_stats, make_old_format, 0, -1, -1, 0},
+ {"KEY_COLUMN_USAGE", key_column_usage_fields_info, create_schema_table,
+ get_all_tables, 0, get_schema_key_column_usage_record, 4, 5, 0},
+ {"OPEN_TABLES", open_tables_fields_info, create_schema_table,
+@@ -4542,10 +4912,14 @@
+ get_all_tables, make_table_names_old_format, 0, 1, 2, 1},
+ {"TABLE_PRIVILEGES", table_privileges_fields_info, create_schema_table,
+ fill_schema_table_privileges, 0, 0, -1, -1, 0},
++ {"TABLE_STATISTICS", table_stats_fields_info, create_schema_table,
++ fill_schema_table_stats, make_old_format, 0, -1, -1, 0},
+ {"TRIGGERS", triggers_fields_info, create_schema_table,
+ get_all_tables, make_old_format, get_schema_triggers_record, 5, 6, 0},
+ {"USER_PRIVILEGES", user_privileges_fields_info, create_schema_table,
+ fill_schema_user_privileges, 0, 0, -1, -1, 0},
++ {"USER_STATISTICS", user_stats_fields_info, create_schema_table,
++ fill_schema_user_stats, make_old_format, 0, -1, -1, 0},
+ {"VARIABLES", variables_fields_info, create_schema_table, fill_variables,
+ make_old_format, 0, -1, -1, 1},
+ {"VIEWS", view_fields_info, create_schema_table,
+diff -r 592f6c3641ba sql/sql_update.cc
+--- a/sql/sql_update.cc Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_update.cc Wed Jul 29 13:34:11 2009 -0700
+@@ -601,7 +601,8 @@
+ (thd->client_capabilities & CLIENT_FOUND_ROWS) ? found : updated;
+ send_ok(thd, (ulong) thd->row_count_func,
+ thd->insert_id_used ? thd->last_insert_id : 0L,buff);
+- DBUG_PRINT("info",("%ld records updated", (long) updated));
++ thd->updated_row_count += thd->row_count_func;
++ DBUG_PRINT("info",("%d records updated",updated));
+ }
+ thd->count_cuted_fields= CHECK_FIELD_IGNORE; /* calc cuted fields */
+ thd->abort_on_warning= 0;
+@@ -1832,5 +1833,6 @@
+ (thd->client_capabilities & CLIENT_FOUND_ROWS) ? found : updated;
+ ::send_ok(thd, (ulong) thd->row_count_func,
+ thd->insert_id_used ? thd->last_insert_id : 0L,buff);
++ thd->updated_row_count += thd->row_count_func;
+ return FALSE;
+ }
+diff -r 592f6c3641ba sql/sql_yacc.yy
+--- a/sql/sql_yacc.yy Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/sql_yacc.yy Wed Jul 29 13:34:11 2009 -0700
+@@ -523,6 +523,7 @@
+ %token CHECK_SYM
+ %token CIPHER_SYM
+ %token CLIENT_SYM
++%token CLIENT_STATS_SYM
+ %token CLOSE_SYM
+ %token COALESCE
+ %token CODE_SYM
+@@ -680,6 +681,7 @@
+ %token IMPORT
+ %token INDEXES
+ %token INDEX_SYM
++%token INDEX_STATS_SYM
+ %token INFILE
+ %token INNER_SYM
+ %token INNOBASE_SYM
+@@ -909,6 +911,7 @@
+ %token SIGNED_SYM
+ %token SIMPLE_SYM
+ %token SLAVE
++%token SLOW_SYM
+ %token SMALLINT
+ %token SNAPSHOT_SYM
+ %token SOUNDS_SYM
+@@ -949,6 +952,7 @@
+ %token TABLES
+ %token TABLESPACE
+ %token TABLE_SYM
++%token TABLE_STATS_SYM
+ %token TEMPORARY
+ %token TEMPTABLE_SYM
+ %token TERMINATED
+@@ -991,6 +995,7 @@
+ %token UPGRADE_SYM
+ %token USAGE
+ %token USER
++%token USER_STATS_SYM
+ %token USE_FRM
+ %token USE_SYM
+ %token USING
+@@ -8255,6 +8260,38 @@
+ {
+ Lex->sql_command = SQLCOM_SHOW_SLAVE_STAT;
+ }
++ | CLIENT_STATS_SYM wild_and_where
++ {
++ LEX *lex= Lex;
++ Lex->sql_command = SQLCOM_SELECT;
++ lex->orig_sql_command= SQLCOM_SHOW_CLIENT_STATS;
++ if (prepare_schema_table(YYTHD, lex, 0, SCH_CLIENT_STATS))
++ MYSQL_YYABORT;
++ }
++ | USER_STATS_SYM wild_and_where
++ {
++ LEX *lex= Lex;
++ lex->sql_command = SQLCOM_SELECT;
++ lex->orig_sql_command= SQLCOM_SHOW_USER_STATS;
++ if (prepare_schema_table(YYTHD, lex, 0, SCH_USER_STATS))
++ MYSQL_YYABORT;
++ }
++ | TABLE_STATS_SYM wild_and_where
++ {
++ LEX *lex= Lex;
++ lex->sql_command= SQLCOM_SELECT;
++ lex->orig_sql_command= SQLCOM_SHOW_TABLE_STATS;
++ if (prepare_schema_table(YYTHD, lex, 0, SCH_TABLE_STATS))
++ MYSQL_YYABORT;
++ }
++ | INDEX_STATS_SYM wild_and_where
++ {
++ LEX *lex= Lex;
++ lex->sql_command= SQLCOM_SELECT;
++ lex->orig_sql_command= SQLCOM_SHOW_INDEX_STATS;
++ if (prepare_schema_table(YYTHD, lex, 0, SCH_INDEX_STATS))
++ MYSQL_YYABORT;
++ }
+ | CREATE PROCEDURE sp_name
+ {
+ LEX *lex= Lex;
+@@ -8459,9 +8496,14 @@
+ | LOGS_SYM { Lex->type|= REFRESH_LOG; }
+ | STATUS_SYM { Lex->type|= REFRESH_STATUS; }
+ | SLAVE { Lex->type|= REFRESH_SLAVE; }
++ | SLOW_SYM QUERY_SYM LOGS_SYM { Lex->type |= REFRESH_SLOW_QUERY_LOG; }
+ | MASTER_SYM { Lex->type|= REFRESH_MASTER; }
+ | DES_KEY_FILE { Lex->type|= REFRESH_DES_KEY_FILE; }
+- | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; };
++ | RESOURCES { Lex->type|= REFRESH_USER_RESOURCES; }
++ | CLIENT_STATS_SYM { Lex->type|= REFRESH_CLIENT_STATS; }
++ | USER_STATS_SYM { Lex->type|= REFRESH_USER_STATS; }
++ | TABLE_STATS_SYM { Lex->type|= REFRESH_TABLE_STATS; }
++ | INDEX_STATS_SYM { Lex->type|= REFRESH_INDEX_STATS; };
+
+ opt_table_list:
+ /* empty */ {;}
+@@ -9450,6 +9492,7 @@
+ | CHAIN_SYM {}
+ | CHANGED {}
+ | CIPHER_SYM {}
++ | CLIENT_STATS_SYM {}
+ | CLIENT_SYM {}
+ | CODE_SYM {}
+ | COLLATION_SYM {}
+@@ -9502,6 +9545,7 @@
+ | HOSTS_SYM {}
+ | HOUR_SYM {}
+ | IDENTIFIED_SYM {}
++ | INDEX_STATS_SYM {}
+ | INVOKER_SYM {}
+ | IMPORT {}
+ | INDEXES {}
+@@ -9611,6 +9655,7 @@
+ | SIMPLE_SYM {}
+ | SHARE_SYM {}
+ | SHUTDOWN {}
++ | SLOW_SYM {}
+ | SNAPSHOT_SYM {}
+ | SOUNDS_SYM {}
+ | SOURCE_SYM {}
+@@ -9627,6 +9672,7 @@
+ | SUSPEND_SYM {}
+ | SWAPS_SYM {}
+ | SWITCHES_SYM {}
++ | TABLE_STATS_SYM {}
+ | TABLES {}
+ | TABLESPACE {}
+ | TEMPORARY {}
+@@ -9647,6 +9693,7 @@
+ | UNKNOWN_SYM {}
+ | UNTIL_SYM {}
+ | USER {}
++ | USER_STATS_SYM {}
+ | USE_FRM {}
+ | VARIABLES {}
+ | VIEW_SYM {}
+diff -r 592f6c3641ba sql/structs.h
+--- a/sql/structs.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/structs.h Wed Jul 29 13:34:11 2009 -0700
+@@ -273,6 +273,98 @@
+ time_t intime;
+ } USER_CONN;
+
++typedef struct st_user_stats {
++ char user[max(USERNAME_LENGTH, LIST_PROCESS_HOST_LEN) + 1];
++ // Account name the user is mapped to when this is a user from mapped_user.
++ // Otherwise, the same value as user.
++ char priv_user[max(USERNAME_LENGTH, LIST_PROCESS_HOST_LEN) + 1];
++ uint total_connections;
++ uint concurrent_connections;
++ time_t connected_time; // in seconds
++ double busy_time; // in seconds
++ double cpu_time; // in seconds
++ ulonglong bytes_received;
++ ulonglong bytes_sent;
++ ulonglong binlog_bytes_written;
++ ha_rows rows_fetched, rows_updated, rows_read;
++ ulonglong select_commands, update_commands, other_commands;
++ ulonglong commit_trans, rollback_trans;
++ ulonglong denied_connections, lost_connections;
++ ulonglong access_denied_errors;
++ ulonglong empty_queries;
++} USER_STATS;
++
++/* Lookup function for hash tables with USER_STATS entries */
++extern byte *get_key_user_stats(USER_STATS *user_stats, uint *length,
++ my_bool not_used __attribute__((unused)));
++
++/* Free all memory for a hash table with USER_STATS entries */
++extern void free_user_stats(USER_STATS* user_stats);
++
++/* Intialize an instance of USER_STATS */
++extern void
++init_user_stats(USER_STATS *user_stats,
++ const char *user,
++ const char *priv_user,
++ uint total_connections,
++ uint concurrent_connections,
++ time_t connected_time,
++ double busy_time,
++ double cpu_time,
++ ulonglong bytes_received,
++ ulonglong bytes_sent,
++ ulonglong binlog_bytes_written,
++ ha_rows rows_fetched,
++ ha_rows rows_updated,
++ ha_rows rows_read,
++ ulonglong select_commands,
++ ulonglong update_commands,
++ ulonglong other_commands,
++ ulonglong commit_trans,
++ ulonglong rollback_trans,
++ ulonglong denied_connections,
++ ulonglong lost_connections,
++ ulonglong access_denied_errors,
++ ulonglong empty_queries);
++
++/* Increment values of an instance of USER_STATS */
++extern void
++add_user_stats(USER_STATS *user_stats,
++ uint total_connections,
++ uint concurrent_connections,
++ time_t connected_time,
++ double busy_time,
++ double cpu_time,
++ ulonglong bytes_received,
++ ulonglong bytes_sent,
++ ulonglong binlog_bytes_written,
++ ha_rows rows_fetched,
++ ha_rows rows_updated,
++ ha_rows rows_read,
++ ulonglong select_commands,
++ ulonglong update_commands,
++ ulonglong other_commands,
++ ulonglong commit_trans,
++ ulonglong rollback_trans,
++ ulonglong denied_connections,
++ ulonglong lost_connections,
++ ulonglong access_denied_errors,
++ ulonglong empty_queries);
++
++typedef struct st_table_stats {
++ char table[NAME_LEN * 2 + 2]; // [db] + '.' + [table] + '\0'
++ ulonglong rows_read, rows_changed;
++ ulonglong rows_changed_x_indexes;
++ /* Stores enum db_type, but forward declarations cannot be done */
++ int engine_type;
++} TABLE_STATS;
++
++typedef struct st_index_stats {
++ char index[NAME_LEN * 3 + 3]; // [db] + '.' + [table] + '.' + [index] + '\0'
++ ulonglong rows_read;
++} INDEX_STATS;
++
++
+ /* Bits in form->update */
+ #define REG_MAKE_DUPP 1 /* Make a copy of record when read */
+ #define REG_NEW_RECORD 2 /* Write a new record if not found */
+diff -r 592f6c3641ba sql/table.h
+--- a/sql/table.h Wed Jul 29 13:33:34 2009 -0700
++++ b/sql/table.h Wed Jul 29 13:34:11 2009 -0700
+@@ -371,10 +371,12 @@
+ enum enum_schema_tables
+ {
+ SCH_CHARSETS= 0,
++ SCH_CLIENT_STATS,
+ SCH_COLLATIONS,
+ SCH_COLLATION_CHARACTER_SET_APPLICABILITY,
+ SCH_COLUMNS,
+ SCH_COLUMN_PRIVILEGES,
++ SCH_INDEX_STATS,
+ SCH_KEY_COLUMN_USAGE,
+ SCH_OPEN_TABLES,
+ SCH_PROFILES,
+@@ -387,8 +389,10 @@
+ SCH_TABLE_CONSTRAINTS,
+ SCH_TABLE_NAMES,
+ SCH_TABLE_PRIVILEGES,
++ SCH_TABLE_STATS,
+ SCH_TRIGGERS,
+ SCH_USER_PRIVILEGES,
++ SCH_USER_STATS,
+ SCH_VARIABLES,
+ SCH_VIEWS
+ };
+diff -r 592f6c3641ba strings/Makefile.in
+--- a/strings/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/strings/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -342,6 +342,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba support-files/MacOSX/Makefile.in
+--- a/support-files/MacOSX/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/support-files/MacOSX/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -148,6 +148,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba support-files/Makefile.in
+--- a/support-files/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/support-files/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -171,6 +171,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba support-files/RHEL4-SElinux/Makefile.in
+--- a/support-files/RHEL4-SElinux/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/support-files/RHEL4-SElinux/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -146,6 +146,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba tests/Makefile.in
+--- a/tests/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/tests/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -193,6 +193,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @CLIENT_LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba tools/Makefile.in
+--- a/tools/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/tools/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -167,6 +167,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba vio/Makefile.in
+--- a/vio/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/vio/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -176,6 +176,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba win/Makefile.in
+--- a/win/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/win/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -144,6 +144,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = @LIBS@
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -r 592f6c3641ba zlib/Makefile.in
+--- a/zlib/Makefile.in Wed Jul 29 13:33:34 2009 -0700
++++ b/zlib/Makefile.in Wed Jul 29 13:34:11 2009 -0700
+@@ -187,6 +187,7 @@
+ LIBDL = @LIBDL@
+ LIBEDIT_LOBJECTS = @LIBEDIT_LOBJECTS@
+ LIBOBJS = @LIBOBJS@
++LIBRT = @LIBRT@
+ LIBS = $(NON_THREADED_LIBS)
+ LIBTOOL = @LIBTOOL@
+ LIB_EXTRA_CCFLAGS = @LIB_EXTRA_CCFLAGS@
+diff -Nur a/include/mysql_com.h b/include/mysql_com.h
+--- a/include/mysql_com.h 2010-05-22 00:26:45.000000000 -0700
++++ b/include/mysql_com.h 2010-05-22 00:27:14.000000000 -0700
+@@ -228,7 +228,7 @@
+
+ my_bool report_error; /* We should report error (we have unreported error) */
+ my_bool return_errno;
+-#if defined(MYSQL_SERVER) && !defined(EMBEDDED_LIBRARY)
++#if defined(MYSQL_SERVER)
+ /*
+ Controls whether a big packet should be skipped.
+