From 0802e5da694b3105dd723eacd3d7dc92b24f1f0b Mon Sep 17 00:00:00 2001 From: Calvin Sun Date: Tue, 20 Jul 2010 15:42:31 -0500 Subject: [PATCH] Improve InnoDB synchronization primitives on Windows This patch was originally developed by Vladislav Vaintroub. The main changes are: * Use TryEnterCriticalSection in os_fast_mutex_trylock(). * Use lightweight condition variables on Vista or later Windows; but fall back to events on older Windows, such as XP. This patch also fixes the following bugs: bug# 52102 InnoDB Plugin shows performance drop compared to InnoDB on Windows bug# 53204 os_fastmutex_trylock is implemented incorrectly on Windows rb://363 approved by Inaam Rana --- storage/innobase/CMakeLists.txt | 6 +- storage/innobase/include/os0file.h | 10 +- storage/innobase/include/os0sync.h | 73 +---- storage/innobase/include/os0sync.ic | 11 +- storage/innobase/include/srv0srv.h | 3 + storage/innobase/os/os0file.c | 62 ++-- storage/innobase/os/os0sync.c | 444 ++++++++++++++++------------ storage/innobase/os/os0thread.c | 2 +- storage/innobase/srv/srv0srv.c | 15 + storage/innobase/srv/srv0start.c | 12 +- storage/innobase/sync/sync0arr.c | 2 +- 11 files changed, 349 insertions(+), 291 deletions(-) diff --git a/storage/innobase/CMakeLists.txt b/storage/innobase/CMakeLists.txt index 8e3e4efbb0e..fbb59b07dfe 100644 --- a/storage/innobase/CMakeLists.txt +++ b/storage/innobase/CMakeLists.txt @@ -188,11 +188,7 @@ IF(SIZEOF_PTHREAD_T) ENDIF() IF(MSVC) - # Windows atomics do not perform well. Disable Windows atomics by default. - # See bug#52102 for details. - - #ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DINNODB_RW_LOCKS_USE_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION) - ADD_DEFINITIONS(-DHAVE_IB_PAUSE_INSTRUCTION) + ADD_DEFINITIONS(-DHAVE_WINDOWS_ATOMICS -DHAVE_IB_PAUSE_INSTRUCTION) ENDIF() diff --git a/storage/innobase/include/os0file.h b/storage/innobase/include/os0file.h index a112cb06697..7a9b4cffa2b 100644 --- a/storage/innobase/include/os0file.h +++ b/storage/innobase/include/os0file.h @@ -177,6 +177,13 @@ log. */ #define OS_WIN95 2 /*!< Microsoft Windows 95 */ #define OS_WINNT 3 /*!< Microsoft Windows NT 3.x */ #define OS_WIN2000 4 /*!< Microsoft Windows 2000 */ +#define OS_WINXP 5 /*!< Microsoft Windows XP + or Windows Server 2003 */ +#define OS_WINVISTA 6 /*!< Microsoft Windows Vista + or Windows Server 2008 */ +#define OS_WIN7 7 /*!< Microsoft Windows 7 + or Windows Server 2008 R2 */ + extern ulint os_n_file_reads; extern ulint os_n_file_writes; @@ -368,7 +375,8 @@ typedef DIR* os_file_dir_t; /*!< directory stream */ /***********************************************************************//** Gets the operating system version. Currently works only on Windows. -@return OS_WIN95, OS_WIN31, OS_WINNT, or OS_WIN2000 */ +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ UNIV_INTERN ulint os_get_os_version(void); diff --git a/storage/innobase/include/os0sync.h b/storage/innobase/include/os0sync.h index 0c22162b900..0b600c80ce3 100644 --- a/storage/innobase/include/os0sync.h +++ b/storage/innobase/include/os0sync.h @@ -38,28 +38,18 @@ Created 9/6/1995 Heikki Tuuri #include "ut0lst.h" #ifdef __WIN__ - +/** Native event (slow)*/ +typedef HANDLE os_native_event_t; /** Native mutex */ -#define os_fast_mutex_t CRITICAL_SECTION - -/** Native event */ -typedef HANDLE os_native_event_t; - -/** Operating system event */ -typedef struct os_event_struct os_event_struct_t; -/** Operating system event handle */ -typedef os_event_struct_t* os_event_t; - -/** An asynchronous signal sent between threads */ -struct os_event_struct { - os_native_event_t handle; - /*!< Windows event */ - UT_LIST_NODE_T(os_event_struct_t) os_event_list; - /*!< list of all created events */ -}; +typedef CRITICAL_SECTION os_fast_mutex_t; +/** Native condition variable. */ +typedef CONDITION_VARIABLE os_cond_t; #else /** Native mutex */ -typedef pthread_mutex_t os_fast_mutex_t; +typedef pthread_mutex_t os_fast_mutex_t; +/** Native condition variable */ +typedef pthread_cond_t os_cond_t; +#endif /** Operating system event */ typedef struct os_event_struct os_event_struct_t; @@ -68,6 +58,10 @@ typedef os_event_struct_t* os_event_t; /** An asynchronous signal sent between threads */ struct os_event_struct { +#ifdef __WIN__ + HANDLE handle; /*!< kernel event object, slow, + used on older Windows */ +#endif os_fast_mutex_t os_mutex; /*!< this mutex protects the next fields */ ibool is_set; /*!< this is TRUE when the event is @@ -76,24 +70,17 @@ struct os_event_struct { this event */ ib_int64_t signal_count; /*!< this is incremented each time the event becomes signaled */ - pthread_cond_t cond_var; /*!< condition variable is used in + os_cond_t cond_var; /*!< condition variable is used in waiting for the event */ UT_LIST_NODE_T(os_event_struct_t) os_event_list; /*!< list of all created events */ }; -#endif /** Operating system mutex */ typedef struct os_mutex_struct os_mutex_str_t; /** Operating system mutex handle */ typedef os_mutex_str_t* os_mutex_t; -/** Denotes an infinite delay for os_event_wait_time() */ -#define OS_SYNC_INFINITE_TIME ((ulint)(-1)) - -/** Return value of os_event_wait_time() when the time is exceeded */ -#define OS_SYNC_TIME_EXCEEDED 1 - /** Mutex protecting counts and the event and OS 'slow' mutex lists */ extern os_mutex_t os_sync_mutex; @@ -187,42 +174,14 @@ os_event_wait_low( #define os_event_wait(event) os_event_wait_low(event, 0) -/**********************************************************//** -Waits for an event object until it is in the signaled state or -a timeout is exceeded. In Unix the timeout is always infinite. -@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ -UNIV_INTERN -ulint -os_event_wait_time( -/*===============*/ - os_event_t event, /*!< in: event to wait */ - ulint time); /*!< in: timeout in microseconds, or - OS_SYNC_INFINITE_TIME */ -#ifdef __WIN__ -/**********************************************************//** -Waits for any event in an OS native event array. Returns if even a single -one is signaled or becomes signaled. -@return index of the event which was signaled */ -UNIV_INTERN -ulint -os_event_wait_multiple( -/*===================*/ - ulint n, /*!< in: number of events in the - array */ - os_native_event_t* native_event_array); - /*!< in: pointer to an array of event - handles */ -#endif /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the mutex semaphore of InnoDB itself (mutex_t) should be used where possible. @return the mutex handle */ UNIV_INTERN os_mutex_t -os_mutex_create( -/*============*/ - const char* name); /*!< in: the name of the mutex, if NULL - the mutex is created without a name */ +os_mutex_create(void); +/*=================*/ /**********************************************************//** Acquires ownership of a mutex semaphore. */ UNIV_INTERN diff --git a/storage/innobase/include/os0sync.ic b/storage/innobase/include/os0sync.ic index 1f3ce38fa65..c33f13aaad6 100644 --- a/storage/innobase/include/os0sync.ic +++ b/storage/innobase/include/os0sync.ic @@ -28,8 +28,7 @@ Created 9/6/1995 Heikki Tuuri #endif /**********************************************************//** -Acquires ownership of a fast mutex. Currently in Windows this is the same -as os_fast_mutex_lock! +Acquires ownership of a fast mutex. @return 0 if success, != 0 if was reserved by another thread */ UNIV_INLINE ulint @@ -38,9 +37,13 @@ os_fast_mutex_trylock( os_fast_mutex_t* fast_mutex) /*!< in: mutex to acquire */ { #ifdef __WIN__ - EnterCriticalSection(fast_mutex); + if (TryEnterCriticalSection(fast_mutex)) { - return(0); + return(0); + } else { + + return(1); + } #else /* NOTE that the MySQL my_pthread.h redefines pthread_mutex_trylock so that it returns 0 on success. In the operating system diff --git a/storage/innobase/include/srv0srv.h b/storage/innobase/include/srv0srv.h index 5fbb59b14ff..d78c8113aee 100644 --- a/storage/innobase/include/srv0srv.h +++ b/storage/innobase/include/srv0srv.h @@ -112,6 +112,9 @@ OS (provided we compiled Innobase with it in), otherwise we will use simulated aio we build below with threads. Currently we support native aio on windows and linux */ extern my_bool srv_use_native_aio; +#ifdef __WIN__ +extern ibool srv_use_native_conditions; +#endif extern ulint srv_n_data_files; extern char** srv_data_file_names; extern ulint* srv_data_file_sizes; diff --git a/storage/innobase/os/os0file.c b/storage/innobase/os/os0file.c index 0045b8731e4..6b17dccd2bf 100644 --- a/storage/innobase/os/os0file.c +++ b/storage/innobase/os/os0file.c @@ -183,7 +183,7 @@ struct os_aio_slot_struct{ which pending aio operation was completed */ #ifdef WIN_ASYNC_IO - os_event_t event; /*!< event object we need in the + HANDLE handle; /*!< handle object we need in the OVERLAPPED struct */ OVERLAPPED control; /*!< Windows control block for the aio request */ @@ -225,7 +225,7 @@ struct os_aio_array_struct{ aio array outside the ibuf segment */ os_aio_slot_t* slots; /*!< Pointer to the slots in the array */ #ifdef __WIN__ - os_native_event_t* native_events; + HANDLE* handles; /*!< Pointer to an array of OS native event handles where we copied the handles from slots, in the same @@ -304,7 +304,8 @@ UNIV_INTERN ulint os_n_pending_reads = 0; /***********************************************************************//** Gets the operating system version. Currently works only on Windows. -@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000 */ +@return OS_WIN95, OS_WIN31, OS_WINNT, OS_WIN2000, OS_WINXP, OS_WINVISTA, +OS_WIN7. */ UNIV_INTERN ulint os_get_os_version(void) @@ -322,10 +323,18 @@ os_get_os_version(void) } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_WINDOWS) { return(OS_WIN95); } else if (os_info.dwPlatformId == VER_PLATFORM_WIN32_NT) { - if (os_info.dwMajorVersion <= 4) { - return(OS_WINNT); - } else { - return(OS_WIN2000); + switch (os_info.dwMajorVersion) { + case 3: + case 4: + return OS_WINNT; + case 5: + return (os_info.dwMinorVersion == 0) ? OS_WIN2000 + : OS_WINXP; + case 6: + return (os_info.dwMinorVersion == 0) ? OS_WINVISTA + : OS_WIN7; + default: + return OS_WIN7; } } else { ut_error; @@ -673,10 +682,10 @@ os_io_init_simple(void) { ulint i; - os_file_count_mutex = os_mutex_create(NULL); + os_file_count_mutex = os_mutex_create(); for (i = 0; i < OS_FILE_N_SEEK_MUTEXES; i++) { - os_file_seek_mutexes[i] = os_mutex_create(NULL); + os_file_seek_mutexes[i] = os_mutex_create(); } } @@ -3217,7 +3226,7 @@ os_aio_array_create( array = ut_malloc(sizeof(os_aio_array_t)); - array->mutex = os_mutex_create(NULL); + array->mutex = os_mutex_create(); array->not_full = os_event_create(NULL); array->is_empty = os_event_create(NULL); @@ -3229,7 +3238,7 @@ os_aio_array_create( array->cur_seg = 0; array->slots = ut_malloc(n * sizeof(os_aio_slot_t)); #ifdef __WIN__ - array->native_events = ut_malloc(n * sizeof(os_native_event_t)); + array->handles = ut_malloc(n * sizeof(HANDLE)); #endif #if defined(LINUX_NATIVE_AIO) @@ -3273,13 +3282,13 @@ skip_native_aio: slot->pos = i; slot->reserved = FALSE; #ifdef WIN_ASYNC_IO - slot->event = os_event_create(NULL); + slot->handle = CreateEvent(NULL,TRUE, FALSE, NULL); over = &(slot->control); - over->hEvent = slot->event->handle; + over->hEvent = slot->handle; - *((array->native_events) + i) = over->hEvent; + *((array->handles) + i) = over->hEvent; #elif defined(LINUX_NATIVE_AIO) @@ -3305,12 +3314,12 @@ os_aio_array_free( for (i = 0; i < array->n_slots; i++) { os_aio_slot_t* slot = os_aio_array_get_nth_slot(array, i); - os_event_free(slot->event); + CloseHandle(slot->handle); } #endif /* WIN_ASYNC_IO */ #ifdef __WIN__ - ut_free(array->native_events); + ut_free(array->handles); #endif /* __WIN__ */ os_mutex_free(array->mutex); os_event_free(array->not_full); @@ -3463,7 +3472,7 @@ os_aio_array_wake_win_aio_at_shutdown( for (i = 0; i < array->n_slots; i++) { - os_event_set((array->slots + i)->event); + SetEvent((array->slots + i)->handle); } } #endif @@ -3702,7 +3711,7 @@ found: control = &(slot->control); control->Offset = (DWORD)offset; control->OffsetHigh = (DWORD)offset_high; - os_event_reset(slot->event); + ResetEvent(slot->handle); #elif defined(LINUX_NATIVE_AIO) @@ -3774,7 +3783,7 @@ os_aio_array_free_slot( #ifdef WIN_ASYNC_IO - os_event_reset(slot->event); + ResetEvent(slot->handle); #elif defined(LINUX_NATIVE_AIO) @@ -4208,13 +4217,20 @@ os_aio_windows_handle( n = array->n_slots / array->n_segments; if (array == os_aio_sync_array) { - os_event_wait(os_aio_array_get_nth_slot(array, pos)->event); + WaitForSingleObject( + os_aio_array_get_nth_slot(array, pos)->handle, + INFINITE); i = pos; } else { srv_set_io_thread_op_info(orig_seg, "wait Windows aio"); - i = os_event_wait_multiple(n, - (array->native_events) - + segment * n); + i = WaitForMultipleObjects((DWORD) n, + array->handles + segment * n, + FALSE, + INFINITE); + } + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); } os_mutex_enter(array->mutex); diff --git a/storage/innobase/os/os0sync.c b/storage/innobase/os/os0sync.c index 60467242e14..3c70e93aae0 100644 --- a/storage/innobase/os/os0sync.c +++ b/storage/innobase/os/os0sync.c @@ -35,6 +35,7 @@ Created 9/6/1995 Heikki Tuuri #include "ut0mem.h" #include "srv0start.h" +#include "srv0srv.h" /* Type definition for an operating system mutex struct */ struct os_mutex_struct{ @@ -76,6 +77,155 @@ event embedded inside a mutex, on free, this generates a recursive call. This version of the free event function doesn't acquire the global lock */ static void os_event_free_internal(os_event_t event); +/* On Windows (Vista and later), load function pointers for condition +variable handling. Those functions are not available in prior versions, +so we have to use them via runtime loading, as long as we support XP. */ +static void os_cond_module_init(void); + +#ifdef __WIN__ +/* Prototypes and function pointers for condition variable functions */ +typedef VOID (WINAPI* InitializeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static InitializeConditionVariableProc initialize_condition_variable; + +typedef BOOL (WINAPI* SleepConditionVariableCSProc) + (PCONDITION_VARIABLE ConditionVariable, + PCRITICAL_SECTION CriticalSection, + DWORD dwMilliseconds); +static SleepConditionVariableCSProc sleep_condition_variable; + +typedef VOID (WINAPI* WakeAllConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeAllConditionVariableProc wake_all_condition_variable; + +typedef VOID (WINAPI* WakeConditionVariableProc) + (PCONDITION_VARIABLE ConditionVariable); +static WakeConditionVariableProc wake_condition_variable; +#endif + +/*********************************************************//** +Initialitze condition variable */ +UNIV_INLINE +void +os_cond_init( +/*=========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(initialize_condition_variable != NULL); + initialize_condition_variable(cond); +#else + ut_a(pthread_cond_init(cond, NULL) == 0); +#endif +} + +/*********************************************************//** +Wait on condition variable */ +UNIV_INLINE +void +os_cond_wait( +/*=========*/ + os_cond_t* cond, /*!< in: condition variable. */ + os_fast_mutex_t* mutex) /*!< in: fast mutex */ +{ + ut_a(cond); + ut_a(mutex); + +#ifdef __WIN__ + ut_a(sleep_condition_variable != NULL); + ut_a(sleep_condition_variable(cond, mutex, INFINITE)); +#else + ut_a(pthread_cond_wait(cond, mutex) == 0); +#endif +} + +/*********************************************************//** +Wakes all threads waiting for condition variable */ +UNIV_INLINE +void +os_cond_broadcast( +/*==============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_all_condition_variable != NULL); + wake_all_condition_variable(cond); +#else + ut_a(pthread_cond_broadcast(cond) == 0); +#endif +} + +/*********************************************************//** +Wakes one thread waiting for condition variable */ +UNIV_INLINE +void +os_cond_signal( +/*==========*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ + ut_a(cond); + +#ifdef __WIN__ + ut_a(wake_condition_variable != NULL); + wake_condition_variable(cond); +#else + ut_a(pthread_cond_signal(cond) == 0); +#endif +} + +/*********************************************************//** +Destroys condition variable */ +UNIV_INLINE +void +os_cond_destroy( +/*============*/ + os_cond_t* cond) /*!< in: condition variable. */ +{ +#ifdef __WIN__ + /* Do nothing */ +#else + ut_a(pthread_cond_destroy(cond) == 0); +#endif +} + +/*********************************************************//** +On Windows (Vista and later), load function pointers for condition variable +handling. Those functions are not available in prior versions, so we have to +use them via runtime loading, as long as we support XP. */ +static +void +os_cond_module_init(void) +/*=====================*/ +{ +#ifdef __WIN__ + HMODULE h_dll; + + if (!srv_use_native_conditions) + return; + + h_dll = GetModuleHandle("kernel32"); + + initialize_condition_variable = (InitializeConditionVariableProc) + GetProcAddress(h_dll, "InitializeConditionVariable"); + sleep_condition_variable = (SleepConditionVariableCSProc) + GetProcAddress(h_dll, "SleepConditionVariableCS"); + wake_all_condition_variable = (WakeAllConditionVariableProc) + GetProcAddress(h_dll, "WakeAllConditionVariable"); + wake_condition_variable = (WakeConditionVariableProc) + GetProcAddress(h_dll, "WakeConditionVariable"); + + /* When using native condition variables, check function pointers */ + ut_a(initialize_condition_variable); + ut_a(sleep_condition_variable); + ut_a(wake_all_condition_variable); + ut_a(wake_condition_variable); +#endif +} + /*********************************************************//** Initializes global event and OS 'slow' mutex lists. */ UNIV_INTERN @@ -89,7 +239,10 @@ os_sync_init(void) os_sync_mutex = NULL; os_sync_mutex_inited = FALSE; - os_sync_mutex = os_mutex_create(NULL); + /* Now for Windows only */ + os_cond_module_init(); + + os_sync_mutex = os_mutex_create(); os_sync_mutex_inited = TRUE; } @@ -143,42 +296,45 @@ os_event_create( const char* name) /*!< in: the name of the event, if NULL the event is created without a name */ { -#ifdef __WIN__ - os_event_t event; - - event = ut_malloc(sizeof(struct os_event_struct)); - - event->handle = CreateEvent(NULL, /* No security attributes */ - TRUE, /* Manual reset */ - FALSE, /* Initial state nonsignaled */ - (LPCTSTR) name); - if (!event->handle) { - fprintf(stderr, - "InnoDB: Could not create a Windows event semaphore;" - " Windows error %lu\n", - (ulong) GetLastError()); - } -#else /* Unix */ os_event_t event; - UT_NOT_USED(name); +#ifdef __WIN__ + if(!srv_use_native_conditions) { - event = ut_malloc(sizeof(struct os_event_struct)); + event = ut_malloc(sizeof(struct os_event_struct)); - os_fast_mutex_init(&(event->os_mutex)); + event->handle = CreateEvent(NULL, + TRUE, + FALSE, + (LPCTSTR) name); + if (!event->handle) { + fprintf(stderr, + "InnoDB: Could not create a Windows event" + " semaphore; Windows error %lu\n", + (ulong) GetLastError()); + } + } else /* Windows with condition variables */ +#endif - ut_a(0 == pthread_cond_init(&(event->cond_var), NULL)); + { + UT_NOT_USED(name); - event->is_set = FALSE; + event = ut_malloc(sizeof(struct os_event_struct)); - /* We return this value in os_event_reset(), which can then be - be used to pass to the os_event_wait_low(). The value of zero - is reserved in os_event_wait_low() for the case when the - caller does not want to pass any signal_count value. To - distinguish between the two cases we initialize signal_count - to 1 here. */ - event->signal_count = 1; -#endif /* __WIN__ */ + os_fast_mutex_init(&(event->os_mutex)); + + os_cond_init(&(event->cond_var)); + + event->is_set = FALSE; + + /* We return this value in os_event_reset(), which can then be + be used to pass to the os_event_wait_low(). The value of zero + is reserved in os_event_wait_low() for the case when the + caller does not want to pass any signal_count value. To + distinguish between the two cases we initialize signal_count + to 1 here. */ + event->signal_count = 1; + } /* The os_sync_mutex can be NULL because during startup an event can be created [ because it's embedded in the mutex/rwlock ] before @@ -208,10 +364,15 @@ os_event_set( /*=========*/ os_event_t event) /*!< in: event to set */ { -#ifdef __WIN__ ut_a(event); - ut_a(SetEvent(event->handle)); -#else + +#ifdef __WIN__ + if (!srv_use_native_conditions) { + ut_a(SetEvent(event->handle)); + return; + } +#endif + ut_a(event); os_fast_mutex_lock(&(event->os_mutex)); @@ -221,11 +382,10 @@ os_event_set( } else { event->is_set = TRUE; event->signal_count += 1; - ut_a(0 == pthread_cond_broadcast(&(event->cond_var))); + os_cond_broadcast(&(event->cond_var)); } os_fast_mutex_unlock(&(event->os_mutex)); -#endif } /**********************************************************//** @@ -244,12 +404,14 @@ os_event_reset( { ib_int64_t ret = 0; -#ifdef __WIN__ ut_a(event); - ut_a(ResetEvent(event->handle)); -#else - ut_a(event); +#ifdef __WIN__ + if(!srv_use_native_conditions) { + ut_a(ResetEvent(event->handle)); + return(0); + } +#endif os_fast_mutex_lock(&(event->os_mutex)); @@ -261,7 +423,6 @@ os_event_reset( ret = event->signal_count; os_fast_mutex_unlock(&(event->os_mutex)); -#endif return(ret); } @@ -274,19 +435,21 @@ os_event_free_internal( os_event_t event) /*!< in: event to free */ { #ifdef __WIN__ - ut_a(event); - - ut_a(CloseHandle(event->handle)); -#else - ut_a(event); - - /* This is to avoid freeing the mutex twice */ - os_fast_mutex_free(&(event->os_mutex)); - - ut_a(0 == pthread_cond_destroy(&(event->cond_var))); + if(!srv_use_native_conditions) { + ut_a(event); + ut_a(CloseHandle(event->handle)); + } else #endif - /* Remove from the list of events */ + { + ut_a(event); + /* This is to avoid freeing the mutex twice */ + os_fast_mutex_free(&(event->os_mutex)); + + os_cond_destroy(&(event->cond_var)); + } + + /* Remove from the list of events */ UT_LIST_REMOVE(os_event_list, os_event_list, event); os_event_count--; @@ -303,18 +466,19 @@ os_event_free( os_event_t event) /*!< in: event to free */ { + ut_a(event); #ifdef __WIN__ - ut_a(event); - - ut_a(CloseHandle(event->handle)); -#else - ut_a(event); - - os_fast_mutex_free(&(event->os_mutex)); - ut_a(0 == pthread_cond_destroy(&(event->cond_var))); + if(!srv_use_native_conditions){ + ut_a(CloseHandle(event->handle)); + } else /*Windows with condition variables */ #endif - /* Remove from the list of events */ + { + os_fast_mutex_free(&(event->os_mutex)); + os_cond_destroy(&(event->cond_var)); + } + + /* Remove from the list of events */ os_mutex_enter(os_sync_mutex); UT_LIST_REMOVE(os_event_list, os_event_list, event); @@ -355,24 +519,28 @@ os_event_wait_low( returned by previous call of os_event_reset(). */ { -#ifdef __WIN__ - DWORD err; - - ut_a(event); - - UT_NOT_USED(reset_sig_count); - - /* Specify an infinite time limit for waiting */ - err = WaitForSingleObject(event->handle, INFINITE); - - ut_a(err == WAIT_OBJECT_0); - - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); - } -#else ib_int64_t old_signal_count; +#ifdef __WIN__ + if(!srv_use_native_conditions) { + DWORD err; + + ut_a(event); + + UT_NOT_USED(reset_sig_count); + + /* Specify an infinite wait */ + err = WaitForSingleObject(event->handle, INFINITE); + + ut_a(err == WAIT_OBJECT_0); + + if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { + os_thread_exit(NULL); + } + return; + } +#endif + os_fast_mutex_lock(&(event->os_mutex)); if (reset_sig_count) { @@ -396,123 +564,29 @@ os_event_wait_low( return; } - pthread_cond_wait(&(event->cond_var), &(event->os_mutex)); + os_cond_wait(&(event->cond_var), &(event->os_mutex)); /* Solaris manual said that spurious wakeups may occur: we have to check if the event really has been signaled after we came here to wait */ } -#endif } -/**********************************************************//** -Waits for an event object until it is in the signaled state or -a timeout is exceeded. In Unix the timeout is always infinite. -@return 0 if success, OS_SYNC_TIME_EXCEEDED if timeout was exceeded */ -UNIV_INTERN -ulint -os_event_wait_time( -/*===============*/ - os_event_t event, /*!< in: event to wait */ - ulint time) /*!< in: timeout in microseconds, or - OS_SYNC_INFINITE_TIME */ -{ -#ifdef __WIN__ - DWORD err; - - ut_a(event); - - if (time != OS_SYNC_INFINITE_TIME) { - err = WaitForSingleObject(event->handle, (DWORD) time / 1000); - } else { - err = WaitForSingleObject(event->handle, INFINITE); - } - - if (err == WAIT_OBJECT_0) { - - return(0); - } else if (err == WAIT_TIMEOUT) { - - return(OS_SYNC_TIME_EXCEEDED); - } else { - ut_error; - return(1000000); /* dummy value to eliminate compiler warn. */ - } -#else - UT_NOT_USED(time); - - /* In Posix this is just an ordinary, infinite wait */ - - os_event_wait(event); - - return(0); -#endif -} - -#ifdef __WIN__ -/**********************************************************//** -Waits for any event in an OS native event array. Returns if even a single -one is signaled or becomes signaled. -@return index of the event which was signaled */ -UNIV_INTERN -ulint -os_event_wait_multiple( -/*===================*/ - ulint n, /*!< in: number of events in the - array */ - os_native_event_t* native_event_array) - /*!< in: pointer to an array of event - handles */ -{ - DWORD index; - - ut_a(native_event_array); - ut_a(n > 0); - - index = WaitForMultipleObjects((DWORD) n, native_event_array, - FALSE, /* Wait for any 1 event */ - INFINITE); /* Infinite wait time - limit */ - ut_a(index >= WAIT_OBJECT_0); /* NOTE: Pointless comparison */ - ut_a(index < WAIT_OBJECT_0 + n); - - if (srv_shutdown_state == SRV_SHUTDOWN_EXIT_THREADS) { - os_thread_exit(NULL); - } - - return(index - WAIT_OBJECT_0); -} -#endif - /*********************************************************//** Creates an operating system mutex semaphore. Because these are slow, the mutex semaphore of InnoDB itself (mutex_t) should be used where possible. @return the mutex handle */ UNIV_INTERN os_mutex_t -os_mutex_create( -/*============*/ - const char* name) /*!< in: the name of the mutex, if NULL - the mutex is created without a name */ +os_mutex_create(void) +/*=================*/ { -#ifdef __WIN__ - HANDLE mutex; - os_mutex_t mutex_str; - - mutex = CreateMutex(NULL, /* No security attributes */ - FALSE, /* Initial state: no owner */ - (LPCTSTR) name); - ut_a(mutex); -#else os_fast_mutex_t* mutex; os_mutex_t mutex_str; - UT_NOT_USED(name); - mutex = ut_malloc(sizeof(os_fast_mutex_t)); os_fast_mutex_init(mutex); -#endif mutex_str = ut_malloc(sizeof(os_mutex_str_t)); mutex_str->handle = mutex; @@ -543,25 +617,11 @@ os_mutex_enter( /*===========*/ os_mutex_t mutex) /*!< in: mutex to acquire */ { -#ifdef __WIN__ - DWORD err; - - ut_a(mutex); - - /* Specify infinite time limit for waiting */ - err = WaitForSingleObject(mutex->handle, INFINITE); - - ut_a(err == WAIT_OBJECT_0); - - (mutex->count)++; - ut_a(mutex->count == 1); -#else os_fast_mutex_lock(mutex->handle); (mutex->count)++; ut_a(mutex->count == 1); -#endif } /**********************************************************//** @@ -577,11 +637,7 @@ os_mutex_exit( ut_a(mutex->count == 1); (mutex->count)--; -#ifdef __WIN__ - ut_a(ReleaseMutex(mutex->handle)); -#else os_fast_mutex_unlock(mutex->handle); -#endif } /**********************************************************//** @@ -610,15 +666,9 @@ os_mutex_free( os_mutex_exit(os_sync_mutex); } -#ifdef __WIN__ - ut_a(CloseHandle(mutex->handle)); - - ut_free(mutex); -#else os_fast_mutex_free(mutex->handle); ut_free(mutex->handle); ut_free(mutex); -#endif } /*********************************************************//** diff --git a/storage/innobase/os/os0thread.c b/storage/innobase/os/os0thread.c index 78df66d7834..b41d873a129 100644 --- a/storage/innobase/os/os0thread.c +++ b/storage/innobase/os/os0thread.c @@ -252,7 +252,7 @@ os_thread_yield(void) /*=================*/ { #if defined(__WIN__) - Sleep(0); + SwitchToThread(); #elif (defined(HAVE_SCHED_YIELD) && defined(HAVE_SCHED_H)) sched_yield(); #elif defined(HAVE_PTHREAD_YIELD_ZERO_ARG) diff --git a/storage/innobase/srv/srv0srv.c b/storage/innobase/srv/srv0srv.c index 6354689105a..7430ff73668 100644 --- a/storage/innobase/srv/srv0srv.c +++ b/storage/innobase/srv/srv0srv.c @@ -142,6 +142,21 @@ use simulated aio we build below with threads. Currently we support native aio on windows and linux */ UNIV_INTERN my_bool srv_use_native_aio = TRUE; +#ifdef __WIN__ +/* Windows native condition variables. We use runtime loading / function +pointers, because they are not available on Windows Server 2003 and +Windows XP/2000. + +We use condition for events on Windows if possible, even if os_event +resembles Windows kernel event object well API-wise. The reason is +performance, kernel objects are heavyweights and WaitForSingleObject() is a +performance killer causing calling thread to context switch. Besides, Innodb +is preallocating large number (often millions) of os_events. With kernel event +objects it takes a big chunk out of non-paged pool, which is better suited +for tasks like IO than for storing idle event objects. */ +UNIV_INTERN ibool srv_use_native_conditions = FALSE; +#endif /* __WIN__ */ + UNIV_INTERN ulint srv_n_data_files = 0; UNIV_INTERN char** srv_data_file_names = NULL; /* size in database pages */ diff --git a/storage/innobase/srv/srv0start.c b/storage/innobase/srv/srv0start.c index 4a0ecc5154f..d6ad554d41e 100644 --- a/storage/innobase/srv/srv0start.c +++ b/storage/innobase/srv/srv0start.c @@ -1160,10 +1160,18 @@ innobase_start_or_create_for_mysql(void) srv_use_native_aio = FALSE; break; - default: - /* On Win 2000 and XP use async i/o */ + + case OS_WIN2000: + case OS_WINXP: + /* On 2000 and XP, async IO is available. */ srv_use_native_aio = TRUE; break; + + default: + /* Vista and later have both async IO and condition variables */ + srv_use_native_aio = TRUE; + srv_use_native_conditions = TRUE; + break; } #elif defined(LINUX_NATIVE_AIO) diff --git a/storage/innobase/sync/sync0arr.c b/storage/innobase/sync/sync0arr.c index 248bd2cd25d..753ebd958ac 100644 --- a/storage/innobase/sync/sync0arr.c +++ b/storage/innobase/sync/sync0arr.c @@ -250,7 +250,7 @@ sync_array_create( /* Then create the mutex to protect the wait array complex */ if (protection == SYNC_ARRAY_OS_MUTEX) { - arr->os_mutex = os_mutex_create(NULL); + arr->os_mutex = os_mutex_create(); } else if (protection == SYNC_ARRAY_MUTEX) { mutex_create(syn_arr_mutex_key, &arr->mutex, SYNC_NO_ORDER_CHECK);