1
0
mirror of https://github.com/MariaDB/server.git synced 2025-08-07 00:04:31 +03:00

Bug #14637: trim trailing spaces processes data only byte wise

(From: gkodinov)
Use and int * where possible to scan for trailing space in a
string instead of always iterating char-by-char.
Using the attached benchmark file on a 32 bit Intel Core 2 
Duo CPU I've got 43485 ms run with the fix compared to 44373 
without it.

Backported to 5.6.0 (next-mr-runtime)
6.0-codebase revid: 2476.1362.1


include/m_string.h:
  scan for space through ints
strings/ctype-bin.c:
  scan for space through ints
strings/ctype-latin1.c:
  scan for space through ints
strings/ctype-mb.c:
  scan for space through ints
strings/ctype-simple.c:
  scan for space through ints
This commit is contained in:
Magne Mahre
2009-11-11 17:03:02 +01:00
parent 1c3c72ce17
commit 3c3b11c3b8
5 changed files with 78 additions and 17 deletions

View File

@@ -266,4 +266,74 @@ typedef struct st_mysql_lex_string LEX_STRING;
#define USTRING_WITH_LEN(X) ((uchar*) X), ((size_t) (sizeof(X) - 1))
#define C_STRING_WITH_LEN(X) ((char *) (X)), ((size_t) (sizeof(X) - 1))
/* SPACE_INT is a word that contains only spaces */
#if SIZEOF_INT == 4
#define SPACE_INT 0x20202020
#elif SIZEOF_INT == 8
#define SPACE_INT 0x2020202020202020
#else
#error define the appropriate constant for a word full of spaces
#endif
/**
Skip trailing space.
On most systems reading memory in larger chunks (ideally equal to the size of
the chinks that the machine physically reads from memory) causes fewer memory
access loops and hence increased performance.
This is why the 'int' type is used : it's closest to that (according to how
it's defined in C).
So when we determine the amount of whitespace at the end of a string we do
the following :
1. We divide the string into 3 zones :
a) from the start of the string (__start) to the first multiple
of sizeof(int) (__start_words)
b) from the end of the string (__end) to the last multiple of sizeof(int)
(__end_words)
c) a zone that is aligned to sizeof(int) and can be safely accessed
through an int *
2. We start comparing backwards from (c) char-by-char. If all we find is
space then we continue
3. If there are elements in zone (b) we compare them as unsigned ints to a
int mask (SPACE_INT) consisting of all spaces
4. Finally we compare the remaining part (a) of the string char by char.
This covers for the last non-space unsigned int from 3. (if any)
This algorithm works well for relatively larger strings, but it will slow
the things down for smaller strings (because of the additional calculations
and checks compared to the naive method). Thus the barrier of length 20
is added.
@param ptr pointer to the input string
@param len the length of the string
@return the last non-space character
*/
static inline const uchar *skip_trailing_space(const uchar *ptr,size_t len)
{
const uchar *start= ptr;
const uchar *end= ptr + len;
if (len > 20)
{
const uchar *end_words= (const uchar *)
(((intptr)end) / SIZEOF_INT * SIZEOF_INT);
const uchar *start_words= (const uchar *)
((((intptr)start) + SIZEOF_INT - 1) / SIZEOF_INT * SIZEOF_INT);
DBUG_ASSERT(((intptr)start) >= SIZEOF_INT);
if (end_words > start)
{
while (end > end_words && end[-1] == 0x20)
end--;
if (end[-1] == 0x20 && start_words < end_words)
while (end > start_words && ((unsigned *)end)[-1] == SPACE_INT)
end -= SIZEOF_INT;
}
}
while (end > start && end[-1] == 0x20)
end--;
return (end);
}
#endif