1
0
mirror of https://github.com/MariaDB/server.git synced 2025-04-18 21:44:20 +03:00
mariadb/strings/ctype-uca-scanner_next.inl
Alexander Barkov f6118acda9 A follow-up patch MDEV-27266 Improve UCA collation performance for utf8mb3 and utf8mb4
Moving these members:

   CHARSET_INFO *cs;
   const MY_UCA_WEIGHT_LEVEL *level;

from my_uca_scanner to a new separate structure my_uca_scanner_param.

Rationale:

During a comparison of two strings these members were initialized two times
(one time for every string).

After the change these members initialized only one time inside
a shared instance of my_uca_scanner_param, and the instance is
shared between two scanners (its const address is passed as new a parameter
to the underlying scanner functions).

This change gives a slight performance improvement (~5%).
2022-09-02 13:23:24 +04:00

224 lines
7.6 KiB
C++

/* Copyright (c) 2004, 2013, Oracle and/or its affiliates.
Copyright (c) 2009, 2021, MariaDB
This library is free software; you can redistribute it and/or
modify it under the terms of the GNU Library General Public
License as published by the Free Software Foundation; version 2
of the License.
This library is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
Library General Public License for more details.
You should have received a copy of the GNU Library General Public
License along with this library; if not, write to the Free
Software Foundation, Inc., 51 Franklin St, Fifth Floor, Boston,
MA 02110-1335 USA */
#ifdef SCANNER_NEXT_NCHARS
#define SCANNER_NEXT_RETURN(_w,_n) \
do { weight_and_nchars_t rc= {_w, _n}; return rc; } while(0)
#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
do { \
weight_and_nchars_t rc= { _cnt->weight[0], \
_ignorable_nchars + \
my_contraction_char_length(_cnt) }; \
return rc; \
} while(0)
#else
#define SCANNER_NEXT_RETURN(_w,_n) do { return _w; } while (0)
#define SCANNER_NEXT_RETURN_CONTRACTION(_cnt,_ignorable_nchars) \
do { return _cnt->weight[0]; } while(0)
#endif
static inline
#ifdef SCANNER_NEXT_NCHARS
weight_and_nchars_t
MY_FUNCTION_NAME(scanner_next_with_nchars)(my_uca_scanner *scanner,
const my_uca_scanner_param *param,
size_t nchars)
#else
int
MY_FUNCTION_NAME(scanner_next)(my_uca_scanner *scanner,
const my_uca_scanner_param *param)
#endif
{
#ifdef SCANNER_NEXT_NCHARS
uint ignorable_nchars;
#define LOCAL_MAX_CONTRACTION_LENGTH nchars
#else
#define LOCAL_MAX_CONTRACTION_LENGTH MY_UCA_MAX_CONTRACTION
#endif
uint16 weight= my_uca_scanner_next_expansion_weight(scanner);
if (weight)
{
/*
More weights left from the previous step.
Return the next weight from the current expansion.
Return "0" as "nchars". The real nchars was set on a previous
iteration.
*/
SCANNER_NEXT_RETURN(weight, 0);
}
#ifdef SCANNER_NEXT_NCHARS
for (ignorable_nchars= 0 ; ; ignorable_nchars++)
#else
for ( ; ; )
#endif
{
const uint16 *wpage;
int mblen;
my_wc_t currwc= 0;
const uint16 *cweight;
#if MY_UCA_ASCII_OPTIMIZE && !defined(SCANNER_NEXT_NCHARS)
if (scanner->sbeg + 1 < scanner->send)
{
const MY_UCA_2BYTES_ITEM *ww;
ww= my_uca_level_booster_2bytes_item_addr_const(param->level->booster,
scanner->sbeg[0],
scanner->sbeg[1]);
if (my_uca_2bytes_item_is_applicable(ww))
{
/*
Byte pairs that make 2-byte head characters in previous
context pairs are marked as not applicable for optimization
during the collation initialization. So when we come here
sbeg[0] and sbeg[1] are:
- either two ASCII characters
- or one 2-byte character which IS NOT a previous context head
Just remember sbeg[1] as the previous character for simplicity.
This may erroneously interpret bytes 0x80..0x9F as previous context
head characters U+0080..U+009F. However, CLDR does not have any real
collations that use these characters as previous context heads.
*/
scanner->page= 0;
scanner->code= (int) scanner->sbeg[1];
scanner->sbeg+= 2;
if ((weight= my_uca_scanner_set_weight(scanner, ww->weight)))
{
/*
TODO: add support for scanner_next_with_nchars and do this:
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
*/
return weight;
}
continue; /* Ignorable character */
}
/* 2 byte optimization is not applicable, go the slow path */
}
#endif
/* Get next character */
#if MY_UCA_ASCII_OPTIMIZE
/* Get next ASCII character */
if (scanner->sbeg < scanner->send && scanner->sbeg[0] < 0x80)
{
currwc= scanner->sbeg[0];
scanner->sbeg+= 1;
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(param->level, currwc))
{
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param,
currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
{
if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
continue; /* Ignorable contraction */
}
}
#endif
scanner->page= 0;
scanner->code= (int) currwc;
cweight= param->level->weights[0] + scanner->code * param->level->lengths[0];
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
continue; /* Ignorable character */
}
else
#endif
/* Get next MB character */
if (((mblen= MY_MB_WC(scanner, param, &currwc, scanner->sbeg,
scanner->send)) <= 0))
{
if (scanner->sbeg >= scanner->send)
{
/* No more bytes, end of line reached */
SCANNER_NEXT_RETURN(-1, ignorable_nchars);
}
/*
There are some more bytes left. Non-positive mb_len means that
we got an incomplete or a bad byte sequence. Consume mbminlen bytes.
*/
if ((scanner->sbeg+= param->cs->mbminlen) > scanner->send)
{
/* For safety purposes don't go beyond the string range. */
scanner->sbeg= scanner->send;
}
/*
Treat every complete or incomplete mbminlen unit as a weight which is
greater than weight for any possible normal character.
0xFFFF is greater than any possible weight in the UCA weight table.
*/
SCANNER_NEXT_RETURN(0xFFFF, ignorable_nchars + 1);
}
scanner->sbeg+= mblen;
if (currwc > param->level->maxchar)
{
SCANNER_NEXT_RETURN(my_uca_scanner_set_weight_outside_maxchar(scanner),
ignorable_nchars + 1);
}
#if MY_UCA_COMPILE_CONTRACTIONS
if (my_uca_needs_context_handling(param->level, currwc))
{
const MY_CONTRACTION *cnt= my_uca_context_weight_find(scanner, param, currwc,
LOCAL_MAX_CONTRACTION_LENGTH);
if (cnt)
{
if ((weight= my_uca_scanner_set_weight(scanner, cnt->weight)))
SCANNER_NEXT_RETURN_CONTRACTION(cnt, ignorable_nchars);
continue; /* Ignorable contraction */
}
}
#endif
/* Process single character */
scanner->page= currwc >> 8;
scanner->code= currwc & 0xFF;
/* If weight page for w[0] does not exist, then calculate algoritmically */
if (!(wpage= param->level->weights[scanner->page]))
SCANNER_NEXT_RETURN(my_uca_scanner_next_implicit(scanner, param),
ignorable_nchars + 1);
/* Calculate pointer to w[0]'s weight, using page and offset */
cweight= wpage + scanner->code * param->level->lengths[scanner->page];
if ((weight= my_uca_scanner_set_weight(scanner, cweight)))
SCANNER_NEXT_RETURN(weight, ignorable_nchars + 1);
continue; /* Ignorable character */
}
SCANNER_NEXT_RETURN(0, 0); /* Not reachable */
}
#undef SCANNER_NEXT_NCHARS
#undef SCANNER_NEXT_RETURN
#undef SCANNER_NEXT_RETURN_CONTRACTION
#undef LOCAL_MAX_CONTRACTION_LENGTH