Add mbverifystr() functions specific to each encoding.

This makes pg_verify_mbstr() function faster, by allowing more efficient encoding-specific implementations. All the implementations included in this commit are pretty naive, they just call the same encoding-specific verifychar functions that were used previously, but that already gives a performance boost because the tight character-at-a-time loop is simpler. Reviewed-by: John Naylor Discussion: https://www.postgresql.org/message-id/e7861509-3960-538a-9025-b75a61188e01@iki.fi
2025-11-24 00:23:06 +03:00 · 2021-01-28 14:40:07 +02:00
parent a3367aa3c4
commit b80e10638e
9 changed files with 493 additions and 101 deletions
--- a/src/backend/utils/mb/mbutils.c
+++ b/src/backend/utils/mb/mbutils.c
@@ -519,7 +519,7 @@ pg_convert(PG_FUNCTION_ARGS)
 	/* make sure that source string is valid */
 	len = VARSIZE_ANY_EXHDR(string);
 	src_str = VARDATA_ANY(string);
-	pg_verify_mbstr_len(src_encoding, src_str, len, false);
+	(void) pg_verify_mbstr(src_encoding, src_str, len, false);

 	/* perform conversion */
 	dest_str = (char *) pg_do_encoding_conversion((unsigned char *) unconstify(char *, src_str),
@@ -1215,10 +1215,10 @@ static bool
 pg_generic_charinc(unsigned char *charptr, int len)
 {
 	unsigned char *lastbyte = charptr + len - 1;
-	mbverifier	mbverify;
+	mbchar_verifier mbverify;

 	/* We can just invoke the character verifier directly. */
-	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
+	mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverifychar;

 	while (*lastbyte < (unsigned char) 255)
 	{
@@ -1445,8 +1445,7 @@ pg_database_encoding_max_length(void)
 bool
 pg_verifymbstr(const char *mbstr, int len, bool noError)
 {
-	return
-		pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
+	return pg_verify_mbstr(GetDatabaseEncoding(), mbstr, len, noError);
 }

 /*
@@ -1456,7 +1455,18 @@ pg_verifymbstr(const char *mbstr, int len, bool noError)
 bool
 pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 {
-	return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
+	int			oklen;
+
+	Assert(PG_VALID_ENCODING(encoding));
+
+	oklen = pg_wchar_table[encoding].mbverifystr((const unsigned char *) mbstr, len);
+	if (oklen != len)
+	{
+		if (noError)
+			return false;
+		report_invalid_encoding(encoding, mbstr + oklen, len - oklen);
+	}
+	return true;
 }

 /*
@@ -1469,11 +1479,14 @@ pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
 * If OK, return length of string in the encoding.
 * If a problem is found, return -1 when noError is
 * true; when noError is false, ereport() a descriptive message.
+ *
+ * Note: We cannot use the faster encoding-specific mbverifystr() function
+ * here, because we need to count the number of characters in the string.
 */
 int
 pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 {
-	mbverifier	mbverify;
+	mbchar_verifier mbverifychar;
 	int			mb_len;

 	Assert(PG_VALID_ENCODING(encoding));
@@ -1493,7 +1506,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 	}

 	/* fetch function pointer just once */
-	mbverify = pg_wchar_table[encoding].mbverify;
+	mbverifychar = pg_wchar_table[encoding].mbverifychar;

 	mb_len = 0;

@@ -1516,7 +1529,7 @@ pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
 			report_invalid_encoding(encoding, mbstr, len);
 		}

-		l = (*mbverify) ((const unsigned char *) mbstr, len);
+		l = (*mbverifychar) ((const unsigned char *) mbstr, len);

 		if (l < 0)
 		{