1
0
mirror of https://github.com/MariaDB/server.git synced 2025-07-30 16:24:05 +03:00

Adding thorough tests covering what happens with escaped sequences

in the SQL parser.

Various backslash escapes and quote-quote escaped sequences are covered
in combination with single and multi-byte characters.

This is especially important for the character sets that can have 0x5C
as the second byte in a multi-byte character (big5, cp932, gbk, sjis).

swe7 is also a special character set, because in swe7 0x5C is used for
both escape character and for "LATIN CAPITAL LETTER O WITH DIAERESIS".
This commit is contained in:
Alexander Barkov
2014-09-12 12:57:27 +04:00
parent d2ae40ac2e
commit 6a576f1a58
17 changed files with 21909 additions and 0 deletions

View File

@ -0,0 +1,9 @@
#
# Create a table with all byte values
#
CREATE TABLE halfs (a INT);
INSERT INTO halfs VALUES (0x00),(0x01),(0x02),(0x03),(0x04),(0x05),(0x06),(0x07);
INSERT INTO halfs VALUES (0x08),(0x09),(0x0A),(0x0B),(0x0C),(0x0D),(0x0E),(0x0F);
CREATE TEMPORARY TABLE bytes (a BINARY(1), KEY(a)) ENGINE=MyISAM;
INSERT INTO bytes SELECT CHAR((t1.a << 4) | t2.a USING BINARY) FROM halfs t1, halfs t2;
DROP TABLE halfs;

View File

@ -0,0 +1,21 @@
#
# Create a table with all 2 byte sequence values
#
--source include/bytes.inc
CREATE TABLE halfs (a BINARY(1));
# "bytes" is a temporary table, hence is not allowed in joins
# Create a non-temporary copy.
INSERT INTO halfs SELECT * FROM bytes;
CREATE TEMPORARY TABLE bytes2 (
a BINARY(2),
hi BINARY(1),
lo BINARY(1),
KEY(a),
KEY(lo)
) ENGINE=MyISAM;
INSERT INTO bytes2
SELECT CONCAT(t1.a, t2.a), t1.a, t2.a FROM halfs t1, halfs t2
ORDER BY t1.a, t2.a;
DROP TABLE halfs;

View File

@ -0,0 +1,327 @@
--echo # Start of ctype_unescape.inc
#
# Testing how string literals with backslash and quote-quote are unescaped.
# The tests assume that single quote (') is used as a delimiter.
#
#
# Make sure that the parser really works using the character set we need.
# We use binary strings to compose strings, to be able to test get malformed
# sequences, which are possible as a result of mysql_real_escape_string().
# The important thing for this test is to make the parser unescape using
# the client character set, rather than binary. Currently it works exactly
# that way by default, so the query below should return @@character_set_client
#
SET @query=_binary'SELECT CHARSET(\'test\'),@@character_set_client,@@character_set_connection';
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
let $CHARSET=`SELECT @@character_set_connection`;
CREATE TABLE allbytes (a VARBINARY(10));
#
# Create various byte sequences to test. Testing the full banch of
# possible combinations takes about 2 minutes. So this test provides
# variants to run with:
# - the full set of possible combinations
# - a reduced test of combinations for selected bytes only
#
# Create selected byte combinations
if ($ctype_unescape_combinations == 'selected')
{
--echo # Using selected bytes combinations
--source include/bytes.inc
#
# Populate "selected_bytes" with bytes that have a special meaning.
# We'll use "selected_bytes" to generate byte seqeunces,
# instead of the full possible byte combinations, to reduce test time.
#
CREATE TABLE selected_bytes (a VARBINARY(10));
# Bytes that have a special meaning in all character sets:
# 0x00 - mysql_real_escape_string() quotes this to '\0'
# 0x0D - mysql_real_escape_string() quotes this to '\r'
# 0x0A - mysql_real_escape_string() quotes this to '\n'
# 0x1A - mysql_real_escape_string() quotes this to '\Z'
# 0x08 - mysql_real_escape_string() does not quote this,
# but '\b' is unescaped to 0x08.
# 0x09 - mysql_real_escape_string() does not quote this,
# but '\t' is unescaped to 0x09.
# 0x30 - '0', as in '\0'
# 0x5A - 'Z', as in '\Z'
# 0x62 - 'b', as in '\b'
# 0x6E - 'n', as in '\n'
# 0x72 - 't', as in '\r'
# 0x74 - 't', as in '\t'
INSERT INTO selected_bytes (a) VALUES ('\0'),('\b'),('\t'),('\r'),('\n'),('\Z');
INSERT INTO selected_bytes (a) VALUES ('0'),('b'),('t'),('r'),('n'),('Z');
# 0x22 - double quote
# 0x25 - percent sign, '\%' is preserved as is for LIKE.
# 0x27 - single quote
# 0x5C - backslash
# 0x5F - underscore, '\_' is preserved as is for LIKE.
INSERT INTO selected_bytes (a) VALUES ('\\'),('_'),('%'),(0x22),(0x27);
# Some bytes do not have any special meaning, for example basic Latin letters.
# Let's add, one should be enough for a good enough coverage.
INSERT INTO selected_bytes (a) VALUES ('a');
#
# This maps summarizes bytes that have a special
# meaning in various character sets:
#
# MBHEAD MBTAIL NONASCII-8BIT BAD
# ------ ------ -------------- ----------
# big5: [A1..F9] [40..7E,A1..FE] N/A [80..A0,FA..FF]
# cp932: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# gbk: [81..FE] [40..7E,80..FE] N/A [FF]
# sjis: [81..9F,E0..FC] [40..7E,80..FC] [A1..DF] [FD..FF]
# swe7: N/A N/A [5B..5E,7B..7E] [80..FF]
#
INSERT INTO selected_bytes (a) VALUES
(0x3F), # 7bit
(0x40), # 7bit mbtail
(0x7E), # 7bit mbtail nonascii-8bit
(0x7F), # 7bit nonascii-8bit
(0x80), # mbtail bad-mb
(0x81), # mbhead mbtail
(0x9F), # mbhead mbtail bad-mb
(0xA0), # mbhead mbtail bad-mb
(0xA1), # mbhead mbtail nonascii-8bit
(0xE0), # mbhead mbtai
(0xEF), # mbhead mbtail
(0xF9), # mbhead mbtail
(0xFA), # mbhead mbtail bad-mb
(0xFC), # mbhead mbtail bad-mb
(0xFD), # mbhead mbtail bad-mb
(0xFE), # mbhead mbtial bad-mb
(0xFF); # bad-mb
#
# Now populate the test table
#
# Use all single bytes, this is cheap, there are only 256 values.
INSERT INTO allbytes (a) SELECT a FROM bytes;
# Add selected bytes combinations
INSERT INTO allbytes (a) SELECT CONCAT(t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,t2.a) FROM selected_bytes t1,selected_bytes t2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,t1.a,0x5C,t2.a) FROM selected_bytes t1,selected_bytes t2;
DROP TABLE selected_bytes;
# Delete all non-single byte sequences that do not have
# backslashes or quotes at all. There is nothing special with these strings.
DELETE FROM allbytes WHERE
OCTET_LENGTH(a)>1 AND
LOCATE(0x5C,a)=0 AND
a NOT LIKE '%\'%' AND
a NOT LIKE '%"%';
}
if ($ctype_unescape_combinations=='')
{
--echo # Using full byte combinations
--source include/bytes2.inc
INSERT INTO allbytes (a) SELECT a FROM bytes;
INSERT INTO allbytes (a) SELECT CONCAT(hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,lo) FROM bytes2;
INSERT INTO allbytes (a) SELECT CONCAT(0x5C,hi,0x5C,lo) FROM bytes2;
}
DELIMITER //;
#
# A procedure that make an SQL query using 'val' as a string literal.
# The result of the query execution is written into the table 't1'.
# NULL in t1.b means that query failed due to syntax error,
# typically because of mis-interpreted closing quote delimiter.
#
CREATE PROCEDURE p1(val VARBINARY(10))
BEGIN
DECLARE EXIT HANDLER FOR SQLSTATE '42000' INSERT INTO t1 (a,b) VALUES(val,NULL);
SET @query=CONCAT(_binary"INSERT INTO t1 (a,b) VALUES (0x",HEX(val),",'",val,"')");
PREPARE stmt FROM @query;
EXECUTE stmt;
DEALLOCATE PREPARE stmt;
END//
#
# A procedure that iterates through all records in "allbytes".
# And runs p1() for every record.
#
CREATE PROCEDURE p2()
BEGIN
DECLARE val VARBINARY(10);
DECLARE done INT DEFAULT FALSE;
DECLARE stmt CURSOR FOR SELECT a FROM allbytes;
DECLARE CONTINUE HANDLER FOR NOT FOUND SET done=TRUE;
OPEN stmt;
read_loop1: LOOP
FETCH stmt INTO val;
IF done THEN
LEAVE read_loop1;
END IF;
CALL p1(val);
END LOOP;
CLOSE stmt;
END//
# A function that converts the value from binary to $CHARSET
# and check if it has changed. CONVERT() fixes malformed strings.
# So if the string changes in CONVERT(), it means it was not wellformed.
--eval CREATE FUNCTION iswellformed(a VARBINARY(256)) RETURNS INT RETURN a=BINARY CONVERT(a USING $CHARSET);
#
# A function that approximately reproduces how the SQL parser
# would unescape a binary string.
#
CREATE FUNCTION unescape(a VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
# We need to do it in a way to avoid producing new escape sequences
# First, enclose all known escsape sequences to '{{xx}}'
# - Backslash not followed by a LIKE pattern characters _ and %
# - Double escapes
# This uses PCRE Branch Reset Groups: (?|(alt1)|(alt2)|(alt3)).
# So '\\1' in the last argument always means the match, no matter
# which alternative it came from.
SET a=REGEXP_REPLACE(a,'(?|(\\\\[^_%])|(\\x{27}\\x{27}))','{{\\1}}');
# Now unescape all enclosed standard escape sequences
SET a=REPLACE(a,'{{\\0}}', '\0');
SET a=REPLACE(a,'{{\\b}}', '\b');
SET a=REPLACE(a,'{{\\t}}', '\t');
SET a=REPLACE(a,'{{\\r}}', '\r');
SET a=REPLACE(a,'{{\\n}}', '\n');
SET a=REPLACE(a,'{{\\Z}}', '\Z');
SET a=REPLACE(a,'{{\\\'}}', '\'');
# Unescape double quotes
SET a=REPLACE(a,'{{\'\'}}', '\'');
# Unescape the rest: all other \x sequences mean just 'x'
SET a=REGEXP_REPLACE(a, '{{\\\\(.|\\R)}}', '\\1');
RETURN a;
END//
#
# A function that checks what happened during unescaping.
#
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# The following return values are possible:
# - SyntErr - b IS NULL, which means syntax error happened in p1().
# - Preserv - the value was not modified during unescaping.
# This is possible if 0x5C was treated as mbtail.
# Or only LIKE escape sequences were found: '\_' and '\%'.
# - Trivial - only 0x5C were removed.
# - Regular - the value was unescaped like a binary string.
# Some standard escape sequences were found.
# No special multi-byte handling happened.
# - Special - Something else happened. Should not happen.
#
CREATE FUNCTION unescape_type(a VARBINARY(256),b VARBINARY(256)) RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN '[SyntErr]'
WHEN a=b THEN CASE
WHEN OCTET_LENGTH(a)=1 THEN '[Preserve]'
WHEN a RLIKE '\\\\[_%]' THEN '[Preserve][LIKE]'
WHEN a RLIKE '^[[:ascii:]]+$' THEN '[Preserve][ASCII]'
ELSE '[Preserv][MB]' END
WHEN REPLACE(a,0x5C,'')=b THEN '[Trivial]'
WHEN UNESCAPE(a)=b THEN '[Regular]'
ELSE '[Special]' END;
END//
#
# Check what happened with wellformedness during unescaping
# @param a - the value before unescaping
# @param b - the value after unescaping
#
# Returned values:
# [FIXED] - the value was malformed and become wellformed after unescaping
# [BROKE] - the value was wellformed and become malformed after unescaping
# [ILSEQ] - both values (before unescaping and after unescaping) are malformed
# '' - both values are wellformed
#
CREATE FUNCTION wellformedness(a VARBINARY(256), b VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
RETURN CASE
WHEN b IS NULL THEN ''
WHEN NOT iswellformed(a) AND iswellformed(b) THEN '[FIXED]'
WHEN iswellformed(a) AND NOT iswellformed(b) THEN '[BROKE]'
WHEN NOT iswellformed(a) AND NOT iswellformed(b) THEN '[ILSEQ]'
ELSE ''
END;
END//
#
# Check if the value could be generated by mysql_real_escape_string(),
# or can only come from a direct user input.
#
# @param a - the value before unescaping
#
# Returns:
# [USER] - if the value could not be generated by mysql_real_escape_string()
# '' - if the value was possibly generated by mysql_real_escape_string()
#
#
CREATE FUNCTION mysql_real_escape_string_generated(a VARBINARY(256))
RETURNS VARBINARY(256)
BEGIN
DECLARE a1 BINARY(1) DEFAULT SUBSTR(a,1,1);
DECLARE a2 BINARY(1) DEFAULT SUBSTR(a,2,1);
DECLARE a3 BINARY(1) DEFAULT SUBSTR(a,3,1);
DECLARE a4 BINARY(1) DEFAULT SUBSTR(a,4,1);
DECLARE a2a4 BINARY(2) DEFAULT CONCAT(a2,a4);
RETURN CASE
WHEN (a1=0x5C) AND
(a3=0x5C) AND
(a2>0x7F) AND
(a4 NOT IN ('_','%','0','t','r','n','Z')) AND
iswellformed(a2a4) THEN '[USER]'
ELSE ''
END;
END//
DELIMITER ;//
CREATE TABLE t1 (a VARBINARY(10),b VARBINARY(10));
CALL p2();
# Avoid "Invalid XXX character string" warnings
# We mark malformed strings in the output anyway
--disable_warnings
# All records marked with '[BAD]' mean that the string was unescaped
# in a unexpected way, that means there is a bug in UNESCAPE() above.
SELECT HEX(a),HEX(b),
CONCAT(unescape_type(a,b),
wellformedness(a,b),
mysql_real_escape_string_generated(a),
IF(UNESCAPE(a)<>b,CONCAT('[BAD',HEX(UNESCAPE(a)),']'),'')) AS comment
FROM t1 ORDER BY LENGTH(a),a;
--enable_warnings
DROP TABLE t1;
DROP PROCEDURE p1;
DROP PROCEDURE p2;
DROP FUNCTION unescape;
DROP FUNCTION unescape_type;
DROP FUNCTION wellformedness;
DROP FUNCTION mysql_real_escape_string_generated;
DROP FUNCTION iswellformed;
DROP TABLE allbytes;
--echo # End of ctype_backslash.inc