mirror of
				https://github.com/postgres/postgres.git
				synced 2025-10-24 01:29:19 +03:00 
			
		
		
		
	Improve code clarity in epilogue of UTF-8 verification fast path
The previous coding was correct, but the style and commentary were a bit vague about which operations had to happen, in what circumstances, and in what order. Rearrange so that the epilogue does nothing in the DFA END state. That allows turning some conditional statements in the backtracking logic into asserts. With that, we can be more explicit about needing to backtrack at least one byte in non-END states to ensure checking the current byte sequence in the slow path. No change to the regression tests, since they should be able catch deficiencies here already. In passing, improve the comments around DFA states where the first continuation byte has a restricted range.
This commit is contained in:
		| @@ -1807,12 +1807,11 @@ pg_utf8_verifychar(const unsigned char *s, int len) | |||||||
| #define	CS1 16 | #define	CS1 16 | ||||||
| #define	CS2  1 | #define	CS2  1 | ||||||
| #define	CS3  5 | #define	CS3  5 | ||||||
| /* Leading byte was E0/ED, expect 1 more continuation byte */ | /* Partial states, where the first continuation byte has a restricted range */ | ||||||
| #define	P3A  6 | #define	P3A  6					/* Lead was E0, check for 3-byte overlong */ | ||||||
| #define	P3B 20 | #define	P3B 20					/* Lead was ED, check for surrogate */ | ||||||
| /* Leading byte was F0/F4, expect 2 more continuation bytes */ | #define	P4A 25					/* Lead was F0, check for 4-byte overlong */ | ||||||
| #define	P4A 25 | #define	P4B 30					/* Lead was F4, check for too-large */ | ||||||
| #define	P4B 30 |  | ||||||
| /* Begin and End are the same state */ | /* Begin and End are the same state */ | ||||||
| #define	END BGN | #define	END BGN | ||||||
|  |  | ||||||
| @@ -1941,31 +1940,32 @@ pg_utf8_verifystr(const unsigned char *s, int len) | |||||||
| 			len -= STRIDE_LENGTH; | 			len -= STRIDE_LENGTH; | ||||||
| 		} | 		} | ||||||
|  |  | ||||||
| 		/* | 		/* The error state persists, so we only need to check for it here. */ | ||||||
| 		 * The error state persists, so we only need to check for it here. In |  | ||||||
| 		 * case of error we start over from the beginning with the slow path |  | ||||||
| 		 * so we can count the valid bytes. |  | ||||||
| 		 */ |  | ||||||
| 		if (state == ERR) | 		if (state == ERR) | ||||||
| 		{ | 		{ | ||||||
|  | 			/* | ||||||
|  | 			 * Start over from the beginning with the slow path so we can | ||||||
|  | 			 * count the valid bytes. | ||||||
|  | 			 */ | ||||||
| 			len = orig_len; | 			len = orig_len; | ||||||
| 			s = start; | 			s = start; | ||||||
| 		} | 		} | ||||||
|  | 		else if (state != END) | ||||||
| 		/* |  | ||||||
| 		 * We treat all other states as success, but it's possible the fast |  | ||||||
| 		 * path exited in the middle of a multibyte sequence, since that |  | ||||||
| 		 * wouldn't have caused an error. Before checking the remaining bytes, |  | ||||||
| 		 * walk backwards to find the last byte that could have been the start |  | ||||||
| 		 * of a valid sequence. |  | ||||||
| 		 */ |  | ||||||
| 		while (s > start) |  | ||||||
| 		{ | 		{ | ||||||
| 			s--; | 			/* | ||||||
| 			len++; | 			 * The fast path exited in the middle of a multibyte sequence. | ||||||
|  | 			 * Walk backwards to find the leading byte so that the slow path | ||||||
| 			if (!IS_HIGHBIT_SET(*s) || pg_utf_mblen(s) > 1) | 			 * can resume checking from there. We must always backtrack at | ||||||
| 				break; | 			 * least one byte, since the current byte could be e.g. an ASCII | ||||||
|  | 			 * byte after a 2-byte lead, which is invalid. | ||||||
|  | 			 */ | ||||||
|  | 			do | ||||||
|  | 			{ | ||||||
|  | 				Assert(s > start); | ||||||
|  | 				s--; | ||||||
|  | 				len++; | ||||||
|  | 				Assert(IS_HIGHBIT_SET(*s)); | ||||||
|  | 			} while (pg_utf_mblen(s) <= 1); | ||||||
| 		} | 		} | ||||||
| 	} | 	} | ||||||
|  |  | ||||||
|   | |||||||
		Reference in New Issue
	
	Block a user