Allow psql multi-line column values to align in the proper columns

If the second output column value is 'a\nb', the 'b' should appear in the second display column, rather than the first column as it does now. Change libpq's PQdsplen() to return more useful values. > Note: this changes the PQdsplen function, it can now return zero or > minus one which was not possible before. It doesn't appear anyone is > actually using the functions other than psql but it is a change. The > functions are not actually documentated anywhere so it's not like we're > breaking a defined interface. The new semantics follow the Unicode > standard. BACKWARD COMPATIBLE CHANGE. The only user-visible change I saw in the regression tests is that a SELECT * on a table where all the columns have been dropped doesn't return a blank line like before. This seems like a step forward. Martijn van Oosterhout
2025-12-10 14:22:35 +03:00 · 2006-02-10 00:39:04 +00:00
parent 593763c086
commit c01999a557
6 changed files with 714 additions and 406 deletions
--- a/src/bin/psql/mbprint.c
+++ b/src/bin/psql/mbprint.c
@@ -3,7 +3,7 @@
 *
 * Copyright (c) 2000-2005, PostgreSQL Global Development Group
 *
- * $PostgreSQL: pgsql/src/bin/psql/mbprint.c,v 1.18 2005/10/15 02:49:40 momjian Exp $
+ * $PostgreSQL: pgsql/src/bin/psql/mbprint.c,v 1.19 2006/02/10 00:39:04 momjian Exp $
 */

 #include "postgres_fe.h"
@@ -14,149 +14,6 @@

 #include "mb/pg_wchar.h"

-/*
- * This is an implementation of wcwidth() and wcswidth() as defined in
- * "The Single UNIX Specification, Version 2, The Open Group, 1997"
- * <http://www.UNIX-systems.org/online.html>
- *
- * Markus Kuhn -- 2001-09-08 -- public domain
- *
- * customised for PostgreSQL
- *
- * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
- */
-
-struct mbinterval
-{
-	unsigned short first;
-	unsigned short last;
-};
-
-/* auxiliary function for binary search in interval table */
-static int
-mbbisearch(pg_wchar ucs, const struct mbinterval * table, int max)
-{
-	int			min = 0;
-	int			mid;
-
-	if (ucs < table[0].first || ucs > table[max].last)
-		return 0;
-	while (max >= min)
-	{
-		mid = (min + max) / 2;
-		if (ucs > table[mid].last)
-			min = mid + 1;
-		else if (ucs < table[mid].first)
-			max = mid - 1;
-		else
-			return 1;
-	}
-
-	return 0;
-}
-
-
-/* The following functions define the column width of an ISO 10646
- * character as follows:
- *
- *	  - The null character (U+0000) has a column width of 0.
- *
- *	  - Other C0/C1 control characters and DEL will lead to a return
- *		value of -1.
- *
- *	  - Non-spacing and enclosing combining characters (general
- *		category code Mn or Me in the Unicode database) have a
- *		column width of 0.
- *
- *	  - Other format characters (general category code Cf in the Unicode
- *		database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
- *
- *	  - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
- *		have a column width of 0.
- *
- *	  - Spacing characters in the East Asian Wide (W) or East Asian
- *		FullWidth (F) category as defined in Unicode Technical
- *		Report #11 have a column width of 2.
- *
- *	  - All remaining characters (including all printable
- *		ISO 8859-1 and WGL4 characters, Unicode control characters,
- *		etc.) have a column width of 1.
- *
- * This implementation assumes that wchar_t characters are encoded
- * in ISO 10646.
- */
-
-static int
-ucs_wcwidth(pg_wchar ucs)
-{
-	/* sorted list of non-overlapping intervals of non-spacing characters */
-	static const struct mbinterval combining[] = {
-		{0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
-		{0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
-		{0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
-		{0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
-		{0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
-		{0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
-		{0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
-		{0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
-		{0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
-		{0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
-		{0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
-		{0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
-		{0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
-		{0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
-		{0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
-		{0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
-		{0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
-		{0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
-		{0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
-		{0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
-		{0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
-		{0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
-		{0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
-		{0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
-		{0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
-		{0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
-		{0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
-		{0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
-		{0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
-		{0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
-		{0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
-		{0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
-		{0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
-		{0xFFF9, 0xFFFB}
-	};
-
-	/* test for 8-bit control characters */
-	if (ucs == 0)
-		return 0;
-
-	if (ucs < 32 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
-		return -1;
-
-	/* binary search in table of non-spacing characters */
-	if (mbbisearch(ucs, combining,
-				   sizeof(combining) / sizeof(struct mbinterval) - 1))
-		return 0;
-
-	/*
-	 * if we arrive here, ucs is not a combining or C0/C1 control character
-	 */
-
-	return 1 +
-		(ucs >= 0x1100 &&
-		 (ucs <= 0x115f ||		/* Hangul Jamo init. consonants */
-		  (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
-		   ucs != 0x303f) ||	/* CJK ... Yi */
-		  (ucs >= 0xac00 && ucs <= 0xd7a3) ||	/* Hangul Syllables */
-		  (ucs >= 0xf900 && ucs <= 0xfaff) ||	/* CJK Compatibility
-												 * Ideographs */
-		  (ucs >= 0xfe30 && ucs <= 0xfe6f) ||	/* CJK Compatibility Forms */
-		  (ucs >= 0xff00 && ucs <= 0xff5f) ||	/* Fullwidth Forms */
-		  (ucs >= 0xffe0 && ucs <= 0xffe6) ||
-		  (ucs >= 0x20000 && ucs <= 0x2ffff)));
-}
-
 static pg_wchar
 utf2ucs(const unsigned char *c)
 {
@@ -191,35 +48,16 @@ utf2ucs(const unsigned char *c)
 	}
 }

-/* mb_utf_wcwidth : calculate column length for the utf8 string pwcs
+
+/*
+ * Unicode 3.1 compliant validation : for each category, it checks the
+ * combination of each byte to make sure it maps to a valid range. It also
+ * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
+ * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
 */
-static int
-mb_utf_wcswidth(const unsigned char *pwcs, size_t len)
-{
-	int			w,
-				l = 0;
-	int			width = 0;
-
-	for (; *pwcs && len > 0; pwcs += l)
-	{
-		l = pg_utf_mblen(pwcs);
-		if ((len < (size_t) l) || ((w = ucs_wcwidth(utf2ucs(pwcs))) < 0))
-			return width;
-		len -= l;
-		width += w;
-	}
-	return width;
-}
-
 static int
 utf_charcheck(const unsigned char *c)
 {
-	/*
-	 * Unicode 3.1 compliant validation : for each category, it checks the
-	 * combination of each byte to make sur it maps to a valid range. It also
-	 * returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
-	 * 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
-	 */
 	if ((*c & 0x80) == 0)
 		return 1;
 	else if ((*c & 0xe0) == 0xc0)
@@ -270,6 +108,7 @@ utf_charcheck(const unsigned char *c)
 	return -1;
 }

+
 static void
 mb_utf_validate(unsigned char *pwcs)
 {
@@ -277,28 +116,26 @@ mb_utf_validate(unsigned char *pwcs)

 	while (*pwcs)
 	{
-		int			l;
+		int			len;

-		if ((l = utf_charcheck(pwcs)) > 0)
+		if ((len = utf_charcheck(pwcs)) > 0)
 		{
 			if (p != pwcs)
 			{
 				int			i;

-				for (i = 0; i < l; i++)
+				for (i = 0; i < len; i++)
 					*p++ = *pwcs++;
 			}
 			else
 			{
-				pwcs += l;
-				p += l;
+				pwcs += len;
+				p += len;
 			}
 		}
 		else
-		{
 			/* we skip the char */
 			pwcs++;
-		}
 	}
 	if (p != pwcs)
 		*p = '\0';
@@ -308,23 +145,193 @@ mb_utf_validate(unsigned char *pwcs)
 * public functions : wcswidth and mbvalidate
 */

+/*
+ * pg_wcswidth is the dumb width function. It assumes that everything will
+ * only appear on one line. OTOH it is easier to use if this applies to you.
+ */
 int
-pg_wcswidth(const char *pwcs, size_t len, int encoding)
+pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
 {
-	if (encoding == PG_UTF8)
-		return mb_utf_wcswidth((const unsigned char *) pwcs, len);
-	else
+	int width = 0;
+
+	while (len > 0)
 	{
-		/*
-		 * obviously, other encodings may want to fix this, but I don't know
-		 * them myself, unfortunately.
-		 */
-		return len;
+		int chlen, chwidth;
+
+		chlen = PQmblen(pwcs, encoding);
+		if (chlen > len)
+			break;     /* Invalid string */
+			
+		chwidth = PQdsplen(pwcs, encoding);
+		
+		if (chwidth > 0)
+			width += chwidth;
+		pwcs += chlen;
 	}
+	return width;
 }

-char *
-mbvalidate(char *pwcs, int encoding)
+/*
+ * pg_wcssize takes the given string in the given encoding and returns three
+ * values:
+ *    result_width: Width in display character of longest line in string
+ *    result_hieght: Number of lines in display output
+ *    result_format_size: Number of bytes required to store formatted representation of string
+ */
+int
+pg_wcssize(unsigned char *pwcs, size_t len, int encoding, int *result_width,
+			int *result_height, int *result_format_size)
+{
+	int	w,
+		chlen = 0,
+		linewidth = 0;
+	int width = 0;
+	int height = 1;
+	int format_size = 0;
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen(pwcs, encoding);
+		if (len < (size_t)chlen)
+			break;
+		w = PQdsplen(pwcs, encoding);
+
+		if (chlen == 1)   /* ASCII char */
+		{
+			if (*pwcs == '\n') /* Newline */
+			{
+				if (linewidth > width)
+					width = linewidth;
+				linewidth = 0;
+				height += 1;
+				format_size += 1;  /* For NUL char */
+			}
+			else if (*pwcs == '\r')   /* Linefeed */
+			{
+				linewidth += 2;
+				format_size += 2;
+			}
+			else if (w <= 0)  /* Other control char */
+			{
+				linewidth += 4;
+				format_size += 4;
+			}
+			else  /* Output itself */
+			{
+				linewidth++;
+				format_size += 1;
+			}
+		}
+		else if (w <= 0)  /* Non-ascii control char */
+		{
+			linewidth += 6;   /* \u0000 */
+			format_size += 6;
+		}
+		else  /* All other chars */
+		{
+			linewidth += w;
+			format_size += chlen;
+		}
+		len -= chlen;
+	}
+	if (linewidth > width)
+		width = linewidth;
+	format_size += 1;
+	
+	/* Set results */
+	if (result_width)
+		*result_width = width;
+	if (result_height)
+		*result_height = height;
+	if (result_format_size)
+		*result_format_size = format_size;
+		
+	return width;
+}
+
+void
+pg_wcsformat(unsigned char *pwcs, size_t len, int encoding,
+			 struct lineptr *lines, int count)
+{
+	int			w,
+				chlen = 0;
+	int linewidth = 0;
+	
+	char *ptr = lines->ptr;   /* Pointer to data area */
+
+	for (; *pwcs && len > 0; pwcs += chlen)
+	{
+		chlen = PQmblen(pwcs,encoding);
+		if (len < (size_t)chlen)
+			break;
+		w = PQdsplen(pwcs,encoding);
+
+		if (chlen == 1)   /* single byte char char */
+		{
+			if (*pwcs == '\n') /* Newline */
+			{
+				*ptr++ = 0;   /* NULL char */
+				lines->width = linewidth;
+				linewidth = 0;
+				lines++;
+				count--;
+				if (count == 0)
+					exit(1);   /* Screwup */	
+					
+				lines->ptr = ptr;
+			}
+			else if (*pwcs == '\r')   /* Linefeed */
+			{
+				strcpy(ptr, "\\r");
+				linewidth += 2;
+				ptr += 2;
+			}
+			else if (w <= 0)  /* Other control char */
+			{
+				sprintf(ptr, "\\x%02X", *pwcs);
+				linewidth += 4;
+				ptr += 4;
+			}
+			else  /* Output itself */
+			{
+				linewidth++;
+				*ptr++ = *pwcs;
+			}
+		}
+		else if (w <= 0)  /* Non-ascii control char */
+		{
+			if (encoding == PG_UTF8)
+				sprintf(ptr, "\\u%04X", utf2ucs(pwcs));
+			else
+				/* This case cannot happen in the current
+				 * code because only UTF-8 signals multibyte
+				 * control characters. But we may need to
+				 * support it at some stage */
+				sprintf(ptr, "\\u????");
+				
+			ptr += 6;
+			linewidth += 6;
+		}
+		else  /* All other chars */
+		{
+			int i;
+			for (i=0; i < chlen; i++)
+				*ptr++ = pwcs[i];
+			linewidth += w;
+		}
+		len -= chlen;
+	}
+	*ptr++ = 0;
+	lines->width = linewidth;
+	lines++;
+	count--;
+	if (count > 0)
+		lines->ptr = NULL;
+	return;
+}
+
+unsigned char *
+mbvalidate(unsigned char *pwcs, int encoding)
 {
 	if (encoding == PG_UTF8)
 		mb_utf_validate((unsigned char *) pwcs);