1
0
mirror of https://github.com/postgres/postgres.git synced 2025-06-08 22:02:03 +03:00
postgres/src/fe_utils/mbprint.c
Tom Lane 382ceffdf7 Phase 3 of pgindent updates.
Don't move parenthesized lines to the left, even if that means they
flow past the right margin.

By default, BSD indent lines up statement continuation lines that are
within parentheses so that they start just to the right of the preceding
left parenthesis.  However, traditionally, if that resulted in the
continuation line extending to the right of the desired right margin,
then indent would push it left just far enough to not overrun the margin,
if it could do so without making the continuation line start to the left of
the current statement indent.  That makes for a weird mix of indentations
unless one has been completely rigid about never violating the 80-column
limit.

This behavior has been pretty universally panned by Postgres developers.
Hence, disable it with indent's new -lpl switch, so that parenthesized
lines are always lined up with the preceding left paren.

This patch is much less interesting than the first round of indent
changes, but also bulkier, so I thought it best to separate the effects.

Discussion: https://postgr.es/m/E1dAmxK-0006EE-1r@gemulon.postgresql.org
Discussion: https://postgr.es/m/30527.1495162840@sss.pgh.pa.us
2017-06-21 15:35:54 -04:00

406 lines
8.6 KiB
C

/*-------------------------------------------------------------------------
*
* Multibyte character printing support for frontend code
*
*
* Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
* src/fe_utils/mbprint.c
*
*-------------------------------------------------------------------------
*/
#include "postgres_fe.h"
#include "fe_utils/mbprint.h"
#include "libpq-fe.h"
/*
* To avoid version-skew problems, this file must not use declarations
* from pg_wchar.h: the encoding IDs we are dealing with are determined
* by the libpq.so we are linked with, and that might not match the
* numbers we see at compile time. (If this file were inside libpq,
* the problem would go away...)
*
* Hence, we have our own definition of pg_wchar, and we get the values
* of any needed encoding IDs on-the-fly.
*/
typedef unsigned int pg_wchar;
static int
pg_get_utf8_id(void)
{
static int utf8_id = -1;
if (utf8_id < 0)
utf8_id = pg_char_to_encoding("utf8");
return utf8_id;
}
#define PG_UTF8 pg_get_utf8_id()
/*
* Convert a UTF-8 character to a Unicode code point.
* This is a one-character version of pg_utf2wchar_with_len.
*
* No error checks here, c must point to a long-enough string.
*/
static pg_wchar
utf8_to_unicode(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return (pg_wchar) c[0];
else if ((*c & 0xe0) == 0xc0)
return (pg_wchar) (((c[0] & 0x1f) << 6) |
(c[1] & 0x3f));
else if ((*c & 0xf0) == 0xe0)
return (pg_wchar) (((c[0] & 0x0f) << 12) |
((c[1] & 0x3f) << 6) |
(c[2] & 0x3f));
else if ((*c & 0xf8) == 0xf0)
return (pg_wchar) (((c[0] & 0x07) << 18) |
((c[1] & 0x3f) << 12) |
((c[2] & 0x3f) << 6) |
(c[3] & 0x3f));
else
/* that is an invalid code on purpose */
return 0xffffffff;
}
/*
* Unicode 3.1 compliant validation : for each category, it checks the
* combination of each byte to make sure it maps to a valid range. It also
* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
*/
static int
utf_charcheck(const unsigned char *c)
{
if ((*c & 0x80) == 0)
return 1;
else if ((*c & 0xe0) == 0xc0)
{
/* two-byte char */
if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
return 2;
return -1;
}
else if ((*c & 0xf0) == 0xe0)
{
/* three-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
((c[2] & 0xc0) == 0x80))
{
int z = c[0] & 0x0f;
int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
int lx = yx & 0x7f;
/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
if (((z == 0x0f) &&
(((yx & 0xffe) == 0xffe) ||
(((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
((z == 0x0d) && ((yx & 0xb00) == 0x800)))
return -1;
return 3;
}
return -1;
}
else if ((*c & 0xf8) == 0xf0)
{
int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
/* four-byte char */
if (((c[1] & 0xc0) == 0x80) &&
(u > 0x00) && (u <= 0x10) &&
((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
{
/* test for 0xzzzzfffe/0xzzzzfffff */
if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
((c[3] & 0x3e) == 0x3e))
return -1;
return 4;
}
return -1;
}
return -1;
}
static void
mb_utf_validate(unsigned char *pwcs)
{
unsigned char *p = pwcs;
while (*pwcs)
{
int len;
if ((len = utf_charcheck(pwcs)) > 0)
{
if (p != pwcs)
{
int i;
for (i = 0; i < len; i++)
*p++ = *pwcs++;
}
else
{
pwcs += len;
p += len;
}
}
else
/* we skip the char */
pwcs++;
}
if (p != pwcs)
*p = '\0';
}
/*
* public functions : wcswidth and mbvalidate
*/
/*
* pg_wcswidth is the dumb display-width function.
* It assumes that everything will appear on one line.
* OTOH it is easier to use than pg_wcssize if this applies to you.
*/
int
pg_wcswidth(const char *pwcs, size_t len, int encoding)
{
int width = 0;
while (len > 0)
{
int chlen,
chwidth;
chlen = PQmblen(pwcs, encoding);
if (len < (size_t) chlen)
break; /* Invalid string */
chwidth = PQdsplen(pwcs, encoding);
if (chwidth > 0)
width += chwidth;
pwcs += chlen;
len -= chlen;
}
return width;
}
/*
* pg_wcssize takes the given string in the given encoding and returns three
* values:
* result_width: Width in display characters of the longest line in string
* result_height: Number of lines in display output
* result_format_size: Number of bytes required to store formatted
* representation of string
*
* This MUST be kept in sync with pg_wcsformat!
*/
void
pg_wcssize(const unsigned char *pwcs, size_t len, int encoding,
int *result_width, int *result_height, int *result_format_size)
{
int w,
chlen = 0,
linewidth = 0;
int width = 0;
int height = 1;
int format_size = 0;
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
if (linewidth > width)
width = linewidth;
linewidth = 0;
height += 1;
format_size += 1; /* For NUL char */
}
else if (*pwcs == '\r') /* Linefeed */
{
linewidth += 2;
format_size += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
linewidth++;
format_size++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
linewidth += 4;
format_size += 4;
}
else /* Output it as-is */
{
linewidth += w;
format_size += 1;
}
}
else if (w < 0) /* Non-ascii control char */
{
linewidth += 6; /* \u0000 */
format_size += 6;
}
else /* All other chars */
{
linewidth += w;
format_size += chlen;
}
len -= chlen;
}
if (linewidth > width)
width = linewidth;
format_size += 1; /* For NUL char */
/* Set results */
if (result_width)
*result_width = width;
if (result_height)
*result_height = height;
if (result_format_size)
*result_format_size = format_size;
}
/*
* Format a string into one or more "struct lineptr" lines.
* lines[i].ptr == NULL indicates the end of the array.
*
* This MUST be kept in sync with pg_wcssize!
*/
void
pg_wcsformat(const unsigned char *pwcs, size_t len, int encoding,
struct lineptr *lines, int count)
{
int w,
chlen = 0;
int linewidth = 0;
unsigned char *ptr = lines->ptr; /* Pointer to data area */
for (; *pwcs && len > 0; pwcs += chlen)
{
chlen = PQmblen((const char *) pwcs, encoding);
if (len < (size_t) chlen)
break;
w = PQdsplen((const char *) pwcs, encoding);
if (chlen == 1) /* single-byte char */
{
if (*pwcs == '\n') /* Newline */
{
*ptr++ = '\0';
lines->width = linewidth;
linewidth = 0;
lines++;
count--;
if (count <= 0)
exit(1); /* Screwup */
/* make next line point to remaining memory */
lines->ptr = ptr;
}
else if (*pwcs == '\r') /* Linefeed */
{
strcpy((char *) ptr, "\\r");
linewidth += 2;
ptr += 2;
}
else if (*pwcs == '\t') /* Tab */
{
do
{
*ptr++ = ' ';
linewidth++;
} while (linewidth % 8 != 0);
}
else if (w < 0) /* Other control char */
{
sprintf((char *) ptr, "\\x%02X", *pwcs);
linewidth += 4;
ptr += 4;
}
else /* Output it as-is */
{
linewidth += w;
*ptr++ = *pwcs;
}
}
else if (w < 0) /* Non-ascii control char */
{
if (encoding == PG_UTF8)
sprintf((char *) ptr, "\\u%04X", utf8_to_unicode(pwcs));
else
{
/*
* This case cannot happen in the current code because only
* UTF-8 signals multibyte control characters. But we may need
* to support it at some stage
*/
sprintf((char *) ptr, "\\u????");
}
ptr += 6;
linewidth += 6;
}
else /* All other chars */
{
int i;
for (i = 0; i < chlen; i++)
*ptr++ = pwcs[i];
linewidth += w;
}
len -= chlen;
}
lines->width = linewidth;
*ptr++ = '\0'; /* Terminate formatted string */
if (count <= 0)
exit(1); /* Screwup */
(lines + 1)->ptr = NULL; /* terminate line array */
}
/*
* Encoding validation: delete any unvalidatable characters from the string
*
* This seems redundant with existing functionality elsewhere?
*/
unsigned char *
mbvalidate(unsigned char *pwcs, int encoding)
{
if (encoding == PG_UTF8)
mb_utf_validate(pwcs);
else
{
/*
* other encodings needing validation should add their own routines
* here
*/
}
return pwcs;
}