mirror of
https://github.com/postgres/postgres.git
synced 2025-04-25 21:42:33 +03:00
the time, rather than hoping we can tell whether the terminal supports UTF8 characters. Per discussion.
404 lines
8.3 KiB
C
404 lines
8.3 KiB
C
/*
|
|
* psql - the PostgreSQL interactive terminal
|
|
*
|
|
* Copyright (c) 2000-2009, PostgreSQL Global Development Group
|
|
*
|
|
* $PostgreSQL: pgsql/src/bin/psql/mbprint.c,v 1.37 2009/11/25 20:26:31 tgl Exp $
|
|
*
|
|
* XXX this file does not really belong in psql/. Perhaps move to libpq?
|
|
* It also seems that the mbvalidate function is redundant with existing
|
|
* functionality.
|
|
*/
|
|
|
|
#include "postgres_fe.h"
|
|
#include "mbprint.h"
|
|
#include "libpq-fe.h"
|
|
#ifndef PGSCRIPTS
|
|
#include "settings.h"
|
|
#endif
|
|
|
|
/*
|
|
* To avoid version-skew problems, this file must not use declarations
|
|
* from pg_wchar.h: the encoding IDs we are dealing with are determined
|
|
* by the libpq.so we are linked with, and that might not match the
|
|
* numbers we see at compile time. (If this file were inside libpq,
|
|
* the problem would go away...)
|
|
*
|
|
* Hence, we have our own definition of pg_wchar, and we get the values
|
|
* of any needed encoding IDs on-the-fly.
|
|
*/
|
|
|
|
typedef unsigned int pg_wchar;
|
|
|
|
static int
|
|
pg_get_utf8_id(void)
|
|
{
|
|
static int utf8_id = -1;
|
|
|
|
if (utf8_id < 0)
|
|
utf8_id = pg_char_to_encoding("utf8");
|
|
return utf8_id;
|
|
}
|
|
|
|
#define PG_UTF8 pg_get_utf8_id()
|
|
|
|
|
|
static pg_wchar
|
|
utf2ucs(const unsigned char *c)
|
|
{
|
|
/*
|
|
* one char version of pg_utf2wchar_with_len. no control here, c must
|
|
* point to a large enough string
|
|
*/
|
|
if ((*c & 0x80) == 0)
|
|
return (pg_wchar) c[0];
|
|
else if ((*c & 0xe0) == 0xc0)
|
|
{
|
|
return (pg_wchar) (((c[0] & 0x1f) << 6) |
|
|
(c[1] & 0x3f));
|
|
}
|
|
else if ((*c & 0xf0) == 0xe0)
|
|
{
|
|
return (pg_wchar) (((c[0] & 0x0f) << 12) |
|
|
((c[1] & 0x3f) << 6) |
|
|
(c[2] & 0x3f));
|
|
}
|
|
else if ((*c & 0xf0) == 0xf0)
|
|
{
|
|
return (pg_wchar) (((c[0] & 0x07) << 18) |
|
|
((c[1] & 0x3f) << 12) |
|
|
((c[2] & 0x3f) << 6) |
|
|
(c[3] & 0x3f));
|
|
}
|
|
else
|
|
{
|
|
/* that is an invalid code on purpose */
|
|
return 0xffffffff;
|
|
}
|
|
}
|
|
|
|
|
|
/*
|
|
* Unicode 3.1 compliant validation : for each category, it checks the
|
|
* combination of each byte to make sure it maps to a valid range. It also
|
|
* returns -1 for the following UCS values: ucs > 0x10ffff ucs & 0xfffe =
|
|
* 0xfffe 0xfdd0 < ucs < 0xfdef ucs & 0xdb00 = 0xd800 (surrogates)
|
|
*/
|
|
static int
|
|
utf_charcheck(const unsigned char *c)
|
|
{
|
|
if ((*c & 0x80) == 0)
|
|
return 1;
|
|
else if ((*c & 0xe0) == 0xc0)
|
|
{
|
|
/* two-byte char */
|
|
if (((c[1] & 0xc0) == 0x80) && ((c[0] & 0x1f) > 0x01))
|
|
return 2;
|
|
return -1;
|
|
}
|
|
else if ((*c & 0xf0) == 0xe0)
|
|
{
|
|
/* three-byte char */
|
|
if (((c[1] & 0xc0) == 0x80) &&
|
|
(((c[0] & 0x0f) != 0x00) || ((c[1] & 0x20) == 0x20)) &&
|
|
((c[2] & 0xc0) == 0x80))
|
|
{
|
|
int z = c[0] & 0x0f;
|
|
int yx = ((c[1] & 0x3f) << 6) | (c[0] & 0x3f);
|
|
int lx = yx & 0x7f;
|
|
|
|
/* check 0xfffe/0xffff, 0xfdd0..0xfedf range, surrogates */
|
|
if (((z == 0x0f) &&
|
|
(((yx & 0xffe) == 0xffe) ||
|
|
(((yx & 0xf80) == 0xd80) && (lx >= 0x30) && (lx <= 0x4f)))) ||
|
|
((z == 0x0d) && ((yx & 0xb00) == 0x800)))
|
|
return -1;
|
|
return 3;
|
|
}
|
|
return -1;
|
|
}
|
|
else if ((*c & 0xf8) == 0xf0)
|
|
{
|
|
int u = ((c[0] & 0x07) << 2) | ((c[1] & 0x30) >> 4);
|
|
|
|
/* four-byte char */
|
|
if (((c[1] & 0xc0) == 0x80) &&
|
|
(u > 0x00) && (u <= 0x10) &&
|
|
((c[2] & 0xc0) == 0x80) && ((c[3] & 0xc0) == 0x80))
|
|
{
|
|
/* test for 0xzzzzfffe/0xzzzzfffff */
|
|
if (((c[1] & 0x0f) == 0x0f) && ((c[2] & 0x3f) == 0x3f) &&
|
|
((c[3] & 0x3e) == 0x3e))
|
|
return -1;
|
|
return 4;
|
|
}
|
|
return -1;
|
|
}
|
|
return -1;
|
|
}
|
|
|
|
|
|
static void
|
|
mb_utf_validate(unsigned char *pwcs)
|
|
{
|
|
unsigned char *p = pwcs;
|
|
|
|
while (*pwcs)
|
|
{
|
|
int len;
|
|
|
|
if ((len = utf_charcheck(pwcs)) > 0)
|
|
{
|
|
if (p != pwcs)
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < len; i++)
|
|
*p++ = *pwcs++;
|
|
}
|
|
else
|
|
{
|
|
pwcs += len;
|
|
p += len;
|
|
}
|
|
}
|
|
else
|
|
/* we skip the char */
|
|
pwcs++;
|
|
}
|
|
if (p != pwcs)
|
|
*p = '\0';
|
|
}
|
|
|
|
/*
|
|
* public functions : wcswidth and mbvalidate
|
|
*/
|
|
|
|
/*
|
|
* pg_wcswidth is the dumb width function. It assumes that everything will
|
|
* only appear on one line. OTOH it is easier to use if this applies to you.
|
|
*/
|
|
int
|
|
pg_wcswidth(const unsigned char *pwcs, size_t len, int encoding)
|
|
{
|
|
int width = 0;
|
|
|
|
while (len > 0)
|
|
{
|
|
int chlen,
|
|
chwidth;
|
|
|
|
chlen = PQmblen((const char *) pwcs, encoding);
|
|
if (chlen > len)
|
|
break; /* Invalid string */
|
|
|
|
chwidth = PQdsplen((const char *) pwcs, encoding);
|
|
|
|
if (chwidth > 0)
|
|
width += chwidth;
|
|
pwcs += chlen;
|
|
}
|
|
return width;
|
|
}
|
|
|
|
/*
|
|
* pg_wcssize takes the given string in the given encoding and returns three
|
|
* values:
|
|
* result_width: Width in display characters of the longest line in string
|
|
* result_height: Number of lines in display output
|
|
* result_format_size: Number of bytes required to store formatted
|
|
* representation of string
|
|
*
|
|
* This MUST be kept in sync with pg_wcsformat!
|
|
*/
|
|
void
|
|
pg_wcssize(unsigned char *pwcs, size_t len, int encoding,
|
|
int *result_width, int *result_height, int *result_format_size)
|
|
{
|
|
int w,
|
|
chlen = 0,
|
|
linewidth = 0;
|
|
int width = 0;
|
|
int height = 1;
|
|
int format_size = 0;
|
|
|
|
for (; *pwcs && len > 0; pwcs += chlen)
|
|
{
|
|
chlen = PQmblen((char *) pwcs, encoding);
|
|
if (len < (size_t) chlen)
|
|
break;
|
|
w = PQdsplen((char *) pwcs, encoding);
|
|
|
|
if (chlen == 1) /* single-byte char */
|
|
{
|
|
if (*pwcs == '\n') /* Newline */
|
|
{
|
|
if (linewidth > width)
|
|
width = linewidth;
|
|
linewidth = 0;
|
|
height += 1;
|
|
format_size += 1; /* For NUL char */
|
|
}
|
|
else if (*pwcs == '\r') /* Linefeed */
|
|
{
|
|
linewidth += 2;
|
|
format_size += 2;
|
|
}
|
|
else if (*pwcs == '\t') /* Tab */
|
|
{
|
|
do
|
|
{
|
|
linewidth++;
|
|
format_size++;
|
|
} while (linewidth % 8 != 0);
|
|
}
|
|
else if (w < 0) /* Other control char */
|
|
{
|
|
linewidth += 4;
|
|
format_size += 4;
|
|
}
|
|
else /* Output it as-is */
|
|
{
|
|
linewidth += w;
|
|
format_size += 1;
|
|
}
|
|
}
|
|
else if (w < 0) /* Non-ascii control char */
|
|
{
|
|
linewidth += 6; /* \u0000 */
|
|
format_size += 6;
|
|
}
|
|
else /* All other chars */
|
|
{
|
|
linewidth += w;
|
|
format_size += chlen;
|
|
}
|
|
len -= chlen;
|
|
}
|
|
if (linewidth > width)
|
|
width = linewidth;
|
|
format_size += 1; /* For NUL char */
|
|
|
|
/* Set results */
|
|
if (result_width)
|
|
*result_width = width;
|
|
if (result_height)
|
|
*result_height = height;
|
|
if (result_format_size)
|
|
*result_format_size = format_size;
|
|
}
|
|
|
|
/*
|
|
* Format a string into one or more "struct lineptr" lines.
|
|
* lines[i].ptr == NULL indicates the end of the array.
|
|
*
|
|
* This MUST be kept in sync with pg_wcssize!
|
|
*/
|
|
void
|
|
pg_wcsformat(unsigned char *pwcs, size_t len, int encoding,
|
|
struct lineptr * lines, int count)
|
|
{
|
|
int w,
|
|
chlen = 0;
|
|
int linewidth = 0;
|
|
unsigned char *ptr = lines->ptr; /* Pointer to data area */
|
|
|
|
for (; *pwcs && len > 0; pwcs += chlen)
|
|
{
|
|
chlen = PQmblen((char *) pwcs, encoding);
|
|
if (len < (size_t) chlen)
|
|
break;
|
|
w = PQdsplen((char *) pwcs, encoding);
|
|
|
|
if (chlen == 1) /* single-byte char */
|
|
{
|
|
if (*pwcs == '\n') /* Newline */
|
|
{
|
|
*ptr++ = '\0';
|
|
lines->width = linewidth;
|
|
linewidth = 0;
|
|
lines++;
|
|
count--;
|
|
if (count <= 0)
|
|
exit(1); /* Screwup */
|
|
|
|
/* make next line point to remaining memory */
|
|
lines->ptr = ptr;
|
|
}
|
|
else if (*pwcs == '\r') /* Linefeed */
|
|
{
|
|
strcpy((char *) ptr, "\\r");
|
|
linewidth += 2;
|
|
ptr += 2;
|
|
}
|
|
else if (*pwcs == '\t') /* Tab */
|
|
{
|
|
do
|
|
{
|
|
*ptr++ = ' ';
|
|
linewidth++;
|
|
} while (linewidth % 8 != 0);
|
|
}
|
|
else if (w < 0) /* Other control char */
|
|
{
|
|
sprintf((char *) ptr, "\\x%02X", *pwcs);
|
|
linewidth += 4;
|
|
ptr += 4;
|
|
}
|
|
else /* Output it as-is */
|
|
{
|
|
linewidth += w;
|
|
*ptr++ = *pwcs;
|
|
}
|
|
}
|
|
else if (w < 0) /* Non-ascii control char */
|
|
{
|
|
if (encoding == PG_UTF8)
|
|
sprintf((char *) ptr, "\\u%04X", utf2ucs(pwcs));
|
|
else
|
|
{
|
|
/*
|
|
* This case cannot happen in the current code because only
|
|
* UTF-8 signals multibyte control characters. But we may need
|
|
* to support it at some stage
|
|
*/
|
|
sprintf((char *) ptr, "\\u????");
|
|
}
|
|
ptr += 6;
|
|
linewidth += 6;
|
|
}
|
|
else /* All other chars */
|
|
{
|
|
int i;
|
|
|
|
for (i = 0; i < chlen; i++)
|
|
*ptr++ = pwcs[i];
|
|
linewidth += w;
|
|
}
|
|
len -= chlen;
|
|
}
|
|
lines->width = linewidth;
|
|
*ptr++ = '\0'; /* Terminate formatted string */
|
|
|
|
if (count <= 0)
|
|
exit(1); /* Screwup */
|
|
|
|
(lines + 1)->ptr = NULL; /* terminate line array */
|
|
}
|
|
|
|
unsigned char *
|
|
mbvalidate(unsigned char *pwcs, int encoding)
|
|
{
|
|
if (encoding == PG_UTF8)
|
|
mb_utf_validate((unsigned char *) pwcs);
|
|
else
|
|
{
|
|
/*
|
|
* other encodings needing validation should add their own routines
|
|
* here
|
|
*/
|
|
}
|
|
|
|
return pwcs;
|
|
}
|