1
0
mirror of https://github.com/postgres/postgres.git synced 2025-07-30 11:03:19 +03:00

Support RN (roman-numeral format) in to_number().

We've long had roman-numeral output support in to_char(),
but lacked the reverse conversion.  Here it is.

Author: Hunaid Sohail <hunaidpgml@gmail.com>
Reviewed-by: Maciek Sakrejda <m.sakrejda@gmail.com>
Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us>
Reviewed-by: Tomas Vondra <tomas@vondra.me>
Discussion: https://postgr.es/m/CAMWA6ybh4M1VQqpmnu2tfSwO+3gAPeA8YKnMHVADeB=XDEvT_A@mail.gmail.com
This commit is contained in:
Tom Lane
2025-01-22 15:18:40 -05:00
parent f0ee648527
commit 172e6b3adb
4 changed files with 350 additions and 53 deletions

View File

@ -8669,8 +8669,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
<entry>plus/minus sign in specified position</entry> <entry>plus/minus sign in specified position</entry>
</row> </row>
<row> <row>
<entry><literal>RN</literal></entry> <entry><literal>RN</literal> or <literal>rn</literal></entry>
<entry>Roman numeral (input between 1 and 3999)</entry> <entry>Roman numeral (values between 1 and 3999)</entry>
</row> </row>
<row> <row>
<entry><literal>TH</literal> or <literal>th</literal></entry> <entry><literal>TH</literal> or <literal>th</literal></entry>
@ -8798,6 +8798,19 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
(e.g., <literal>9.99EEEE</literal> is a valid pattern). (e.g., <literal>9.99EEEE</literal> is a valid pattern).
</para> </para>
</listitem> </listitem>
<listitem>
<para>
In <function>to_number()</function>, the <literal>RN</literal>
pattern converts Roman numerals (in standard form) to numbers.
Input is case-insensitive, so <literal>RN</literal>
and <literal>rn</literal> are equivalent. <literal>RN</literal>
cannot be used in combination with any other formatting patterns or
modifiers except <literal>FM</literal>, which is applicable only
in <function>to_char()</function> and is ignored
in <function>to_number()</function>.
</para>
</listitem>
</itemizedlist> </itemizedlist>
</para> </para>

View File

@ -49,7 +49,6 @@
* - better number building (formatting) / parsing, now it isn't * - better number building (formatting) / parsing, now it isn't
* ideal code * ideal code
* - use Assert() * - use Assert()
* - add support for roman number to standard number conversion
* - add support for number spelling * - add support for number spelling
* - add support for string to string formatting (we must be better * - add support for string to string formatting (we must be better
* than Oracle :-), * than Oracle :-),
@ -257,13 +256,39 @@ static const char *const rm_months_lower[] =
{"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL}; {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
/* ---------- /* ----------
* Roman numbers * Roman numerals
* ---------- * ----------
*/ */
static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL}; static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL};
static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL}; static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL};
static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL}; static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL};
/*
* MACRO: Check if the current and next characters form a valid subtraction
* combination for roman numerals.
*/
#define IS_VALID_SUB_COMB(curr, next) \
(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
/*
* MACRO: Roman numeral value, or 0 if character isn't a roman numeral.
*/
#define ROMAN_VAL(r) \
((r) == 'I' ? 1 : \
(r) == 'V' ? 5 : \
(r) == 'X' ? 10 : \
(r) == 'L' ? 50 : \
(r) == 'C' ? 100 : \
(r) == 'D' ? 500 : \
(r) == 'M' ? 1000 : 0)
/*
* 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 characters).
*/
#define MAX_ROMAN_LEN 15
/* ---------- /* ----------
* Ordinal postfixes * Ordinal postfixes
* ---------- * ----------
@ -1028,6 +1053,15 @@ typedef struct NUMProc
#define DCH_TIMED 0x02 #define DCH_TIMED 0x02
#define DCH_ZONED 0x04 #define DCH_ZONED 0x04
/*
* These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string
* AMOUNT_TEST(s): true if at least s bytes remain in string
*/
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
/* ---------- /* ----------
* Functions * Functions
* ---------- * ----------
@ -1075,6 +1109,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
static char *fill_str(char *str, int c, int max); static char *fill_str(char *str, int c, int max);
static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree); static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
static char *int_to_roman(int number); static char *int_to_roman(int number);
static int roman_to_int(NUMProc *Np, int input_len);
static void NUM_prepare_locale(NUMProc *Np); static void NUM_prepare_locale(NUMProc *Np);
static char *get_last_relevant_decnum(char *num); static char *get_last_relevant_decnum(char *num);
static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len); static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@ -1285,6 +1320,10 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
case NUM_rn: case NUM_rn:
case NUM_RN: case NUM_RN:
if (IS_ROMAN(num))
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("cannot use \"RN\" twice")));
num->flag |= NUM_F_ROMAN; num->flag |= NUM_F_ROMAN;
break; break;
@ -1316,6 +1355,13 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
num->flag |= NUM_F_EEEE; num->flag |= NUM_F_EEEE;
break; break;
} }
if (IS_ROMAN(num) &&
(num->flag & ~(NUM_F_ROMAN | NUM_F_FILLMODE)) != 0)
ereport(ERROR,
(errcode(ERRCODE_SYNTAX_ERROR),
errmsg("\"RN\" is incompatible with other formats"),
errdetail("\"RN\" may only be used together with \"FM\".")));
} }
/* ---------- /* ----------
@ -4956,7 +5002,7 @@ int_to_roman(int number)
*result, *result,
numstr[12]; numstr[12];
result = (char *) palloc(16); result = (char *) palloc(MAX_ROMAN_LEN + 1);
*result = '\0'; *result = '\0';
/* /*
@ -4966,7 +5012,7 @@ int_to_roman(int number)
*/ */
if (number > 3999 || number < 1) if (number > 3999 || number < 1)
{ {
fill_str(result, '#', 15); fill_str(result, '#', MAX_ROMAN_LEN);
return result; return result;
} }
@ -5000,6 +5046,157 @@ int_to_roman(int number)
return result; return result;
} }
/*
* Convert a roman numeral (standard form) to an integer.
* Result is an integer between 1 and 3999.
* Np->inout_p is advanced past the characters consumed.
*
* If input is invalid, return -1.
*/
static int
roman_to_int(NUMProc *Np, int input_len)
{
int result = 0;
int len;
char romanChars[MAX_ROMAN_LEN];
int romanValues[MAX_ROMAN_LEN];
int repeatCount = 1;
int vCount = 0,
lCount = 0,
dCount = 0;
bool subtractionEncountered = false;
int lastSubtractedValue = 0;
/*
* Skip any leading whitespace. Perhaps we should limit the amount of
* space skipped to MAX_ROMAN_LEN, but that seems unnecessarily picky.
*/
while (!OVERLOAD_TEST && isspace((unsigned char) *Np->inout_p))
Np->inout_p++;
/*
* Collect and decode valid roman numerals, consuming at most
* MAX_ROMAN_LEN characters. We do this in a separate loop to avoid
* repeated decoding and because the main loop needs to know when it's at
* the last numeral.
*/
for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; len++)
{
char currChar = pg_ascii_toupper(*Np->inout_p);
int currValue = ROMAN_VAL(currChar);
if (currValue == 0)
break; /* Not a valid roman numeral. */
romanChars[len] = currChar;
romanValues[len] = currValue;
Np->inout_p++;
}
if (len == 0)
return -1; /* No valid roman numerals. */
/* Check for valid combinations and compute the represented value. */
for (int i = 0; i < len; i++)
{
char currChar = romanChars[i];
int currValue = romanValues[i];
/*
* Ensure no numeral greater than or equal to the subtracted numeral
* appears after a subtraction.
*/
if (subtractionEncountered && currValue >= lastSubtractedValue)
return -1;
/*
* V, L, and D should not appear before a larger numeral, nor should
* they be repeated.
*/
if ((vCount && currValue >= ROMAN_VAL('V')) ||
(lCount && currValue >= ROMAN_VAL('L')) ||
(dCount && currValue >= ROMAN_VAL('D')))
return -1;
if (currChar == 'V')
vCount++;
else if (currChar == 'L')
lCount++;
else if (currChar == 'D')
dCount++;
if (i < len - 1)
{
/* Compare current numeral to next numeral. */
char nextChar = romanChars[i + 1];
int nextValue = romanValues[i + 1];
/*
* If the current value is less than the next value, handle
* subtraction. Verify valid subtractive combinations and update
* the result accordingly.
*/
if (currValue < nextValue)
{
if (!IS_VALID_SUB_COMB(currChar, nextChar))
return -1;
/*
* Reject cases where same numeral is repeated with
* subtraction (e.g. 'MCCM' or 'DCCCD').
*/
if (repeatCount > 1)
return -1;
/*
* We are going to skip nextChar, so first make checks needed
* for V, L, and D. These are the same as we'd have applied
* if we reached nextChar without a subtraction.
*/
if ((vCount && nextValue >= ROMAN_VAL('V')) ||
(lCount && nextValue >= ROMAN_VAL('L')) ||
(dCount && nextValue >= ROMAN_VAL('D')))
return -1;
if (nextChar == 'V')
vCount++;
else if (nextChar == 'L')
lCount++;
else if (nextChar == 'D')
dCount++;
/*
* Skip the next numeral as it is part of the subtractive
* combination.
*/
i++;
/* Update state. */
repeatCount = 1;
subtractionEncountered = true;
lastSubtractedValue = currValue;
result += (nextValue - currValue);
}
else
{
/* For same numerals, check for repetition. */
if (currChar == nextChar)
{
repeatCount++;
if (repeatCount > 3)
return -1;
}
else
repeatCount = 1;
result += currValue;
}
}
else
{
/* This is the last numeral; just add it to the result. */
result += currValue;
}
}
return result;
}
/* ---------- /* ----------
@ -5112,14 +5309,6 @@ get_last_relevant_decnum(char *num)
return result; return result;
} }
/*
* These macros are used in NUM_processor() and its subsidiary routines.
* OVERLOAD_TEST: true if we've reached end of input string
* AMOUNT_TEST(s): true if at least s bytes remain in string
*/
#define OVERLOAD_TEST (Np->inout_p >= Np->inout + input_len)
#define AMOUNT_TEST(s) (Np->inout_p <= Np->inout + (input_len - (s)))
/* ---------- /* ----------
* Number extraction for TO_NUMBER() * Number extraction for TO_NUMBER()
* ---------- * ----------
@ -5576,29 +5765,6 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
return strcpy(inout, number); return strcpy(inout, number);
} }
/*
* Roman correction
*/
if (IS_ROMAN(Np->Num))
{
if (!Np->is_to_char)
ereport(ERROR,
(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
errmsg("\"RN\" not supported for input")));
Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
if (IS_FILLMODE(Np->Num))
{
Np->Num->flag = 0;
Np->Num->flag |= NUM_F_FILLMODE;
}
else
Np->Num->flag = 0;
Np->Num->flag |= NUM_F_ROMAN;
}
/* /*
* Sign * Sign
*/ */
@ -5849,28 +6015,35 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
break; break;
case NUM_RN: case NUM_RN:
if (IS_FILLMODE(Np->Num))
{
strcpy(Np->inout_p, Np->number_p);
Np->inout_p += strlen(Np->inout_p) - 1;
}
else
{
sprintf(Np->inout_p, "%15s", Np->number_p);
Np->inout_p += strlen(Np->inout_p) - 1;
}
break;
case NUM_rn: case NUM_rn:
if (IS_FILLMODE(Np->Num)) if (Np->is_to_char)
{ {
strcpy(Np->inout_p, asc_tolower_z(Np->number_p)); const char *number_p;
if (n->key->id == NUM_rn)
number_p = asc_tolower_z(Np->number_p);
else
number_p = Np->number_p;
if (IS_FILLMODE(Np->Num))
strcpy(Np->inout_p, number_p);
else
sprintf(Np->inout_p, "%15s", number_p);
Np->inout_p += strlen(Np->inout_p) - 1; Np->inout_p += strlen(Np->inout_p) - 1;
} }
else else
{ {
sprintf(Np->inout_p, "%15s", asc_tolower_z(Np->number_p)); int roman_result = roman_to_int(Np, input_len);
Np->inout_p += strlen(Np->inout_p) - 1; int numlen;
if (roman_result < 0)
ereport(ERROR,
(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
errmsg("invalid Roman numeral")));
numlen = sprintf(Np->number_p, "%d", roman_result);
Np->number_p += numlen;
Np->Num->pre = numlen;
Np->Num->post = 0;
continue; /* roman_to_int ate all the chars */
} }
break; break;

View File

@ -2384,6 +2384,84 @@ SELECT to_number('123456', '99999V99');
1234.560000000000000000 1234.560000000000000000
(1 row) (1 row)
-- Test for correct conversion between numbers and Roman numerals
WITH rows AS
(SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
SELECT
bool_and(to_number(roman, 'RN') = i) as valid
FROM rows;
valid
-------
t
(1 row)
-- Some additional tests for RN input
SELECT to_number('CvIiI', 'rn');
to_number
-----------
108
(1 row)
SELECT to_number('MMXX ', 'RN');
to_number
-----------
2020
(1 row)
SELECT to_number(' XIV', ' RN');
to_number
-----------
14
(1 row)
SELECT to_number(' XIV ', ' RN');
to_number
-----------
14
(1 row)
SELECT to_number('M CC', 'RN');
to_number
-----------
1000
(1 row)
-- error cases
SELECT to_number('viv', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('DCCCD', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('XIXL', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('MCCM', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('MMMM', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('VV', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('IL', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('VIX', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('LXC', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('DCM', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('MMMDCM', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('CLXC', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('CM', 'MIRN');
ERROR: "RN" is incompatible with other formats
DETAIL: "RN" may only be used together with "FM".
SELECT to_number('CM', 'RNRN');
ERROR: cannot use "RN" twice
SELECT to_number('qiv', 'RN');
ERROR: invalid Roman numeral
SELECT to_number('', 'RN');
ERROR: invalid input syntax for type numeric: " "
SELECT to_number(' ', 'RN');
ERROR: invalid Roman numeral
RESET lc_numeric; RESET lc_numeric;
-- --
-- Input syntax -- Input syntax

View File

@ -1085,6 +1085,39 @@ SELECT to_number('1234.56','L99,999.99');
SELECT to_number('1,234.56','L99,999.99'); SELECT to_number('1,234.56','L99,999.99');
SELECT to_number('42nd', '99th'); SELECT to_number('42nd', '99th');
SELECT to_number('123456', '99999V99'); SELECT to_number('123456', '99999V99');
-- Test for correct conversion between numbers and Roman numerals
WITH rows AS
(SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
SELECT
bool_and(to_number(roman, 'RN') = i) as valid
FROM rows;
-- Some additional tests for RN input
SELECT to_number('CvIiI', 'rn');
SELECT to_number('MMXX ', 'RN');
SELECT to_number(' XIV', ' RN');
SELECT to_number(' XIV ', ' RN');
SELECT to_number('M CC', 'RN');
-- error cases
SELECT to_number('viv', 'RN');
SELECT to_number('DCCCD', 'RN');
SELECT to_number('XIXL', 'RN');
SELECT to_number('MCCM', 'RN');
SELECT to_number('MMMM', 'RN');
SELECT to_number('VV', 'RN');
SELECT to_number('IL', 'RN');
SELECT to_number('VIX', 'RN');
SELECT to_number('LXC', 'RN');
SELECT to_number('DCM', 'RN');
SELECT to_number('MMMDCM', 'RN');
SELECT to_number('CLXC', 'RN');
SELECT to_number('CM', 'MIRN');
SELECT to_number('CM', 'RNRN');
SELECT to_number('qiv', 'RN');
SELECT to_number('', 'RN');
SELECT to_number(' ', 'RN');
RESET lc_numeric; RESET lc_numeric;
-- --