Support RN (roman-numeral format) in to_number().

We've long had roman-numeral output support in to_char(), but lacked the reverse conversion. Here it is. Author: Hunaid Sohail <hunaidpgml@gmail.com> Reviewed-by: Maciek Sakrejda <m.sakrejda@gmail.com> Reviewed-by: Tom Lane <tgl@sss.pgh.pa.us> Reviewed-by: Tomas Vondra <tomas@vondra.me> Discussion: https://postgr.es/m/CAMWA6ybh4M1VQqpmnu2tfSwO+3gAPeA8YKnMHVADeB=XDEvT_A@mail.gmail.com
2025-11-19 13:42:17 +03:00 · 2025-01-22 15:18:40 -05:00
parent f0ee648527
commit 172e6b3adb
4 changed files with 350 additions and 53 deletions
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -8669,8 +8669,8 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
        <entry>plus/minus sign in specified position</entry>
       </row>
       <row>
-        <entry><literal>RN</literal></entry>
+        <entry><literal>RN</literal> or <literal>rn</literal></entry>
-        <entry>Roman numeral (input between 1 and 3999)</entry>
+        <entry>Roman numeral (values between 1 and 3999)</entry>
       </row>
       <row>
        <entry><literal>TH</literal> or <literal>th</literal></entry>
@@ -8798,6 +8798,19 @@ SELECT regexp_match('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}');
       (e.g., <literal>9.99EEEE</literal> is a valid pattern).
      </para>
     </listitem>
     <listitem>
      <para>
       In <function>to_number()</function>, the <literal>RN</literal>
       pattern converts Roman numerals (in standard form) to numbers.
       Input is case-insensitive, so <literal>RN</literal>
       and <literal>rn</literal> are equivalent.  <literal>RN</literal>
       cannot be used in combination with any other formatting patterns or
       modifiers except <literal>FM</literal>, which is applicable only
       in <function>to_char()</function> and is ignored
       in <function>to_number()</function>.
      </para>
     </listitem>
    </itemizedlist>
   </para>
--- a/src/backend/utils/adt/formatting.c
+++ b/src/backend/utils/adt/formatting.c
@@ -49,7 +49,6 @@
 *	- better number building (formatting) / parsing, now it isn't
 *		  ideal code
 *	- use Assert()
 *	- add support for roman number to standard number conversion
 *	- add support for number spelling
 *	- add support for string to string formatting (we must be better
 *	  than Oracle :-),
@@ -257,13 +256,39 @@ static const char *const rm_months_lower[] =
 {"xii", "xi", "x", "ix", "viii", "vii", "vi", "v", "iv", "iii", "ii", "i", NULL};
 /* ----------
- * Roman numbers
+ * Roman numerals
 * ----------
 */
 static const char *const rm1[] = {"I", "II", "III", "IV", "V", "VI", "VII", "VIII", "IX", NULL};
 static const char *const rm10[] = {"X", "XX", "XXX", "XL", "L", "LX", "LXX", "LXXX", "XC", NULL};
 static const char *const rm100[] = {"C", "CC", "CCC", "CD", "D", "DC", "DCC", "DCCC", "CM", NULL};
 /*
 * MACRO: Check if the current and next characters form a valid subtraction
 * combination for roman numerals.
 */
 #define IS_VALID_SUB_COMB(curr, next) \
 	(((curr) == 'I' && ((next) == 'V' || (next) == 'X')) || \
 	 ((curr) == 'X' && ((next) == 'L' || (next) == 'C')) || \
 	 ((curr) == 'C' && ((next) == 'D' || (next) == 'M')))
 /*
 * MACRO: Roman numeral value, or 0 if character isn't a roman numeral.
 */
 #define ROMAN_VAL(r) \
 	((r) == 'I' ? 1 : \
 	 (r) == 'V' ? 5 : \
 	 (r) == 'X' ? 10 : \
 	 (r) == 'L' ? 50 : \
 	 (r) == 'C' ? 100 : \
 	 (r) == 'D' ? 500 : \
 	 (r) == 'M' ? 1000 : 0)
 /*
 * 'MMMDCCCLXXXVIII' (3888) is the longest valid roman numeral (15 characters).
 */
 #define MAX_ROMAN_LEN	15
 /* ----------
 * Ordinal postfixes
 * ----------
@@ -1028,6 +1053,15 @@ typedef struct NUMProc
 #define DCH_TIMED	0x02
 #define DCH_ZONED	0x04
 /*
 * These macros are used in NUM_processor() and its subsidiary routines.
 * OVERLOAD_TEST: true if we've reached end of input string
 * AMOUNT_TEST(s): true if at least s bytes remain in string
 */
 #define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
 #define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
 /* ----------
 * Functions
 * ----------
@@ -1075,6 +1109,7 @@ static bool do_to_timestamp(text *date_txt, text *fmt, Oid collid, bool std,
 static char *fill_str(char *str, int c, int max);
 static FormatNode *NUM_cache(int len, NUMDesc *Num, text *pars_str, bool *shouldFree);
 static char *int_to_roman(int number);
 static int	roman_to_int(NUMProc *Np, int input_len);
 static void NUM_prepare_locale(NUMProc *Np);
 static char *get_last_relevant_decnum(char *num);
 static void NUM_numpart_from_char(NUMProc *Np, int id, int input_len);
@@ -1285,6 +1320,10 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 		case NUM_rn:
 		case NUM_RN:
 			if (IS_ROMAN(num))
 				ereport(ERROR,
 						(errcode(ERRCODE_SYNTAX_ERROR),
 						 errmsg("cannot use \"RN\" twice")));
 			num->flag |= NUM_F_ROMAN;
 			break;
@@ -1316,6 +1355,13 @@ NUMDesc_prepare(NUMDesc *num, FormatNode *n)
 			num->flag |= NUM_F_EEEE;
 			break;
 	}
 	if (IS_ROMAN(num) &&
 		(num->flag & ~(NUM_F_ROMAN | NUM_F_FILLMODE)) != 0)
 		ereport(ERROR,
 				(errcode(ERRCODE_SYNTAX_ERROR),
 				 errmsg("\"RN\" is incompatible with other formats"),
 				 errdetail("\"RN\" may only be used together with \"FM\".")));
 }
 /* ----------
@@ -4956,7 +5002,7 @@ int_to_roman(int number)
 			   *result,
 				numstr[12];
-	result = (char *) palloc(16);
+	result = (char *) palloc(MAX_ROMAN_LEN + 1);
 	*result = '\0';
 	/*
@@ -4966,7 +5012,7 @@ int_to_roman(int number)
 	 */
 	if (number > 3999 || number < 1)
 	{
-		fill_str(result, '#', 15);
+		fill_str(result, '#', MAX_ROMAN_LEN);
 		return result;
 	}
@@ -5000,6 +5046,157 @@ int_to_roman(int number)
 	return result;
 }
 /*
 * Convert a roman numeral (standard form) to an integer.
 * Result is an integer between 1 and 3999.
 * Np->inout_p is advanced past the characters consumed.
 *
 * If input is invalid, return -1.
 */
 static int
 roman_to_int(NUMProc *Np, int input_len)
 {
 	int			result = 0;
 	int			len;
 	char		romanChars[MAX_ROMAN_LEN];
 	int			romanValues[MAX_ROMAN_LEN];
 	int			repeatCount = 1;
 	int			vCount = 0,
 				lCount = 0,
 				dCount = 0;
 	bool		subtractionEncountered = false;
 	int			lastSubtractedValue = 0;
 	/*
 	 * Skip any leading whitespace.  Perhaps we should limit the amount of
 	 * space skipped to MAX_ROMAN_LEN, but that seems unnecessarily picky.
 	 */
 	while (!OVERLOAD_TEST && isspace((unsigned char) *Np->inout_p))
 		Np->inout_p++;
 	/*
 	 * Collect and decode valid roman numerals, consuming at most
 	 * MAX_ROMAN_LEN characters.  We do this in a separate loop to avoid
 	 * repeated decoding and because the main loop needs to know when it's at
 	 * the last numeral.
 	 */
 	for (len = 0; len < MAX_ROMAN_LEN && !OVERLOAD_TEST; len++)
 	{
 		char		currChar = pg_ascii_toupper(*Np->inout_p);
 		int			currValue = ROMAN_VAL(currChar);
 		if (currValue == 0)
 			break;				/* Not a valid roman numeral. */
 		romanChars[len] = currChar;
 		romanValues[len] = currValue;
 		Np->inout_p++;
 	}
 	if (len == 0)
 		return -1;				/* No valid roman numerals. */
 	/* Check for valid combinations and compute the represented value. */
 	for (int i = 0; i < len; i++)
 	{
 		char		currChar = romanChars[i];
 		int			currValue = romanValues[i];
 		/*
 		 * Ensure no numeral greater than or equal to the subtracted numeral
 		 * appears after a subtraction.
 		 */
 		if (subtractionEncountered && currValue >= lastSubtractedValue)
 			return -1;
 		/*
 		 * V, L, and D should not appear before a larger numeral, nor should
 		 * they be repeated.
 		 */
 		if ((vCount && currValue >= ROMAN_VAL('V')) ||
 			(lCount && currValue >= ROMAN_VAL('L')) ||
 			(dCount && currValue >= ROMAN_VAL('D')))
 			return -1;
 		if (currChar == 'V')
 			vCount++;
 		else if (currChar == 'L')
 			lCount++;
 		else if (currChar == 'D')
 			dCount++;
 		if (i < len - 1)
 		{
 			/* Compare current numeral to next numeral. */
 			char		nextChar = romanChars[i + 1];
 			int			nextValue = romanValues[i + 1];
 			/*
 			 * If the current value is less than the next value, handle
 			 * subtraction. Verify valid subtractive combinations and update
 			 * the result accordingly.
 			 */
 			if (currValue < nextValue)
 			{
 				if (!IS_VALID_SUB_COMB(currChar, nextChar))
 					return -1;
 				/*
 				 * Reject cases where same numeral is repeated with
 				 * subtraction (e.g. 'MCCM' or 'DCCCD').
 				 */
 				if (repeatCount > 1)
 					return -1;
 				/*
 				 * We are going to skip nextChar, so first make checks needed
 				 * for V, L, and D.  These are the same as we'd have applied
 				 * if we reached nextChar without a subtraction.
 				 */
 				if ((vCount && nextValue >= ROMAN_VAL('V')) ||
 					(lCount && nextValue >= ROMAN_VAL('L')) ||
 					(dCount && nextValue >= ROMAN_VAL('D')))
 					return -1;
 				if (nextChar == 'V')
 					vCount++;
 				else if (nextChar == 'L')
 					lCount++;
 				else if (nextChar == 'D')
 					dCount++;
 				/*
 				 * Skip the next numeral as it is part of the subtractive
 				 * combination.
 				 */
 				i++;
 				/* Update state. */
 				repeatCount = 1;
 				subtractionEncountered = true;
 				lastSubtractedValue = currValue;
 				result += (nextValue - currValue);
 			}
 			else
 			{
 				/* For same numerals, check for repetition. */
 				if (currChar == nextChar)
 				{
 					repeatCount++;
 					if (repeatCount > 3)
 						return -1;
 				}
 				else
 					repeatCount = 1;
 				result += currValue;
 			}
 		}
 		else
 		{
 			/* This is the last numeral; just add it to the result. */
 			result += currValue;
 		}
 	}
 	return result;
 }
 /* ----------
@@ -5112,14 +5309,6 @@ get_last_relevant_decnum(char *num)
 	return result;
 }
 /*
 * These macros are used in NUM_processor() and its subsidiary routines.
 * OVERLOAD_TEST: true if we've reached end of input string
 * AMOUNT_TEST(s): true if at least s bytes remain in string
 */
 #define OVERLOAD_TEST	(Np->inout_p >= Np->inout + input_len)
 #define AMOUNT_TEST(s)	(Np->inout_p <= Np->inout + (input_len - (s)))
 /* ----------
 * Number extraction for TO_NUMBER()
 * ----------
@@ -5576,29 +5765,6 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 		return strcpy(inout, number);
 	}
 	/*
 	 * Roman correction
 	 */
 	if (IS_ROMAN(Np->Num))
 	{
 		if (!Np->is_to_char)
 			ereport(ERROR,
 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
 					 errmsg("\"RN\" not supported for input")));
 		Np->Num->lsign = Np->Num->pre_lsign_num = Np->Num->post =
 			Np->Num->pre = Np->out_pre_spaces = Np->sign = 0;
 		if (IS_FILLMODE(Np->Num))
 		{
 			Np->Num->flag = 0;
 			Np->Num->flag |= NUM_F_FILLMODE;
 		}
 		else
 			Np->Num->flag = 0;
 		Np->Num->flag |= NUM_F_ROMAN;
 	}
 	/*
 	 * Sign
 	 */
@@ -5849,28 +6015,35 @@ NUM_processor(FormatNode *node, NUMDesc *Num, char *inout,
 					break;
 				case NUM_RN:
 					if (IS_FILLMODE(Np->Num))
 					{
 						strcpy(Np->inout_p, Np->number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					else
 					{
 						sprintf(Np->inout_p, "%15s", Np->number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					break;
 				case NUM_rn:
-					if (IS_FILLMODE(Np->Num))
+					if (Np->is_to_char)
 					{
-						strcpy(Np->inout_p, asc_tolower_z(Np->number_p));
+						const char *number_p;
 						if (n->key->id == NUM_rn)
 							number_p = asc_tolower_z(Np->number_p);
 						else
 							number_p = Np->number_p;
 						if (IS_FILLMODE(Np->Num))
 							strcpy(Np->inout_p, number_p);
 						else
 							sprintf(Np->inout_p, "%15s", number_p);
 						Np->inout_p += strlen(Np->inout_p) - 1;
 					}
 					else
 					{
-						sprintf(Np->inout_p, "%15s", asc_tolower_z(Np->number_p));
+						int			roman_result = roman_to_int(Np, input_len);
-						Np->inout_p += strlen(Np->inout_p) - 1;
+						int			numlen;
 						if (roman_result < 0)
 							ereport(ERROR,
 									(errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
 									 errmsg("invalid Roman numeral")));
 						numlen = sprintf(Np->number_p, "%d", roman_result);
 						Np->number_p += numlen;
 						Np->Num->pre = numlen;
 						Np->Num->post = 0;
 						continue;	/* roman_to_int ate all the chars */
 					}
 					break;
--- a/src/test/regress/expected/numeric.out
+++ b/src/test/regress/expected/numeric.out
@@ -2384,6 +2384,84 @@ SELECT to_number('123456', '99999V99');
 1234.560000000000000000
 (1 row)
 -- Test for correct conversion between numbers and Roman numerals
 WITH rows AS
  (SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
 SELECT
  bool_and(to_number(roman, 'RN') = i) as valid
 FROM rows;
 valid 
 -------
 t
 (1 row)
 -- Some additional tests for RN input
 SELECT to_number('CvIiI', 'rn');
 to_number 
 -----------
       108
 (1 row)
 SELECT to_number('MMXX  ', 'RN');
 to_number 
 -----------
      2020
 (1 row)
 SELECT to_number('  XIV', '  RN');
 to_number 
 -----------
        14
 (1 row)
 SELECT to_number('  XIV  ', '  RN');
 to_number 
 -----------
        14
 (1 row)
 SELECT to_number('M CC', 'RN');
 to_number 
 -----------
      1000
 (1 row)
 -- error cases
 SELECT to_number('viv', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('DCCCD', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('XIXL', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('MCCM', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('MMMM', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('VV', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('IL', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('VIX', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('LXC', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('DCM', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('MMMDCM', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('CLXC', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('CM', 'MIRN');
 ERROR:  "RN" is incompatible with other formats
 DETAIL:  "RN" may only be used together with "FM".
 SELECT to_number('CM', 'RNRN');
 ERROR:  cannot use "RN" twice
 SELECT to_number('qiv', 'RN');
 ERROR:  invalid Roman numeral
 SELECT to_number('', 'RN');
 ERROR:  invalid input syntax for type numeric: " "
 SELECT to_number(' ', 'RN');
 ERROR:  invalid Roman numeral
 RESET lc_numeric;
 --
 -- Input syntax
--- a/src/test/regress/sql/numeric.sql
+++ b/src/test/regress/sql/numeric.sql
@@ -1085,6 +1085,39 @@ SELECT to_number('1234.56','L99,999.99');
 SELECT to_number('1,234.56','L99,999.99');
 SELECT to_number('42nd', '99th');
 SELECT to_number('123456', '99999V99');
 -- Test for correct conversion between numbers and Roman numerals
 WITH rows AS
  (SELECT i, to_char(i, 'RN') AS roman FROM generate_series(1, 3999) AS i)
 SELECT
  bool_and(to_number(roman, 'RN') = i) as valid
 FROM rows;
 -- Some additional tests for RN input
 SELECT to_number('CvIiI', 'rn');
 SELECT to_number('MMXX  ', 'RN');
 SELECT to_number('  XIV', '  RN');
 SELECT to_number('  XIV  ', '  RN');
 SELECT to_number('M CC', 'RN');
 -- error cases
 SELECT to_number('viv', 'RN');
 SELECT to_number('DCCCD', 'RN');
 SELECT to_number('XIXL', 'RN');
 SELECT to_number('MCCM', 'RN');
 SELECT to_number('MMMM', 'RN');
 SELECT to_number('VV', 'RN');
 SELECT to_number('IL', 'RN');
 SELECT to_number('VIX', 'RN');
 SELECT to_number('LXC', 'RN');
 SELECT to_number('DCM', 'RN');
 SELECT to_number('MMMDCM', 'RN');
 SELECT to_number('CLXC', 'RN');
 SELECT to_number('CM', 'MIRN');
 SELECT to_number('CM', 'RNRN');
 SELECT to_number('qiv', 'RN');
 SELECT to_number('', 'RN');
 SELECT to_number(' ', 'RN');
 RESET lc_numeric;
 --