1
0
mirror of https://github.com/nlohmann/json.git synced 2025-07-28 12:02:00 +03:00

Merge pull request #1314 from nlohmann/feature/codec_errors

Allow to set error handler for decoding errors
This commit is contained in:
Niels Lohmann
2018-10-24 08:40:58 +02:00
committed by GitHub
5 changed files with 455 additions and 30 deletions

View File

@ -10122,6 +10122,14 @@ namespace detail
// serialization //
///////////////////
/// how to treat decoding errors
enum class error_handler_t
{
strict, ///< throw a type_error exception in case of invalid UTF-8
replace, ///< replace invalid UTF-8 sequences with U+FFFD
ignore ///< ignore invalid UTF-8 sequences
};
template<typename BasicJsonType>
class serializer
{
@ -10136,12 +10144,17 @@ class serializer
/*!
@param[in] s output stream to serialize to
@param[in] ichar indentation character to use
@param[in] error_handler_ how to react on decoding errors
*/
serializer(output_adapter_t<char> s, const char ichar)
: o(std::move(s)), loc(std::localeconv()),
thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep)),
decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point)),
indent_char(ichar), indent_string(512, indent_char)
serializer(output_adapter_t<char> s, const char ichar,
error_handler_t error_handler_ = error_handler_t::strict)
: o(std::move(s))
, loc(std::localeconv())
, thousands_sep(loc->thousands_sep == nullptr ? '\0' : * (loc->thousands_sep))
, decimal_point(loc->decimal_point == nullptr ? '\0' : * (loc->decimal_point))
, indent_char(ichar)
, indent_string(512, indent_char)
, error_handler(error_handler_)
{}
// delete because of pointer members
@ -10381,6 +10394,10 @@ class serializer
uint8_t state = UTF8_ACCEPT;
std::size_t bytes = 0; // number of bytes written to string_buffer
// number of bytes written at the point of the last valid byte
std::size_t bytes_after_last_accept = 0;
std::size_t undumped_chars = 0;
for (std::size_t i = 0; i < s.size(); ++i)
{
const auto byte = static_cast<uint8_t>(s[i]);
@ -10478,14 +10495,68 @@ class serializer
o->write_characters(string_buffer.data(), bytes);
bytes = 0;
}
// remember the byte position of this accept
bytes_after_last_accept = bytes;
undumped_chars = 0;
break;
}
case UTF8_REJECT: // decode found invalid UTF-8 byte
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", byte);
JSON_THROW(type_error::create(316, "invalid UTF-8 byte at index " + std::to_string(i) + ": 0x" + sn));
}
case error_handler_t::ignore:
case error_handler_t::replace:
{
// in case we saw this character the first time, we
// would like to read it again, because the byte
// may be OK for itself, but just not OK for the
// previous sequence
if (undumped_chars > 0)
{
--i;
}
// reset length buffer to the last accepted index;
// thus removing/ignoring the invalid characters
bytes = bytes_after_last_accept;
if (error_handler == error_handler_t::replace)
{
// add a replacement character
if (ensure_ascii)
{
string_buffer[bytes++] = '\\';
string_buffer[bytes++] = 'u';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'f';
string_buffer[bytes++] = 'd';
}
else
{
string_buffer[bytes++] = '\xEF';
string_buffer[bytes++] = '\xBF';
string_buffer[bytes++] = '\xBD';
}
bytes_after_last_accept = bytes;
}
undumped_chars = 0;
// continue processing the string
state = UTF8_ACCEPT;
continue;
}
}
}
default: // decode found yet incomplete multi-byte code point
@ -10495,11 +10566,13 @@ class serializer
// code point will not be escaped - copy byte to buffer
string_buffer[bytes++] = s[i];
}
++undumped_chars;
break;
}
}
}
// we finished processing the string
if (JSON_LIKELY(state == UTF8_ACCEPT))
{
// write buffer
@ -10511,9 +10584,38 @@ class serializer
else
{
// we finish reading, but do not accept: string was incomplete
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
switch (error_handler)
{
case error_handler_t::strict:
{
std::string sn(3, '\0');
snprintf(&sn[0], sn.size(), "%.2X", static_cast<uint8_t>(s.back()));
JSON_THROW(type_error::create(316, "incomplete UTF-8 string; last byte: 0x" + sn));
}
case error_handler_t::ignore:
{
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
break;
}
case error_handler_t::replace:
{
// write all accepted bytes
o->write_characters(string_buffer.data(), bytes_after_last_accept);
// add a replacement character
if (ensure_ascii)
{
o->write_characters("\\ufffd", 6);
}
else
{
o->write_characters("\xEF\xBF\xBD", 3);
}
break;
}
}
}
}
@ -10723,6 +10825,9 @@ class serializer
const char indent_char;
/// the indentation string
string_t indent_string;
/// error_handler how to react on decoding errors
const error_handler_t error_handler;
};
} // namespace detail
} // namespace nlohmann
@ -11692,6 +11797,8 @@ class basic_json
using json_pointer = ::nlohmann::json_pointer<basic_json>;
template<typename T, typename SFINAE>
using json_serializer = JSONSerializer<T, SFINAE>;
/// how to treat decoding errors
using error_handler_t = detail::error_handler_t;
/// helper type for initializer lists of basic_json values
using initializer_list_t = std::initializer_list<detail::json_ref<basic_json>>;
@ -13416,6 +13523,10 @@ class basic_json
@param[in] ensure_ascii If @a ensure_ascii is true, all non-ASCII characters
in the output are escaped with `\uXXXX` sequences, and the result consists
of ASCII characters only.
@param[in] error_handler how to react on decoding errors; there are three
possible values: `strict` (throws and exception in case a decoding error
occurs; default), `replace` (replace invalid UTF-8 sequences with U+FFFD),
and `ignore` (ignore invalid UTF-8 sequences during serialization).
@return string containing the serialization of the JSON value
@ -13434,13 +13545,16 @@ class basic_json
@see https://docs.python.org/2/library/json.html#json.dump
@since version 1.0.0; indentation character @a indent_char, option
@a ensure_ascii and exceptions added in version 3.0.0
@a ensure_ascii and exceptions added in version 3.0.0; error
handlers added in version 3.4.0.
*/
string_t dump(const int indent = -1, const char indent_char = ' ',
const bool ensure_ascii = false) const
string_t dump(const int indent = -1,
const char indent_char = ' ',
const bool ensure_ascii = false,
const error_handler_t error_handler = error_handler_t::strict) const
{
string_t result;
serializer s(detail::output_adapter<char, string_t>(result), indent_char);
serializer s(detail::output_adapter<char, string_t>(result), indent_char, error_handler);
if (indent >= 0)
{