1
0
mirror of https://github.com/nlohmann/json.git synced 2025-07-31 10:24:23 +03:00

Support UBJSON-derived Binary JData (BJData) format (#3336)

* support UBJSON-derived Binary JData (BJData) format

* fix Codacy warning

* partially fix VS compilation errors

* fix additional VS errors

* fix more VS compilation errors

* fix additional warnings and errors for clang and msvc

* add more tests to cover the new bjdata types

* add tests for optimized ndarray, improve coverage, fix clang/gcc warnings

* gcc warn useless conversion but msvc gives an error

* fix ci_test errors

* complete test coverage, fix ci_test errors

* add half precision error test

* fix No newline at end of file error by clang

* simplify endian condition, format unit-bjdata

* remove broken test due to alloc limit

* full coverage, I hope

* move bjdata new markers from default to the same level as ubjson markers

* fix ci errors, add tests for new bjdata switch structure

* make is_bjdata const after using initializer list

* remove the unwanted assert

* move is_bjdata to an optional param to write_ubjson

* pass use_bjdata via output adapter

* revert order to avoid msvc 2015 unreferenced formal param error

* update BJData Spect V1 Draft-2 URL after spec release

* amalgamate code

* code polishing following @gregmarr's feedback

* make use_bjdata a non-default parameter

* fix ci error, remove unwanted param comment

* encode and decode bjdata ndarray in jdata annotations, enable roundtrip tests

* partially fix ci errors, add tests to improve coverage

* polish patch to remove ci errors

* fix a ndarray dim vector condition

* fix clang tidy error

* add sax test cases for ndarray

* add additional sax event tests

* adjust sax event numbering

* fix sax tests

* ndarray can only be used with array containers, discard if used in object

* complete test coverage

* disable [{SHTFNZ in optimized type due to security risks in #2793 and hampered readability

* fix ci error

* move OutputIsLittleEndian from tparam to param to replace use_bjdata

* fix ci clang gcc error

* fix ci static analysis error

* update json_test_data to 3.1.0, enable file-based bjdata unit tests

* fix stack overflow error on msvc 2019 and 2022

* use https link, update sax_parse_error after rebase

* make input_format const and use initializer

* return bool for write_bjdata_ndarray

* test write_bjdata_ndarray return value as boolean

* fix ci error
This commit is contained in:
Qianqian Fang
2022-04-29 15:17:30 -04:00
committed by GitHub
parent a6ee8bf9d9
commit ee51661481
12 changed files with 4877 additions and 227 deletions

View File

@ -12,6 +12,7 @@
#include <string> // char_traits, string
#include <utility> // make_pair, move
#include <vector> // vector
#include <map> // map
#include <nlohmann/detail/exceptions.hpp>
#include <nlohmann/detail/input/input_adapters.hpp>
@ -74,7 +75,7 @@ class binary_reader
@param[in] adapter input adapter to read from
*/
explicit binary_reader(InputAdapterType&& adapter) noexcept : ia(std::move(adapter))
explicit binary_reader(InputAdapterType&& adapter, const input_format_t format = input_format_t::json) noexcept : ia(std::move(adapter)), input_format(format)
{
(void)detail::is_sax_static_asserts<SAX, BasicJsonType> {};
}
@ -118,6 +119,7 @@ class binary_reader
break;
case input_format_t::ubjson:
case input_format_t::bjdata:
result = parse_ubjson_internal();
break;
@ -129,7 +131,7 @@ class binary_reader
// strict mode: next byte must be EOF
if (result && strict)
{
if (format == input_format_t::ubjson)
if (input_format == input_format_t::ubjson || input_format == input_format_t::bjdata)
{
get_ignore_noop();
}
@ -141,7 +143,7 @@ class binary_reader
if (JSON_HEDLEY_UNLIKELY(current != std::char_traits<char_type>::eof()))
{
return sax->parse_error(chars_read, get_token_string(), parse_error::create(110, chars_read,
exception_message(format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
exception_message(input_format, concat("expected end of input; last byte: 0x", get_token_string()), "value"), nullptr));
}
}
@ -1844,7 +1846,7 @@ class binary_reader
get(); // TODO(niels): may we ignore N here?
}
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
{
return false;
}
@ -1854,52 +1856,154 @@ class binary_reader
case 'U':
{
std::uint8_t len{};
return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'i':
{
std::int8_t len{};
return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'I':
{
std::int16_t len{};
return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'l':
{
std::int32_t len{};
return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'L':
{
std::int64_t len{};
return get_number(input_format_t::ubjson, len) && get_string(input_format_t::ubjson, len, result);
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'u':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint16_t len{};
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'm':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint32_t len{};
return get_number(input_format, len) && get_string(input_format, len, result);
}
case 'M':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint64_t len{};
return get_number(input_format, len) && get_string(input_format, len, result);
}
default:
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
exception_message(input_format_t::ubjson, concat("expected length type specification (U, i, I, l, L); last byte: 0x", last_token), "string"), nullptr));
break;
}
auto last_token = get_token_string();
std::string message;
if (input_format != input_format_t::bjdata)
{
message = "expected length type specification (U, i, I, l, L); last byte: 0x" + last_token;
}
else
{
message = "expected length type specification (U, i, u, I, m, l, M, L); last byte: 0x" + last_token;
}
return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "string"), nullptr));
}
/*!
@param[out] dim an integer vector storing the ND array dimensions
@return whether reading ND array size vector is successful
*/
bool get_ubjson_ndarray_size(std::vector<size_t>& dim)
{
std::pair<std::size_t, char_int_type> size_and_type;
size_t dimlen = 0;
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_type(size_and_type)))
{
return false;
}
if (size_and_type.first != string_t::npos)
{
if (size_and_type.second != 0)
{
if (size_and_type.second != 'N')
{
for (std::size_t i = 0; i < size_and_type.first; ++i)
{
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, size_and_type.second)))
{
return false;
}
dim.push_back(dimlen);
}
}
}
else
{
for (std::size_t i = 0; i < size_and_type.first; ++i)
{
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen)))
{
return false;
}
dim.push_back(dimlen);
}
}
}
else
{
while (current != ']')
{
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_size_value(dimlen, current)))
{
return false;
}
dim.push_back(dimlen);
get_ignore_noop();
}
}
return true;
}
/*!
@param[out] result determined size
@return whether size determination completed
*/
bool get_ubjson_size_value(std::size_t& result)
bool get_ubjson_size_value(std::size_t& result, char_int_type prefix = 0)
{
switch (get_ignore_noop())
if (prefix == 0)
{
prefix = get_ignore_noop();
}
switch (prefix)
{
case 'U':
{
std::uint8_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
@ -1910,7 +2014,7 @@ class binary_reader
case 'i':
{
std::int8_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
@ -1921,7 +2025,7 @@ class binary_reader
case 'I':
{
std::int16_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
@ -1932,7 +2036,7 @@ class binary_reader
case 'l':
{
std::int32_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
@ -1943,7 +2047,7 @@ class binary_reader
case 'L':
{
std::int64_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format_t::ubjson, number)))
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
@ -1951,13 +2055,105 @@ class binary_reader
return true;
}
default:
case 'u':
{
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
exception_message(input_format_t::ubjson, concat("expected length type specification (U, i, I, l, L) after '#'; last byte: 0x", last_token), "size"), nullptr));
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint16_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
result = static_cast<std::size_t>(number);
return true;
}
case 'm':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint32_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
result = static_cast<std::size_t>(number);
return true;
}
case 'M':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint64_t number{};
if (JSON_HEDLEY_UNLIKELY(!get_number(input_format, number)))
{
return false;
}
result = detail::conditional_static_cast<std::size_t>(number);
return true;
}
case '[':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::vector<size_t> dim;
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_ndarray_size(dim)))
{
return false;
}
if (dim.size() == 1 || (dim.size() == 2 && dim.at(0) == 1)) // return normal array size if 1D row vector
{
result = dim.at(dim.size() - 1);
return true;
}
if (!dim.empty()) // if ndarray, convert to an object in JData annotated array format
{
string_t key = "_ArraySize_";
if (JSON_HEDLEY_UNLIKELY(!sax->start_object(3) || !sax->key(key) || !sax->start_array(dim.size())))
{
return false;
}
result = 1;
for (auto i : dim)
{
result *= i;
if (JSON_HEDLEY_UNLIKELY(!sax->number_integer(static_cast<number_integer_t>(i))))
{
return false;
}
}
result |= (1ull << (sizeof(result) * 8 - 1)); // low 63 bit of result stores the total element count, sign-bit indicates ndarray
return sax->end_array();
}
result = 0;
return true;
}
default:
break;
}
auto last_token = get_token_string();
std::string message;
if (input_format != input_format_t::bjdata)
{
message = "expected length type specification (U, i, I, l, L) after '#'; last byte: 0x" + last_token;
}
else
{
message = "expected length type specification (U, i, u, I, m, l, M, L) after '#'; last byte: 0x" + last_token;
}
return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read, exception_message(input_format, message, "size"), nullptr));
}
/*!
@ -1979,8 +2175,10 @@ class binary_reader
if (current == '$')
{
std::vector<char_int_type> bjdx = {'[', '{', 'S', 'H', 'T', 'F', 'N', 'Z'}; // excluded markers in bjdata optimized type
result.second = get(); // must not ignore 'N', because 'N' maybe the type
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "type")))
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "type") || (input_format == input_format_t::bjdata && std::find(bjdx.begin(), bjdx.end(), result.second) != bjdx.end() )))
{
return false;
}
@ -1988,13 +2186,13 @@ class binary_reader
get_ignore_noop();
if (JSON_HEDLEY_UNLIKELY(current != '#'))
{
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "value")))
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "value")))
{
return false;
}
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
exception_message(input_format_t::ubjson, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
exception_message(input_format, concat("expected '#' after type information; last byte: 0x", last_token), "size"), nullptr));
}
return get_ubjson_size_value(result.first);
@ -2017,7 +2215,7 @@ class binary_reader
switch (prefix)
{
case std::char_traits<char_type>::eof(): // EOF
return unexpect_eof(input_format_t::ubjson, "value");
return unexpect_eof(input_format, "value");
case 'T': // true
return sax->boolean(true);
@ -2030,43 +2228,125 @@ class binary_reader
case 'U':
{
std::uint8_t number{};
return get_number(input_format_t::ubjson, number) && sax->number_unsigned(number);
return get_number(input_format, number) && sax->number_unsigned(number);
}
case 'i':
{
std::int8_t number{};
return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
return get_number(input_format, number) && sax->number_integer(number);
}
case 'I':
{
std::int16_t number{};
return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
return get_number(input_format, number) && sax->number_integer(number);
}
case 'l':
{
std::int32_t number{};
return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
return get_number(input_format, number) && sax->number_integer(number);
}
case 'L':
{
std::int64_t number{};
return get_number(input_format_t::ubjson, number) && sax->number_integer(number);
return get_number(input_format, number) && sax->number_integer(number);
}
case 'u':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint16_t number{};
return get_number(input_format, number) && sax->number_unsigned(number);
}
case 'm':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint32_t number{};
return get_number(input_format, number) && sax->number_unsigned(number);
}
case 'M':
{
if (input_format != input_format_t::bjdata)
{
break;
}
std::uint64_t number{};
return get_number(input_format, number) && sax->number_unsigned(number);
}
case 'h':
{
if (input_format != input_format_t::bjdata)
{
break;
}
const auto byte1_raw = get();
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
{
return false;
}
const auto byte2_raw = get();
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
{
return false;
}
const auto byte1 = static_cast<unsigned char>(byte1_raw);
const auto byte2 = static_cast<unsigned char>(byte2_raw);
// code from RFC 7049, Appendix D, Figure 3:
// As half-precision floating-point numbers were only added
// to IEEE 754 in 2008, today's programming platforms often
// still only have limited support for them. It is very
// easy to include at least decoding support for them even
// without such support. An example of a small decoder for
// half-precision floating-point numbers in the C language
// is shown in Fig. 3.
const auto half = static_cast<unsigned int>((byte2 << 8u) + byte1);
const double val = [&half]
{
const int exp = (half >> 10u) & 0x1Fu;
const unsigned int mant = half & 0x3FFu;
JSON_ASSERT(0 <= exp&& exp <= 32);
JSON_ASSERT(mant <= 1024);
switch (exp)
{
case 0:
return std::ldexp(mant, -24);
case 31:
return (mant == 0)
? std::numeric_limits<double>::infinity()
: std::numeric_limits<double>::quiet_NaN();
default:
return std::ldexp(mant + 1024, exp - 25);
}
}();
return sax->number_float((half & 0x8000u) != 0
? static_cast<number_float_t>(-val)
: static_cast<number_float_t>(val), "");
}
case 'd':
{
float number{};
return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
}
case 'D':
{
double number{};
return get_number(input_format_t::ubjson, number) && sax->number_float(static_cast<number_float_t>(number), "");
return get_number(input_format, number) && sax->number_float(static_cast<number_float_t>(number), "");
}
case 'H':
@ -2077,7 +2357,7 @@ class binary_reader
case 'C': // char
{
get();
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "char")))
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "char")))
{
return false;
}
@ -2085,7 +2365,7 @@ class binary_reader
{
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(113, chars_read,
exception_message(input_format_t::ubjson, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr));
exception_message(input_format, concat("byte after 'C' must be in range 0x00..0x7F; last byte: 0x", last_token), "char"), nullptr));
}
string_t s(1, static_cast<typename string_t::value_type>(current));
return sax->string(s);
@ -2104,12 +2384,10 @@ class binary_reader
return get_ubjson_object();
default: // anything else
{
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read,
exception_message(input_format_t::ubjson, concat("invalid byte: 0x", last_token), "value"), nullptr));
}
break;
}
auto last_token = get_token_string();
return sax->parse_error(chars_read, last_token, parse_error::create(112, chars_read, exception_message(input_format, "invalid byte: 0x" + last_token, "value"), nullptr));
}
/*!
@ -2123,6 +2401,44 @@ class binary_reader
return false;
}
// detect and encode bjdata ndarray as an object in JData annotated array format (https://github.com/NeuroJSON/jdata):
// {"_ArrayType_" : "typeid", "_ArraySize_" : [n1, n2, ...], "_ArrayData_" : [v1, v2, ...]}
if (input_format == input_format_t::bjdata && size_and_type.first != string_t::npos && size_and_type.first >= (1ull << (sizeof(std::size_t) * 8 - 1)))
{
std::map<char_int_type, string_t> bjdtype = {{'U', "uint8"}, {'i', "int8"}, {'u', "uint16"}, {'I', "int16"},
{'m', "uint32"}, {'l', "int32"}, {'M', "uint64"}, {'L', "int64"}, {'d', "single"}, {'D', "double"}, {'C', "char"}
};
string_t key = "_ArrayType_";
if (JSON_HEDLEY_UNLIKELY(bjdtype.count(size_and_type.second) == 0 || !sax->key(key) || !sax->string(bjdtype[size_and_type.second]) ))
{
return false;
}
if (size_and_type.second == 'C')
{
size_and_type.second = 'U';
}
size_and_type.first &= ~(1ull << (sizeof(std::size_t) * 8 - 1));
key = "_ArrayData_";
if (JSON_HEDLEY_UNLIKELY(!sax->key(key) || !sax->start_array(size_and_type.first) ))
{
return false;
}
for (std::size_t i = 0; i < size_and_type.first; ++i)
{
if (JSON_HEDLEY_UNLIKELY(!get_ubjson_value(size_and_type.second)))
{
return false;
}
}
return (sax->end_array() && sax->end_object());
}
if (size_and_type.first != string_t::npos)
{
if (JSON_HEDLEY_UNLIKELY(!sax->start_array(size_and_type.first)))
@ -2185,6 +2501,11 @@ class binary_reader
return false;
}
if (input_format == input_format_t::bjdata && size_and_type.first != string_t::npos && size_and_type.first >= (1ull << (sizeof(std::size_t) * 8 - 1)))
{
return false;
}
string_t key;
if (size_and_type.first != string_t::npos)
{
@ -2267,7 +2588,7 @@ class binary_reader
for (std::size_t i = 0; i < size; ++i)
{
get();
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format_t::ubjson, "number")))
if (JSON_HEDLEY_UNLIKELY(!unexpect_eof(input_format, "number")))
{
return false;
}
@ -2286,7 +2607,7 @@ class binary_reader
if (JSON_HEDLEY_UNLIKELY(result_remainder != token_type::end_of_input))
{
return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
exception_message(input_format_t::ubjson, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
}
switch (result_number)
@ -2313,7 +2634,7 @@ class binary_reader
case token_type::literal_or_value:
default:
return sax->parse_error(chars_read, number_string, parse_error::create(115, chars_read,
exception_message(input_format_t::ubjson, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
exception_message(input_format, concat("invalid number text: ", number_lexer.get_token_string()), "high-precision number"), nullptr));
}
}
@ -2362,6 +2683,8 @@ class binary_reader
@note This function needs to respect the system's endianness, because
bytes in CBOR, MessagePack, and UBJSON are stored in network order
(big endian) and therefore need reordering on little endian systems.
On the other hand, BSON and BJData use little endian and should reorder
on big endian systems.
*/
template<typename NumberType, bool InputIsLittleEndian = false>
bool get_number(const input_format_t format, NumberType& result)
@ -2377,7 +2700,7 @@ class binary_reader
}
// reverse byte order prior to conversion if necessary
if (is_little_endian != InputIsLittleEndian)
if (is_little_endian != (InputIsLittleEndian || format == input_format_t::bjdata))
{
vec[sizeof(NumberType) - i - 1] = static_cast<std::uint8_t>(current);
}
@ -2514,6 +2837,10 @@ class binary_reader
error_msg += "BSON";
break;
case input_format_t::bjdata:
error_msg += "BJData";
break;
case input_format_t::json: // LCOV_EXCL_LINE
default: // LCOV_EXCL_LINE
JSON_ASSERT(false); // NOLINT(cert-dcl03-c,hicpp-static-assert,misc-static-assert) LCOV_EXCL_LINE
@ -2535,6 +2862,9 @@ class binary_reader
/// whether we can assume little endianness
const bool is_little_endian = little_endianness();
/// input format
const input_format_t input_format = input_format_t::json;
/// the SAX parser
json_sax_t* sax = nullptr;
};