mirror of
https://github.com/nlohmann/json.git
synced 2025-07-19 17:03:16 +03:00
🚧 added input adapter for wide strings #1031
This commit is contained in:
@ -1738,6 +1738,179 @@ class input_buffer_adapter : public input_adapter_protocol
|
||||
const char* start;
|
||||
};
|
||||
|
||||
template<typename WideStringType>
|
||||
class wide_string_input_adapter : public input_adapter_protocol
|
||||
{
|
||||
private:
|
||||
using char_t = typename WideStringType::value_type;
|
||||
|
||||
public:
|
||||
wide_string_input_adapter(const WideStringType& w) : str(w) {}
|
||||
|
||||
std::char_traits<char>::int_type get_character() noexcept override
|
||||
{
|
||||
// unget_character() was called previously: return the last character
|
||||
if (next_unget)
|
||||
{
|
||||
next_unget = false;
|
||||
return last_char;
|
||||
}
|
||||
|
||||
// check if buffer needs to be filled
|
||||
if (utf8_bytes_index == utf8_bytes_filled)
|
||||
{
|
||||
if (sizeof(char_t) == 2)
|
||||
{
|
||||
fill_buffer_utf16();
|
||||
}
|
||||
else
|
||||
{
|
||||
fill_buffer_utf32();
|
||||
}
|
||||
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index == 0);
|
||||
}
|
||||
|
||||
// use buffer
|
||||
assert(utf8_bytes_filled > 0);
|
||||
assert(utf8_bytes_index < utf8_bytes_filled);
|
||||
return (last_char = utf8_bytes[utf8_bytes_index++]);
|
||||
}
|
||||
|
||||
void unget_character() noexcept override
|
||||
{
|
||||
next_unget = true;
|
||||
}
|
||||
|
||||
private:
|
||||
void fill_buffer_utf16()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-16 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6));
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (0xD800 > wc or wc >= 0xE000)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12));
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else
|
||||
{
|
||||
if (current_wchar < str.size())
|
||||
{
|
||||
const char_t wc2 = str[current_wchar++];
|
||||
const int charcode = 0x10000 + (((wc & 0x3FF) << 10) | (wc2 & 0x3FF));
|
||||
utf8_bytes[0] = 0xf0 | (charcode >> 18);
|
||||
utf8_bytes[1] = 0x80 | ((charcode >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((charcode >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (charcode & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
++current_wchar;
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void fill_buffer_utf32()
|
||||
{
|
||||
utf8_bytes_index = 0;
|
||||
|
||||
if (current_wchar == str.size())
|
||||
{
|
||||
utf8_bytes[0] = std::char_traits<char>::eof();
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else
|
||||
{
|
||||
// get the current character
|
||||
const char_t wc = str[current_wchar++];
|
||||
|
||||
// UTF-32 to UTF-8 encoding
|
||||
if (wc < 0x80)
|
||||
{
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
else if (wc <= 0x7FF)
|
||||
{
|
||||
utf8_bytes[0] = 0xC0 | ((wc >> 6) & 0x1F);
|
||||
utf8_bytes[1] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 2;
|
||||
}
|
||||
else if (wc <= 0xFFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xE0 | ((wc >> 12) & 0x0F);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 3;
|
||||
}
|
||||
else if (wc <= 0x10FFFF)
|
||||
{
|
||||
utf8_bytes[0] = 0xF0 | ((wc >> 18 ) & 0x07);
|
||||
utf8_bytes[1] = 0x80 | ((wc >> 12) & 0x3F);
|
||||
utf8_bytes[2] = 0x80 | ((wc >> 6) & 0x3F);
|
||||
utf8_bytes[3] = 0x80 | (wc & 0x3F);
|
||||
utf8_bytes_filled = 4;
|
||||
}
|
||||
else
|
||||
{
|
||||
// unknown character
|
||||
utf8_bytes[0] = wc;
|
||||
utf8_bytes_filled = 1;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
/// the wstring to process
|
||||
const WideStringType& str;
|
||||
|
||||
/// index of the current wchar in str
|
||||
std::size_t current_wchar = 0;
|
||||
|
||||
/// a buffer for UTF-8 bytes
|
||||
std::array<std::char_traits<char>::int_type, 4> utf8_bytes = {{0, 0, 0, 0}};
|
||||
|
||||
/// index to the utf8_codes array for the next valid byte
|
||||
std::size_t utf8_bytes_index = 0;
|
||||
/// number of valid bytes in the utf8_codes array
|
||||
std::size_t utf8_bytes_filled = 0;
|
||||
|
||||
/// the last character (returned after unget_character() is called)
|
||||
std::char_traits<char>::int_type last_char = 0;
|
||||
/// whether get_character() should return last_char
|
||||
bool next_unget = false;
|
||||
};
|
||||
|
||||
class input_adapter
|
||||
{
|
||||
public:
|
||||
@ -1751,6 +1924,15 @@ class input_adapter
|
||||
input_adapter(std::istream&& i)
|
||||
: ia(std::make_shared<input_stream_adapter>(i)) {}
|
||||
|
||||
input_adapter(const std::wstring& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::wstring>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u16string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u16string>>(ws)) {}
|
||||
|
||||
input_adapter(const std::u32string& ws)
|
||||
: ia(std::make_shared<wide_string_input_adapter<std::u32string>>(ws)) {}
|
||||
|
||||
/// input adapter for buffer
|
||||
template<typename CharT,
|
||||
typename std::enable_if<
|
||||
|
Reference in New Issue
Block a user