py-kms/upy/codecs.py

__all__ = ['encode']
# from source of pypy:
# pypy / pypy / module / _codecs / interp_codecs.py
# pypy / rpython / rlib / runicode.py
import sys

BYTEORDER = sys.byteorder
BYTEORDER2 = BYTEORDER[0] + 'e'      # either "le" or "be"
assert BYTEORDER2 in ('le', 'be')


def _storechar(result, CH, byteorder):
    hi = ((CH) >> 8) & 0xff
    lo = (CH) & 0xff
    if byteorder == 'little':
        result.append(lo)
        result.append(hi)
    else:
        result.append(hi)
        result.append(lo)

def encode_utf_16(s, errors,
                          errorhandler=None,
                          allow_surrogates=True,
                          byteorder='little',
                          public_encoding_name='utf16'):
    if errorhandler is None:
        pass
        # errorhandler = default_unicode_error_encode

    result = bytearray()
    if byteorder == 'native':
        _storechar(result, 0xFEFF, BYTEORDER)
        byteorder = BYTEORDER

    for pos, c in enumerate(s):
        ch = ord(c)

        if ch < 0xD800:
            _storechar(result, ch, byteorder)
        elif ch >= 0x10000:
            _storechar(result, 0xD800 | ((ch-0x10000) >> 10), byteorder)
            _storechar(result, 0xDC00 | ((ch-0x10000) & 0x3FF), byteorder)
        elif ch >= 0xE000 or allow_surrogates:
            _storechar(result, ch, byteorder)
        else:
            ru, rs, pos = errorhandler(errors, public_encoding_name,
                                       'surrogates not allowed',
                                       s, pos-1, pos)
            if rs is not None:
                # py3k only
                if len(rs) % 2 != 0:
                    errorhandler('strict', public_encoding_name,
                                 'surrogates not allowed',
                                 s, pos-1, pos)
                result.append(rs)
                continue
            for ch in ru:
                if ord(ch) < 0xD800:
                    _storechar(result, ord(ch), byteorder)
                else:
                    errorhandler('strict', public_encoding_name,
                                 'surrogates not allowed',
                                 s, pos-1, pos)
            continue

    return bytes(result)

def encode(obj, encoding='utf_8', errors='strict'):
    if encoding == 'utf_8':
        return obj.encode(encoding, errors)
    elif encoding == 'utf_16':
        return encode_utf_16(obj, [], None,
                             True, 'native',
                             'utf-16-' + BYTEORDER2)
    elif encoding == 'utf_16_be':
        return encode_utf_16(obj, [], None,
                             True, 'big',
                             'utf-16-be')
    elif encoding == 'utf_16_le':
        return encode_utf_16(obj, [], None,
                             True, 'little',
                             'utf-16-le')
    else:
        raise NotImplementedError('Encoding of {} not implemented'.format(encoding))