我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用sys.maxunicode()。
def test_invalid_escape_sequences(self): # incomplete escape sequence self.assertRaises(json.JSONDecodeError, json.loads, '"\\u') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u12') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u123') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1234') # invalid escape sequence self.assertRaises(json.JSONDecodeError, json.loads, '"\\u123x"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u12x4"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\u1x34"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ux234"') if sys.maxunicode > 65535: # invalid escape sequence for low surrogate self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u000x"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u00x0"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\u0x00"') self.assertRaises(json.JSONDecodeError, json.loads, '"\\ud800\\ux000"')
def parse_entity(self, entity): """ Returns the text from a given :class:`telegram.MessageEntity`. Note: This method is present because Telegram calculates the offset and length in UTF-16 codepoint pairs, which some versions of Python don't handle automatically. (That is, you can't just slice ``Message.text`` with the offset and length.) Args: entity (telegram.MessageEntity): The entity to extract the text from. It must be an entity that belongs to this message. Returns: str: The text of the given entity """ # Is it a narrow build, if so we don't need to convert if sys.maxunicode == 0xffff: return self.text[entity.offset:entity.offset + entity.length] else: entity_text = self.text.encode('utf-16-le') entity_text = entity_text[entity.offset * 2:(entity.offset + entity.length) * 2] return entity_text.decode('utf-16-le')
def parse_text_entity(self, entity): """ Returns the text from a given :class:`telegram.MessageEntity`. Note: This method is present because Telegram calculates the offset and length in UTF-16 codepoint pairs, which some versions of Python don't handle automatically. (That is, you can't just slice ``Message.text`` with the offset and length.) Args: entity (telegram.MessageEntity): The entity to extract the text from. It must be an entity that belongs to this message. Returns: str: The text of the given entity """ # Is it a narrow build, if so we don't need to convert if sys.maxunicode == 0xffff: return self.text[entity.offset:entity.offset + entity.length] else: entity_text = self.text.encode('utf-16-le') entity_text = entity_text[entity.offset * 2:(entity.offset + entity.length) * 2] return entity_text.decode('utf-16-le')
def build_index(self, chars=None): if chars is None: chars = (chr(i) for i in range(32, sys.maxunicode)) index = {} for char in chars: try: name = unicodedata.name(char) except ValueError: continue if name.startswith(CJK_UNI_PREFIX): name = CJK_UNI_PREFIX elif name.startswith(CJK_CMP_PREFIX): name = CJK_CMP_PREFIX for word in tokenize(name): index.setdefault(word, set()).add(char) self.index = index
def unicode_iter(val): """Provides an iterator over the *code points* of the given Unicode sequence. Notes: Before PEP-393, Python has the potential to support Unicode as UTF-16 or UTF-32. This is reified in the property as ``sys.maxunicode``. As a result, naive iteration of Unicode sequences will render non-character code points such as UTF-16 surrogates. Args: val (unicode): The unicode sequence to iterate over as integer code points in the range ``0x0`` to ``0x10FFFF``. """ val_iter = iter(val) while True: code_point = next(_next_code_point(val, val_iter, to_int=ord)) if code_point is None: raise ValueError('Unpaired high surrogate at end of Unicode sequence: %r' % val) yield code_point
def test_bug1251300(self): # Decoding with unicode_internal used to not correctly handle "code # points" above 0x10ffff on UCS-4 builds. if sys.maxunicode > 0xffff: ok = [ (b"\x00\x10\xff\xff", "\U0010ffff"), (b"\x00\x00\x01\x01", "\U00000101"), (b"", ""), ] not_ok = [ b"\x7f\xff\xff\xff", b"\x80\x00\x00\x00", b"\x81\x00\x00\x00", b"\x00", b"\x00\x00\x00\x00\x00", ] for internal, uni in ok: if sys.byteorder == "little": internal = bytes(reversed(internal)) self.assertEqual(uni, internal.decode("unicode_internal")) for internal in not_ok: if sys.byteorder == "little": internal = bytes(reversed(internal)) self.assertRaises(UnicodeDecodeError, internal.decode, "unicode_internal")
def test_backslashescape(self): # Does the same as the "unicode-escape" encoding, but with different # base encodings. sin = "a\xac\u1234\u20ac\u8000" if sys.maxunicode > 0xffff: sin += chr(sys.maxunicode) sout = b"a\\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += bytes("\\U%08x" % sys.maxunicode, "ascii") self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) sout = b"a\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += bytes("\\U%08x" % sys.maxunicode, "ascii") self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) sout = b"a\xac\\u1234\xa4\\u8000" if sys.maxunicode > 0xffff: sout += bytes("\\U%08x" % sys.maxunicode, "ascii") self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
def test_chr(self): self.assertEqual(chr(32), ' ') self.assertEqual(chr(65), 'A') self.assertEqual(chr(97), 'a') self.assertEqual(chr(0xff), '\xff') self.assertRaises(ValueError, chr, 1<<24) self.assertEqual(chr(sys.maxunicode), str(('\\U%08x' % (sys.maxunicode)).encode("ascii"), 'unicode-escape')) self.assertRaises(TypeError, chr) self.assertEqual(chr(0x0000FFFF), "\U0000FFFF") self.assertEqual(chr(0x00010000), "\U00010000") self.assertEqual(chr(0x00010001), "\U00010001") self.assertEqual(chr(0x000FFFFE), "\U000FFFFE") self.assertEqual(chr(0x000FFFFF), "\U000FFFFF") self.assertEqual(chr(0x00100000), "\U00100000") self.assertEqual(chr(0x00100001), "\U00100001") self.assertEqual(chr(0x0010FFFE), "\U0010FFFE") self.assertEqual(chr(0x0010FFFF), "\U0010FFFF") self.assertRaises(ValueError, chr, -1) self.assertRaises(ValueError, chr, 0x00110000) self.assertRaises((OverflowError, ValueError), chr, 2**32)
def test_hasattr(self): self.assertTrue(hasattr(sys, 'stdout')) self.assertRaises(TypeError, hasattr, sys, 1) self.assertRaises(TypeError, hasattr) self.assertEqual(False, hasattr(sys, chr(sys.maxunicode))) # Check that hasattr propagates all exceptions outside of # AttributeError. class A: def __getattr__(self, what): raise SystemExit self.assertRaises(SystemExit, hasattr, A(), "b") class B: def __getattr__(self, what): raise ValueError self.assertRaises(ValueError, hasattr, B(), "b")
def get_cext_path(dist_path): """ Get the directory of dist/cext. """ # Python version of current platform python_version = 'cp' + str(sys.version_info.major) + str(sys.version_info.minor) dirname = os.path.join(dist_path, 'cext/' + python_version) platform = util.get_platform() # For Linux system with cpython<3.3, there could be abi tags 'm' and 'mu' if 'linux' in platform and int(python_version[2:]) < 33: dirname = os.path.join(dirname, 'linux') # encode with ucs2 if sys.maxunicode == 65535: dirname = os.path.join(dirname, python_version+'m') # encode with ucs4 else: dirname = os.path.join(dirname, python_version+'mu') elif 'macosx' in platform: platform = 'macosx' dirname = os.path.join(dirname, platform) sys.path = [os.path.realpath(str(dirname))] + sys.path
def test_bug1251300(self): # Decoding with unicode_internal used to not correctly handle "code # points" above 0x10ffff on UCS-4 builds. if sys.maxunicode > 0xffff: ok = [ ("\x00\x10\xff\xff", u"\U0010ffff"), ("\x00\x00\x01\x01", u"\U00000101"), ("", u""), ] not_ok = [ "\x7f\xff\xff\xff", "\x80\x00\x00\x00", "\x81\x00\x00\x00", "\x00", "\x00\x00\x00\x00\x00", ] for internal, uni in ok: if sys.byteorder == "little": internal = "".join(reversed(internal)) self.assertEqual(uni, internal.decode("unicode_internal")) for internal in not_ok: if sys.byteorder == "little": internal = "".join(reversed(internal)) self.assertRaises(UnicodeDecodeError, internal.decode, "unicode_internal")
def test_utf8_decode_valid_sequences(self): sequences = [ # single byte ('\x00', u'\x00'), ('a', u'a'), ('\x7f', u'\x7f'), # 2 bytes ('\xc2\x80', u'\x80'), ('\xdf\xbf', u'\u07ff'), # 3 bytes ('\xe0\xa0\x80', u'\u0800'), ('\xed\x9f\xbf', u'\ud7ff'), ('\xee\x80\x80', u'\uE000'), ('\xef\xbf\xbf', u'\uffff'), # 4 bytes ('\xF0\x90\x80\x80', u'\U00010000'), ('\xf4\x8f\xbf\xbf', u'\U0010FFFF') ] for seq, res in sequences: self.assertEqual(seq.decode('utf-8'), res) for ch in map(unichr, range(0, sys.maxunicode)): self.assertEqual(ch, ch.encode('utf-8').decode('utf-8'))
def test_hasattr(self): import sys self.assertTrue(hasattr(sys, 'stdout')) self.assertRaises(TypeError, hasattr, sys, 1) self.assertRaises(TypeError, hasattr) if have_unicode: self.assertRaises(UnicodeError, hasattr, sys, unichr(sys.maxunicode)) # Check that hasattr allows SystemExit and KeyboardInterrupts by class A: def __getattr__(self, what): raise KeyboardInterrupt self.assertRaises(KeyboardInterrupt, hasattr, A(), "b") class B: def __getattr__(self, what): raise SystemExit self.assertRaises(SystemExit, hasattr, B(), "b")
def test_backslashescape(self): # Does the same as the "unicode-escape" encoding, but with different # base encodings. sin = u"a\xac\u1234\u20ac\u8000" if sys.maxunicode > 0xffff: sin += unichr(sys.maxunicode) sout = "a\\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += "\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("ascii", "backslashreplace"), sout) sout = "a\xac\\u1234\\u20ac\\u8000" if sys.maxunicode > 0xffff: sout += "\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("latin-1", "backslashreplace"), sout) sout = "a\xac\\u1234\xa4\\u8000" if sys.maxunicode > 0xffff: sout += "\\U%08x" % sys.maxunicode self.assertEqual(sin.encode("iso-8859-15", "backslashreplace"), sout)
def _test_matching_pattern(self, pattern, isvalidchar, unicode=False): r = unicode_regex(pattern) if unicode else ascii_regex(pattern) codepoints = six.moves.range(0, sys.maxunicode+1) \ if unicode else six.moves.range(1, 128) for c in [six.unichr(x) for x in codepoints]: if isvalidchar(c): assert r.match(c), ( '"%s" supposed to match "%s" (%r, category "%s"), ' 'but it doesnt' % (pattern, c, c, unicodedata.category(c)) ) else: assert not r.match(c), ( '"%s" supposed not to match "%s" (%r, category "%s"), ' 'but it does' % (pattern, c, c, unicodedata.category(c)) )
def test_hasattr(self): self.assertTrue(hasattr(sys, 'stdout')) self.assertRaises(TypeError, hasattr, sys, 1) self.assertRaises(TypeError, hasattr) # Fails on Py2: # self.assertEqual(False, hasattr(sys, chr(sys.maxunicode))) # Check that hasattr propagates all exceptions outside of # AttributeError. class A(object): def __getattr__(self, what): raise SystemExit self.assertRaises(SystemExit, hasattr, A(), "b") class B(object): def __getattr__(self, what): raise ValueError # Was: self.assertRaises(ValueError, hasattr, B(), "b") # Fails on Py2
def get_abi_tag(): """Return the ABI tag based on SOABI (if available) or emulate SOABI (CPython 2, PyPy).""" soabi = get_config_var('SOABI') impl = get_abbr_impl() if not soabi and impl in ('cp', 'pp') and hasattr(sys, 'maxunicode'): d = '' m = '' u = '' if get_flag('Py_DEBUG', lambda: hasattr(sys, 'gettotalrefcount'), warn=(impl == 'cp')): d = 'd' if get_flag('WITH_PYMALLOC', lambda: impl == 'cp', warn=(impl == 'cp')): m = 'm' if get_flag('Py_UNICODE_SIZE', lambda: sys.maxunicode == 0x10ffff, expected=4, warn=(impl == 'cp' and sys.version_info < (3, 3))) \ and sys.version_info < (3, 3): u = 'u' abi = '%s%s%s%s%s' % (impl, get_impl_ver(), d, m, u) elif soabi and soabi.startswith('cpython-'): abi = 'cp' + soabi.split('-')[1] elif soabi: abi = soabi.replace('.', '_').replace('-', '_') else: abi = None return abi
def get_abi_tag(): """Return the ABI tag based on SOABI (if available) or emulate SOABI (CPython 2, PyPy).""" soabi = get_config_var('SOABI') impl = get_abbr_impl() if not soabi and impl in {'cp', 'pp'} and hasattr(sys, 'maxunicode'): d = '' m = '' u = '' if get_flag('Py_DEBUG', lambda: hasattr(sys, 'gettotalrefcount'), warn=(impl == 'cp')): d = 'd' if get_flag('WITH_PYMALLOC', lambda: impl == 'cp', warn=(impl == 'cp')): m = 'm' if get_flag('Py_UNICODE_SIZE', lambda: sys.maxunicode == 0x10ffff, expected=4, warn=(impl == 'cp' and sys.version_info < (3, 3))) \ and sys.version_info < (3, 3): u = 'u' abi = '%s%s%s%s%s' % (impl, get_impl_ver(), d, m, u) elif soabi and soabi.startswith('cpython-'): abi = 'cp' + soabi.split('-')[1] elif soabi: abi = soabi.replace('.', '_').replace('-', '_') else: abi = None return abi
def pinyinify(string): # TODO: Use static file instead of constructing table in real time table = dict() for i in range(sys.maxunicode): if re.match('P|S|Z|C', unicodedata.category(chr(i))) is not None: table[i] = '-' string = string.translate(table) for char in [x for x in string if unicodedata.name(x).startswith('CJK')]: string = string.replace(char, pinyin.get(char, format='strip') + '-') string = re.sub('\-+', '-', string) return pinyin.get(string, delimiter='', format='strip').lower()
def unirange(a, b): """Returns a regular expression string to match the given non-BMP range.""" if b < a: raise ValueError("Bad character range") if a < 0x10000 or b < 0x10000: raise ValueError("unirange is only defined for non-BMP ranges") if sys.maxunicode > 0xffff: # wide build return u'[%s-%s]' % (unichr(a), unichr(b)) else: # narrow build stores surrogates, and the 're' module handles them # (incorrectly) as characters. Since there is still ordering among # these characters, expand the range to one that it understands. Some # background in http://bugs.python.org/issue3665 and # http://bugs.python.org/issue12749 # # Additionally, the lower constants are using unichr rather than # literals because jython [which uses the wide path] can't load this # file if they are literals. ah, al = _surrogatepair(a) bh, bl = _surrogatepair(b) if ah == bh: return u'(?:%s[%s-%s])' % (unichr(ah), unichr(al), unichr(bl)) else: buf = [] buf.append(u'%s[%s-%s]' % (unichr(ah), unichr(al), ah == bh and unichr(bl) or unichr(0xdfff))) if ah - bh > 1: buf.append(u'[%s-%s][%s-%s]' % unichr(ah+1), unichr(bh-1), unichr(0xdc00), unichr(0xdfff)) if ah != bh: buf.append(u'%s[%s-%s]' % (unichr(bh), unichr(0xdc00), unichr(bl))) return u'(?:' + u'|'.join(buf) + u')'
def test_unescape(self): self.assertEqual(unescape('"'), '"') self.assertEqual(unescape('&'), '&') self.assertEqual(unescape('あ'), '\u3042') if sys.maxunicode > 0xFFFF: # Python 3 or UCS-4 build of Python 2 self.assertEqual(unescape('𝕆'), '\U0001D546') self.assertEqual(unescape('𝓁'), '\U0001d4c1') else: # UCS-2 build of Python 2 self.assertEqual(unescape('𝕆'), '𝕆') self.assertEqual(unescape('𝓁'), '𝓁')
def test_conjunctions(self): n_code_points = len(set.union(*[v for k, v in UNICODE_CATEGORIES.items() if len(k) > 1])) self.assertTrue( n_code_points == maxunicode + 1, "The Unicode categories have a wrong number of elements: %d (!= %d) " % (n_code_points, maxunicode + 1) )
def test_max_value(self): max_code_point = max([max(s) for s in UNICODE_CATEGORIES.values()]) self.assertTrue( max_code_point <= maxunicode, "The Unicode categories have a code point greater than %d: %d" % (maxunicode, max_code_point) )