我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用unicodedata.lookup()。
def cleanup_command_line(): if not sys.stdin.encoding or sys.stdin.encoding == 'ascii': return conversion_pairs = { 'EN DASH': '-', 'EM DASH': '--', 'LEFT DOUBLE QUOTATION MARK': '"', 'RIGHT DOUBLE QUOTATION MARK': '"', 'LEFT SINGLE QUOTATION MARK': "'", 'RIGHT SINGLE QUOTATION MARK': "'", } for i in range(len(sys.argv)): # create a unicode string with the decoded contents of the corresponding # sys.argv string decoded = unicode(sys.argv[i], sys.stdin.encoding) for key, val in conversion_pairs.iteritems(): decoded = unicode.replace(decoded, unicodedata.lookup(key), val) # Should we be doing 'strict' here instead of 'replace'? sys.argv[i] = decoded.encode(sys.stdin.encoding, 'replace')
def parse_repl_named_char(source): "Parses a named character in a replacement string." saved_pos = source.pos if source.match("{"): name = source.get_while(ALPHA | set(" ")) if source.match("}"): try: value = unicodedata.lookup(name) return ord(value) except KeyError: raise error("undefined character name", source.string, source.pos) source.pos = saved_pos return None
def unicode_name_matches(self, text): u"""Match Latex-like syntax for unicode characters base on the name of the character. This does ``\\GREEK SMALL LETTER ETA`` -> ``?`` Works only on valid python 3 identifier, or on combining characters that will combine to form a valid identifier. Used on Python 3 only. """ slashpos = text.rfind('\\') if slashpos > -1: s = text[slashpos+1:] try : unic = unicodedata.lookup(s) # allow combining chars if ('a'+unic).isidentifier(): return '\\'+s,[unic] except KeyError: pass return u'', []
def _token_splittable(token): """ Predicate for whether a token name can be split into multiple tokens. A token is splittable if it does not contain an underscore character and it is not the name of a Greek letter. This is used to implicitly convert expressions like 'xyz' into 'x*y*z'. """ if '_' in token: return False else: try: return not unicodedata.lookup('GREEK SMALL LETTER ' + token) except KeyError: pass if len(token) > 1: return True return False
def __init__(self, msg='', maxspin=0, minspin=10, speed=5): # Count of a spin self.count = 0 self.out = sys.stdout self.flag = False self.max = maxspin self.min = minspin # Any message to print first ? self.msg = msg # Complete printed string self.string = '' # Speed is given as number of spins a second # Use it to calculate spin wait time self.waittime = 1.0 / float(speed * 4) if os.name == 'posix': self.spinchars = (unicodedata.lookup('FIGURE DASH'), u'\\ ', u'| ', u'/ ') else: # The unicode dash character does not show # up properly in Windows console. self.spinchars = (u'-', u'\\ ', u'| ', u'/ ') threading.Thread.__init__(self, None, None, "Spin Thread")
def escape(m): all, tail = m.group(0, 1) assert all.startswith("\\") esc = simple_escapes.get(tail) if esc is not None: return esc elif tail.startswith("x"): return chr(convert_hex(tail, 2)) elif tail.startswith('u'): return unichr(convert_hex(tail, 4)) elif tail.startswith('U'): return unichr(convert_hex(tail, 8)) elif tail.startswith('N'): import unicodedata try: return unicodedata.lookup(tail[1:-1]) except KeyError: raise ValueError("undefined character name %r" % tail[1:-1]) else: try: return chr(int(tail, 8)) except ValueError: raise ValueError("invalid octal string escape ('\\%s')" % tail)
def test_aliases(self): # Check that the aliases defined in the NameAliases.txt file work. # This should be updated when new aliases are added or the file # should be downloaded and parsed instead. See #12753. aliases = [ ('LATIN CAPITAL LETTER GHA', 0x01A2), ('LATIN SMALL LETTER GHA', 0x01A3), ('KANNADA LETTER LLLA', 0x0CDE), ('LAO LETTER FO FON', 0x0E9D), ('LAO LETTER FO FAY', 0x0E9F), ('LAO LETTER RO', 0x0EA3), ('LAO LETTER LO', 0x0EA5), ('TIBETAN MARK BKA- SHOG GI MGO RGYAN', 0x0FD0), ('YI SYLLABLE ITERATION MARK', 0xA015), ('PRESENTATION FORM FOR VERTICAL RIGHT WHITE LENTICULAR BRACKET', 0xFE18), ('BYZANTINE MUSICAL SYMBOL FTHORA SKLIRON CHROMA VASIS', 0x1D0C5) ] for alias, codepoint in aliases: self.checkletter(alias, chr(codepoint)) name = unicodedata.name(chr(codepoint)) self.assertNotEqual(name, alias) self.assertEqual(unicodedata.lookup(alias), unicodedata.lookup(name)) with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(alias)
def test_named_sequences_full(self): # Check all the named sequences url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" % unicodedata.unidata_version) try: testdata = support.open_urlresource(url, encoding="utf-8", check=check_version) except (IOError, HTTPException): self.skipTest("Could not retrieve " + url) self.addCleanup(testdata.close) for line in testdata: line = line.strip() if not line or line.startswith('#'): continue seqname, codepoints = line.split(';') codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) self.assertEqual(unicodedata.lookup(seqname), codepoints) with self.assertRaises(SyntaxError): self.checkletter(seqname, None) with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname)
def start_unichar(self, attr): if 'name' in attr: if 'code' in attr: self._syntax_error('<unichar/> invalid with both name and code attributes') try: v = unicodedata.lookup(attr['name']) except KeyError: self._syntax_error('<unichar/> invalid name attribute\n"%s"' % ascii(attr['name'])) v = '\0' elif 'code' in attr: try: v = int(eval(attr['code'])) v = chr(v) if isPy3 else unichr(v) except: self._syntax_error('<unichar/> invalid code attribute %s' % ascii(attr['code'])) v = '\0' else: v = None if attr: self._syntax_error('<unichar/> invalid attribute %s' % list(attr.keys())[0]) if v is not None: self.handle_data(v) self._push('unichar',_selfClosingTag='unichar')
def unicode_name_matches(self, text): u"""Match Latex-like syntax for unicode characters base on the name of the character. This does \\GREEK SMALL LETTER ETA -> ? Works only on valid python 3 identifier, or on combining characters that will combine to form a valid identifier. Used on Python 3 only. """ slashpos = text.rfind('\\') if slashpos > -1: s = text[slashpos+1:] try : unic = unicodedata.lookup(s) # allow combining chars if ('a'+unic).isidentifier(): return '\\'+s,[unic] except KeyError: pass return u'', []
def test_named_sequences_full(self): # Check all the named sequences url = ("http://www.pythontest.net/unicode/%s/NamedSequences.txt" % unicodedata.unidata_version) try: testdata = support.open_urlresource(url, encoding="utf-8", check=check_version) except (OSError, HTTPException): self.skipTest("Could not retrieve " + url) self.addCleanup(testdata.close) for line in testdata: line = line.strip() if not line or line.startswith('#'): continue seqname, codepoints = line.split(';') codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) self.assertEqual(unicodedata.lookup(seqname), codepoints) with self.assertRaises(SyntaxError): self.checkletter(seqname, None) with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname)
def _greekletters(letterlist): for l in letterlist: ucharname = l.upper() if (ucharname == 'LAMBDA'): ucharname = 'LAMDA' smallname = "GREEK SMALL LETTER "+ucharname; if (ucharname == 'EPSILON'): smallname = "GREEK LUNATE EPSILON SYMBOL" if (ucharname == 'PHI'): smallname = "GREEK PHI SYMBOL" _default_macro_list.append( (l, unicodedata.lookup(smallname)) ); _default_macro_list.append( (l[0].upper()+l[1:], unicodedata.lookup("GREEK CAPITAL LETTER "+ucharname)) );
def _check_files(): if not dataIO.is_valid_json(TRIGGERS_PATH): _LOGGER.info("Creating json: " + TRIGGERS_PATH) dataIO.save_json(TRIGGERS_PATH, DEFAULT_SETTINGS) else: # Backwards compatibility check triggers = dataIO.load_json(TRIGGERS_PATH) for text, emoji_list in triggers['text_triggers'].items(): for idx, emoji in enumerate(emoji_list): try: emoji = lookup(emoji) except KeyError: pass else: emoji_list[idx] = emoji triggers['text_triggers'][text] = emoji_list for user, emoji_list in triggers['user_triggers'].items(): for idx, emoji in enumerate(emoji_list): try: emoji = lookup(emoji) except KeyError: pass else: emoji_list[idx] = emoji triggers['user_triggers'][user] = emoji_list dataIO.save_json(TRIGGERS_PATH, triggers)
def test_named_sequences_full(self): # Check all the named sequences url = ("http://www.unicode.org/Public/%s/ucd/NamedSequences.txt" % unicodedata.unidata_version) try: testdata = support.open_urlresource(url, encoding="utf-8", check=check_version) except (OSError, HTTPException): self.skipTest("Could not retrieve " + url) self.addCleanup(testdata.close) for line in testdata: line = line.strip() if not line or line.startswith('#'): continue seqname, codepoints = line.split(';') codepoints = ''.join(chr(int(cp, 16)) for cp in codepoints.split()) self.assertEqual(unicodedata.lookup(seqname), codepoints) with self.assertRaises(SyntaxError): self.checkletter(seqname, None) with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname)
def insert_accented(self, c, accent): if c.isalpha(): if c.isupper(): cap = 'capital' else: cap = 'small' try: c = lookup("latin %s letter %c with %s" % (cap, c, accent)) self.insert(INSERT, c) # Prevent plain letter from being inserted too, tell Tk to # stop handling this event return "break" except KeyError, e: pass
def expand_unicode(s): """ Convert unicode reference in to a Unicode string. """ if s.startswith(r'\u') or s.startswith(r'\U'): return chr(int(s,16)) if s.startswith(r'\N{'): name = s[3:-1] try: return unicodedata.lookup(name) except: raise ConfigError("Failed to find unicode value with name {}\n".format(name)) else: return s
def u(s): """Generate Unicode string from a string input, encoding Unicode characters. This is expected to work in the same way as u'<string>' would work in Python 2.x (although it is not completely robust as it is based on a simple set of regexps). """ us = re.sub(_U16_RE, lambda m: unichr(int(m.group('hexval'), 16)), unicode(s)) us = re.sub(_U32_RE, lambda m: unichr(int(m.group('hexval'), 16)), us) us = re.sub(_UNAME_RE, lambda m: unicodedata.lookup(m.group('name')), us) return us
def dia_to_unicode(s): """ Translates a string that contains CELEX encodings of diacritics to a Unicode string. Parameters ---------- s : string A string containing CELEX diacritics (see CELEX/english/eol/README for details) Returns ------- s : string The corresponding unicode string """ encoded_diacritics = { "#": "COMBINING ACUTE ACCENT", "`": "COMBINING GRAVE ACCENT", '"': "COMBINING DIAERESIS", "^": "COMBINING CIRCUMFLEX ACCENT", ",": "COMBINING CEDILLA", "~": "COMBINING TILDE", "@": "COMBINING RING ABOVE"} diacritic = None char_list = [] for ch in s: if ch in encoded_diacritics: diacritic = unicodedata.lookup(encoded_diacritics[ch]) else: char_list.append(ch) # add diacritics: if diacritic: char_list.append(diacritic) diacritic = None # join and normalize characters: unicode_string = unicodedata.normalize("NFC", "".join(char_list)) return unicode_string
def parse_named_char(source, info, in_set): "Parses a named character." saved_pos = source.pos if source.match("{"): name = source.get_while(NAMED_CHAR_PART) if source.match("}"): try: value = unicodedata.lookup(name) return make_character(info, ord(value), in_set) except KeyError: raise error("undefined character name", source.string, source.pos) source.pos = saved_pos return make_character(info, ord("N"), in_set)
def unicode_name(self, name): """Insert Unicode value by its name.""" value = ord(unicodedata.lookup(name)) return '\\%03o' % value if value <= 0xFF else compat.uchr(value)
def U(name): """unicode character by name or None if not found""" try: u = unicodedata.lookup(name) except KeyError: u = None global unicode_warnings unicode_warnings += 'No \'%s\' in unicodedata\n' % name return u
def test_ascii_letters(self): import unicodedata for char in "".join(map(chr, range(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) self.assertEqual(unicodedata.name(code), name)
def test_bmp_characters(self): import unicodedata count = 0 for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) count += 1
def test_errors(self): import unicodedata self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, 'xx') self.assertRaises(TypeError, unicodedata.lookup) self.assertRaises(KeyError, unicodedata.lookup, 'unknown')
def test_ascii_letters(self): import unicodedata for char in "".join(map(chr, xrange(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) self.assertEqual(unicodedata.name(code), name)
def test_bmp_characters(self): import unicodedata count = 0 for code in xrange(0x10000): char = unichr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char) count += 1
def test_errors(self): import unicodedata self.assertRaises(TypeError, unicodedata.name) self.assertRaises(TypeError, unicodedata.name, u'xx') self.assertRaises(TypeError, unicodedata.lookup) self.assertRaises(KeyError, unicodedata.lookup, u'unknown')
def cmd_do(expr, msg=None): actions = collections.OrderedDict(( ('shrug', '¯\\_(?)_/¯'), ('lenny', '( ?° ?? ?°)'), ('flip', '??°?°??? ???'), ('homo', '?????o???'), ('look', '?_?'), ('cn', '[citation needed]'), ('boom', '??'), ('tweet', '??'), ('blink', '??'), ('see-no-evil', '??'), ('hear-no-evil', '??'), ('speak-no-evil', '??'), ('evil', '??????'), ('table', '(?>_<)?</?lq??>'), ('release-upgrade', '????'), ('however', ('???????????\n??????????\n' 'Something happened\n???????\n' '?????????????\n???????\n???????')), ('mac', ('?????\n????\n???????\n????\n' '?????\n??????\n??\n????')) )) origexpr = expr expr = expr.lower() res = actions.get(expr) if res: return res elif expr == 'help': return ', '.join(actions.keys()) else: try: res = unicodedata.lookup(expr) return res except KeyError: pass if len(expr) <= 10: res = ', '.join(unicodedata.name(ch) for ch in origexpr) return res else: return 'Something happened.'
def _get_base_character(c): desc = unicodedata.name(unicode(c)) cutoff = desc.find(' WITH ') if cutoff != -1: desc = desc[:cutoff] return unicodedata.lookup(desc)
def test_ascii_letters(self): for char in "".join(map(chr, range(ord("a"), ord("z")))): name = "LATIN SMALL LETTER %s" % char.upper() code = unicodedata.lookup(name) self.assertEqual(unicodedata.name(code), name)
def test_bmp_characters(self): for code in range(0x10000): char = chr(code) name = unicodedata.name(char, None) if name is not None: self.assertEqual(unicodedata.lookup(name), char)
def test_named_sequences_sample(self): # Check a few named sequences. See #12753. sequences = [ ('LATIN SMALL LETTER R WITH TILDE', '\u0072\u0303'), ('TAMIL SYLLABLE SAI', '\u0BB8\u0BC8'), ('TAMIL SYLLABLE MOO', '\u0BAE\u0BCB'), ('TAMIL SYLLABLE NNOO', '\u0BA3\u0BCB'), ('TAMIL CONSONANT KSS', '\u0B95\u0BCD\u0BB7\u0BCD'), ] for seqname, codepoints in sequences: self.assertEqual(unicodedata.lookup(seqname), codepoints) with self.assertRaises(SyntaxError): self.checkletter(seqname, None) with self.assertRaises(KeyError): unicodedata.ucd_3_2_0.lookup(seqname)
def test_unicode(self ): # See GH 6885 - get_dummies chokes on unicode values import unicodedata e = 'e' eacute = unicodedata.lookup('LATIN SMALL LETTER E WITH ACUTE') s = [e, eacute, eacute] res = get_dummies(s, prefix='letter', sparse=self.sparse) exp = DataFrame({'letter_e': {0: 1.0, 1: 0.0, 2: 0.0}, u('letter_%s') % eacute: {0: 0.0, 1: 1.0, 2: 1.0}}) assert_frame_equal(res, exp)
def normalize_char(c): try: cname = unicodedata.name( unicode(c) ) cname = cname[:cname.index( ' WITH' )] return unicodedata.lookup( cname ) except ( ValueError, KeyError ): return c
def unicode(self, name): return lookup(name) # Safe, fast math parser