我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用unicodedata.category()。
def deaccent(text): """ Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring. Return input string with accents removed, as unicode. >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek") u'Sef chomutovskych komunistu dostal postou bily prasek' """ if not isinstance(text, unicode): # assume utf8 for byte strings, use default (strict) error handling text = text.decode('utf8') norm = unicodedata.normalize("NFD", text) result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn') return unicodedata.normalize("NFC", result)
def remove_nonprinting_characters(input, encoding='utf-8'): input_was_unicode = True if isinstance(input, basestring): if not isinstance(input, unicode): input_was_unicode = False unicode_input = to_unicode_or_bust(input) # see http://www.fileformat.info/info/unicode/category/index.htm char_classes_to_remove = ["C", "M", "Z"] response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove) if not input_was_unicode: response = response.encode(encoding) return response # getting a "decoding Unicode is not supported" error in this function? # might need to reinstall libaries as per # http://stackoverflow.com/questions/17092849/flask-login-typeerror-decoding-unicode-is-not-supported
def _is_safe_url(url, host): # Chrome considers any URL with more than two slashes to be absolute, but # urlparse is not so flexible. Treat any url with three slashes as unsafe. if url.startswith('///'): return False url_info = urlparse(url) # Forbid URLs like http:///example.com - with a scheme, but without a hostname. # In that URL, example.com is not the hostname but, a path component. However, # Chrome will still consider example.com to be the hostname, so we must not # allow this syntax. if not url_info.netloc and url_info.scheme: return False # Forbid URLs that start with control characters. Some browsers (like # Chrome) ignore quite a few control characters at the start of a # URL and might consider the URL as scheme relative. if unicodedata.category(url[0])[0] == 'C': return False return ((not url_info.netloc or url_info.netloc == host) and (not url_info.scheme or url_info.scheme in ['http', 'https']))
def XetexBody(self): data = '' prevcode = 0 for code in sorted(self.font.chars): try: uniname = unicodedata.name(unichr(code)) except ValueError: uniname = '' if code - prevcode > 1: gaps = len([x for x in range(prevcode + 1, code) if unicodedata.category(unichr(x))[0] != 'C']) if gaps: data += ('\\rowcolor{missing}\\multicolumn{3}{|c|}' '{\\small %d visible characters not mapped to glyphs} \\\\\n') % (gaps) prevcode = code data += ('\\texttt{%04X} & {\\customfont\\symbol{%d}} &' '{\\small %s}\\\\\n') % (code, code, uniname) return data
def weekday_portuguese_to_english(string): string = string.lower() string = string.strip() string = string.replace("-", " ") string = ''.join((c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn')) string = string.replace(",", " ") string = string.split(" ")[0] if string in [u"dom", u"domingo"]: return "Sunday" elif string in [u"seg", u"segunda", u"segunda-feira"]: return "Monday" elif string in [u"ter", u"terca", u"terça", u"terca-feira", u"terça-feira"]: return "Tuesday" elif string in [u"qua", u"quarta", u"quarta-feira"]: return "Wednesday" elif string in [u"qui", u"quinta", u"quinta-feira"]: return "Thursday" elif string in [u"sex", u"sexta", u"sexta-feira"]: return "Friday" elif string in [u"sab", u"sáb", u"sabado", u"sábado"]: return "Saturday"
def push(self, evt): trace("[input] pushed {!r}", evt.data) key = evt.data d = self.k.get(key) if isinstance(d, dict): trace("[input] transition") self.stack.append(key) self.k = d else: if d is None: trace("[input] invalid") if self.stack or len(key) > 1 or unicodedata.category(key) == 'C': self.results.append( (self.invalid_cls, self.stack + [key])) else: # small optimization: self.k[key] = self.character_cls self.results.append( (self.character_cls, [key])) else: trace("[input] matched {}", d) self.results.append((d, self.stack + [key])) self.stack = [] self.k = self.ck
def get_cc(nunichar): """Computes CharCase for a Unicode character. This function computes the CharCase of a Unicode character. Args: nunichar: A Unicode character whose casing is to be computed. Returns: The CharCase for the input character. """ catstr = unicodedata.category(nunichar) if catstr == "Ll": return CharCase.lower elif catstr == "Lu": return CharCase.upper else: return CharCase.dc
def __init__(self, emoji): self.raw = emoji if isinstance(emoji, str): self.id = 0 self.unicode = emoji self.custom = False self.managed = False self.name = [unicodedata.name(ch) for ch in emoji] self.category = [unicodedata.category(ch) for ch in emoji] self.roles = [] self.guild = None else: self.id = emoji.id self.unicode = '' self.custom = True self.managed = getattr(emoji, 'managed', None) self.name = [emoji.name] self.category = ['custom'] self.roles = getattr(emoji, 'roles', None) self.guild = getattr(emoji, 'guild', None)
def read_identifier(self): self.j = self.i + 1 while unicodedata.category(self.data[self.j]) in self.IDENT_PART_CATEGORIES: self.j += 1 ident = self.data[self.i:self.j] if ident in Keyword.VALUES: token_type = Keyword if ident in BasicType.VALUES: token_type = BasicType elif ident in Modifier.VALUES: token_type = Modifier elif ident in Boolean.VALUES: token_type = Boolean elif ident == 'null': token_type = Null else: token_type = Identifier return token_type
def tokenize(text, splits='COPZ'): token = [] if PY3: for c in str(text, 'utf-8'): if category(c)[0] in splits: if len(token): yield u''.join(token) token = [] else: token.append(c) else: for c in unicode(text): if category(c)[0] in splits: if len(token): yield u''.join(token) token = [] else: token.append(c) if len(token): yield u''.join(token)
def _consume_alpha_utf8(self,text,offset): """Consume a sequence of utf8 bytes forming an alphabetic character.""" incr = 2 u = "" while not u and incr <= 4: try: try: # In the common case this will be a string u = text[offset:offset+incr].decode("utf8") except AttributeError: # Looks like it was e.g. a mutable char array. try: s = text[offset:offset+incr].tostring() except AttributeError: s = "".join([c for c in text[offset:offset+incr]]) u = s.decode("utf8") except UnicodeDecodeError: incr += 1 if not u: return 0 if u.isalpha(): return incr if unicodedata.category(u)[0] == "M": return incr return 0
def _consume_alpha_u(self,text,offset): """Consume an alphabetic character from the given unicode string. Given a unicode string and the current offset, this method returns the number of characters occupied by the next alphabetic character in the string. Trailing combining characters are consumed as a single letter. """ assert offset < len(text) incr = 0 if text[offset].isalpha(): incr = 1 while offset + incr < len(text): if unicodedata.category(text[offset+incr])[0] != "M": break incr += 1 return incr
def value_for(self, association): value = str(self.prefix) try: if self.prop: val = str(eval("association."+self.prop)) else: val = str(self.value) if 'accents' in self.options: val = ''.join((c for c in unicodedata.normalize('NFD', val) if unicodedata.category(c) != 'Mn')) if 'caps' in self.options: val = val.upper() return value + val except AttributeError: return ''
def _escape_text(self, s): """Escape text In addition to escaping text, unicode characters are replaced with a span that will display the glyph using CSS. This is to ensure that the text has a consistent width. """ tpl = ('<span class="u"><span class="g">&#x{0:x};</span>' '<span class="ns">{1}</span></span>') out = '' for c in s: w = utils.str_width(c) if unicodedata.category(c) in ('Co', 'Cn', 'So'): out += tpl.format(ord(c), ' ') elif w > 1 or ord(c) > 255: out += tpl.format(ord(c), ' ' * w) else: out += escape(c) return out
def _splitByControlCharacters(self, val): # extract non-control characters output = [] s = '' for c in unicode(val): if unicodedata.category(c)[0] == 'C': if len(s) > 0: # start a new string if we found a control character output.append(str(s)) s = '' else: s += c # clean up any left over string if len(s) > 0: output.append(str(s)) # return extracts strings return output
def calibrate(self): data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF') len_data = len(data) digit = unicodedata.digit numeric = unicodedata.numeric decimal = unicodedata.decimal category = unicodedata.category bidirectional = unicodedata.bidirectional decomposition = unicodedata.decomposition mirrored = unicodedata.mirrored combining = unicodedata.combining for i in xrange(self.rounds): c = data[i % len_data]
def _test_matching_pattern(self, pattern, isvalidchar, unicode=False): r = unicode_regex(pattern) if unicode else ascii_regex(pattern) codepoints = six.moves.range(0, sys.maxunicode+1) \ if unicode else six.moves.range(1, 128) for c in [six.unichr(x) for x in codepoints]: if isvalidchar(c): assert r.match(c), ( '"%s" supposed to match "%s" (%r, category "%s"), ' 'but it doesnt' % (pattern, c, c, unicodedata.category(c)) ) else: assert not r.match(c), ( '"%s" supposed not to match "%s" (%r, category "%s"), ' 'but it does' % (pattern, c, c, unicodedata.category(c)) )
def domains(self): with app.app_context(): if not r.table_list().contains('domains').run(db.conn): log.info("Table domains not found, creating...") r.table_create('domains', primary_key="id").run(db.conn) r.table('domains').index_create("status").run(db.conn) r.table('domains').index_wait("status").run(db.conn) r.table('domains').index_create("hyp_started").run(db.conn) r.table('domains').index_wait("hyp_started").run(db.conn) r.table('domains').index_create("user").run(db.conn) r.table('domains').index_wait("user").run(db.conn) r.table('domains').index_create("group").run(db.conn) r.table('domains').index_wait("group").run(db.conn) r.table('domains').index_create("category").run(db.conn) r.table('domains').index_wait("category").run(db.conn) r.table('domains').index_create("kind").run(db.conn) r.table('domains').index_wait("kind").run(db.conn) return True
def lstrip_token(token): '''Strips some characters from the left side of a token Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS are stripped from the left side of a token. The stripped token is returned. :param token: The token where characters may be stripped from :type token: String :rtype: String Examples: >>> lstrip_token(".'foo'.") "foo'." ''' token = token.lstrip() while (len(token) > 0 and unicodedata.category(token[0]) in CATEGORIES_TO_STRIP_FROM_TOKENS): token = token[1:] return token
def rstrip_token(token): '''Strips some characters from the right side of a token Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS are stripped from the right side of a token. The stripped token is returned. :param token: The token where characters may be stripped from :type token: String :rtype: String Examples: >>> rstrip_token(".'foo'.") ".'foo" ''' token = token.rstrip() while (len(token) > 0 and unicodedata.category(token[-1]) in CATEGORIES_TO_STRIP_FROM_TOKENS): token = token[0:-1] return token
def remove_accents(text): '''Removes accents from the text Returns the text with all accents removed Using “from unidecode import unidecode” is more sophisticated, but I am not sure whether I can require “unidecode”. :param text: The text to change :type text: string :rtype: string Examples: >>> remove_accents('Ångstrøm') 'Angstrom' >>> remove_accents('ÅÆ挜?øß?ü') 'AAEaeOEoeijossSSu' ''' return ''.join([ x for x in unicodedata.normalize('NFKD', text) if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)
def contains_letter(text): '''Returns whether “text” contains a “letter” type character :param text: The text to check :type text: string :rtype: boolean Examples: >>> contains_letter('Hi!') True >>> contains_letter(':-)') False ''' for char in text: category = unicodedata.category(char) if category in ('Ll', 'Lu', 'Lo',): return True return False
def _is_safe_url(url, allowed_hosts, require_https=False): # Chrome considers any URL with more than two slashes to be absolute, but # urlparse is not so flexible. Treat any url with three slashes as unsafe. if url.startswith('///'): return False url_info = urlparse(url) # Forbid URLs like http:///example.com - with a scheme, but without a hostname. # In that URL, example.com is not the hostname but, a path component. However, # Chrome will still consider example.com to be the hostname, so we must not # allow this syntax. if not url_info.netloc and url_info.scheme: return False # Forbid URLs that start with control characters. Some browsers (like # Chrome) ignore quite a few control characters at the start of a # URL and might consider the URL as scheme relative. if unicodedata.category(url[0])[0] == 'C': return False scheme = url_info.scheme # Consider URLs without a scheme (e.g. //example.com/p) to be http. if not url_info.scheme and url_info.netloc: scheme = 'http' valid_schemes = ['https'] if require_https else ['http', 'https'] return ((not url_info.netloc or url_info.netloc in allowed_hosts) and (not scheme or scheme in valid_schemes))
def capitulos(item): logger.info() itemlist = [] data = item.extra thumbnail =scrapertools.get_match(data,'background-image:url\(\'([^"]+)\'') thumbnail = re.sub(r"w185","original",thumbnail) patron= '<a href="([^"]+)".*?<br\/><i>(.*?)<\/i>' matches=re.compile(patron,re.DOTALL).findall(data) for url, capitulo in matches: capitulo = re.sub(r"Cap.*?tulo","",capitulo) capitulo= "[COLOR floralwhite][B]"+capitulo+"[/B][/COLOR]" if capitulo == item.extra.split("|")[4]: continue if not ".jpg" in item.extra.split("|")[2]: fanart = item.show.split("|")[0] else: fanart = item.extra.split("|")[2] itemlist.append( Item(channel=item.channel, title = capitulo , action="findvideos", url=url, thumbnail= thumbnail,extra= "fv2"+"|"+item.extra.split("|")[3],show= item.show,category= item.category, fanart=fanart, folder=True) ) return itemlist
def findvideos(item): logger.info() itemlist = [] temp = item.fulltitle.split("|")[0] epi = item.fulltitle.split("|")[1] url_temp= "http://api.themoviedb.org/3/tv/"+item.show.split("|")[5]+"/season/"+temp+"/images?api_key="+api_key+"" data = httptools.downloadpage(url_temp).data data = re.sub(r"\n|\r|\t|\s{2}| ","",data) patron = '{"id".*?"file_path":"(.*?)","height"' matches = re.compile(patron,re.DOTALL).findall(data) if len(matches) == 0: thumbnail= item.thumbnail for thumtemp in matches: thumbnail= "https://image.tmdb.org/t/p/original"+ thumtemp title = item.show.split("|")[3]+ " " + temp+"x"+epi title = "[COLOR lightgreen]"+title+"[/COLOR]" itemlist.append( Item(channel=item.channel, title = title , action="play", url=item.url, server="torrent", thumbnail= item.show.split("|")[4],extra=item.extra,show= item.show, fanart=item.show.split("|")[0],fulltitle = title, folder=False) ) extra = item.extra+"|"+temp+"|"+epi title_info =" Info" title_info = "[COLOR darkseagreen]"+title_info+"[/COLOR]" itemlist.append( Item(channel=item.channel, action="info_capitulos" , title=title_info , url=item.url, thumbnail=thumbnail, fanart=item.show.split("|")[1], extra =extra,show=item.show,category=item.category, folder=False )) return itemlist
def play(item): logger.info() itemlist = servertools.find_video_items(data=item.url) data = scrapertools.cache_page(item.url) listavideos = servertools.findvideos(data) for video in listavideos: videotitle = scrapertools.unescape(video[0]) url =item.url server = video[2] #xbmctools.addnewvideo( item.channel , "play" , category , server , , url , thumbnail , plot ) itemlist.append( Item(channel=item.channel, action="play", server=server, title="Trailer - " + videotitle , url=url , thumbnail=item.thumbnail , plot=item.plot , fulltitle = item.title , fanart="http://s23.postimg.org/84vkeq863/movietrailers.jpg", folder=False) ) return itemlist
def check_initial_combiner(label): if unicodedata.category(label[0])[0] == 'M': raise IDNAError('Label begins with an illegal combining character') return True
def normalize(title): try: try: return title.decode('ascii').encode("utf-8") except: pass return str(''.join(c for c in unicodedata.normalize('NFKD', unicode(title.decode('utf-8'))) if unicodedata.category(c) != 'Mn')) except: return title
def pinyinify(string): # TODO: Use static file instead of constructing table in real time table = dict() for i in range(sys.maxunicode): if re.match('P|S|Z|C', unicodedata.category(chr(i))) is not None: table[i] = '-' string = string.translate(table) for char in [x for x in string if unicodedata.name(x).startswith('CJK')]: string = string.replace(char, pinyin.get(char, format='strip') + '-') string = re.sub('\-+', '-', string) return pinyin.get(string, delimiter='', format='strip').lower()
def replace_punctuation(text, sub): punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po']) chars = [] for my_char in text: if unicodedata.category(my_char) in punctutation_cats: chars.append(sub) else: chars.append(my_char) return u"".join(chars) # from http://stackoverflow.com/a/22238613/596939
def sanitize(name): name = unicode(name) printable = set(('Lu', 'Ll', 'Lm', 'Lo', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Zs')) return utf8(''.join(c for c in name if unicodedata.category(c) in printable and c != '@'))
def Plaintext(self): data = '' for category, code in sorted(FontFile.NAME_CODES.items(), key=lambda x:x[1]): if code in self.font._names: data += '%15s: %s\n' % (category, self.font._names[code]) return data
def XetexBody(self): data = '' for category, code in sorted(FontFile.NAME_CODES.items(), key=lambda x:x[1]): if code in self.font._names: data += '%s & %s \\\\\n' % (category, TexEscape(self.font._names[code])) return data
def _double_width_char_count(word): dw_count = 0 for char in word: if _unicode_data.category(char) in _double_width_type: dw_count += 1 return dw_count
def _make_unctrl_map(): uc_map = {} for c in map(unichr, range(256)): if unicodedata.category(c)[0] != 'C': uc_map[c] = c for i in range(32): c = unichr(i) uc_map[c] = '^' + unichr(ord('A') + i - 1) uc_map[b'\t'] = ' ' # display TABs as 4 characters uc_map[b'\177'] = unicode('^?') for i in range(256): c = unichr(i) if c not in uc_map: uc_map[c] = unicode('\\%03o') % i return uc_map
def _my_unctrl(c, u=_make_unctrl_map()): if c in u: return u[c] else: if unicodedata.category(c).startswith('C'): return b'\u%04x' % ord(c) else: return c
def is_punct(text): for char in text: if not unicodedata.category(char).startswith('P'): return False else: return True