Python unicodedata 模块，category() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用unicodedata.category()。

项目：paragraph2vec 作者：thunlp | 项目源码 | 文件源码

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

项目：oadoi 作者：Impactstory | 项目源码 | 文件源码

def remove_nonprinting_characters(input, encoding='utf-8'):
    input_was_unicode = True
    if isinstance(input, basestring):
        if not isinstance(input, unicode):
            input_was_unicode = False

    unicode_input = to_unicode_or_bust(input)

    # see http://www.fileformat.info/info/unicode/category/index.htm
    char_classes_to_remove = ["C", "M", "Z"]

    response = u''.join(c for c in unicode_input if unicodedata.category(c)[0] not in char_classes_to_remove)

    if not input_was_unicode:
        response = response.encode(encoding)

    return response

# getting a "decoding Unicode is not supported" error in this function?
# might need to reinstall libaries as per
# http://stackoverflow.com/questions/17092849/flask-login-typeerror-decoding-unicode-is-not-supported

项目：CodingDojo 作者：ComputerSocietyUNB | 项目源码 | 文件源码

def _is_safe_url(url, host):
    # Chrome considers any URL with more than two slashes to be absolute, but
    # urlparse is not so flexible. Treat any url with three slashes as unsafe.
    if url.startswith('///'):
        return False
    url_info = urlparse(url)
    # Forbid URLs like http:///example.com - with a scheme, but without a hostname.
    # In that URL, example.com is not the hostname but, a path component. However,
    # Chrome will still consider example.com to be the hostname, so we must not
    # allow this syntax.
    if not url_info.netloc and url_info.scheme:
        return False
    # Forbid URLs that start with control characters. Some browsers (like
    # Chrome) ignore quite a few control characters at the start of a
    # URL and might consider the URL as scheme relative.
    if unicodedata.category(url[0])[0] == 'C':
        return False
    return ((not url_info.netloc or url_info.netloc == host) and
            (not url_info.scheme or url_info.scheme in ['http', 'https']))

项目：fontreport 作者：googlei18n | 项目源码 | 文件源码

def XetexBody(self):
    data = ''
    prevcode = 0
    for code in sorted(self.font.chars):
      try:
        uniname = unicodedata.name(unichr(code))
      except ValueError:
        uniname = ''
      if code - prevcode > 1:
        gaps = len([x for x in  range(prevcode + 1, code)
                    if unicodedata.category(unichr(x))[0] != 'C'])
        if gaps:
          data += ('\\rowcolor{missing}\\multicolumn{3}{|c|}'
                   '{\\small %d visible characters not mapped to glyphs} \\\\\n') % (gaps)
      prevcode = code
      data += ('\\texttt{%04X} & {\\customfont\\symbol{%d}} &'
               '{\\small %s}\\\\\n') % (code, code, uniname)
    return data

项目：fscan 作者：danielmoraes | 项目源码 | 文件源码

def weekday_portuguese_to_english(string):
    string = string.lower()
    string = string.strip()
    string = string.replace("-", " ")
    string = ''.join((c for c in unicodedata.normalize('NFD', string)
                      if unicodedata.category(c) != 'Mn'))

    string = string.replace(",", " ")
    string = string.split(" ")[0]
    if string in [u"dom", u"domingo"]:
        return "Sunday"
    elif string in [u"seg", u"segunda", u"segunda-feira"]:
        return "Monday"
    elif string in [u"ter", u"terca", u"terça", u"terca-feira", u"terça-feira"]:
        return "Tuesday"
    elif string in [u"qua", u"quarta", u"quarta-feira"]:
        return "Wednesday"
    elif string in [u"qui", u"quinta", u"quinta-feira"]:
        return "Thursday"
    elif string in [u"sex", u"sexta", u"sexta-feira"]:
        return "Friday"
    elif string in [u"sab", u"sáb", u"sabado", u"sábado"]:
        return "Saturday"

项目：pyrepl 作者：dajose | 项目源码 | 文件源码

def push(self, evt):
        trace("[input] pushed {!r}", evt.data)
        key = evt.data
        d = self.k.get(key)
        if isinstance(d, dict):
            trace("[input] transition")
            self.stack.append(key)
            self.k = d
        else:
            if d is None:
                trace("[input] invalid")
                if self.stack or len(key) > 1 or unicodedata.category(key) == 'C':
                    self.results.append(
                        (self.invalid_cls, self.stack + [key]))
                else:
                    # small optimization:
                    self.k[key] = self.character_cls
                    self.results.append(
                        (self.character_cls, [key]))
            else:
                trace("[input] matched {}", d)
                self.results.append((d, self.stack + [key]))
            self.stack = []
            self.k = self.ck

项目：perceptronix 作者：kylebgorman | 项目源码 | 文件源码

def get_cc(nunichar):
  """Computes CharCase for a Unicode character.

  This function computes the CharCase of a Unicode character.

  Args:
    nunichar: A Unicode character whose casing is to be computed.

  Returns:
    The CharCase for the input character.
  """
  catstr = unicodedata.category(nunichar)
  if catstr == "Ll":
    return CharCase.lower
  elif catstr == "Lu":
    return CharCase.upper
  else:
    return CharCase.dc

项目：statbot 作者：strinking | 项目源码 | 文件源码

def __init__(self, emoji):
        self.raw = emoji

        if isinstance(emoji, str):
            self.id = 0
            self.unicode = emoji
            self.custom = False
            self.managed = False
            self.name = [unicodedata.name(ch) for ch in emoji]
            self.category = [unicodedata.category(ch) for ch in emoji]
            self.roles = []
            self.guild = None
        else:
            self.id = emoji.id
            self.unicode = ''
            self.custom = True
            self.managed = getattr(emoji, 'managed', None)
            self.name = [emoji.name]
            self.category = ['custom']
            self.roles = getattr(emoji, 'roles', None)
            self.guild = getattr(emoji, 'guild', None)

项目：Tree-LSTM-LM 作者：vgene | 项目源码 | 文件源码

def read_identifier(self):
        self.j = self.i + 1

        while unicodedata.category(self.data[self.j]) in self.IDENT_PART_CATEGORIES:
            self.j += 1

        ident = self.data[self.i:self.j]
        if ident in Keyword.VALUES:
            token_type = Keyword

            if ident in BasicType.VALUES:
                token_type = BasicType
            elif ident in Modifier.VALUES:
                token_type = Modifier

        elif ident in Boolean.VALUES:
            token_type = Boolean
        elif ident == 'null':
            token_type = Null
        else:
            token_type = Identifier

        return token_type

项目：isni-reconcile 作者：cmh2166 | 项目源码 | 文件源码

def tokenize(text, splits='COPZ'):
    token = []
    if PY3:
        for c in str(text, 'utf-8'):
            if category(c)[0] in splits:
                if len(token):
                    yield u''.join(token)
                token = []
            else:
                token.append(c)
    else:
        for c in unicode(text):
            if category(c)[0] in splits:
                if len(token):
                    yield u''.join(token)
                token = []
            else:
                token.append(c)
    if len(token):
        yield u''.join(token)

项目：Taigabot 作者：FrozenPigs | 项目源码 | 文件源码

def _consume_alpha_utf8(self,text,offset):
        """Consume a sequence of utf8 bytes forming an alphabetic character."""
        incr = 2
        u = ""
        while not u and incr <= 4:
            try:
                try:
                    #  In the common case this will be a string
                    u = text[offset:offset+incr].decode("utf8")
                except AttributeError:
                    #  Looks like it was e.g. a mutable char array.
                    try:
                        s = text[offset:offset+incr].tostring()
                    except AttributeError:
                        s = "".join([c for c in text[offset:offset+incr]])
                    u = s.decode("utf8")
            except UnicodeDecodeError:
                incr += 1
        if not u:
            return 0
        if u.isalpha():
            return incr
        if unicodedata.category(u)[0] == "M":
            return incr
        return 0

项目：Taigabot 作者：FrozenPigs | 项目源码 | 文件源码

def _consume_alpha_u(self,text,offset):
        """Consume an alphabetic character from the given unicode string.

        Given a unicode string and the current offset, this method returns
        the number of characters occupied by the next alphabetic character
        in the string.  Trailing combining characters are consumed as a
        single letter.
        """
        assert offset < len(text)
        incr = 0
        if text[offset].isalpha():
            incr = 1
            while offset + incr < len(text):
                if unicodedata.category(text[offset+incr])[0] != "M":
                    break
                incr += 1
        return incr

项目：portailva 作者：BdEINSALyon | 项目源码 | 文件源码

def value_for(self, association):
        value = str(self.prefix)
        try:
            if self.prop:
                val = str(eval("association."+self.prop))
            else:
                val = str(self.value)

            if 'accents' in self.options:
                val = ''.join((c for c in unicodedata.normalize('NFD', val) if unicodedata.category(c) != 'Mn'))
            if 'caps' in self.options:
                val = val.upper()

            return value + val

        except AttributeError:
            return ''

项目：tmux2html 作者：tweekmonster | 项目源码 | 文件源码

def _escape_text(self, s):
        """Escape text

        In addition to escaping text, unicode characters are replaced with a
        span that will display the glyph using CSS.  This is to ensure that the
        text has a consistent width.
        """
        tpl = ('<span class="u"><span class="g">&#x{0:x};</span>'
               '<span class="ns">{1}</span></span>')
        out = ''
        for c in s:
            w = utils.str_width(c)
            if unicodedata.category(c) in ('Co', 'Cn', 'So'):
                out += tpl.format(ord(c), ' ')
            elif w > 1 or ord(c) > 255:
                out += tpl.format(ord(c), ' ' * w)
            else:
                out += escape(c)
        return out

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

项目：topical_word_embeddings 作者：thunlp | 项目源码 | 文件源码

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

项目：GPArrayController 作者：eastpiger | 项目源码 | 文件源码

def _splitByControlCharacters(self, val):
        # extract non-control characters
        output = []
        s = ''
        for c in unicode(val):
            if unicodedata.category(c)[0] == 'C':
                if len(s) > 0:
                    # start a new string if we found a control character
                    output.append(str(s))
                    s = ''
            else:
                s += c

        # clean up any left over string
        if len(s) > 0:
            output.append(str(s))

        # return extracts strings
        return output

项目：oil 作者：oilshell | 项目源码 | 文件源码

def calibrate(self):

            data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
            len_data = len(data)
            digit = unicodedata.digit
            numeric = unicodedata.numeric
            decimal = unicodedata.decimal
            category = unicodedata.category
            bidirectional = unicodedata.bidirectional
            decomposition = unicodedata.decomposition
            mirrored = unicodedata.mirrored
            combining = unicodedata.combining

            for i in xrange(self.rounds):

                c = data[i % len_data]

项目：python2-tracer 作者：extremecoders-re | 项目源码 | 文件源码

def calibrate(self):

            data = (u'a', u'1', u' ', u'\u1234', u'\uFFFF')
            len_data = len(data)
            digit = unicodedata.digit
            numeric = unicodedata.numeric
            decimal = unicodedata.decimal
            category = unicodedata.category
            bidirectional = unicodedata.bidirectional
            decomposition = unicodedata.decomposition
            mirrored = unicodedata.mirrored
            combining = unicodedata.combining

            for i in xrange(self.rounds):

                c = data[i % len_data]

项目：hypothesis-regex 作者：maximkulkin | 项目源码 | 文件源码

def _test_matching_pattern(self, pattern, isvalidchar, unicode=False):
        r = unicode_regex(pattern) if unicode else ascii_regex(pattern)

        codepoints = six.moves.range(0, sys.maxunicode+1) \
            if unicode else six.moves.range(1, 128)
        for c in [six.unichr(x) for x in codepoints]:
            if isvalidchar(c):
                assert r.match(c), (
                    '"%s" supposed to match "%s" (%r, category "%s"), '
                    'but it doesnt' % (pattern, c, c, unicodedata.category(c))
                )
            else:
                assert not r.match(c), (
                    '"%s" supposed not to match "%s" (%r, category "%s"), '
                    'but it does' % (pattern, c, c, unicodedata.category(c))
                )

项目：isard 作者：isard-vdi | 项目源码 | 文件源码

def domains(self):
        with app.app_context():
            if not r.table_list().contains('domains').run(db.conn):
                log.info("Table domains not found, creating...")
                r.table_create('domains', primary_key="id").run(db.conn)
                r.table('domains').index_create("status").run(db.conn)
                r.table('domains').index_wait("status").run(db.conn)
                r.table('domains').index_create("hyp_started").run(db.conn)
                r.table('domains').index_wait("hyp_started").run(db.conn)
                r.table('domains').index_create("user").run(db.conn)
                r.table('domains').index_wait("user").run(db.conn)
                r.table('domains').index_create("group").run(db.conn)
                r.table('domains').index_wait("group").run(db.conn)
                r.table('domains').index_create("category").run(db.conn)
                r.table('domains').index_wait("category").run(db.conn)
                r.table('domains').index_create("kind").run(db.conn)
                r.table('domains').index_wait("kind").run(db.conn)
            return True

项目：ibus-typing-booster 作者：mike-fabian | 项目源码 | 文件源码

def lstrip_token(token):
    '''Strips some characters from the left side of a token

    Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS
    are stripped from the left side of a token.

    The stripped token is returned.

    :param token: The token where characters may be stripped from
    :type token: String
    :rtype: String

    Examples:

    >>> lstrip_token(".'foo'.")
    "foo'."
    '''
    token = token.lstrip()
    while (len(token) > 0
           and
           unicodedata.category(token[0]) in CATEGORIES_TO_STRIP_FROM_TOKENS):
        token = token[1:]
    return token

项目：ibus-typing-booster 作者：mike-fabian | 项目源码 | 文件源码

def rstrip_token(token):
    '''Strips some characters from the right side of a token

    Characters which have a type listed in CATEGORIES_TO_STRIP_FROM_TOKENS
    are stripped from the right side of a token.

    The stripped token is returned.

    :param token: The token where characters may be stripped from
    :type token: String
    :rtype: String

    Examples:

    >>> rstrip_token(".'foo'.")
    ".'foo"
    '''
    token = token.rstrip()
    while (len(token) > 0
           and
           unicodedata.category(token[-1]) in CATEGORIES_TO_STRIP_FROM_TOKENS):
        token = token[0:-1]
    return token

项目：ibus-typing-booster 作者：mike-fabian | 项目源码 | 文件源码

def remove_accents(text):
    '''Removes accents from the text

    Returns the text with all accents removed

    Using “from unidecode import unidecode” is more
    sophisticated, but I am not sure whether I can require
    “unidecode”.

    :param text: The text to change
    :type text: string
    :rtype: string

    Examples:

    >>> remove_accents('Ångstrøm')
    'Angstrom'

    >>> remove_accents('ÅÆæŒœ?øß?ü')
    'AAEaeOEoeijossSSu'

    '''
    return ''.join([
        x for x in unicodedata.normalize('NFKD', text)
        if unicodedata.category(x) != 'Mn']).translate(TRANS_TABLE)

项目：ibus-typing-booster 作者：mike-fabian | 项目源码 | 文件源码

def contains_letter(text):
    '''Returns whether “text” contains a “letter” type character

    :param text: The text to check
    :type text: string
    :rtype: boolean

    Examples:

    >>> contains_letter('Hi!')
    True

    >>> contains_letter(':-)')
    False
    '''
    for char in text:
        category = unicodedata.category(char)
        if category in ('Ll', 'Lu', 'Lo',):
            return True
    return False

项目：lifesoundtrack 作者：MTG | 项目源码 | 文件源码

def _is_safe_url(url, allowed_hosts, require_https=False):
    # Chrome considers any URL with more than two slashes to be absolute, but
    # urlparse is not so flexible. Treat any url with three slashes as unsafe.
    if url.startswith('///'):
        return False
    url_info = urlparse(url)
    # Forbid URLs like http:///example.com - with a scheme, but without a hostname.
    # In that URL, example.com is not the hostname but, a path component. However,
    # Chrome will still consider example.com to be the hostname, so we must not
    # allow this syntax.
    if not url_info.netloc and url_info.scheme:
        return False
    # Forbid URLs that start with control characters. Some browsers (like
    # Chrome) ignore quite a few control characters at the start of a
    # URL and might consider the URL as scheme relative.
    if unicodedata.category(url[0])[0] == 'C':
        return False
    scheme = url_info.scheme
    # Consider URLs without a scheme (e.g. //example.com/p) to be http.
    if not url_info.scheme and url_info.netloc:
        scheme = 'http'
    valid_schemes = ['https'] if require_https else ['http', 'https']
    return ((not url_info.netloc or url_info.netloc in allowed_hosts) and
            (not scheme or scheme in valid_schemes))

项目：pelisalacarta-ce 作者：pelisalacarta-ce | 项目源码 | 文件源码

def capitulos(item):
    logger.info()
    itemlist = []
    data = item.extra
    thumbnail =scrapertools.get_match(data,'background-image:url\(\'([^"]+)\'')
    thumbnail = re.sub(r"w185","original",thumbnail)
    patron= '<a href="([^"]+)".*?<br\/><i>(.*?)<\/i>'
    matches=re.compile(patron,re.DOTALL).findall(data)
    for url, capitulo in matches:
        capitulo = re.sub(r"Cap.*?tulo","",capitulo)
        capitulo= "[COLOR floralwhite][B]"+capitulo+"[/B][/COLOR]"
        if capitulo == item.extra.split("|")[4]:
           continue
        if not ".jpg" in item.extra.split("|")[2]:
           fanart = item.show.split("|")[0]
        else:
           fanart = item.extra.split("|")[2]
        itemlist.append( Item(channel=item.channel, title = capitulo , action="findvideos", url=url,  thumbnail= thumbnail,extra= "fv2"+"|"+item.extra.split("|")[3],show= item.show,category= item.category, fanart=fanart,   folder=True) )
    return itemlist

项目：pelisalacarta-ce 作者：pelisalacarta-ce | 项目源码 | 文件源码

def findvideos(item):
    logger.info()
    itemlist = []
    temp = item.fulltitle.split("|")[0]
    epi = item.fulltitle.split("|")[1]

    url_temp= "http://api.themoviedb.org/3/tv/"+item.show.split("|")[5]+"/season/"+temp+"/images?api_key="+api_key+""
    data = httptools.downloadpage(url_temp).data
    data = re.sub(r"\n|\r|\t|\s{2}|&nbsp;","",data)
    patron = '{"id".*?"file_path":"(.*?)","height"'
    matches = re.compile(patron,re.DOTALL).findall(data)
    if len(matches) == 0:
        thumbnail= item.thumbnail
    for thumtemp in matches:
        thumbnail= "https://image.tmdb.org/t/p/original"+ thumtemp
        title = item.show.split("|")[3]+ " " + temp+"x"+epi
        title = "[COLOR lightgreen]"+title+"[/COLOR]"
        itemlist.append( Item(channel=item.channel, title = title , action="play", url=item.url, server="torrent", thumbnail= item.show.split("|")[4],extra=item.extra,show= item.show, fanart=item.show.split("|")[0],fulltitle = title,  folder=False) )
    extra = item.extra+"|"+temp+"|"+epi
    title_info ="    Info"
    title_info = "[COLOR darkseagreen]"+title_info+"[/COLOR]"
    itemlist.append( Item(channel=item.channel, action="info_capitulos" , title=title_info , url=item.url, thumbnail=thumbnail, fanart=item.show.split("|")[1], extra =extra,show=item.show,category=item.category, folder=False ))
    return itemlist

项目：pelisalacarta-ce 作者：pelisalacarta-ce | 项目源码 | 文件源码

def play(item):
    logger.info()

    itemlist = servertools.find_video_items(data=item.url)
    data = scrapertools.cache_page(item.url)



    listavideos = servertools.findvideos(data)

    for video in listavideos:
        videotitle = scrapertools.unescape(video[0])
        url =item.url
        server = video[2]

        #xbmctools.addnewvideo( item.channel , "play" , category , server ,  , url , thumbnail , plot )
        itemlist.append( Item(channel=item.channel, action="play", server=server, title="Trailer - " + videotitle  , url=url , thumbnail=item.thumbnail , plot=item.plot , fulltitle = item.title , fanart="http://s23.postimg.org/84vkeq863/movietrailers.jpg", folder=False) )




    return itemlist

项目：nonce2vec 作者：minimalparts | 项目源码 | 文件源码

def deaccent(text):
    """
    Remove accentuation from the given string. Input text is either a unicode string or utf8 encoded bytestring.

    Return input string with accents removed, as unicode.

    >>> deaccent("Šéf chomutovských komunist? dostal poštou bílý prášek")
    u'Sef chomutovskych komunistu dostal postou bily prasek'

    """
    if not isinstance(text, unicode):
        # assume utf8 for byte strings, use default (strict) error handling
        text = text.decode('utf8')
    norm = unicodedata.normalize("NFD", text)
    result = u('').join(ch for ch in norm if unicodedata.category(ch) != 'Mn')
    return unicodedata.normalize("NFC", result)

项目：my-first-blog 作者：AnkurBegining | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：plugin.video.exodus 作者：lastship | 项目源码 | 文件源码

def normalize(title):
    try:
        try: return title.decode('ascii').encode("utf-8")
        except: pass

        return str(''.join(c for c in unicodedata.normalize('NFKD', unicode(title.decode('utf-8'))) if unicodedata.category(c) != 'Mn'))
    except:
        return title

项目：googletranslate.popclipext 作者：wizyoung | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：pip-update-requirements 作者：alanhamlett | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：Cuneiform 作者：nervouna | 项目源码 | 文件源码

def pinyinify(string):
    # TODO: Use static file instead of constructing table in real time
    table = dict()
    for i in range(sys.maxunicode):
        if re.match('P|S|Z|C', unicodedata.category(chr(i))) is not None:
            table[i] = '-'
    string = string.translate(table)
    for char in [x for x in string if unicodedata.name(x).startswith('CJK')]:
        string = string.replace(char, pinyin.get(char, format='strip') + '-')
    string = re.sub('\-+', '-', string)
    return pinyin.get(string, delimiter='', format='strip').lower()

项目：noc-orchestrator 作者：DirceuSilvaLabs | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：jira_worklog_scanner 作者：pgarneau | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：workflows.kyoyue 作者：wizyoung | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：oadoi 作者：Impactstory | 项目源码 | 文件源码

def replace_punctuation(text, sub):
    punctutation_cats = set(['Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po'])
    chars = []
    for my_char in text:
        if unicodedata.category(my_char) in punctutation_cats:
            chars.append(sub)
        else:
            chars.append(my_char)
    return u"".join(chars)


# from http://stackoverflow.com/a/22238613/596939

项目：xmpp-cloud-auth 作者：jsxc | 项目源码 | 文件源码

def sanitize(name):
    name = unicode(name)
    printable = set(('Lu', 'Ll', 'Lm', 'Lo', 'Nd', 'Nl', 'No', 'Pc', 'Pd', 'Ps', 'Pe', 'Pi', 'Pf', 'Po', 'Sm', 'Sc', 'Sk', 'So', 'Zs'))
    return utf8(''.join(c for c in name if unicodedata.category(c) in printable and c != '@'))

项目：fontreport 作者：googlei18n | 项目源码 | 文件源码

def Plaintext(self):
    data = ''
    for category, code in sorted(FontFile.NAME_CODES.items(),
                                 key=lambda x:x[1]):
      if code in self.font._names:
        data += '%15s: %s\n' % (category, self.font._names[code])
    return data

项目：fontreport 作者：googlei18n | 项目源码 | 文件源码

def XetexBody(self):
    data = ''
    for category, code in sorted(FontFile.NAME_CODES.items(),
                                 key=lambda x:x[1]):
      if code in self.font._names:
        data += '%s & %s \\\\\n' % (category,
                                    TexEscape(self.font._names[code]))
    return data

项目：thesaurus_query.vim 作者：Ron89 | 项目源码 | 文件源码

def _double_width_char_count(word):
    dw_count = 0
    for char in word:
        if _unicode_data.category(char) in _double_width_type:
            dw_count += 1
    return dw_count

项目：purelove 作者：hucmosin | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def check_initial_combiner(label):

    if unicodedata.category(label[0])[0] == 'M':
        raise IDNAError('Label begins with an illegal combining character')
    return True

项目：pyrepl 作者：dajose | 项目源码 | 文件源码

def _make_unctrl_map():
    uc_map = {}
    for c in map(unichr, range(256)):
        if unicodedata.category(c)[0] != 'C':
            uc_map[c] = c
    for i in range(32):
        c = unichr(i)
        uc_map[c] = '^' + unichr(ord('A') + i - 1)
    uc_map[b'\t'] = '    '  # display TABs as 4 characters
    uc_map[b'\177'] = unicode('^?')
    for i in range(256):
        c = unichr(i)
        if c not in uc_map:
            uc_map[c] = unicode('\\%03o') % i
    return uc_map

项目：pyrepl 作者：dajose | 项目源码 | 文件源码

def _my_unctrl(c, u=_make_unctrl_map()):
    if c in u:
        return u[c]
    else:
        if unicodedata.category(c).startswith('C'):
            return b'\u%04x' % ord(c)
        else:
            return c

项目：ChemDataExtractor 作者：mcs07 | 项目源码 | 文件源码

def is_punct(text):
    for char in text:
        if not unicodedata.category(char).startswith('P'):
            return False
    else:
        return True