我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用unicodedata.normalize()。
def fold_to_ascii(self, text): """Convert non-ASCII characters to closest ASCII equivalent. .. versionadded:: 1.3 .. note:: This only works for a subset of European languages. :param text: text to convert :type text: ``unicode`` :returns: text containing only ASCII characters :rtype: ``unicode`` """ if isascii(text): return text text = ''.join([ASCII_REPLACEMENTS.get(c, c) for c in text]) return unicode(unicodedata.normalize('NFKD', text).encode('ascii', 'ignore'))
def clean_filename(filename): """Return a sanitized filename (replace / strip out illegal characters) :param filename: string used for a filename :type filename: str :return: sanitized filename :rtype: str """ return ''.join([ c for c in unicodedata.normalize( 'NFKD', ''.join([REPLACEMENT_CHAR.get(c, c) for c in filename]) ) if not unicodedata.combining(c) and c in '-_.() {0}{1}'.format(string.ascii_letters, string.digits) ])
def filename(self): """ Name of the file on the client file system, but normalized to ensure file system compatibility. An empty filename is returned as 'empty'. Only ASCII letters, digits, dashes, underscores and dots are allowed in the final filename. Accents are removed, if possible. Whitespace is replaced by a single dash. Leading or tailing dots or dashes are removed. The filename is limited to 255 characters. """ fname = self.raw_filename if not isinstance(fname, unicode): fname = fname.decode('utf8', 'ignore') fname = normalize('NFKD', fname) fname = fname.encode('ASCII', 'ignore').decode('ASCII') fname = os.path.basename(fname.replace('\\', os.path.sep)) fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip() fname = re.sub(r'[-\s]+', '-', fname).strip('.-') return fname[:255] or 'empty'
def decode_as_string(text, encoding=None): """ Decode the console or file output explicitly using getpreferredencoding. The text paraemeter should be a encoded string, if not no decode occurs If no encoding is given, getpreferredencoding is used. If encoding is specified, that is used instead. This would be needed for SVN --xml output. Unicode is explicitly put in composed NFC form. --xml should be UTF-8 (SVN Issue 2938) the discussion on the Subversion DEV List from 2007 seems to indicate the same. """ #text should be a byte string if encoding is None: encoding = _console_encoding if not isinstance(text, unicode): text = text.decode(encoding) text = unicodedata.normalize('NFC', text) return text
def delete_friends(request): current_username = request.POST.get('username') current_friendName = request.POST.get('friendUsername') ol=[] try: existingUser = FriendList.objects.get(user__username = current_username) user_friends = existingUser.getfoo() for c in user_friends: c = unicodedata.normalize('NFKD', c).encode('ascii','ignore') if(c == current_friendName): continue ol.append(c) existingUser.friendList = json.dumps(ol) existingUser.save() except: ol=[] return HttpResponse(json.dumps(ol))
def GetLineWidth(line): """Determines the width of the line in column positions. Args: line: A string, which may be a Unicode string. Returns: The width of the line in column positions, accounting for Unicode combining characters and wide characters. """ if isinstance(line, unicode): width = 0 for uc in unicodedata.normalize('NFC', line): if unicodedata.east_asian_width(uc) in ('W', 'F'): width += 2 elif not unicodedata.combining(uc): width += 1 return width else: return len(line)
def append_utf8(self, text): try: from Naked.toolshed.system import file_exists if not file_exists(self.filepath): raise IOError("The file specified for the text append does not exist (Naked.toolshed.file.py:append_utf8).") import codecs import unicodedata norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write with codecs.open(self.filepath, mode='a', encoding="utf_8") as appender: appender.write(norm_text) except Exception as e: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: Unable to append text to the file with the append_utf8 method (Naked.toolshed.file.py).") raise e #------------------------------------------------------------------------------ # [ gzip method (writer) ] # writes data to gzip compressed file # Note: adds .gz extension to filename if user did not specify it in the FileWriter class constructor # Note: uses compresslevel = 6 as default to balance speed and compression level (which in general is not significantly less than 9) # Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite, # test_file_gzip_utf8_readwrite_explicit_decode #------------------------------------------------------------------------------
def gzip(self, text, compression_level=6): try: import gzip if not self.filepath.endswith(".gz"): self.filepath = self.filepath + ".gz" with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer: gzip_writer.write(text) except UnicodeEncodeError as ue: import unicodedata norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write import codecs binary_data = codecs.encode(norm_text, "utf_8") with gzip.open(self.filepath, 'wb', compresslevel=compression_level) as gzip_writer: gzip_writer.write(binary_data) except Exception as e: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: unable to gzip compress the file with the gzip method (Naked.toolshed.file.py).") raise e #------------------------------------------------------------------------------ # [ write method ] # Universal text file writer that writes by system default or utf-8 encoded unicode if throws UnicdeEncodeError # Tests: test_IO.py :: test_file_ascii_readwrite, test_file_ascii_readwrite_missing_file, # test_file_utf8_write_raises_unicodeerror #------------------------------------------------------------------------------
def readlines_utf8(self): try: import codecs with codecs.open(self.filepath, encoding='utf-8', mode='r') as uni_reader: modified_text_list = [] for line in uni_reader: import unicodedata norm_line = unicodedata.normalize('NFKD', line) # NKFD normalization of the unicode data before use modified_text_list.append(norm_line) return modified_text_list except Exception as e: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: unable to read lines in the unicode file with the readlines_utf8 method (Naked.toolshed.file.py)") raise e #------------------------------------------------------------------------------ # [ read_gzip ] (byte string) # reads data from a gzip compressed file # returns the decompressed binary data from the file # Note: if decompressing unicode file, set encoding="utf-8" # Tests: test_IO.py :: test_file_gzip_ascii_readwrite, test_file_gzip_utf8_readwrite, # test_file_read_gzip_missing_file #------------------------------------------------------------------------------
def read_utf8(self): try: import codecs f = codecs.open(self.filepath, encoding='utf_8', mode='r') except IOError as ioe: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: Unable to open file for read with read_utf8() method (Naked.toolshed.file.py).") raise ioe try: textstring = f.read() import unicodedata norm_text = unicodedata.normalize('NFKD', textstring) # NKFD normalization of the unicode data before returns return norm_text except Exception as e: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: Unable to read the file with UTF-8 encoding using the read_utf8() method (Naked.toolshed.file.py).") raise e finally: f.close()
def write_utf8(self, text): try: import codecs f = codecs.open(self.filepath, encoding='utf_8', mode='w') except IOError as ioe: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: Unable to open file for write with the write_utf8() method (Naked.toolshed.file.py).") raise ioe try: import unicodedata norm_text = unicodedata.normalize('NFKD', text) # NKFD normalization of the unicode data before write f.write(norm_text) except Exception as e: if DEBUG_FLAG: sys.stderr.write("Naked Framework Error: Unable to write UTF-8 encoded text to file with the write_utf8() method (Naked.toolshed.file.py).") raise e finally: f.close() #------------------------------------------------------------------------------ # [ FileReader class ] # reads data from local files # filename assigned in constructor (inherited from IO class interface) #------------------------------------------------------------------------------
def sanitize_separators(value): """ Sanitizes a value according to the current decimal and thousand separator setting. Used with form field input. """ if settings.USE_L10N and isinstance(value, six.string_types): parts = [] decimal_separator = get_format('DECIMAL_SEPARATOR') if decimal_separator in value: value, decimals = value.split(decimal_separator, 1) parts.append(decimals) if settings.USE_THOUSAND_SEPARATOR: thousand_sep = get_format('THOUSAND_SEPARATOR') if thousand_sep == '.' and value.count('.') == 1 and len(value.split('.')[-1]) != 3: # Special case where we suspect a dot meant decimal separator (see #22171) pass else: for replacement in { thousand_sep, unicodedata.normalize('NFKD', thousand_sep)}: value = value.replace(replacement, '') parts.append(value) value = '.'.join(reversed(parts)) return value
def chars(self, num, truncate=None, html=False): """ Returns the text truncated to be no longer than the specified number of characters. Takes an optional argument of what should be used to notify that the string has been truncated, defaulting to a translatable string of an ellipsis (...). """ length = int(num) text = unicodedata.normalize('NFC', self._wrapped) # Calculate the length to truncate to (max length - end_text length) truncate_len = length for char in self.add_truncation_text('', truncate): if not unicodedata.combining(char): truncate_len -= 1 if truncate_len == 0: break if html: return self._truncate_html(length, truncate, text, truncate_len, False) return self._text_chars(length, truncate, text, truncate_len)
def getLcdPiconName(serviceName): #remove the path and name fields, and replace ':' by '_' sname = '_'.join(GetWithAlternative(serviceName).split(':', 10)[:10]) pngname = findLcdPicon(sname) if not pngname: fields = sname.split('_', 3) if len(fields) > 2 and fields[2] != '1': #fallback to 1 for services with different service types fields[2] = '1' if len(fields) > 0 and fields[0] != '1': #fallback to 1 for other reftypes fields[0] = '1' pngname = findLcdPicon('_'.join(fields)) if not pngname: # picon by channel name name = ServiceReference(serviceName).getServiceName() name = unicodedata.normalize('NFKD', unicode(name, 'utf_8', errors='ignore')).encode('ASCII', 'ignore') name = re.sub('[^a-z0-9]', '', name.replace('&', 'and').replace('+', 'plus').replace('*', 'star').lower()) if len(name) > 0: pngname = findLcdPicon(name) if not pngname and len(name) > 2 and name.endswith('hd'): pngname = findLcdPicon(name[:-2]) return pngname
def getPiconLName(serviceName): #remove the path and name fields, and replace ':' by '_' sname = '_'.join(GetWithAlternative(serviceName).split(':', 10)[:10]) pngname = findPiconL(sname) if not pngname: fields = sname.split('_', 3) if len(fields) > 2 and fields[2] != '2': #fallback to 1 for tv services with nonstandard servicetypes fields[2] = '1' pngname = findPiconL('_'.join(fields)) if not pngname: # picon by channel name name = ServiceReference(serviceName).getServiceName() name = unicodedata.normalize('NFKD', unicode(name, 'utf_8', errors='ignore')).encode('ASCII', 'ignore') excludeChars = ['/', '\\', '\'', '"', '`', '?', ' ', '(', ')', ':', '<', '>', '|', '.', '\n'] name = re.sub('[%s]' % ''.join(excludeChars), '', name) name = name.replace('&', 'and') name = name.replace('+', 'plus') name = name.replace('*', 'star') name = name.lower() if len(name) > 0: pngname = findPicon(name) if not pngname and len(name) > 2 and name.endswith('hd'): pngname = findPicon(name[:-2]) return pngname
def weekday_portuguese_to_english(string): string = string.lower() string = string.strip() string = string.replace("-", " ") string = ''.join((c for c in unicodedata.normalize('NFD', string) if unicodedata.category(c) != 'Mn')) string = string.replace(",", " ") string = string.split(" ")[0] if string in [u"dom", u"domingo"]: return "Sunday" elif string in [u"seg", u"segunda", u"segunda-feira"]: return "Monday" elif string in [u"ter", u"terca", u"terça", u"terca-feira", u"terça-feira"]: return "Tuesday" elif string in [u"qua", u"quarta", u"quarta-feira"]: return "Wednesday" elif string in [u"qui", u"quinta", u"quinta-feira"]: return "Thursday" elif string in [u"sex", u"sexta", u"sexta-feira"]: return "Friday" elif string in [u"sab", u"sáb", u"sabado", u"sábado"]: return "Saturday"
def normalize_string(text): '''normalize string, strip all special chars''' text = text.replace(":", "") text = text.replace("/", "-") text = text.replace("\\", "-") text = text.replace("<", "") text = text.replace(">", "") text = text.replace("*", "") text = text.replace("?", "") text = text.replace('|', "") text = text.replace('(', "") text = text.replace(')', "") text = text.replace("\"", "") text = text.strip() text = text.rstrip('.') if not isinstance(text, unicode): text = text.decode("utf-8") text = unicodedata.normalize('NFKD', text) return text
def to_unicode(source, encoding="utf-8", param="value"): """Helper to normalize input to unicode. :arg source: source bytes/unicode to process. :arg encoding: encoding to use when decoding bytes instances. :param param: optional name of variable/noun to reference when raising errors. :raises TypeError: if source is not unicode or bytes. :returns: * returns unicode strings unchanged. * returns bytes strings decoded using *encoding* """ assert encoding if isinstance(source, unicode): return source elif isinstance(source, bytes): return source.decode(encoding) else: raise ExpectedStringError(source, param)
def slugify(value, allow_unicode=False): """Slugify string to make it a valid filename. Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens. Remove characters that aren't alphanumerics, underscores, or hyphens. Also strip leading and trailing whitespace. """ import unicodedata value = str(value) if allow_unicode: value = unicodedata.normalize('NFKC', value) else: value = unicodedata.normalize('NFKD', value).encode( 'ascii', 'ignore').decode('ascii') value = re.sub(r'[^\w\s-]', '', value).strip() return re.sub(r'[-\s]+', '-', value) # Below from # http://stackoverflow.com/questions/2333872/atomic-writing-to-file-with-python
def charinfo(self, ctx, *, chars): """Get unicode character info.""" if not chars: return chars = unicodedata.normalize('NFC', chars) if len(chars) > 25: await ctx.send('Too many emoji.') return embed = discord.Embed() for char in chars: uc = hex(ord(char))[2:] name = unicodedata.name(char, 'unknown') if name in {'SPACE', 'EM QUAD', 'EN QUAD'} or ' SPACE' in name: char = '" "' short = len(uc) <= 4 code = f'`\\{"u" if short else "U"}{uc.lower().zfill(4 if short else 8)}`' embed.add_field(name=name, value=f'{char} [{code}](http://www.fileformat.info/info/unicode/char/{uc}/index.htm)') await ctx.send(embed=embed)
def filename(self): ''' Name of the file on the client file system, but normalized to ensure file system compatibility. An empty filename is returned as 'empty'. Only ASCII letters, digits, dashes, underscores and dots are allowed in the final filename. Accents are removed, if possible. Whitespace is replaced by a single dash. Leading or tailing dots or dashes are removed. The filename is limited to 255 characters. ''' fname = self.raw_filename if not isinstance(fname, unicode): fname = fname.decode('utf8', 'ignore') fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII') fname = os.path.basename(fname.replace('\\', os.path.sep)) fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip() fname = re.sub(r'[-\s]+', '-', fname).strip('.-') return fname[:255] or 'empty'
def __init__(self, form='NFKC', strip=True, collapse=True, hyphens=False, quotes=False, ellipsis=False, slashes=False, tildes=False): """ :param string form: Normal form for unicode normalization. :param bool strip: Whether to strip whitespace from start and end. :param bool collapse: Whether to collapse all whitespace (tabs, newlines) down to single spaces. :param bool hyphens: Whether to normalize all hyphens, minuses and dashes to the ASCII hyphen-minus character. :param bool quotes: Whether to normalize all apostrophes, quotes and primes to the ASCII quote character. :param bool ellipsis: Whether to normalize ellipses to three full stops. :param bool slashes: Whether to normalize slash characters to the ASCII slash character. :param bool tildes: Whether to normalize tilde characters to the ASCII tilde character. """ self.form = form self.strip = strip self.collapse = collapse self.hyphens = hyphens self.quotes = quotes self.ellipsis = ellipsis self.slashes = slashes self.tildes = tildes
def normalize_string(text): import unicodedata text = text.replace(":", "") text = text.replace("/", "-") text = text.replace("\\", "-") text = text.replace("<", "") text = text.replace(">", "") text = text.replace("*", "") text = text.replace("?", "") text = text.replace('|', "") text = text.replace('(', "") text = text.replace(')', "") text = text.replace("\"", "") text = text.strip() text = text.rstrip('.') text = unicodedata.normalize('NFKD', try_decode(text)) return text
def filename(self): """ Name of the file on the client file system, but normalized to ensure file system compatibility. An empty filename is returned as 'empty'. Only ASCII letters, digits, dashes, underscores and dots are allowed in the final filename. Accents are removed, if possible. Whitespace is replaced by a single dash. Leading or tailing dots or dashes are removed. The filename is limited to 255 characters. """ fname = self.raw_filename if not isinstance(fname, unicode): fname = fname.decode('utf8', 'ignore') fname = normalize('NFKD', fname).encode('ASCII', 'ignore').decode('ASCII') fname = os.path.basename(fname.replace('\\', os.path.sep)) fname = re.sub(r'[^a-zA-Z0-9-_.\s]', '', fname).strip() fname = re.sub(r'[-\s]+', '-', fname).strip('.-') return fname[:255] or 'empty'
def urlify(s, maxlen=80, keep_underscores=False): """ Converts incoming string to a simplified ASCII subset. if (keep_underscores): underscores are retained in the string else: underscores are translated to hyphens (default) """ s = to_unicode(s) # to unicode s = s.lower() # to lowercase s = unicodedata.normalize('NFKD', s) # replace special characters s = to_native(s, charset='ascii', errors='ignore') # encode as ASCII s = re.sub('&\w+?;', '', s) # strip html entities if keep_underscores: s = re.sub('\s+', '-', s) # whitespace to hyphens s = re.sub('[^\w\-]', '', s) # strip all but alphanumeric/underscore/hyphen else: s = re.sub('[\s_]+', '-', s) # whitespace & underscores to hyphens s = re.sub('[^a-z0-9\-]', '', s) # strip all but alphanumeric/hyphen s = re.sub('[-_][-_]+', '-', s) # collapse strings of hyphens s = s.strip('-') # remove leading and trailing hyphens return s[:maxlen] # enforce maximum length
def remove_accents(self, string): nkfd_form = unicodedata.normalize('NFKD', str(string)) return "".join([c for c in nkfd_form if not unicodedata.combining(c)])
def _convert_transaction(transaction): date = transaction['date'].strftime("%Y%m%d%H%M%S") return dict2xml.convert("STMTTRN", { "DTPOSTED": date, "FITID": date, "TRNAMT": transaction['signal'] + transaction['amount'], "MEMO": unicodedata.normalize('NFD', transaction['description']).encode('ascii', 'ignore'), })
def decode(self, text, encoding=None, normalization=None): """Return ``text`` as normalised unicode. If ``encoding`` and/or ``normalization`` is ``None``, the ``input_encoding``and ``normalization`` parameters passed to :class:`Workflow` are used. :param text: string :type text: encoded or Unicode string. If ``text`` is already a Unicode string, it will only be normalised. :param encoding: The text encoding to use to decode ``text`` to Unicode. :type encoding: ``unicode`` or ``None`` :param normalization: The nomalisation form to apply to ``text``. :type normalization: ``unicode`` or ``None`` :returns: decoded and normalised ``unicode`` :class:`Workflow` uses "NFC" normalisation by default. This is the standard for Python and will work well with data from the web (via :mod:`~workflow.web` or :mod:`json`). OS X, on the other hand, uses "NFD" normalisation (nearly), so data coming from the system (e.g. via :mod:`subprocess` or :func:`os.listdir`/:mod:`os.path`) may not match. You should either normalise this data, too, or change the default normalisation used by :class:`Workflow`. """ encoding = encoding or self._input_encoding normalization = normalization or self._normalizsation if not isinstance(text, unicode): text = unicode(text, encoding) return unicodedata.normalize(normalization, text)
def uni(s): """Coerce `s` to normalised Unicode.""" ustr = s.decode('utf-8') return normalize('NFD', ustr)
def text(self): """Unicode-decoded content of response body. If no encoding can be determined from HTTP headers or the content itself, the encoded response body will be returned instead. :returns: Body of HTTP response :rtype: :class:`unicode` or :class:`str` """ if self.encoding: return unicodedata.normalize('NFC', unicode(self.content, self.encoding)) return self.content
def decompose(path): if isinstance(path, six.text_type): return unicodedata.normalize('NFD', path) try: path = path.decode('utf-8') path = unicodedata.normalize('NFD', path) path = path.encode('utf-8') except UnicodeError: pass # Not UTF-8 return path
def check_nfc(label): if unicodedata.normalize('NFC', label) != label: raise IDNAError('Label must be in Normalization Form C')
def uts46_remap(domain, std3_rules=True, transitional=False): """Re-map the characters in the string according to UTS46 processing.""" from .uts46data import uts46data output = u"" try: for pos, char in enumerate(domain): code_point = ord(char) uts46row = uts46data[code_point if code_point < 256 else bisect.bisect_left(uts46data, (code_point, "Z")) - 1] status = uts46row[1] replacement = uts46row[2] if len(uts46row) == 3 else None if (status == "V" or (status == "D" and not transitional) or (status == "3" and std3_rules and replacement is None)): output += char elif replacement is not None and (status == "M" or (status == "3" and std3_rules) or (status == "D" and transitional)): output += replacement elif status != "I": raise IndexError() return unicodedata.normalize("NFC", output) except IndexError: raise InvalidCodepoint( "Codepoint {0} not allowed at position {1} in {2}".format( _unot(code_point), pos + 1, repr(domain)))