我们从Python开源项目中,提取了以下19个代码示例,用于说明如何使用regex.UNICODE。
def __init__(self, pattern): # type: (Union[Text, regex._pattern_type, re._pattern_type]) -> None """ :param pattern: String pattern, or pre-compiled regex. IMPORTANT: If you specify your own compiled regex, be sure to add the ``UNICODE`` flag for Unicode support! """ super(Regex, self).__init__() self.regex = ( pattern if isinstance(pattern, (regex._pattern_type, re._pattern_type)) else regex.compile(pattern, regex.UNICODE) )
def __init__(self, pattern, keys=None): # type: (Union[Text, regex._pattern_type, re._pattern_type], Optional[Sequence[Text]]) -> None """ :param pattern: Regex used to split incoming string values. IMPORTANT: If you specify your own compiled regex, be sure to add the ``UNICODE`` flag for Unicode support! :param keys: If set, the resulting list will be converted into an OrderedDict, using the specified keys. IMPORTANT: If ``keys`` is set, the split value's length must be less than or equal to ``len(keys)``. """ super(Split, self).__init__() self.regex = ( pattern if isinstance(pattern, (regex._pattern_type, re._pattern_type)) else regex.compile(pattern, regex.UNICODE) ) self.keys = keys
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
def __init__(self, leading=r'[\p{C}\s]+', trailing=r'[\p{C}\s]+'): # type: (Text, Text) -> None """ :param leading: Regex to match at the start of the string. :param trailing: Regex to match at the end of the string. """ super(Strip, self).__init__() if leading: self.leading = regex.compile( r'^{pattern}'.format(pattern=leading), regex.UNICODE, ) else: self.leading = None if trailing: self.trailing = regex.compile( r'{pattern}$'.format(pattern=trailing), regex.UNICODE, ) else: self.trailing = None
def __init__(self, encoding='utf-8', normalize=True): # type: (Text, bool) -> None """ :param encoding: Used to decode non-unicode values. :param normalize: Whether to normalize the resulting value: - Convert to NFC form. - Remove non-printable characters. - Convert all line endings to unix-style ('\n'). """ super(Unicode, self).__init__() self.encoding = encoding self.normalize = normalize if self.normalize: # # Compile the regex that we will use to remove non- # printables from the resulting unicode. # http://www.regular-expressions.info/unicode.html#category # # Note: using a double negative so that we can exclude # newlines, which are technically considered control chars. # http://stackoverflow.com/a/3469155 # self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
def test_special_escapes(self): self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx")[1], 'bx') self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd")[1], 'bx') self.assertEqual(regex.search(r"\b(b.)\b", "abcd abc bcd bx", regex.LOCALE)[1], 'bx') self.assertEqual(regex.search(r"\B(b.)\B", "abc bcd bc abxd", regex.LOCALE)[1], 'bx') self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx", regex.UNICODE)[1], u'bx') self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd", regex.UNICODE)[1], u'bx') self.assertEqual(regex.search(r"^abc$", "\nabc\n", regex.M)[0], 'abc') self.assertEqual(regex.search(r"^\Aabc\Z$", "abc", regex.M)[0], 'abc') self.assertEqual(regex.search(r"^\Aabc\Z$", "\nabc\n", regex.M), None) self.assertEqual(regex.search(ur"\b(b.)\b", u"abcd abc bcd bx")[1], u'bx') self.assertEqual(regex.search(ur"\B(b.)\B", u"abc bcd bc abxd")[1], u'bx') self.assertEqual(regex.search(ur"^abc$", u"\nabc\n", regex.M)[0], u'abc') self.assertEqual(regex.search(ur"^\Aabc\Z$", u"abc", regex.M)[0], u'abc') self.assertEqual(regex.search(ur"^\Aabc\Z$", u"\nabc\n", regex.M), None) self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a")[0], '1aa! a') self.assertEqual(regex.search(r"\d\D\w\W\s\S", "1aa! a", regex.LOCALE)[0], '1aa! a') self.assertEqual(regex.search(ur"\d\D\w\W\s\S", u"1aa! a", regex.UNICODE)[0], u'1aa! a')
def test_bigcharset(self): self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222")[1], u'\u2222') self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222", regex.UNICODE)[1], u'\u2222') self.assertEqual(u"".join(regex.findall(u".", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117')
def test_ascii_and_unicode_flag(self): # Unicode patterns. for flags in (0, regex.UNICODE): pat = regex.compile(u'\xc0', flags | regex.IGNORECASE) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\w', flags) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'\w', regex.ASCII) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\w') self.assertEqual(pat.match(u'\xe0'), None) # String patterns. for flags in (0, regex.ASCII): pat = regex.compile('\xc0', flags | regex.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = regex.compile('\w') self.assertEqual(pat.match('\xe0'), None) self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: regex.compile('(?au)\w'))
def regex_match(text, pattern): """Test if a regex pattern is contained within a text.""" try: pattern = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE, ) except BaseException: return False return pattern.search(text) is not None
def regex_match_score(prediction, pattern): """Check if the prediction matches the given regular expression.""" try: compiled = re.compile( pattern, flags=re.IGNORECASE + re.UNICODE + re.MULTILINE ) except BaseException: logger.warn('Regular expression failed to compile: %s' % pattern) return False return compiled.match(prediction) is not None
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). """ self._regexp = regex.compile( '(%s)|(%s)' % (self.ALPHA_NUM, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set()
def remove_elongation(text): return regex.sub(r'(.)\1{3,}', r'\1\1', text, flags=regex.UNICODE)
def clean(text): #removing extra spaces text = regex.sub(r'[\s\n]+', ' ', text, flags=regex.UNICODE) # todo : add more cleaning methods return text
def make_xpath_ranges(html, phrase): '''Given a HTML string and a `phrase`, build a regex to find offsets for the phrase, and then build a list of `XPathRange` objects for it. If this fails, return empty list. ''' if not html: return [] if not isinstance(phrase, unicode): try: phrase = phrase.decode('utf8') except: logger.info('failed %r.decode("utf8")', exc_info=True) return [] phrase_re = re.compile( phrase, flags=re.UNICODE | re.IGNORECASE | re.MULTILINE) spans = [] for match in phrase_re.finditer(html, overlapped=False): spans.append(match.span()) # a list of tuple(start, end) char indexes # now run fancy aligner magic to get xpath info and format them as # XPathRange per above try: xpath_ranges = list(char_offsets_to_xpaths(html, spans)) except: logger.info('failed to get xpaths', exc_info=True) return [] ranges = [] for xpath_range in filter(None, xpath_ranges): ranges.append(dict( start=dict(node=xpath_range.start_xpath, idx=xpath_range.start_offset + 1), end=dict(node=xpath_range.end_xpath, idx=xpath_range.end_offset))) return ranges
def eval_escapes(s): """ Given a string, evaluate escape sequences starting with backslashes as they would be evaluated in Python source code. For a list of these sequences, see: https://docs.python.org/3/reference/lexical_analysis.html This is not the same as decoding the whole string with the 'unicode-escape' codec, because that provides no way to handle non-ASCII characters that are literally present in the string. """ # by Rob Speer escape_sequence_re = re.compile( r''' ( \\U........ # 8-digit Unicode escapes | \\u.... # 4-digit Unicode escapes | \\x.. # 2-digit Unicode escapes | \\[0-7]{1,3} # Octal character escapes | \\N\{[^}]+\} # Unicode characters by name | \\[\\'"abfnrtv] # Single-character escapes )''', re.UNICODE | re.VERBOSE ) def decode_match(match): return codecs.decode(match.group(0), 'unicode-escape') return escape_sequence_re.sub(decode_match, s)