我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用regex.compile()。
def parse_primary_source(text): """given a primary source text reference it returns the abbrev of the primary source """ psource = "" ref_regex = regex.compile(r""" (?P<psource>(\w+\.?\s?)+) (\(?\d\)\s?)+ """, regex.VERBOSE) res = regex.search(ref_regex, text) if res: if res.group(psource): return res.group(psource) else: return None else: return None
def expect_regex(self, pattern): """Read until matches pattern or timeout.""" # inspired by pexpect/pty_spawn and pexpect/expect.py expect_loop end_time = time.time() + self.timeout buf = '' prog = regex.compile(pattern) while (end_time - time.time()) > 0.0: # switch to nonblocking read reads, _, _ = select.select([self.fd], [], [], end_time - time.time()) if len(reads) > 0: try: buf += self.read() except EOFError: assert prog.match(buf) is not None, \ 'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern) if prog.match(buf): return True else: # do not eat up CPU when waiting for the timeout to expire time.sleep(self.timeout/10) assert prog.match(buf) is not None, \ 'output was:\n%s\nexpect regex pattern:\n%s' % (buf, pattern)
def test_pass_precompiled_regex(self): """ You can alternatively provide a precompiled regex to the Filter instead of a string pattern. """ # Compile our own pattern so that we can specify the # ``IGNORECASE`` flag. # Note that you are responsible for adding the ``UNICODE`` flag # to your compiled regex! # noinspection SpellCheckingInspection pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE) self.assertFilterPasses( self._filter('test march of the TEST penguins', pattern=pattern), ['test', 'TEST'], )
def test_pass_regex_library_support(self): """ The Regex Filter also supports precompiled patterns using the ``regex`` library. """ # Roughly, "Hi there!" in Burmese. word = '\u101f\u102d\u102f\u1004\u103a\u1038' # Note that :py:func:`regex.compile` automatically adds the # ``UNICODE`` flag for you when the pattern is a unicode. pattern = regex.compile(r'\w+') self.assertFilterPasses( self._filter(word, pattern=pattern), [word], )
def test_pass_precompiled_regex(self): """ You can alternatively provide a precompiled regex to the Filter instead of a string pattern. """ # Compile our own pattern so that we can specify the # ``IGNORECASE`` flag. # Note that you are responsible for adding the ``UNICODE`` flag # to your compiled regex! # noinspection SpellCheckingInspection pattern = re.compile(r'\btest\b', re.IGNORECASE | re.UNICODE) self.assertFilterPasses( self._filter('test march of the TEST penguins', pattern=pattern), ['', ' march of the ', ' penguins'], )
def test_pass_regex_library_support(self): """ The Regex Filter also supports precompiled patterns using the ``regex`` library. """ # Roughly, "Hi there!" in Burmese. word = '\u101f\u102d\u102f\u1004\u103a\u1038!' # Note that :py:func:`regex.compile` automatically adds the # ``UNICODE`` flag for you when the pattern is a unicode. pattern = regex.compile(r'\w+') self.assertFilterPasses( self._filter(word, pattern=pattern), ['', '!'], )
def __init__(self, pattern, keys=None): # type: (Union[Text, regex._pattern_type, re._pattern_type], Optional[Sequence[Text]]) -> None """ :param pattern: Regex used to split incoming string values. IMPORTANT: If you specify your own compiled regex, be sure to add the ``UNICODE`` flag for Unicode support! :param keys: If set, the resulting list will be converted into an OrderedDict, using the specified keys. IMPORTANT: If ``keys`` is set, the split value's length must be less than or equal to ``len(keys)``. """ super(Split, self).__init__() self.regex = ( pattern if isinstance(pattern, (regex._pattern_type, re._pattern_type)) else regex.compile(pattern, regex.UNICODE) ) self.keys = keys
def __exit__(self, exc_type, exc_value, tb): if exc_type is None: try: exc_name = self.expected.__name__ except AttributeError: exc_name = str(self.expected) raise self.failureException( "%s not raised" % exc_name) if not issubclass(exc_type, self.expected): # let unexpected exceptions pass through return False self.exception = exc_value # store for later retrieval if self.expected_regexp is None: return True expected_regexp = self.expected_regexp if isinstance(expected_regexp, basestring): expected_regexp = re.compile(expected_regexp) if not expected_regexp.search(str(exc_value)): raise self.failureException('"%s" does not match "%s"' % (expected_regexp.pattern, str(exc_value))) return True
def __init__(self): # List of contractions adapted from Robert MacIntyre's tokenizer. # These were in turn collected from the TreebankWordTokenizer in NLTK. self.CONTRACTIONS = [regex.compile(r"([^' ])('[sS]|'[mM]|'[dD]|')\b"), regex.compile( \ r"([^' ])('ll|'LL|'re|'RE|'ve|'VE|n't|N'T)\b")] self.CONTRACTIONS2 = [regex.compile(r"(?i)\b(can)(not)\b"), regex.compile(r"(?i)\b(d)('ye)\b"), regex.compile(r"(?i)\b(gim)(me)\b"), regex.compile(r"(?i)\b(gon)(na)\b"), regex.compile(r"(?i)\b(got)(ta)\b"), regex.compile(r"(?i)\b(lem)(me)\b"), regex.compile(r"(?i)\b(mor)('n)\b"), regex.compile(r"(?i)\b(wan)(na) ")] self.CONTRACTIONS3 = [regex.compile(r"(?i) ('t)(is)\b"), regex.compile(r"(?i) ('t)(was)\b")] self.CONTRACTIONS4 = [regex.compile(r"(?i)\b(whad)(dd)(ya)\b"), regex.compile(r"(?i)\b(wha)(t)(cha)\b")]
def __init__(self, **kwargs): """ Args: annotators: None or empty set (only tokenizes). substitutions: if true, normalizes some token types (e.g. quotes). """ self._regexp = regex.compile( '(?P<digit>%s)|(?P<title>%s)|(?P<abbr>%s)|(?P<neg>%s)|(?P<hyph>%s)|' '(?P<contr1>%s)|(?P<alphanum>%s)|(?P<contr2>%s)|(?P<sdquote>%s)|' '(?P<edquote>%s)|(?P<ssquote>%s)|(?P<esquote>%s)|(?P<dash>%s)|' '(?<ellipses>%s)|(?P<punct>%s)|(?P<nonws>%s)' % (self.DIGIT, self.TITLE, self.ABBRV, self.NEGATION, self.HYPHEN, self.CONTRACTION1, self.ALPHA_NUM, self.CONTRACTION2, self.START_DQUOTE, self.END_DQUOTE, self.START_SQUOTE, self.END_SQUOTE, self.DASH, self.ELLIPSES, self.PUNCT, self.NON_WS), flags=regex.IGNORECASE + regex.UNICODE + regex.MULTILINE ) if len(kwargs.get('annotators', {})) > 0: logger.warning('%s only tokenizes! Skipping annotators: %s' % (type(self).__name__, kwargs.get('annotators'))) self.annotators = set() self.substitutions = kwargs.get('substitutions', True)
def test_format_replace_unicode_name(self): """Test replacing format Unicode names.""" pattern = regex.compile(r"(some)(.*?)(pattern)(!)") expandf = bregex.compile_replace( pattern, r'{1} \N{Black club suit}\l\N{Greek Capital Letter omega} and ' r'\LSPAN \N{Greek Capital Letter omega}\E and Escaped \\N{{Greek Capital Letter omega}}\E {3}', bregex.FORMAT ) results = expandf(pattern.match('some test pattern!')) self.assertEqual( 'some \u2663\u03c9 and span \u03c9 and Escaped \\N{Greek Capital Letter omega} pattern', results )
def test_expand_wrong_values(self): """Test `expand` with wrong values.""" pattern = regex.compile('test') replace = bregex.compile_replace(pattern, 'whatever', bregex.FORMAT) m = pattern.match('test') with pytest.raises(ValueError) as excinfo: bregex.expand(m, replace) assert "Replace should not be compiled as a format replace!" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: bregex.expand(m, 0) assert "Expected string, buffer, or compiled replace!" in str(excinfo.value)
def test_expandf_wrong_values(self): """Test `expand` with wrong values.""" pattern = regex.compile('test') replace = bregex.compile_replace(pattern, 'whatever') m = pattern.match('test') with pytest.raises(ValueError) as excinfo: bregex.expandf(m, replace) assert "Replace not compiled as a format replace" in str(excinfo.value) with pytest.raises(TypeError) as excinfo: bregex.expandf(m, 0) assert "Expected string, buffer, or compiled replace!" in str(excinfo.value)
def test_octal_fail(self): """Test that octal fails properly.""" pattern = regex.compile(b'Test') with pytest.raises(ValueError) as excinfo: bregex.compile_replace(pattern, br'\666') assert "octal escape value outside of range 0-0o377!" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: bregex.compile_replace(pattern, br'\C\666\E') assert "octal escape value outside of range 0-0o377!" in str(excinfo.value) with pytest.raises(ValueError) as excinfo: bregex.compile_replace(pattern, br'\c\666') assert "octal escape value outside of range 0-0o377!" in str(excinfo.value)
def __init__(self, manager, file_changes, prefix_sensitive=True): self.manager = manager self._matchers_prefix_sensitive = False self._transforms_prefix_sensitive = prefix_sensitive for (path_re, action) in file_changes: if not path_re.startswith('.*'): self._matchers_prefix_sensitive = True self._transforms = [(regex.compile(path_re + '$'), action) for (path_re, action) in file_changes] self._stat_tree_cache_hits = 0 self._stat_wrote_trees = 0 self._stat_got_trees = 0 self._stat_transforms = 0
def __init__(self, only_backfill=ONLY_BACKFILL, dont_backfill=DONT_BACKFILL, read_period=READ_PERIOD, clear_checkpoint=CLEAR_CHECKPOINT, read_pause=READ_PAUSE, temp_dir=TMP_DIR, start_of_record_re=None, filter_re=None, windows=None): self.config = collections.namedtuple('Args', self.ARGS) self.config.dont_backfill = dont_backfill self.config.only_backfill = only_backfill self.config.clear_checkpoint = clear_checkpoint self.config.read_period = read_period self.config.read_pause = read_pause self.config.temp_dir = temp_dir if temp_dir else tempfile.mkdtemp() self.config.windows = windows if windows is not None else self.is_windows() self.config.filter_re = regex.compile(filter_re) if filter_re else None if start_of_record_re: self.config.start_of_record_re = regex.compile(start_of_record_re) self.read_record = self.read_record_with_regex self.state = self.STARTING self.stats = collections.Counter()
def test_nginx_log(self): regexp = '<%{INT}>%{SYSLOGTIMESTAMP:syslog_timestamp} %{SYSLOGHOST:host} %{IPORHOST:remote_addr} - %{USERNAME:remote_user}?- \[%{HTTPDATE:time_local}\] \"(?P<method>(GET|PUT|PATCH|POST|DELETE|HEAD|OPTIONS)) %{URIPATH:path}%{URIPARAM:params} HTTP/%{NUMBER:httpversion}\" %{INT:status} %{INT:body_bytes_sent}\"-\" %{QS:http_user_agent}' other_log = '<13>Mar 25 12:26:57 myserver.io 62.73.84.230 - - [25/Mar/2016:12:26:57 +0000] "GET /orders?order_identifier=AB075081&consumer_name=&consumer_first_name= HTTP/1.1" 200 1499"-" "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"' grokked_re, types = pattern_matching.grok_re_preprocess(regexp) compiled = regex.compile(grokked_re) self.assertDictEqual( compiled.match(other_log).groupdict(), {"syslog_timestamp": "Mar 25 12:26:57", "host": 'myserver.io', "remote_addr": '62.73.84.230', "remote_user": None, "time_local": "25/Mar/2016:12:26:57 +0000", "method": "GET", "path": "/orders", "params": "?order_identifier=AB075081&consumer_name=&consumer_first_name=", "httpversion": "1.1", "status": "200", "body_bytes_sent": "1499", "http_user_agent": '"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.87 Safari/537.36"' })
def barcode_to_regex(barcode: str, error_rate: Optional[int]=None): """Convert a barcode string to a regex pattern barcode [str] The barcode string to turn into a regex error_rate [int]=None The error rate""" pattern = '' # type: str umi = regex.findall(r'(N+)', barcode, regex.IGNORECASE) # type: List[str] umi_lengths = tuple(map(len, umi)) # type: Tuple[int] filtered_barcode = filter(None, barcode.upper().split('N')) # type: filter for index, subpattern in enumerate(filtered_barcode): # type: int, str barcode_pattern = '(' + subpattern + ')' # type: str if error_rate: barcode_pattern += '{e<=' + str(error_rate) + '}' pattern += barcode_pattern try: umi_pattern = '(' + ''.join(itertools.repeat('[ACGT]', umi_lengths[index])) + ')' # type: str except IndexError: break else: if error_rate: umi_pattern += '{e<=' + str(error_rate) + '}' pattern += umi_pattern find_barcode = regex.compile(r'%s' % pattern, regex.ENHANCEMATCH) return find_barcode
def __init__(self, lang=None, XSAMPA=False): self.lang = lang self.XSAMPA = XSAMPA self.api = "https://en.wiktionary.org/w/api.php" self.param = { "action": "expandtemplates", "text": None, "prop": "wikitext", "format": "json" } self.regex = { "lang": re.compile("\|lang=([^\|]+)"), "node": re.compile("(?<brackets>{{(?:[^{}]+|(?&brackets))*}})"), "IPA-node": re.compile("^(([\w]+\-)?(IPA|pron))(?=\||\n|\Z)"), "h2": re.compile("(?:\A|\n)={2}([\p{L}0-9 -]+)={2}\n"), "h3": re.compile("\n={3}([\p{L}0-9 -]+)={3}\n"), "h4": re.compile("\n={4}([\p{L}0-9 -]+)={4}\n"), "IPA": re.compile("<span[^>]*>([^<]+)<\/span>") }
def __init__(self, arpabet='arpabet', ligatures=False, cedict_file=None): """Construct a Flite "wrapper" Args: arpabet (str): file containing ARPAbet to IPA mapping ligatures (bool): if True, use non-standard ligatures instead of standard IPA cedict_filename (str): path to CC-CEDict dictionary (included for compatibility) """ arpabet = pkg_resources.resource_filename(__name__, os.path.join('data', arpabet + '.csv')) self.arpa_map = self._read_arpabet(arpabet) self.chunk_re = re.compile(r'(\p{L}+|[^\p{L}]+)', re.U) self.puncnorm = PuncNorm() self.ligatures = ligatures self.ft = panphon.FeatureTable()
def _read_cedict(self, cedict_file, traditional=False): comment_re = re.compile('\s*#') lemma_re = re.compile('(?P<hanzi>[^]]+) \[(?P<pinyin>[^]]+)\] /(?P<english>.+)/') cedict = {} with codecs.open(cedict_file, 'r', 'utf-8') as f: for line in f: if comment_re.match(line): pass elif lemma_re.match(line): match = lemma_re.match(line) hanzi = match.group('hanzi').split(' ') pinyin = match.group('pinyin').split(' ') english = match.group('english').split('/') if traditional: cedict[hanzi[0]] = (pinyin, english) # traditional characters only else: cedict[hanzi[1]] = (pinyin, english) # simplified characters only. return cedict
def compile_regex_from_str(self, pat): """Given a string describing features masks for a sequence of segments, return a compiled regex matching the corresponding strings. Args: ft_str (str): feature masks, each enclosed in square brackets, in which the features are delimited by any standard delimiter. Returns: Pattern: regular expression pattern equivalent to `ft_str` """ s2n = {'-': -1, '0': 0, '+': 1} seg_res = [] for mat in re.findall(r'\[[^]]+\]+', pat): ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)} segs = self.all_segs_matching_fts(ft_mask) seg_res.append('({})'.format('|'.join(segs))) regexp = ''.join(seg_res) return re.compile(regexp)
def compile_regex_from_str(self, ft_str): """Given a string describing features masks for a sequence of segments, return a regex matching the corresponding strings. Args: ft_str (str): feature masks, each enclosed in square brackets, in which the features are delimited by any standard delimiter. Returns: Pattern: regular expression pattern equivalent to `ft_str` """ sequence = [] for m in re.finditer(r'\[([^]]+)\]', ft_str): ft_mask = fts(m.group(1)) segs = self.all_segs_matching_fts(ft_mask) sub_pat = '({})'.format('|'.join(segs)) sequence.append(sub_pat) pattern = ''.join(sequence) regex = re.compile(pattern) return regex
def match(regexp): return regex.compile(regexp)
def strip_html_tags(string, verbose=False): p = regex.compile(r'<.*?>') return p.sub(' ', string)
def __init__(self, candidate_regex, max_context, *args, **kwargs): self._candidate_regex = regex.compile(candidate_regex) self._max_context = max_context self._classifier = perceptronix.SparseBinomialClassifier(*args, **kwargs)
def read(cls, filename, candidate_regex, max_context): """Reads sentence tokenizer model from serialized model file.""" result = cls.__new__(cls) result._candidate_regex = regex.compile(candidate_regex) result._max_context = max_context result._classifier = perceptronix.SparseBinomialClassifier.read(filename) return result
def __init__(self): super(Base64Decode, self).__init__() self.whitespace_re = regex.compile(b'[ \t\r\n]+', regex.ASCII) self.base64_re = regex.compile(b'^[-+_/A-Za-z0-9=]+$', regex.ASCII)
def __init__(self, leading=r'[\p{C}\s]+', trailing=r'[\p{C}\s]+'): # type: (Text, Text) -> None """ :param leading: Regex to match at the start of the string. :param trailing: Regex to match at the end of the string. """ super(Strip, self).__init__() if leading: self.leading = regex.compile( r'^{pattern}'.format(pattern=leading), regex.UNICODE, ) else: self.leading = None if trailing: self.trailing = regex.compile( r'{pattern}$'.format(pattern=trailing), regex.UNICODE, ) else: self.trailing = None
def __init__(self, encoding='utf-8', normalize=True): # type: (Text, bool) -> None """ :param encoding: Used to decode non-unicode values. :param normalize: Whether to normalize the resulting value: - Convert to NFC form. - Remove non-printable characters. - Convert all line endings to unix-style ('\n'). """ super(Unicode, self).__init__() self.encoding = encoding self.normalize = normalize if self.normalize: # # Compile the regex that we will use to remove non- # printables from the resulting unicode. # http://www.regular-expressions.info/unicode.html#category # # Note: using a double negative so that we can exclude # newlines, which are technically considered control chars. # http://stackoverflow.com/a/3469155 # self.npr = regex.compile(r'[^\P{C}\s]+', regex.UNICODE)
def test_weakref(self): s = 'QabbbcR' x = regex.compile('ab+c') y = proxy(x) if x.findall('QabbbcR') != y.findall('QabbbcR'): self.fail()
def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = regex.compile('.') self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.match(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.search(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.findall(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.compile(pattern, regex.I))
def test_bug_3629(self): # A regex that triggered a bug in the sre-code validator self.assertEqual(repr(type(regex.compile("(?P<quote>)(?(quote))"))), self.PATTERN_CLASS)
def test_re_match(self): self.assertEqual(regex.match('a', 'a')[:], ('a',)) self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a')) self.assertEqual(regex.match(r'(a)', 'a')[0], 'a') self.assertEqual(regex.match(r'(a)', 'a')[1], 'a') self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a')) pat = regex.compile('((a)|(b))(c)?') self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None)) self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None)) self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) # A single group. m = regex.match('(a)', 'a') self.assertEqual(m.group(), 'a') self.assertEqual(m.group(0), 'a') self.assertEqual(m.group(1), 'a') self.assertEqual(m.group(1, 1), ('a', 'a')) pat = regex.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b', None)) self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
def test_getattr(self): self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I | regex.DEFAULT_VERSION) self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U | regex.DEFAULT_VERSION) self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, {'first': 1, 'other': 2}) self.assertEqual(regex.match("(a)", "a").pos, 0) self.assertEqual(regex.match("(a)", "a").endpos, 1) self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) self.assertEqual(regex.match("(a)", "a").string, 'a') self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) self.assertEqual(repr(type(regex.match("(a)", "a").re)), self.PATTERN_CLASS) # Issue 14260. p = regex.compile(r'abc(?P<n>def)') p.groupindex["n"] = 0 self.assertEqual(p.groupindex["n"], 1)
def test_re_escape(self): p = "" self.assertEqual(regex.escape(p), p) for i in range(0, 256): p += chr(i) self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))), True) self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(), (0, 1)) pat = regex.compile(regex.escape(p)) self.assertEqual(pat.match(p).span(), (0, 256))
def test_flags(self): for flag in [regex.I, regex.M, regex.X, regex.S, regex.L]: self.assertEqual(repr(type(regex.compile('^pattern$', flag))), self.PATTERN_CLASS)
def test_bug_545855(self): # Bug 545855 -- This pattern failed to cause a compile error as it # should, instead provoking a TypeError. self.assertRaisesRegex(regex.error, self.BAD_SET, lambda: regex.compile('foo[a-'))
def test_bug_612074(self): pat = u"[" + regex.escape(u"\u2039") + u"]" self.assertEqual(regex.compile(pat) and 1, 1)
def test_bug_926075(self): if regex.compile('bug_926075') is regex.compile(u'bug_926075'): self.fail()
def test_bug_931848(self): pattern = u"[\u002E\u3002\uFF0E\uFF61]" self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b', 'c'])
def test_bug_581080(self): it = regex.finditer(r"\s", "a b") self.assertEqual(it.next().span(), (1, 2)) self.assertRaises(StopIteration, lambda: it.next()) scanner = regex.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None)
def test_empty_array(self): # SF buf 1647541. import array for typecode in 'cbBuhHiIlLfd': a = array.array(typecode) self.assertEqual(regex.compile("bla").match(a), None) self.assertEqual(regex.compile("").match(a)[1 : ], ())
def test_inline_flags(self): # Bug #1700. upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = regex.compile(upper_char, regex.I | regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile(lower_char, regex.I | regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?i)' + upper_char, regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?i)' + lower_char, regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?iu)' + upper_char) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?iu)' + lower_char) self.assertEqual(bool(p.match(upper_char)), True) self.assertEqual(bool(regex.match(r"(?i)a", "A")), True) self.assertEqual(bool(regex.match(r"a(?i)", "A")), True) self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True) self.assertEqual(regex.match(r"a(?iV1)", "A"), None)
def test_ascii_and_unicode_flag(self): # Unicode patterns. for flags in (0, regex.UNICODE): pat = regex.compile(u'\xc0', flags | regex.IGNORECASE) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\w', flags) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'\w', regex.ASCII) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\w') self.assertEqual(pat.match(u'\xe0'), None) # String patterns. for flags in (0, regex.ASCII): pat = regex.compile('\xc0', flags | regex.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = regex.compile('\w') self.assertEqual(pat.match('\xe0'), None) self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: regex.compile('(?au)\w'))
def test_bug_10328 (self): # Issue 10328. pat = regex.compile(r'(?mV0)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)') self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', 'foobar '), ('foobar<trailing_ws>', 1)) self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', '']) pat = regex.compile(r'(?mV1)(?P<trailing_ws>[ \t]+\r*$)|(?P<no_final_newline>(?<=[^\n])\Z)') self.assertEqual(pat.subn(lambda m: '<' + m.lastgroup + '>', 'foobar '), ('foobar<trailing_ws><no_final_newline>', 2)) self.assertEqual([m.group() for m in pat.finditer('foobar ')], [' ', ''])
def test_common_prefix(self): # Very long common prefix all = string.ascii_lowercase + string.digits + string.ascii_uppercase side = all * 4 regexp = '(' + side + '|' + side + ')' self.assertEqual(repr(type(regex.compile(regexp))), self.PATTERN_CLASS)
def pretranslate_dict_to_function(self, convert_dict): # add uppercase letters for letter, translation in list(convert_dict.items()): letter_upper = letter.upper() if letter_upper != letter and letter_upper not in convert_dict: convert_dict[letter_upper] = translation.capitalize() self.convert_dict = convert_dict PRETRANSLATE = re.compile(u'(\L<options>)', options=convert_dict) # translate some letters before translating return lambda text: PRETRANSLATE.sub(lambda m: convert_dict[m.group(1)], text)
def calc_unwanted_chars_re(self): unwanted_chars_re = u'[^\p{{AlNum}}{safe_chars}]+'.format(safe_chars=re.escape(self._safe_chars or '')) self.unwanted_chars_re = re.compile(unwanted_chars_re, re.IGNORECASE) if self._stop_words: unwanted_chars_and_words_re = unwanted_chars_re + u'|(?<!\p{AlNum})(?:\L<stop_words>)(?!\p{AlNum})' self.unwanted_chars_and_words_re = re.compile(unwanted_chars_and_words_re, re.IGNORECASE, stop_words=self._stop_words) else: self.unwanted_chars_and_words_re = None