我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用regex.match()。
def __exit__(self, exc_type, exc_value, tb): if exc_type is None: try: exc_name = self.expected.__name__ except AttributeError: exc_name = str(self.expected) raise self.failureException( "%s not raised" % exc_name) if not issubclass(exc_type, self.expected): # let unexpected exceptions pass through return False self.exception = exc_value # store for later retrieval if self.expected_regexp is None: return True expected_regexp = self.expected_regexp if isinstance(expected_regexp, basestring): expected_regexp = re.compile(expected_regexp) if not expected_regexp.search(str(exc_value)): raise self.failureException('"%s" does not match "%s"' % (expected_regexp.pattern, str(exc_value))) return True
def test_re_groupref_exists(self): self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a)')[:], ('(a)', '(', 'a')) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a')[:], ('a', None, 'a')) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', 'a)'), None) self.assertEqual(regex.match(r'^(\()?([^()]+)(?(1)\))$', '(a'), None) self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'ab')[:], ('ab', 'a', 'b')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)b|d))$', 'cd')[:], ('cd', None, 'd')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'cd')[:], ('cd', None, 'd')) self.assertEqual(regex.match('^(?:(a)|c)((?(1)|d))$', 'a')[:], ('a', 'a', '')) # Tests for bug #1177831: exercise groups other than the first group. p = regex.compile('(?P<g1>a)(?P<g2>b)?((?(g2)c|d))') self.assertEqual(p.match('abc')[:], ('abc', 'a', 'b', 'c')) self.assertEqual(p.match('ad')[:], ('ad', 'a', None, 'd')) self.assertEqual(p.match('abd'), None) self.assertEqual(p.match('ac'), None)
def test_ignore_case(self): self.assertEqual(regex.match("abc", "ABC", regex.I)[0], 'ABC') self.assertEqual(regex.match(u"abc", u"ABC", regex.I)[0], u'ABC') self.assertEqual(regex.match(r"(a\s[^a]*)", "a bb", regex.I)[1], 'a bb') self.assertEqual(regex.match(r"(a\s[abc])", "a b", regex.I)[1], 'a b') self.assertEqual(regex.match(r"(a\s[abc]*)", "a bb", regex.I)[1], 'a bb') self.assertEqual(regex.match(r"((a)\s\2)", "a a", regex.I)[1], 'a a') self.assertEqual(regex.match(r"((a)\s\2*)", "a aa", regex.I)[1], 'a aa') self.assertEqual(regex.match(r"((a)\s(abc|a))", "a a", regex.I)[1], 'a a') self.assertEqual(regex.match(r"((a)\s(abc|a)*)", "a aa", regex.I)[1], 'a aa') # Issue 3511. self.assertEqual(regex.match(r"[Z-a]", "_").span(), (0, 1)) self.assertEqual(regex.match(r"(?i)[Z-a]", "_").span(), (0, 1)) self.assertEqual(bool(regex.match(ur"(?iu)nao", u"nAo")), True) self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"n\xC3o")), True) self.assertEqual(bool(regex.match(ur"(?iu)n\xE3o", u"N\xC3O")), True) self.assertEqual(bool(regex.match(ur"(?iu)s", u"\u017F")), True)
def test_captures(self): self.assertEqual(regex.search(r"(\w)+", "abc").captures(1), ['a', 'b', 'c']) self.assertEqual(regex.search(r"(\w{3})+", "abcdef").captures(0, 1), (['abcdef'], ['abc', 'def'])) self.assertEqual(regex.search(r"^(\d{1,3})(?:\.(\d{1,3})){3}$", "192.168.0.1").captures(1, 2), (['192', ], ['168', '0', '1'])) self.assertEqual(regex.match(r"^([0-9A-F]{2}){4} ([a-z]\d){5}$", "3FB52A0C a2c4g3k9d3").captures(1, 2), (['3F', 'B5', '2A', '0C'], ['a2', 'c4', 'g3', 'k9', 'd3'])) self.assertEqual(regex.match("([a-z]W)([a-z]X)+([a-z]Y)", "aWbXcXdXeXfY").captures(1, 2, 3), (['aW'], ['bX', 'cX', 'dX', 'eX'], ['fY'])) self.assertEqual(regex.search(r".*?(?=(.)+)b", "ab").captures(1), ['b']) self.assertEqual(regex.search(r".*?(?>(.){0,2})d", "abcd").captures(1), ['b', 'c']) self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a'])
def test_subscripted_captures(self): self.assertEqual(regex.match(r'(?P<x>.)+', 'abc').expandf('{0} {0[0]} {0[-1]}'), 'abc abc abc') self.assertEqual(regex.match(r'(?P<x>.)+', 'abc').expandf('{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}'), 'c a b c c b a') self.assertEqual(regex.match(r'(?P<x>.)+', 'abc').expandf('{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}'), 'c a b c c b a') self.assertEqual(regex.subf(r'(?P<x>.)+', r'{0} {0[0]} {0[-1]}', 'abc'), 'abc abc abc') self.assertEqual(regex.subf(r'(?P<x>.)+', '{1} {1[0]} {1[1]} {1[2]} {1[-1]} {1[-2]} {1[-3]}', 'abc'), 'c a b c c b a') self.assertEqual(regex.subf(r'(?P<x>.)+', '{x} {x[0]} {x[1]} {x[2]} {x[-1]} {x[-2]} {x[-3]}', 'abc'), 'c a b c c b a')
def preprocess(article): # Take out HTML escaping WikiExtractor didn't clean for k, v in article.items(): article[k] = PARSER.unescape(v) # Filter some disambiguation pages not caught by the WikiExtractor if article['id'] in BLACKLIST: return None if '(disambiguation)' in article['title'].lower(): return None if '(disambiguation page)' in article['title'].lower(): return None # Take out List/Index/Outline pages (mostly links) if re.match(r'(List of .+)|(Index of .+)|(Outline of .+)', article['title']): return None # Return doc with `id` set to `title` return {'id': article['title'], 'text': article['text']}
def iternext(self): """ Iterate through characters of the string. Count escaped Q, E and backslash as a single char. """ if self.index > self.max_index: raise StopIteration char = self.string[self.index:self.index + 1] if char == self._b_slash: m = self._regex_search_ref.match(self.string[self.index + 1:]) if m: char += m.group(1) if m.group(1) else m.group(2) self.index += len(char) self.current = char return self.current # Break apart template patterns into char tokens
def __init__(self, match, template): """Initialize.""" if template.binary: ctokens = ctok.btokens else: ctokens = ctok.utokens self.template = template self._esc_end = ctokens["esc_end"] self._end = ctokens["end"] self._lc = ctokens["lc"] self._lc_span = ctokens["lc_span"] self._uc = ctokens["uc"] self._uc_span = ctokens["uc_span"] self.index = -1 self.end_found = False self.parent_span = [] self.match = match
def expand(self): """Using the template, expand the string.""" sep = self.match.string[:0] text = [] # Expand string for x in range(0, len(self.template.literals)): index = x l = self.template.literals[x] if l is None: g_index = self.template.get_group_index(index) span_case, single_case, capture = self.template.get_group_attributes(index) try: l = self.match.captures(g_index)[capture] except IndexError: raise IndexError("'%d' is out of range!" % capture) if span_case is not None: l = getattr(l, span_case)() if single_case is not None: l = getattr(l[0:1], single_case)() + l[1:] text.append(l) return sep.join(text)
def decode_event_data(topic, data): if isinstance(data, str): data = data_decoder(data) name, types = _process_topic(topic) decoded = decode_abi(types, data) arguments = [] for typ, val in zip(types, decoded): m = TYPES_RE.match(typ) if m is None: continue atyp, arr = m.groups() if arr is None or arr == '': arguments.append(_convert_type(atyp, val)) else: arguments.append(_convert_array(atyp, arr[1:-1].split(']['), val)) return arguments
def _read_rule(self, i, line): line = line.strip() if line: line = unicodedata.normalize('NFC', unicodedata.normalize('NFD', line)) s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line) if s: self.symbols[s.group('symbol')] = s.group('value') else: line = self._sub_symbols(line) r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line) try: a, b, X, Y = r.groups() except AttributeError: raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line)) X, Y = X.replace('#', '^'), Y.replace('#', '$') a, b = a.replace('0', ''), b.replace('0', '') try: if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a): return self._fields_to_function_metathesis(a, X, Y) else: return self._fields_to_function(a, b, X, Y) except Exception as e: raise DatafileError('Line {}: "{}" cannot be compiled as regex: ?{}'.format(i + 1, line, e))
def is_empty(line): # 'r' raw string so that doctest works with these special characters. r"""Detects blank lines. In the pdfbox conversion of the main table, blank lines seem to separate kanji entries. But we can already detect the start of each kanji entry by the presence of the kanji itself, so we just skip blank lines. >>> is_empty('') True >>> is_empty("\n") True >>> is_empty(" \t \n") True >>> is_empty("\u3000") # IDEOGRAPHIC SPACE True >>> is_empty("\u3000?\t\n") False """ line = line.strip() return re.match('^$', line) != None
def is_page_index(line): r"""Detects the page indices from the Joyo document. They usually look like this: >>> is_page_index('03?_???????_??NN.indd 107 2010/11/12 13:10:23') True Pdfbox also generated a single page number (?) like this: >>> is_page_index('163') True Content lines won't match: >>> is_page_index('\t \t \t ????\t ???\t') False """ line = line.strip() # We just test whether it starts with a number. if re.match(r'^[0-9]', line): return(True) else: return(False)
def __init__(self, kanji, reading, variation_of=None, kind=None): self.kanji = kanji if reading[0] == "\u3000": self.reading = reading[1:] self.uncommon = True else: self.reading = reading self.uncommon = False self.examples = list() if kind: self.kind = kind else: if re.match("\p{Katakana}", self.reading): self.kind = 'On' else: self.kind = 'Kun' self.variation_of = variation_of self.notes = list() self.alternate_orthographies = list()
def test_alternate_orthographies(self): for k in joyodb.loaded_data.kanjis: for r in k.readings: for a in r.alternate_orthographies: looks_like_alternate = re.match("^(\p{Han})\p{Hiragana}*$", a) assert(looks_like_alternate) alt_kanji_ch = looks_like_alternate[1] assert(alt_kanji_ch) alt_kanji_list = [obj for obj in joyodb.loaded_data.kanjis if obj.kanji == alt_kanji_ch] assert(len(alt_kanji_list) == 1) alt_kanji = alt_kanji_list[0] found=False for their_readings in alt_kanji.readings: for their_alternates in their_readings.alternate_orthographies: if k.kanji in their_alternates: found=True assert(found)
def validate_username(username): return regex.match('^[a-zA-Z][a-zA-Z0-9_]{2,59}$', username)
def get(self, username): sql = ("SELECT users.*, array_agg(app_categories.category_id) AS category_ids, " "array_agg(categories.tag) AS category_tags, " "array_agg(category_names.name) AS category_names " "FROM users LEFT JOIN app_categories " "ON users.toshi_id = app_categories.toshi_id " "LEFT JOIN category_names ON app_categories.category_id = category_names.category_id " "AND category_names.language = $1 " "LEFT JOIN categories ON app_categories.category_id = categories.category_id " "WHERE ") args = ['en'] # check if ethereum address is given if regex.match('^0x[a-fA-F0-9]{40}$', username): sql += "users.toshi_id = $2" args.append(username) # otherwise verify that username is valid elif not regex.match('^[a-zA-Z][a-zA-Z0-9_]{2,59}$', username): raise JSONHTTPError(400, body={'errors': [{'id': 'invalid_username', 'message': 'Invalid Username'}]}) else: sql += "lower(users.username) = lower($2)" args.append(username) if self.apps_only: sql += " AND users.is_app = $3 AND users.blocked = $4" args.extend([True, False]) sql += " GROUP BY users.toshi_id" async with self.db: row = await self.db.fetchrow(sql, *args) if row is None: raise JSONHTTPError(404, body={'errors': [{'id': 'not_found', 'message': 'Not Found'}]}) self.write(user_row_for_json(self.request, row))
def put(self, username): if regex.match('^0x[a-fA-F0-9]{40}$', username): address_to_update = username elif regex.match('^[a-zA-Z][a-zA-Z0-9_]{2,59}$', username): async with self.db: row = await self.db.fetchrow("SELECT * FROM users WHERE lower(username) = lower($1)", username) if row is None: raise JSONHTTPError(404, body={'errors': [{'id': 'not_found', 'message': 'Not Found'}]}) address_to_update = row['toshi_id'] else: raise JSONHTTPError(400, body={'errors': [{'id': 'invalid_username', 'message': 'Invalid Username'}]}) request_address = self.verify_request() if not self.request.headers['Content-Type'].startswith('application/json') and not self.request.files: raise JSONHTTPError(400, body={'errors': [{'id': 'bad_data', 'message': 'Expected application/json or multipart/form-data'}]}) if request_address != address_to_update: # check for superuser update if not self.is_superuser(request_address): raise JSONHTTPError(401, body={'errors': [{'id': 'permission_denied', 'message': 'Permission Denied'}]}) if self.request.files: return await self.update_user_avatar(address_to_update) else: return await self.update_user(address_to_update)
def validate(self, document): ok = regex.match('^([01]{1})?[-.\s]?\(?(\d{3})\)?[-.\s]?(\d{3})[-.\s]?(\d{4})\s?((?:#|ext\.?\s?|x\.?\s?){1}(?:\d+)?)?$', document.text) if not ok: raise ValidationError( message='Please enter a valid phone number', cursor_position=len(document.text)) # Move cursor to end
def test_search_star_plus(self): self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.search('x*', 'axx').span(), (0, 0)) self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3)) self.assertEqual(regex.search('x+', 'axx').span(), (1, 3)) self.assertEqual(regex.search('x', 'aaa'), None) self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0)) self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3)) self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3)) self.assertEqual(regex.match('a+', 'xxx'), None)
def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = regex.compile('.') self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.match(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.search(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.findall(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.compile(pattern, regex.I))
def test_re_match(self): self.assertEqual(regex.match('a', 'a')[:], ('a',)) self.assertEqual(regex.match('(a)', 'a')[:], ('a', 'a')) self.assertEqual(regex.match(r'(a)', 'a')[0], 'a') self.assertEqual(regex.match(r'(a)', 'a')[1], 'a') self.assertEqual(regex.match(r'(a)', 'a').group(1, 1), ('a', 'a')) pat = regex.compile('((a)|(b))(c)?') self.assertEqual(pat.match('a')[:], ('a', 'a', 'a', None, None)) self.assertEqual(pat.match('b')[:], ('b', 'b', None, 'b', None)) self.assertEqual(pat.match('ac')[:], ('ac', 'a', 'a', None, 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) self.assertEqual(pat.match('bc')[:], ('bc', 'b', None, 'b', 'c')) # A single group. m = regex.match('(a)', 'a') self.assertEqual(m.group(), 'a') self.assertEqual(m.group(0), 'a') self.assertEqual(m.group(1), 'a') self.assertEqual(m.group(1, 1), ('a', 'a')) pat = regex.compile('(?:(?P<a1>a)|(?P<b2>b))(?P<c3>c)?') self.assertEqual(pat.match('a').group(1, 2, 3), ('a', None, None)) self.assertEqual(pat.match('b').group('a1', 'b2', 'c3'), (None, 'b', None)) self.assertEqual(pat.match('ac').group(1, 'b2', 3), ('a', None, 'c'))
def test_groupdict(self): self.assertEqual(regex.match('(?P<first>first) (?P<second>second)', 'first second').groupdict(), {'first': 'first', 'second': 'second'})
def test_expand(self): self.assertEqual(regex.match("(?P<first>first) (?P<second>second)", "first second").expand(r"\2 \1 \g<second> \g<first>"), 'second first second first')
def test_repeat_minmax(self): self.assertEqual(regex.match(r"^(\w){1}$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1}?$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1,2}$", "abc"), None) self.assertEqual(regex.match(r"^(\w){1,2}?$", "abc"), None) self.assertEqual(regex.match(r"^(\w){3}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,3}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,4}$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,3}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){1,4}?$", "abc")[1], 'c') self.assertEqual(regex.match(r"^(\w){3,4}?$", "abc")[1], 'c') self.assertEqual(regex.match("^x{1}$", "xxx"), None) self.assertEqual(regex.match("^x{1}?$", "xxx"), None) self.assertEqual(regex.match("^x{1,2}$", "xxx"), None) self.assertEqual(regex.match("^x{1,2}?$", "xxx"), None) self.assertEqual(regex.match("^x{1}", "xxx")[0], 'x') self.assertEqual(regex.match("^x{1}?", "xxx")[0], 'x') self.assertEqual(regex.match("^x{0,1}", "xxx")[0], 'x') self.assertEqual(regex.match("^x{0,1}?", "xxx")[0], '') self.assertEqual(bool(regex.match("^x{3}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,3}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,4}$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,3}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{1,4}?$", "xxx")), True) self.assertEqual(bool(regex.match("^x{3,4}?$", "xxx")), True) self.assertEqual(regex.match("^x{}$", "xxx"), None) self.assertEqual(bool(regex.match("^x{}$", "x{}")), True)
def test_getattr(self): self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I | regex.DEFAULT_VERSION) self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U | regex.DEFAULT_VERSION) self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, {'first': 1, 'other': 2}) self.assertEqual(regex.match("(a)", "a").pos, 0) self.assertEqual(regex.match("(a)", "a").endpos, 1) self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) self.assertEqual(regex.match("(a)", "a").string, 'a') self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) self.assertEqual(repr(type(regex.match("(a)", "a").re)), self.PATTERN_CLASS) # Issue 14260. p = regex.compile(r'abc(?P<n>def)') p.groupindex["n"] = 0 self.assertEqual(p.groupindex["n"], 1)
def test_bigcharset(self): self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222")[1], u'\u2222') self.assertEqual(regex.match(ur"(?u)([\u2222\u2223])", u"\u2222", regex.UNICODE)[1], u'\u2222') self.assertEqual(u"".join(regex.findall(u".", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"[e\xe8\xe9\xea\xeb\u0113\u011b\u0117]", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117') self.assertEqual(u"".join(regex.findall(ur"e|\xe8|\xe9|\xea|\xeb|\u0113|\u011b|\u0117", u"e\xe8\xe9\xea\xeb\u0113\u011b\u0117", flags=regex.UNICODE)), u'e\xe8\xe9\xea\xeb\u0113\u011b\u0117')
def test_non_consuming(self): self.assertEqual(regex.match(r"(a(?=\s[^a]))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[^a]*))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[abc]))", "a b")[1], 'a') self.assertEqual(regex.match(r"(a(?=\s[abc]*))", "a bc")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s\1)", "a a")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s\1*)", "a aa")[1], 'a') self.assertEqual(regex.match(r"(a)(?=\s(abc|a))", "a a")[1], 'a') self.assertEqual(regex.match(r"(a(?!\s[^a]))", "a a")[1], 'a') self.assertEqual(regex.match(r"(a(?!\s[abc]))", "a d")[1], 'a') self.assertEqual(regex.match(r"(a)(?!\s\1)", "a b")[1], 'a') self.assertEqual(regex.match(r"(a)(?!\s(abc|a))", "a b")[1], 'a')
def test_category(self): self.assertEqual(regex.match(r"(\s)", " ")[1], ' ')
def test_re_escape(self): p = "" self.assertEqual(regex.escape(p), p) for i in range(0, 256): p += chr(i) self.assertEqual(bool(regex.match(regex.escape(chr(i)), chr(i))), True) self.assertEqual(regex.match(regex.escape(chr(i)), chr(i)).span(), (0, 1)) pat = regex.compile(regex.escape(p)) self.assertEqual(pat.match(p).span(), (0, 256))
def test_sre_character_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertEqual(bool(regex.match(r"\%03o" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"\%03o0" % i, chr(i) + "0")), True) self.assertEqual(bool(regex.match(r"\%03o8" % i, chr(i) + "8")), True) self.assertEqual(bool(regex.match(r"\x%02x" % i, chr(i))), True) self.assertEqual(bool(regex.match(r"\x%02x0" % i, chr(i) + "0")), True) self.assertEqual(bool(regex.match(r"\x%02xz" % i, chr(i) + "z")), True) self.assertRaisesRegex(regex.error, self.INVALID_GROUP_REF, lambda: regex.match(r"\911", ""))
def test_bug_113254(self): self.assertEqual(regex.match(r'(a)|(b)', 'b').start(1), -1) self.assertEqual(regex.match(r'(a)|(b)', 'b').end(1), -1) self.assertEqual(regex.match(r'(a)|(b)', 'b').span(1), (-1, -1))
def test_bug_527371(self): # Bug described in patches 527371/672491. self.assertEqual(regex.match(r'(a)?a','a').lastindex, None) self.assertEqual(regex.match(r'(a)(b)?b','ab').lastindex, 1) self.assertEqual(regex.match(r'(?P<a>a)(?P<b>b)?b','ab').lastgroup, 'a') self.assertEqual(regex.match("(?P<a>a(b))", "ab").lastgroup, 'a') self.assertEqual(regex.match("((a))", "a").lastindex, 1)
def test_bug_418626(self): # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of # pattern '*?' on a long string. self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0), 20001) self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' + 'cde').end(0), 20003) self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0), 60001) # Non-simple '*?' still used to hit the recursion limit, before the # non-recursive scheme was implemented. self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0), 20001)
def test_stack_overflow(self): # Nasty cases that used to overflow the straightforward recursive # implementation of repeated groups. self.assertEqual(regex.match('(x)*', 50000 * 'x')[1], 'x') self.assertEqual(regex.match('(x)*y', 50000 * 'x' + 'y')[1], 'x') self.assertEqual(regex.match('(x)*?y', 50000 * 'x' + 'y')[1], 'x')
def test_bug_448951(self): # Bug 448951 (similar to 429357, but with single char match). # (Also test greedy matches.) for op in '', '?', '*': self.assertEqual(regex.match(r'((.%s):)?z' % op, 'z')[:], ('z', None, None)) self.assertEqual(regex.match(r'((.%s):)?z' % op, 'a:z')[:], ('a:z', 'a:', 'a'))
def test_bug_725149(self): # Mark_stack_base restoring before restoring marks. self.assertEqual(regex.match('(a)(?:(?=(b)*)c)*', 'abb')[:], ('a', 'a', None)) self.assertEqual(regex.match('(a)((?!(b)*))*', 'abb')[:], ('a', 'a', None, None))
def test_bug_764548(self): # Bug 764548, regex.compile() barfs on str/unicode subclasses. class my_unicode(str): pass pat = regex.compile(my_unicode("abc")) self.assertEqual(pat.match("xyz"), None)
def test_empty_array(self): # SF buf 1647541. import array for typecode in 'cbBuhHiIlLfd': a = array.array(typecode) self.assertEqual(regex.compile("bla").match(a), None) self.assertEqual(regex.compile("").match(a)[1 : ], ())
def test_inline_flags(self): # Bug #1700. upper_char = unichr(0x1ea0) # Latin Capital Letter A with Dot Below lower_char = unichr(0x1ea1) # Latin Small Letter A with Dot Below p = regex.compile(upper_char, regex.I | regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile(lower_char, regex.I | regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?i)' + upper_char, regex.U) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?i)' + lower_char, regex.U) self.assertEqual(bool(p.match(upper_char)), True) p = regex.compile('(?iu)' + upper_char) self.assertEqual(bool(p.match(lower_char)), True) p = regex.compile('(?iu)' + lower_char) self.assertEqual(bool(p.match(upper_char)), True) self.assertEqual(bool(regex.match(r"(?i)a", "A")), True) self.assertEqual(bool(regex.match(r"a(?i)", "A")), True) self.assertEqual(bool(regex.match(r"(?iV1)a", "A")), True) self.assertEqual(regex.match(r"a(?iV1)", "A"), None)
def test_ascii_and_unicode_flag(self): # Unicode patterns. for flags in (0, regex.UNICODE): pat = regex.compile(u'\xc0', flags | regex.IGNORECASE) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\w', flags) self.assertEqual(bool(pat.match(u'\xe0')), True) pat = regex.compile(u'\xc0', regex.ASCII | regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\xc0', regex.IGNORECASE) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'\w', regex.ASCII) self.assertEqual(pat.match(u'\xe0'), None) pat = regex.compile(u'(?a)\w') self.assertEqual(pat.match(u'\xe0'), None) # String patterns. for flags in (0, regex.ASCII): pat = regex.compile('\xc0', flags | regex.IGNORECASE) self.assertEqual(pat.match('\xe0'), None) pat = regex.compile('\w') self.assertEqual(pat.match('\xe0'), None) self.assertRaisesRegex(ValueError, self.MIXED_FLAGS, lambda: regex.compile('(?au)\w'))
def test_new_named_groups(self): m0 = regex.match(r'(?P<a>\w)', 'x') m1 = regex.match(r'(?<a>\w)', 'x') if not (m0 and m1 and m0[:] == m1[:]): self.fail("Failed")
def test_grapheme(self): self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) self.assertEqual(regex.findall(ur"(?u)\X", u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', u'\xe9', u'e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X{3}", u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), [u'\r', u'\r\n', u'\u0301', u'A\u0301'])
def test_format(self): self.assertEqual(regex.subf(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar"), "foo bar => bar foo") self.assertEqual(regex.subf(r"(?<word1>\w+) (?<word2>\w+)", "{word2} {word1}", "foo bar"), "bar foo") self.assertEqual(regex.subfn(r"(\w+) (\w+)", "{0} => {2} {1}", "foo bar"), ("foo bar => bar foo", 1)) self.assertEqual(regex.subfn(r"(?<word1>\w+) (?<word2>\w+)", "{word2} {word1}", "foo bar"), ("bar foo", 1)) self.assertEqual(regex.match(r"(\w+) (\w+)", "foo bar").expandf("{0} => {2} {1}"), "foo bar => bar foo")
def filter_word(text): """Take out english stopwords, punctuation, and compound endings.""" text = normalize(text) if regex.match(r'^\p{P}+$', text): return True if text.lower() in STOPWORDS: return True return False
def compile_replace(pattern, repl, flags=0): """Construct a method that can be used as a replace method for `sub`, `subn`, etc.""" call = None if pattern is not None and isinstance(pattern, REGEX_TYPE): if isinstance(repl, (compat.string_type, compat.binary_type)): repl = ReplaceTemplate(pattern, repl, bool(flags & FORMAT)) call = Replace( functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash ) elif isinstance(repl, Replace): if flags: raise ValueError("Cannot process flags argument with a compiled pattern!") if repl.pattern_hash != hash(pattern): raise ValueError("Pattern hash doesn't match hash in compiled replace!") call = repl elif isinstance(repl, ReplaceTemplate): if flags: raise ValueError("Cannot process flags argument with a ReplaceTemplate!") call = Replace( functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash ) else: raise TypeError("Not a valid type!") else: raise TypeError("Pattern must be a compiled regular expression!") return call # Convenience methods like re has, but slower due to overhead on each call. # It is recommended to use compile_search and compile_replace
def tone_determ(text): text = unicodedata.normalize("NFD", text) match = re.search(tones, text) if match and match.group() in pinyin_tone.keys(): return pinyin_tone[match.group()] return "5"
def pinyin_transform(text): if re.search("?", text): return "" text = re.sub( unicodedata.normalize("NFD", "ü"), "ü", re.sub( unicodedata.normalize("NFD", "ê"), "ê", unicodedata.normalize("NFD", text) ) ) if re.search( "[aeiouêü]" + tones + "[aeiou]?[aeiouêü]" + tones + "", text.lower()): return "" text = text.lower() if not re.search(tones, text) and re.match("[1-5]", text): return re.sub("(\d)(\p{Ll})", "\1 \2", text) if re.search("[??,.?]", text): text = re.sub( "([??])$", lambda x: " y?" if x.group() == "?" else " bù", text ) text = re.sub("([??])", r" \1 ", text) text = re.sub("([,.?])", r" \1 ", text) text = re.sub(" +", " ", text) text = re.sub("^ ", "", text) text = re.sub(" $", "", text) text = re.sub("\. \. \.", "...", text) text = re.sub("['\-]", " ", text) text = re.sub( "([aeiouêü]" + tones + "?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)", r"\1 \2", text ) text = re.sub(" ([grn])$", r"\1", text) text = re.sub(" ([grn]) ", r"\1 ", text) return unicodedata.normalize("NFC", text)