我们从Python开源项目中,提取了以下26个代码示例,用于说明如何使用regex.findall()。
def test_re_findall(self): self.assertEqual(regex.findall(":+", "abc"), []) self.assertEqual(regex.findall(":+", "a:b::c:::d"), [':', '::', ':::']) self.assertEqual(regex.findall("(:+)", "a:b::c:::d"), [':', '::', ':::']) self.assertEqual(regex.findall("(:)(:*)", "a:b::c:::d"), [(':', ''), (':', ':'), (':', '::')]) self.assertEqual(regex.findall(r"\((?P<test>.{0,5}?TEST)\)", "(MY TEST)"), ["MY TEST"]) self.assertEqual(regex.findall(r"\((?P<test>.{0,3}?TEST)\)", "(MY TEST)"), ["MY TEST"]) self.assertEqual(regex.findall(r"\((?P<test>.{0,3}?T)\)", "(MY T)"), ["MY T"]) self.assertEqual(regex.findall(r"[^a]{2}[A-Z]", "\n S"), [' S']) self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), ['\n S']) self.assertEqual(regex.findall(r"[^a]{2,3}[A-Z]", "\n S"), [' S']) self.assertEqual(regex.findall(r"X(Y[^Y]+?){1,2}( |Q)+DEF", "XYABCYPPQ\nQ DEF"), [('YPPQ\n', ' ')]) self.assertEqual(regex.findall(r"(\nTest(\n+.+?){0,2}?)?\n+End", "\nTest\nxyz\nxyz\nEnd"), [('\nTest\nxyz\nxyz', '\nxyz')])
def test_overlapped(self): self.assertEqual(regex.findall(r"..", "abcde"), ['ab', 'cd']) self.assertEqual(regex.findall(r"..", "abcde", overlapped=True), ['ab', 'bc', 'cd', 'de']) self.assertEqual(regex.findall(r"(?r)..", "abcde"), ['de', 'bc']) self.assertEqual(regex.findall(r"(?r)..", "abcde", overlapped=True), ['de', 'cd', 'bc', 'ab']) self.assertEqual(regex.findall(r"(.)(-)(.)", "a-b-c", overlapped=True), [("a", "-", "b"), ("b", "-", "c")]) self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde")], ['ab', 'cd']) self.assertEqual([m[0] for m in regex.finditer(r"..", "abcde", overlapped=True)], ['ab', 'bc', 'cd', 'de']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde")], ['de', 'bc']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)..", "abcde", overlapped=True)], ['de', 'cd', 'bc', 'ab']) self.assertEqual([m.groups() for m in regex.finditer(r"(.)(-)(.)", "a-b-c", overlapped=True)], [("a", "-", "b"), ("b", "-", "c")]) self.assertEqual([m.groups() for m in regex.finditer(r"(?r)(.)(-)(.)", "a-b-c", overlapped=True)], [("b", "-", "c"), ("a", "-", "b")])
def barcode_to_regex(barcode: str, error_rate: Optional[int]=None): """Convert a barcode string to a regex pattern barcode [str] The barcode string to turn into a regex error_rate [int]=None The error rate""" pattern = '' # type: str umi = regex.findall(r'(N+)', barcode, regex.IGNORECASE) # type: List[str] umi_lengths = tuple(map(len, umi)) # type: Tuple[int] filtered_barcode = filter(None, barcode.upper().split('N')) # type: filter for index, subpattern in enumerate(filtered_barcode): # type: int, str barcode_pattern = '(' + subpattern + ')' # type: str if error_rate: barcode_pattern += '{e<=' + str(error_rate) + '}' pattern += barcode_pattern try: umi_pattern = '(' + ''.join(itertools.repeat('[ACGT]', umi_lengths[index])) + ')' # type: str except IndexError: break else: if error_rate: umi_pattern += '{e<=' + str(error_rate) + '}' pattern += umi_pattern find_barcode = regex.compile(r'%s' % pattern, regex.ENHANCEMATCH) return find_barcode
def compile_regex_from_str(self, pat): """Given a string describing features masks for a sequence of segments, return a compiled regex matching the corresponding strings. Args: ft_str (str): feature masks, each enclosed in square brackets, in which the features are delimited by any standard delimiter. Returns: Pattern: regular expression pattern equivalent to `ft_str` """ s2n = {'-': -1, '0': 0, '+': 1} seg_res = [] for mat in re.findall(r'\[[^]]+\]+', pat): ft_mask = {k: s2n[v] for (v, k) in re.findall(r'([+-])(\w+)', mat)} segs = self.all_segs_matching_fts(ft_mask) seg_res.append('({})'.format('|'.join(segs))) regexp = ''.join(seg_res) return re.compile(regexp)
def tokenize_field(value): """ Extract normalized tokens from a field. Args: value (str): The field value. Returns: list: The cleaned tokens. """ # Extract tokens. tokens = regex.findall('\p{L}{2,}', value.lower()) # Remove articles. tokens = [t for t in tokens if t not in [ 'a', 'an', 'the', 'and', ]] return tokens
def request_url(url, headers=None, name=''): print('req', url) data = set() servers = list() try: response = requests.get(url, headers=headers, verify=False).text data.update(map(lambda x: re.sub('\s', '', x), re.findall('ssr?://[a-zA-Z0-9=]+', response))) soup = BeautifulSoup(response, 'html.parser') title = soup.find('title').text info = {'message': '', 'url': url, 'name': str(title)} for i, server in enumerate(data): try: servers.append(parse(server, ' '.join([title, name, str(i)]))) except Exception as e: logging.exception(e, stack_info=False) print('URL:', url, 'SERVER', server) except Exception as e: print(url) logging.exception(e, stack_info=False) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info
def generate_char_list(string, strip_html=True): if strip_html: s = strip_html_tags(string.lower()) else: s = string.lower() normalized_string = regex.sub(r'\s+', r' ', s) # change any kind of whitespace to a single space list_norm_chars = regex.findall(r"\w|[?!'#@$:\"&*=,]", normalized_string) return list_norm_chars
def generate_word_list(string, strip_html=True): if strip_html: s = strip_html_tags(string.lower()) else: s = string.lower() normalized_string = regex.sub(r"\s+", r' ', s) # change any kind of whitespace to a single space # list of words all words seen during training including strings like '!!!' , '??', '....' # as these repeated punctuations tend to imply more than the're gramatical meaning list_normalized_string = regex.findall(r"\b\w+[']?\w*\b|\!+|\?+|\.{3,}", normalized_string) return list_normalized_string
def countOccurrences(text, searchFor): ''' Count all occurrences of the string "searchFor" in the text "text" ''' return len(re.findall(searchFor, text, overlapped=True))
def test_weakref(self): s = 'QabbbcR' x = regex.compile('ab+c') y = proxy(x) if x.findall('QabbbcR') != y.findall('QabbbcR'): self.fail()
def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = regex.compile('.') self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.match(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.search(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.findall(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.compile(pattern, regex.I))
def test_bug_117612(self): self.assertEqual(regex.findall(r"(a|(b))", "aba"), [('a', ''), ('b', 'b'), ('a', '')])
def test_re_groupref(self): self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a|')[:], ('|a|', '|', 'a')) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1?$', 'a')[:], ('a', None, 'a')) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', 'a|'), None) self.assertEqual(regex.match(r'^(\|)?([^()]+)\1$', '|a'), None) self.assertEqual(regex.match(r'^(?:(a)|c)(\1)$', 'aa')[:], ('aa', 'a', 'a')) self.assertEqual(regex.match(r'^(?:(a)|c)(\1)?$', 'c')[:], ('c', None, None)) self.assertEqual(regex.findall("(?i)(.{1,40}?),(.{1,40}?)(?:;)+(.{1,80}).{1,40}?\\3(\ |;)+(.{1,80}?)\\1", "TEST, BEST; LEST ; Lest 123 Test, Best"), [('TEST', ' BEST', ' LEST', ' ', '123 ')])
def test_word_class(self): self.assertEqual(regex.findall(ur"(?u)\w+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'\u0939\u093f\u0928\u094d\u0926\u0940']) self.assertEqual(regex.findall(ur"(?u)\W+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u',']) self.assertEqual(regex.split(ur"(?uV1)\b", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u'\u0939\u093f\u0928\u094d\u0926\u0940', u',']) self.assertEqual(regex.split(ur"(?uV1)\B", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'', u' \u0939', u'\u093f', u'\u0928', u'\u094d', u'\u0926', u'\u0940,', u''])
def test_search_anchor(self): self.assertEqual(regex.findall(r"\G\w{2}", "abcd ef"), ['ab', 'cd'])
def test_zerowidth(self): # Issue 3262. self.assertEqual(regex.split(r"\b", "a b"), ['a b']) self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b', '']) # Issue 1647489. self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b', 'x', 'a', 'x', '']) self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['', 'c', 'x', 'b', 'x', 'a', 'x', ''])
def test_grapheme(self): self.assertEqual(regex.match(ur"(?u)\X", u"\xE0").span(), (0, 1)) self.assertEqual(regex.match(ur"(?u)\X", u"a\u0300").span(), (0, 2)) self.assertEqual(regex.findall(ur"(?u)\X", u"a\xE0a\u0300e\xE9e\u0301"), [u'a', u'\xe0', u'a\u0300', u'e', u'\xe9', u'e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X{3}", u"a\xE0a\u0300e\xE9e\u0301"), [u'a\xe0a\u0300', u'e\xe9e\u0301']) self.assertEqual(regex.findall(ur"(?u)\X", u"\r\r\n\u0301A\u0301"), [u'\r', u'\r\n', u'\u0301', u'A\u0301'])
def findall( pattern, string, flags=0, pos=None, endpos=None, overlapped=False, concurrent=None, **kwargs ): """Wrapper for `findall`.""" return regex.findall( _apply_search_backrefs(pattern, flags), string, flags, pos, endpos, overlapped, concurrent, **kwargs )
def expand_template(self, text): """Expand IPA Template through Wiktionary API. Used to expand ``{{*-IPA}}`` template in parser and return IPA list. Parameters ---------- text : string String of template text inside "{{" and "}}". Returns ------- list of string List of expanded IPA text. Examples -------- >>> parser = Parser() >>> template = "{{la-IPA|eccl=yes|th?saurus}}" >>> parser.expand_template(template) ['/t?e??sau?.rus/', '[t?e??sau?.r?s]', '/te?sau?.rus/'] """ self.param["text"] = text.encode("utf-8") res = urlopen(self.api, urlencode(self.param).encode()).read() content = json.loads(res.decode("utf-8")) html = content["expandtemplates"]["wikitext"] # Use BeautifulSoup instead of raw regex expr # return self.regex["IPA"].findall(html) soup = BeautifulSoup(html, "html.parser") span = soup.find_all("span", {"class": "IPA"}) return list(map(lambda x: x.text, span))
def parse(self, wiki_text, title=None): """Parse Wiktionary wiki text. Split Wiktionary wiki text into different langugaes and return parseed IPA result. Parameters ---------- wiki_text : string String of Wiktionary wiki text, from XML dump or Wiktionary API. title: string String of wiki entry title. Returns ------- dict Dict of parsed IPA results. Key: language name; Value: list of IPA text. """ self.title = title parse_result = {} h2_lst = self.regex["h2"].findall(wiki_text) if self.lang and self.lang not in h2_lst: parse_result = {self.lang: "Language not found."} return parse_result h2_split = self.regex["h2"].split(wiki_text) i = 0 while i < len(h2_split): if h2_split[i] in h2_lst: if not self.lang or h2_split[i] == self.lang: pronunciation = self.parse_detail(h2_split[i+1]) if not pronunciation: pronunciation = "IPA not found." parse_result[h2_split[i]] = pronunciation i += 1 i += 1 return parse_result
def parse_detail(self, wiki_text, depth=3): """Parse the section of a certain language in wiki text. Parse pronunciation section of the certain language recursively. Parameters ---------- wiki_text : string String of wiki text in a language section. depth : int Integer indicated depth of pronunciation section. Returns ------- list of dict List of extracted IPA text in ``{"IPA": "", "X-SAMPA": "", "lang": ""}`` format. """ parse_result = [] detail_lst = self.regex["h" + str(depth)].findall(wiki_text) detail_split = self.regex["h" + str(depth)].split(wiki_text) # To avoid maximum recursion depth exceeded. if len(detail_split) > 99999: return "Maximum recursion depth exceeded in wiki text." i = 0 while i < len(detail_split): if detail_split[i] in detail_lst: header_name = detail_split[i].lower() if header_name == "pronunciation": parse_result += \ self.parse_pronunciation(detail_split[i+1]) elif ("etymology" in header_name and header_name != "etymology"): parse_result += \ self.parse_detail(detail_split[i+1], depth=4) i += 1 i += 1 return parse_result
def get_href(string, pattern='.*'): found = re.findall('(?<=<a\s+href=")[^"]+(?=">%s</a>)' % pattern, string) if found: return found[0]
def test_named_lists(self): options = [u"one", u"two", u"three"] self.assertEqual(regex.match(ur"333\L<bar>444", u"333one444", bar=options).group(), u"333one444") self.assertEqual(regex.match(ur"(?i)333\L<bar>444", u"333TWO444", bar=options).group(), u"333TWO444") self.assertEqual(regex.match(ur"333\L<bar>444", u"333four444", bar=options), None) options = ["one", "two", "three"] self.assertEqual(regex.match(r"333\L<bar>444", "333one444", bar=options).group(), "333one444") self.assertEqual(regex.match(r"(?i)333\L<bar>444", "333TWO444", bar=options).group(), "333TWO444") self.assertEqual(regex.match(r"333\L<bar>444", "333four444", bar=options), None) self.assertEqual(repr(type(regex.compile(r"3\L<bar>4\L<bar>+5", bar=["one", "two", "three"]))), self.PATTERN_CLASS) self.assertEqual(regex.findall(r"^\L<options>", "solid QWERT", options=set(['good', 'brilliant', '+s\\ol[i}d'])), []) self.assertEqual(regex.findall(r"^\L<options>", "+solid QWERT", options=set(['good', 'brilliant', '+solid'])), ['+solid']) options = [u"STRASSE"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"STRASSE", u"stress"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"stra\N{LATIN SMALL LETTER SHARP S}e"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"STRASSE", words=options).span(), (0, 7)) options = ["kit"] self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SKITS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15)) self.assertEqual(regex.search(r"^\L<options>$", "", options=[]).span(), (0, 0))
def transliterate(text): def repl1(match): c, d = match.group(1), match.group(2) if d == "": return c + "a" return c + d def repl2(match): opt, first, second, third = \ match.group(1), match.group(2), match.group(3), match.group(4) if (re.match("[" + special_cons + "]", first) and \ re.match("?", second) and \ (first + second + third) not in perm_cl.keys()) or \ re.match("?[???]", first + second): return "a" + opt + first + second + third return "" + opt + first + second + third def repl3(match): succ, prev = match.group(1), match.group(2) if succ + prev == "a": return succ + "??" + prev if succ == "" and re.match("[" + vowel + "]", prev): return succ + "?" + prev if succ in nasal_assim.keys(): return succ + nasal_assim[succ] + prev return succ + "n" + prev def repl4(match): k = match.group() if k in conv.keys(): return conv[k] return k text = re.sub("([" + all_cons + "]??)([" + vowel + "?]?)", repl1, text) for word in re.findall("[?-?a]+", text): orig_word = str(word) rev_word = word[::-1] rev_word = re.sub("^a(??)([" + all_cons + "])(.)(.?)", repl2, rev_word) while re.match(syncope_pattern, rev_word): rev_word = re.sub(syncope_pattern, r"\1\2\3\4", rev_word) rev_word = re.sub("(.?)?(.)", repl3, rev_word) text = re.sub(orig_word, rev_word[::-1], text) text = re.sub(".??", repl4, text) text = re.sub("a([iu])?", r"a?\1", text) text = re.sub("???", repl4, text) return unicodedata.normalize("NFC", text)
def request_iss(url='http://ss.ishadowx.com/'): print('req iss...') try: data = requests.get(url) soup = BeautifulSoup(data.text, 'html.parser') except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} try: info = { 'message': soup.find('div', attrs={'id': 'portfolio'}).find('div', attrs={'class': 'section-title text-center center'}).text, 'name': 'ishadowx', 'url': url} '''servers[-1]['name'] = tmp[0] servers[-1]['server'] = tmp[0] servers[-1]['server_port'] = tmp[0] servers[-1]['password'] = tmp[0] servers[-1]['method'] = tmp[0] servers[-1]['ssr_protocol'] = tmp[0] servers[-1]['obfs'] = tmp[0]''' soup = BeautifulSoup(data.text, 'html.parser') server_data = soup.find_all('div', attrs={'class': 'hover-text'}) servers = list() except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} for i, server in enumerate(server_data): try: servers.append(dict()) server_data = server.text.strip().split('\n') servers[-1]['server'] = server_data[0].split(':')[-1].strip() servers[-1]['server_port'] = re.findall('\d+', server_data[1])[0] servers[-1]['remarks'] = ' '.join(['ss.ishadowx.com', str(i)]) servers[-1]['password'] = server_data[2].split(':')[-1].strip() servers[-1]['method'] = server_data[3].split(':')[-1].strip() if 'QR' not in server_data[4]: servers[-1]['ssr_protocol'], servers[-1]['obfs'] = server_data[4].strip().split(maxsplit=1) servers[-1]['remarks'] = ' '.join([servers[-1]['remarks'], 'SSR']) except Exception as e: logging.exception(e, stack_info=True) return servers, info
def request_newpac(url='https://github.com/Alvin9999/new-pac/wiki/ss%E5%85%8D%E8%B4%B9%E8%B4%A6%E5%8F%B7'): data = requests.get(url) soup = BeautifulSoup(data.text, 'html.parser') ss_list = list() for i in soup.find_all('p'): if re.match('\<p\>\s*???\d+[^:?]*[:?]', str(i)): ss_list.append(str(i)) servers = list() for i in ss_list: servers.append(dict()) servers[-1]['string'] = i # name tmp = re.findall('???\d+[^:?]*(?=\s*[:?])', i) if tmp: servers[-1]['remarks'] = tmp[0] # server tmp = re.findall('(?<=???\s*\d+[^:?]*[:?]\s*[^a-zA-Z0-9_]*)[\w\d\.]+', i) if tmp: servers[-1]['server'] = tmp[0] # server_port tmp = re.findall('(?<=??\s*[^:?]*[:?]\s*[^a-zA-Z0-9_]*)\d+', i) if tmp: servers[-1]['server_port'] = tmp[0] # password tmp = re.findall('(?<=??\s*[^:?]*[:?]\s*[^a-zA-Z0-9_]*)[a-zA-Z\d\.\+\-_\*\\/]+', i) if tmp: servers[-1]['password'] = tmp[0] # method tmp = re.findall('(?<=???[??]\s*[^:?]*[:?]\s*[^a-zA-Z0-9_]*)[a-zA-Z\d\.\+\-_\*\\/]+', i) if tmp: servers[-1]['method'] = tmp[0] # SSR?? tmp = re.findall('(?<=SSR??\s*[^:?]*[:?]\s*[^a-zA-Z_0-9]*)[a-zA-Z\d\.\+\-_\*\\/]+', i) if tmp: servers[-1]['ssr_protocol'] = tmp[0] # ?? tmp = re.findall('(?<=??\s*[^:?]*[:?]\s*[^a-zA-Z0-9_]*)[a-zA-Z\d\.\+\-_\*\\/]+', i) if tmp: servers[-1]['obfs'] = tmp[0] info = {'message': '', 'name': 'new-pac', 'url': url} return servers, info