我们从Python开源项目中,提取了以下21个代码示例,用于说明如何使用regex.split()。
def test_word_boundary(self): text = u'The quick ("brown") fox can\'t jump 32.3 feet, right?' self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', u'quick', u' ("', u'brown', u'") ', u'fox', u' ', u'can', u"'", u't', u' ', u'jump', u' ', u'32', u'.', u'3', u' ', u'feet', u', ', u'right', u'?']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', u'quick', u' ', u'(', u'"', u'brown', u'"', u')', u' ', u'fox', u' ', u"can't", u' ', u'jump', u' ', u'32.3', u' ', u'feet', u',', u' ', u'right', u'?', u'']) text = u"The fox" self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'The', u' ', u'fox', u'']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u'The', u' ', u' ', u'fox', u'']) text = u"can't aujourd'hui l'objectif" self.assertEqual(regex.split(ur'(?V1)\b', text), [u'', u'can', u"'", u't', u' ', u'aujourd', u"'", u'hui', u' ', u'l', u"'", u'objectif', u'']) self.assertEqual(regex.split(ur'(?V1w)\b', text), [u'', u"can't", u' ', u"aujourd'hui", u' ', u"l'", u'objectif', u''])
def test_turkic(self): # Turkish has dotted and dotless I/i. pairs = u"I=i;I=\u0131;i=\u0130" all_chars = set() matching = set() for pair in pairs.split(";"): ch1, ch2 = pair.split("=") all_chars.update((ch1, ch2)) matching.add((ch1, ch1)) matching.add((ch1, ch2)) matching.add((ch2, ch1)) matching.add((ch2, ch2)) for ch1 in all_chars: for ch2 in all_chars: m = regex.match(ur"(?iu)\A" + ch1 + ur"\Z", ch2) if m: if (ch1, ch2) not in matching: self.fail("%s matching %s" % (repr(ch1), repr(ch2))) else: if (ch1, ch2) in matching: self.fail("%s not matching %s" % (repr(ch1), repr(ch2)))
def search_docs(inputs, max_ex=5, opts=None): """Given a set of document ids (returned by ranking for a question), search for top N best matching (by heuristic) paragraphs that contain the answer. """ if not opts: raise RuntimeError('Options dict must be supplied.') doc_ids, q_tokens, answer = inputs examples = [] for i, doc_id in enumerate(doc_ids): for j, paragraph in enumerate(re.split(r'\n+', fetch_text(doc_id))): found = find_answer(paragraph, q_tokens, answer, opts) if found: # Reverse ranking, giving priority to early docs + paragraphs score = (found[0], -i, -j, random.random()) if len(examples) < max_ex: heapq.heappush(examples, (score, found[1])) else: heapq.heappushpop(examples, (score, found[1])) return [e[1] for e in examples]
def _split_doc(self, doc): """Given a doc, split it into chunks (by paragraph).""" curr = [] curr_len = 0 for split in regex.split(r'\n+', doc): split = split.strip() if len(split) == 0: continue # Maybe group paragraphs together until we hit a length limit if len(curr) > 0 and curr_len + len(split) > self.GROUP_LENGTH: yield ' '.join(curr) curr = [] curr_len = 0 curr.append(split) curr_len += len(split) if len(curr) > 0: yield ' '.join(curr)
def _process_text_line(self, text): split_text = [token for token in new_regex.split(self.tokenisation_pattern, text) \ if token != ''] if self.replace_whitespace: new_text = [] for token in split_text: if token.isspace(): new_text.append(self.replace_whitespace) else: new_text.append(token) split_text = new_text split_text = [token.strip(u' ') for token in split_text] ## prevent multiple spaces split_text = [token for token in split_text if token != u''] ## prevent multiple spaces split_text = [token.lower() for token in split_text] ## lowercase text = ' '.join(split_text) return text
def tokenize(self, sentence): """ Tokenize the given sentence. You can also pass a generic text, but you will lose the sentence segmentation. :param str sentence: a natural language sentence or text to be tokenized :return: the list of tokens :rtype: list """ tokens = regex.split(self.tokenization_regex, unicode(sentence)) logger.debug("'%s' tokenized into %s using regex %s" % (sentence, tokens, self.tokenization_regex)) # Skip empty tokens return [token for token in tokens if token]
def test_re_split(self): self.assertEqual(regex.split(":", ":a:b::c"), ['', 'a', 'b', '', 'c']) self.assertEqual(regex.split(":*", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(:*)", ":a:b::c"), ['', ':', 'a', ':', 'b', '::', 'c']) self.assertEqual(regex.split("(?::*)", ":a:b::c"), ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(:)*", ":a:b::c"), ['', ':', 'a', ':', 'b', ':', 'c']) self.assertEqual(regex.split("([b:]+)", ":a:b::c"), ['', ':', 'a', ':b::', 'c']) self.assertEqual(regex.split("(b)|(:+)", ":a:b::c"), ['', None, ':', 'a', None, ':', '', 'b', None, '', None, '::', 'c']) self.assertEqual(regex.split("(?:b)|(?::+)", ":a:b::c"), ['', 'a', '', '', 'c']) self.assertEqual(regex.split("x", "xaxbxc"), ['', 'a', 'b', 'c']) self.assertEqual([m for m in regex.splititer("x", "xaxbxc")], ['', 'a', 'b', 'c']) self.assertEqual(regex.split("(?r)x", "xaxbxc"), ['c', 'b', 'a', '']) self.assertEqual([m for m in regex.splititer("(?r)x", "xaxbxc")], ['c', 'b', 'a', '']) self.assertEqual(regex.split("(x)|(y)", "xaxbxc"), ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) self.assertEqual([m for m in regex.splititer("(x)|(y)", "xaxbxc")], ['', 'x', None, 'a', 'x', None, 'b', 'x', None, 'c']) self.assertEqual(regex.split("(?r)(x)|(y)", "xaxbxc"), ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) self.assertEqual([m for m in regex.splititer("(?r)(x)|(y)", "xaxbxc")], ['c', 'x', None, 'b', 'x', None, 'a', 'x', None, '']) self.assertEqual(regex.split(r"(?V1)\b", "a b c"), ['', 'a', ' ', 'b', ' ', 'c', '']) self.assertEqual(regex.split(r"(?V1)\m", "a b c"), ['', 'a ', 'b ', 'c']) self.assertEqual(regex.split(r"(?V1)\M", "a b c"), ['a', ' b', ' c', ''])
def test_qualified_re_split(self): self.assertEqual(regex.split(":", ":a:b::c", 2), ['', 'a', 'b::c']) self.assertEqual(regex.split(':', 'a:b:c:d', 2), ['a', 'b', 'c:d']) self.assertEqual(regex.split("(:)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c']) self.assertEqual(regex.split("(:*)", ":a:b::c", 2), ['', ':', 'a', ':', 'b::c'])
def test_bug_931848(self): pattern = u"[\u002E\u3002\uFF0E\uFF61]" self.assertEqual(regex.compile(pattern).split("a.b.c"), ['a', 'b', 'c'])
def test_word_class(self): self.assertEqual(regex.findall(ur"(?u)\w+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'\u0939\u093f\u0928\u094d\u0926\u0940']) self.assertEqual(regex.findall(ur"(?u)\W+", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u',']) self.assertEqual(regex.split(ur"(?uV1)\b", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u' ', u'\u0939\u093f\u0928\u094d\u0926\u0940', u',']) self.assertEqual(regex.split(ur"(?uV1)\B", u" \u0939\u093f\u0928\u094d\u0926\u0940,"), [u'', u' \u0939', u'\u093f', u'\u0928', u'\u094d', u'\u0926', u'\u0940,', u''])
def test_zerowidth(self): # Issue 3262. self.assertEqual(regex.split(r"\b", "a b"), ['a b']) self.assertEqual(regex.split(r"(?V1)\b", "a b"), ['', 'a', ' ', 'b', '']) # Issue 1647489. self.assertEqual(regex.findall(r"^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?r)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?r)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.findall(r"(?V1)^|\w+", "foo bar"), ['', 'foo', 'bar']) self.assertEqual([m[0] for m in regex.finditer(r"(?V1)^|\w+", "foo bar")], ['', 'foo', 'bar']) self.assertEqual(regex.findall(r"(?rV1)^|\w+", "foo bar"), ['bar', 'foo', '']) self.assertEqual([m[0] for m in regex.finditer(r"(?rV1)^|\w+", "foo bar")], ['bar', 'foo', '']) self.assertEqual(regex.split("", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?r)", "xaxbxc"), ['xaxbxc']) self.assertEqual([m for m in regex.splititer("(?r)", "xaxbxc")], ['xaxbxc']) self.assertEqual(regex.split("(?V1)", "xaxbxc"), ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual([m for m in regex.splititer("(?V1)", "xaxbxc")], ['', 'x', 'a', 'x', 'b', 'x', 'c', '']) self.assertEqual(regex.split("(?rV1)", "xaxbxc"), ['', 'c', 'x', 'b', 'x', 'a', 'x', '']) self.assertEqual([m for m in regex.splititer("(?rV1)", "xaxbxc")], ['', 'c', 'x', 'b', 'x', 'a', 'x', ''])
def split(pattern, string, maxsplit=0, flags=0, concurrent=None, **kwargs): """Wrapper for `split`.""" return regex.split( _apply_search_backrefs(pattern, flags), string, maxsplit, flags, concurrent, **kwargs )
def splitWord(str): str = re.sub("[^A-Za-z]", "", str) words = re.split(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-z]A)(?=[A-Z])', str, flags=re.V1) return words
def splitting_function(self, instring): tokens = self.regex.split(instring) tokens = [t for t in tokens if t != ''] if self.add_terminal_tokens: tokens = [c.TERMINAL] + tokens + [c.TERMINAL] return tokens
def request_5752me(url='https://wget.5752.me/Computer/soft/socks5%E4%BB%A3%E7%90%86/%E5%85%8D%E8%B4%B9ss%E8%B4%A6%E5%8F%B7.html'): print('req 5752...') servers = list() try: data = requests.get(url) if 'IP??' in data.content.decode('gb2312'): data = data.content.decode('gb2312') elif 'IP??' in data.text: data = data.text else: raise Exception('???5752???' + url) info = {'message': '', 'name': '????', 'url': 'https://www.5752.me/'} data = data.split('<br/>') avail_data = list(filter(lambda x: 'IP??' in x, data)) if len(avail_data) == 0: raise Exception('5752???????????' + '\n'.join(data)) for i, server in enumerate(avail_data): servers.append(dict()) servers[-1]['remarks'] = '???? {}'.format(i) ( servers[-1]['server'], servers[-1]['password'], servers[-1]['server_port'], servers[-1]['method']) = server.split()[1::2] except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info
def request_nobey(url='https://raw.githubusercontent.com/NoBey/Shadowsocks-free/master/README.md'): def strip_dot(x): return print('req nobey...') servers = list() try: data = re.split('##+|---+', requests.get(url).text)[2:5:2] info = {'message': '', 'name': 'NoBey', 'url': 'https://github.com/NoBey/Shadowsocks-free'} for i, server in enumerate(data): server = server.split('\n') name = server[0].strip() ( ips, ports, _, method, password) = list(map( lambda server: list(map( lambda x: x.strip().strip('`').strip(), server.strip('-').strip().split()[1:])), server[1:6])) method = method[0] password = password[0] for j, ip in enumerate(ips): for k, port in enumerate(ports): servers.append(dict()) servers[-1]['remarks'] = 'NoBey {}-{}-{}'.format(name, j, k) ( servers[-1]['server'], servers[-1]['password'], servers[-1]['server_port'], servers[-1]['method']) = (ip, password, port, method) except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info
def request_xiaoshuang(url='https://xsjs.yhyhd.org/free-ss'): print('req xcud...') try: data = requests.get(url) soup = BeautifulSoup(data.text, 'html.parser') data = soup.find('div', attrs={'id': 'ss-body'}) data = data.text.strip().split('\n\n\n') info = {'message': data[0].split('\n')[0], 'name': '????', 'url': url} data[0] = data[0].split('\n', maxsplit=1)[-1] servers = list() for server in data: server_data = server.strip().split('\n') servers.append(dict()) servers[-1]['remarks'] = '??{}'.format(server_data[0]).strip() servers[-1]['server'] = server_data[1].split()[1].strip() servers[-1]['server_port'] = server_data[1].split()[3].strip() servers[-1]['password'] = server_data[2].split()[3].strip() servers[-1]['method'] = server_data[2].split()[1].strip() servers[-1]['ssr_protocol'] = server_data[3].split()[1].split(':')[1].strip() servers[-1]['obfs'] = server_data[3].split()[2].split(':')[1].strip() except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info # this cannot use for now
def request_iss(url='http://ss.ishadowx.com/'): print('req iss...') try: data = requests.get(url) soup = BeautifulSoup(data.text, 'html.parser') except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} try: info = { 'message': soup.find('div', attrs={'id': 'portfolio'}).find('div', attrs={'class': 'section-title text-center center'}).text, 'name': 'ishadowx', 'url': url} '''servers[-1]['name'] = tmp[0] servers[-1]['server'] = tmp[0] servers[-1]['server_port'] = tmp[0] servers[-1]['password'] = tmp[0] servers[-1]['method'] = tmp[0] servers[-1]['ssr_protocol'] = tmp[0] servers[-1]['obfs'] = tmp[0]''' soup = BeautifulSoup(data.text, 'html.parser') server_data = soup.find_all('div', attrs={'class': 'hover-text'}) servers = list() except Exception as e: logging.exception(e, stack_info=True) return [], {'message': str(e), 'url': '', 'name': ''} for i, server in enumerate(server_data): try: servers.append(dict()) server_data = server.text.strip().split('\n') servers[-1]['server'] = server_data[0].split(':')[-1].strip() servers[-1]['server_port'] = re.findall('\d+', server_data[1])[0] servers[-1]['remarks'] = ' '.join(['ss.ishadowx.com', str(i)]) servers[-1]['password'] = server_data[2].split(':')[-1].strip() servers[-1]['method'] = server_data[3].split(':')[-1].strip() if 'QR' not in server_data[4]: servers[-1]['ssr_protocol'], servers[-1]['obfs'] = server_data[4].strip().split(maxsplit=1) servers[-1]['remarks'] = ' '.join([servers[-1]['remarks'], 'SSR']) except Exception as e: logging.exception(e, stack_info=True) return servers, info