我们从Python开源项目中,提取了以下43个代码示例,用于说明如何使用regex.search()。
def parse_primary_source(text): """given a primary source text reference it returns the abbrev of the primary source """ psource = "" ref_regex = regex.compile(r""" (?P<psource>(\w+\.?\s?)+) (\(?\d\)\s?)+ """, regex.VERBOSE) res = regex.search(ref_regex, text) if res: if res.group(psource): return res.group(psource) else: return None else: return None
def __exit__(self, exc_type, exc_value, tb): if exc_type is None: try: exc_name = self.expected.__name__ except AttributeError: exc_name = str(self.expected) raise self.failureException( "%s not raised" % exc_name) if not issubclass(exc_type, self.expected): # let unexpected exceptions pass through return False self.exception = exc_value # store for later retrieval if self.expected_regexp is None: return True expected_regexp = self.expected_regexp if isinstance(expected_regexp, basestring): expected_regexp = re.compile(expected_regexp) if not expected_regexp.search(str(exc_value)): raise self.failureException('"%s" does not match "%s"' % (expected_regexp.pattern, str(exc_value))) return True
def test_scoped_and_inline_flags(self): # Issues 433028, 433024, 433027. self.assertEqual(regex.search(r"(?i)Ab", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"(?i:A)b", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"A(?i)b", "ab").span(), (0, 2)) self.assertEqual(regex.search(r"A(?iV1)b", "ab"), None) self.assertRaisesRegex(regex.error, self.CANT_TURN_OFF, lambda: regex.search(r"(?V0-i)Ab", "ab", flags=regex.I)) self.assertEqual(regex.search(r"(?V0)Ab", "ab"), None) self.assertEqual(regex.search(r"(?V1)Ab", "ab"), None) self.assertEqual(regex.search(r"(?V1-i)Ab", "ab", flags=regex.I), None) self.assertEqual(regex.search(r"(?-i:A)b", "ab", flags=regex.I), None) self.assertEqual(regex.search(r"A(?V1-i)b", "ab", flags=regex.I).span(), (0, 2))
def test_captures(self): self.assertEqual(regex.search(r"(\w)+", "abc").captures(1), ['a', 'b', 'c']) self.assertEqual(regex.search(r"(\w{3})+", "abcdef").captures(0, 1), (['abcdef'], ['abc', 'def'])) self.assertEqual(regex.search(r"^(\d{1,3})(?:\.(\d{1,3})){3}$", "192.168.0.1").captures(1, 2), (['192', ], ['168', '0', '1'])) self.assertEqual(regex.match(r"^([0-9A-F]{2}){4} ([a-z]\d){5}$", "3FB52A0C a2c4g3k9d3").captures(1, 2), (['3F', 'B5', '2A', '0C'], ['a2', 'c4', 'g3', 'k9', 'd3'])) self.assertEqual(regex.match("([a-z]W)([a-z]X)+([a-z]Y)", "aWbXcXdXeXfY").captures(1, 2, 3), (['aW'], ['bX', 'cX', 'dX', 'eX'], ['fY'])) self.assertEqual(regex.search(r".*?(?=(.)+)b", "ab").captures(1), ['b']) self.assertEqual(regex.search(r".*?(?>(.){0,2})d", "abcd").captures(1), ['b', 'c']) self.assertEqual(regex.search(r"(.)+", "a").captures(1), ['a'])
def encode(self, names): """ Encode list of names into list of list of character IDs using the character encoder. :param names: list of names :return: list (each name) of list (each word) of character IDs """ name_id2word_id2char_ids = list() for name in names: name = self._clean_characters(name) word_id2char_ids = list() for word in name.split(self._separator): word = '{}{}{}'.format(self._start_char, word, self._end_char) try: word_id2char_ids.append(self._label_encoder.transform(list(word)).tolist()) except ValueError as exception: unseen_chars = regex.search( r'y contains new labels: (.*)$', exception.args[0]).groups()[0] raise UnseenCharacterException('Unseen characters: {}'.format(unseen_chars)) name_id2word_id2char_ids.append(word_id2char_ids) return name_id2word_id2char_ids
def compute_gender_probas(dir_path, start_year): year_prefix = 'yob' name2gender2count = defaultdict(lambda: defaultdict(int)) for file_path in glob(os.path.join(dir_path, '*.txt')): year = int(regex.search(r'/{}(\d\d\d\d)'.format(year_prefix), file_path).groups()[0]) if year < start_year: continue with open(file_path, encoding='utf8') as file_: csv_reader = csv.reader(file_) for name, gender, count in csv_reader: name2gender2count[name][_CLASS_MAP[gender]] += int(count) name2proba = dict() for name, gender2count in name2gender2count.items(): name2proba[name] = float(gender2count[POSITIVE_CLASS]) / (gender2count[POSITIVE_CLASS] + gender2count[NEGATIVE_CLASS]) return name2proba
def apply(self): """Apply search template.""" i = RegexSearchTokens(self.search, self.verbose) iter(i) for t in i: if len(t) > 1: # handle our stuff c = t[1:] if c[0:1] in self._verbose_tokens: self.extended.append(t) elif c == self._quote: self.extended.extend(self.quoted(i)) elif c != self._end: self.extended.append(t) elif self.verbose and t == self._hashtag and not self.in_group(i.index - 1): self.extended.append(t) self.extended.extend(self.comments(i)) else: self.extended.append(t) return self._empty.join(self.extended)
def _apply_search_backrefs(pattern, flags=0): """Apply the search backrefs to the search pattern.""" if isinstance(pattern, (compat.string_type, compat.binary_type)): re_verbose = VERBOSE & flags if flags & V0: re_version = V0 elif flags & V1: re_version = V1 else: re_version = 0 pattern = RegexSearchTemplate(pattern, re_verbose, re_version).apply() elif isinstance(pattern, REGEX_TYPE): if flags: raise ValueError("Cannot process flags argument with a compiled pattern!") else: raise TypeError("Not a string or compiled pattern!") return pattern
def _read_rule(self, i, line): line = line.strip() if line: line = unicodedata.normalize('NFC', unicodedata.normalize('NFD', line)) s = re.match(r'(?P<symbol>::\w+::)\s*=\s*(?P<value>.+)', line) if s: self.symbols[s.group('symbol')] = s.group('value') else: line = self._sub_symbols(line) r = re.match(r'(\S+)\s*->\s*(\S+)\s*/\s*(\S*)\s*[_]\s*(\S*)', line) try: a, b, X, Y = r.groups() except AttributeError: raise DatafileError('Line {}: "{}" cannot be parsed.'.format(i + 1, line)) X, Y = X.replace('#', '^'), Y.replace('#', '$') a, b = a.replace('0', ''), b.replace('0', '') try: if re.search(r'[?]P[<]sw1[>].+[?]P[<]sw2[>]', a): return self._fields_to_function_metathesis(a, X, Y) else: return self._fields_to_function(a, b, X, Y) except Exception as e: raise DatafileError('Line {}: "{}" cannot be compiled as regex: ?{}'.format(i + 1, line, e))
def is_ichidan_verb(kanji, canonical_reading): """ >>> is_ichidan_verb('?', '???') True >>> is_ichidan_verb('?', '??') False >>> is_ichidan_verb('?', '??') True >>> is_ichidan_verb('?', '??') False """ if kanji in ICHIDAN_EXCEPTIONS: return False elif re.search(ICHIDAN_BASE_ENDING + '?$', canonical_reading): return True else: return False
def parse(cls, string): selectors = [] combinator = None prev_selector = None while True: match = regex.search(cls.RE.comma, string) if match: # skip comma _, pos = match.span() string = string[pos:] continue match = regex.search(cls.RE.combinator, string) if match: _, pos = match.span() combinator = string[:pos].strip() string = string[pos:] else: combinator = None match = regex.search(cls.RE.selector, string) if match: _, pos = match.span() seltext = string[:pos] string = string[pos:] selector = cls(seltext, combinator=combinator) if combinator is not None and prev_selector: prev_selector.next_selector = prev_selector = selector else: prev_selector = selector selectors.append(selector) continue break return selectors
def test_search_star_plus(self): self.assertEqual(regex.search('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.search('x*', 'axx').span(), (0, 0)) self.assertEqual(regex.search('x+', 'axx').span(0), (1, 3)) self.assertEqual(regex.search('x+', 'axx').span(), (1, 3)) self.assertEqual(regex.search('x', 'aaa'), None) self.assertEqual(regex.match('a*', 'xxx').span(0), (0, 0)) self.assertEqual(regex.match('a*', 'xxx').span(), (0, 0)) self.assertEqual(regex.match('x*', 'xxxa').span(0), (0, 3)) self.assertEqual(regex.match('x*', 'xxxa').span(), (0, 3)) self.assertEqual(regex.match('a+', 'xxx'), None)
def test_bug_1661(self): # Verify that flags do not get silently ignored with compiled patterns pattern = regex.compile('.') self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.match(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.search(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.findall(pattern, 'A', regex.I)) self.assertRaisesRegex(ValueError, self.FLAGS_WITH_COMPILED_PAT, lambda: regex.compile(pattern, regex.I))
def test_bug_14462(self): # chr(255) is not a valid identifier in Python 2. group_name = u'\xFF' self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.search(ur'(?P<' + group_name + '>a)', u'a'))
def test_getattr(self): self.assertEqual(regex.compile("(?i)(a)(b)").pattern, '(?i)(a)(b)') self.assertEqual(regex.compile("(?i)(a)(b)").flags, regex.A | regex.I | regex.DEFAULT_VERSION) self.assertEqual(regex.compile(u"(?i)(a)(b)").flags, regex.I | regex.U | regex.DEFAULT_VERSION) self.assertEqual(regex.compile("(?i)(a)(b)").groups, 2) self.assertEqual(regex.compile("(?i)(a)(b)").groupindex, {}) self.assertEqual(regex.compile("(?i)(?P<first>a)(?P<other>b)").groupindex, {'first': 1, 'other': 2}) self.assertEqual(regex.match("(a)", "a").pos, 0) self.assertEqual(regex.match("(a)", "a").endpos, 1) self.assertEqual(regex.search("b(c)", "abcdef").pos, 0) self.assertEqual(regex.search("b(c)", "abcdef").endpos, 6) self.assertEqual(regex.search("b(c)", "abcdef").span(), (1, 3)) self.assertEqual(regex.search("b(c)", "abcdef").span(1), (2, 3)) self.assertEqual(regex.match("(a)", "a").string, 'a') self.assertEqual(regex.match("(a)", "a").regs, ((0, 1), (0, 1))) self.assertEqual(repr(type(regex.match("(a)", "a").re)), self.PATTERN_CLASS) # Issue 14260. p = regex.compile(r'abc(?P<n>def)') p.groupindex["n"] = 0 self.assertEqual(p.groupindex["n"], 1)
def test_not_literal(self): self.assertEqual(regex.search(r"\s([^a])", " b")[1], 'b') self.assertEqual(regex.search(r"\s([^a]*)", " bb")[1], 'bb')
def test_search_coverage(self): self.assertEqual(regex.search(r"\s(b)", " b")[1], 'b') self.assertEqual(regex.search(r"a\s", "a ")[0], 'a ')
def test_bug_418626(self): # Bugs 418626 at al. -- Testing Greg Chapman's addition of op code # SRE_OP_MIN_REPEAT_ONE for eliminating recursion on simple uses of # pattern '*?' on a long string. self.assertEqual(regex.match('.*?c', 10000 * 'ab' + 'cd').end(0), 20001) self.assertEqual(regex.match('.*?cd', 5000 * 'ab' + 'c' + 5000 * 'ab' + 'cde').end(0), 20003) self.assertEqual(regex.match('.*?cd', 20000 * 'abc' + 'de').end(0), 60001) # Non-simple '*?' still used to hit the recursion limit, before the # non-recursive scheme was implemented. self.assertEqual(regex.search('(a|b)*?c', 10000 * 'ab' + 'cd').end(0), 20001)
def test_bug_581080(self): it = regex.finditer(r"\s", "a b") self.assertEqual(it.next().span(), (1, 2)) self.assertRaises(StopIteration, lambda: it.next()) scanner = regex.compile(r"\s").scanner("a b") self.assertEqual(scanner.search().span(), (1, 2)) self.assertEqual(scanner.search(), None)
def test_atomic(self): # Issue 433030. self.assertEqual(regex.search(r"(?>a*)a", "aa"), None)
def test_repeated_repeats(self): # Issue 2537. self.assertEqual(regex.search(r"(?:a+)+", "aaa").span(), (0, 3)) self.assertEqual(regex.search(r"(?:(?:ab)+c)+", "abcabc").span(), (0, 6))
def test_line_boundary(self): self.assertEqual(regex.findall(r".+", "Line 1\nLine 2\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r".+", "Line 1\rLine 2\r"), ["Line 1\rLine 2\r"]) self.assertEqual(regex.findall(r".+", "Line 1\r\nLine 2\r\n"), ["Line 1\r", "Line 2\r"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\nLine 2\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\rLine 2\r"), ["Line 1", "Line 2"]) self.assertEqual(regex.findall(r"(?w).+", "Line 1\r\nLine 2\r\n"), ["Line 1", "Line 2"]) self.assertEqual(regex.search(r"^abc", "abc").start(), 0) self.assertEqual(regex.search(r"^abc", "\nabc"), None) self.assertEqual(regex.search(r"^abc", "\rabc"), None) self.assertEqual(regex.search(r"(?w)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?w)^abc", "\nabc"), None) self.assertEqual(regex.search(r"(?w)^abc", "\rabc"), None) self.assertEqual(regex.search(r"abc$", "abc").start(), 0) self.assertEqual(regex.search(r"abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"abc$", "abc\r"), None) self.assertEqual(regex.search(r"(?w)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?w)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?w)abc$", "abc\r").start(), 0) self.assertEqual(regex.search(r"(?m)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?m)^abc", "\nabc").start(), 1) self.assertEqual(regex.search(r"(?m)^abc", "\rabc"), None) self.assertEqual(regex.search(r"(?mw)^abc", "abc").start(), 0) self.assertEqual(regex.search(r"(?mw)^abc", "\nabc").start(), 1) self.assertEqual(regex.search(r"(?mw)^abc", "\rabc").start(), 1) self.assertEqual(regex.search(r"(?m)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?m)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?m)abc$", "abc\r"), None) self.assertEqual(regex.search(r"(?mw)abc$", "abc").start(), 0) self.assertEqual(regex.search(r"(?mw)abc$", "abc\n").start(), 0) self.assertEqual(regex.search(r"(?mw)abc$", "abc\r").start(), 0)
def _filter(names, predictions, return_proba): """Filter bad results.""" neutral_pred = {POSITIVE_CLASS: 0.5, NEGATIVE_CLASS: 0.5} if return_proba else NEUTRAL_CLASS for name_id, name in enumerate(names): if not regex.search(r'\w', name): predictions[name_id] = copy(neutral_pred) return predictions
def compile_search(pattern, flags=0, **kwargs): """Compile with extended search references.""" return regex.compile(_apply_search_backrefs(pattern, flags), flags, **kwargs)
def search(pattern, string, flags=0, pos=None, endpos=None, partial=False, concurrent=None, **kwargs): """Wrapper for `search`.""" return regex.search( _apply_search_backrefs(pattern, flags), string, flags, pos, endpos, partial, concurrent, **kwargs )
def expand_iupac(barcode): ''' Expand IUPAC codes, i.e. turn 'AY' to ['AC', 'AT'], removes 'N's ''' barcode = barcode.upper() if all((i in 'ACGTN' for i in set(barcode))): return barcode.replace('N','') else: pos = regex.search(r'[%s]' % ''.join(IUPAC_CODES.keys()), barcode).start() code = barcode[pos] return (expand_iupac(barcode.replace(code, i, 1)) for i in IUPAC_CODES[code])
def test_list_cmd(awsclient, capsys): tooldata = get_tooldata( awsclient, 'kumo', 'list', config_base_name='gcdt_large', location=here('./resources/simple_cloudformation_stack/')) list_cmd(**tooldata) out, err = capsys.readouterr() # using regular expression search in captured output assert regex.search('listed \d+ stacks', out) is not None
def tone_determ(text): text = unicodedata.normalize("NFD", text) match = re.search(tones, text) if match and match.group() in pinyin_tone.keys(): return pinyin_tone[match.group()] return "5"
def pinyin_transform(text): if re.search("?", text): return "" text = re.sub( unicodedata.normalize("NFD", "ü"), "ü", re.sub( unicodedata.normalize("NFD", "ê"), "ê", unicodedata.normalize("NFD", text) ) ) if re.search( "[aeiouêü]" + tones + "[aeiou]?[aeiouêü]" + tones + "", text.lower()): return "" text = text.lower() if not re.search(tones, text) and re.match("[1-5]", text): return re.sub("(\d)(\p{Ll})", "\1 \2", text) if re.search("[??,.?]", text): text = re.sub( "([??])$", lambda x: " y?" if x.group() == "?" else " bù", text ) text = re.sub("([??])", r" \1 ", text) text = re.sub("([,.?])", r" \1 ", text) text = re.sub(" +", " ", text) text = re.sub("^ ", "", text) text = re.sub(" $", "", text) text = re.sub("\. \. \.", "...", text) text = re.sub("['\-]", " ", text) text = re.sub( "([aeiouêü]" + tones + "?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)", r"\1 \2", text ) text = re.sub(" ([grn])$", r"\1", text) text = re.sub(" ([grn]) ", r"\1 ", text) return unicodedata.normalize("NFC", text)
def is_monosyllabic(word): return not re.search("[" + vowels + "].*[" + vowels + "]", word) # Apply transformations to the Cyrillic to more closely match pronunciation. # Return two arguments: the "original" text (after decomposing composed # grave characters), and the transformed text. If the two are different, # {{ru-IPA}} should display a "phonetic respelling" notation. # NOADJ disables special-casing for adjectives in -??, while FORCEADJ forces # special-casing for adjectives, including those in -??? (pre-reform spelling) # and disables checking for exceptions (e.g. ?????, ???). NOSHTO disables # special-casing for ??? and related words.
def _sub_symbols(self, line): while re.search(r'::\w+::', line): s = re.search(r'::\w+::', line).group(0) if s in self.symbols: line = line.replace(s, self.symbols[s]) else: raise RuleFileError('Undefined symbol: {}'.format(s)) return line
def ids_and_clean_visible_from_streamcorpus_chunk_path(corpus_path): '''converts a streamcorpus.Chunk file into the structure that is passed by the search engine to find_soft_selectors ''' ch = clean_html(clean_html.default_config) cv = clean_visible(clean_visible.default_config) ids_and_clean_visible = [] for si in streamcorpus.Chunk(path=corpus_path): if not si.body.clean_visible: ## attempt to make clean_visible if not si.body.raw: logger.critical('no raw content, so skipping: %r', si.abs_url) continue abs_url = si.abs_url si = ch(si, {}) if not si: logger.critical( 'failed to make clean_html, so skipping: %r', abs_url) continue si = cv(si, {}) if not si or not si.body.clean_visible: logger.critical( 'failed to make clean_visible, so skipping: %r', abs_url) continue rec = (si.stream_id, si.body.clean_visible.decode('utf8'), {}) ids_and_clean_visible.append(rec) return ids_and_clean_visible
def parse_arguments(args): query = ArgumentParser(prog="poirot", description="""Poirot: Mind Your Language""") query.add_argument("--url", "-u", dest="url", default="", action="store", help="""The repository's git URL, e.g. 'https://github.com/dcgov/poirot.git'.""") query.add_argument("--dir", "-d", dest="dir", default=os.getcwd(), help="""The path to the local directory where the git repo is located or should be stored; defaults to the current directory.""") query.add_argument("--term", "-t", dest="term", required=False, action="store", help="""A single string or regular expression to search for.""") query.add_argument("--patterns", "-p", dest="patterns", action="store", help="""The path to the local file(s) containing strings or regular expressions to match against, each on a new line. Accepts a comma-separated list of file paths.""") query.add_argument("--output", "-o", dest="output", required=False, help="""Output results as JSON to FILE.""") query.add_argument("--revlist", "-rl", dest="revlist", required=False, default="HEAD^!", help="""A comma-delimited list of revision (commit) ranges to search. Defaults to HEAD^!. Specify 'all' to search the entire revision history.""") query.add_argument("--before", "-b", dest="before", required=False, help="""Search commits prior to a given date, e.g., Dec-12-2015""") query.add_argument("--after", "-a", dest="after", required=False, help="""Search commits after a given date, e.g., Jan-01-2015""") query.add_argument("--author", "-au", dest="author", required=False, help="""Restrict to commits made by an AUTHOR. An email address is fine.""") query.add_argument("--staged", "-st", dest="staged", action="store_true", help="""Flag to search staged modifications, instead of already committed ones.""") query.add_argument("--verbose", "-v", dest="verbose", action="store_true", help="""Flag to output colorful, verbose results.""") parsed_args = query.parse_args(args) formatted_args = format_arguments(parsed_args) return formatted_args
def parse_patterns(path): """ Reads in patterns from pattern file at path """ result = {} try: if regex.search(r"^http[s]://", path): response = requests.get(path) if response.status_code == 200: lines = response.text.split("\n") else: sys.exit(1) else: with open(path) as infile: lines = infile.readlines() label = None for line in lines: line = str(line).strip() if line.startswith("#"): label = line.lstrip("# ") elif not line: label = "" else: result[line] = label except: out = """Pattern file {file} does not exist.\n Specify the correct path with --patterns""".format(file=path) print(style(out, "red")) return result
def test_okurigana_delimit(self): """Simple test to look for suspicious non-delimited readings.""" for k in joyodb.loaded_data.kanjis: for r in filter(lambda r: r.kind == 'Kun', k.readings): examples = [e.example for e in r.examples] for e in examples: match = re.search(k.kanji + "(\p{Hiragana}+)", e) if match and re.search(match[1] + '$', r.reading): self.assertIn('.', r.reading)
def __init__(self, name, combinator=None): self.name = name self.combinator = combinator self.next_selector = None selector_patterns = { 'types': self.RE.type_selector, 'ids': self.RE.id_selector, 'classes': self.RE.class_selector, 'pseudos': self.RE.pseudo_selector, 'attrs': self.RE.attr_selector, } matches = {} while True: pattern_matched = False for key, pattern in selector_patterns.items(): match = regex.search(r'^{}'.format(pattern), name) if match: i, pos = match.span() if key not in matches: matches[key] = [] matches[key].append(match.groups()) name = name[pos:] pattern_matched = True if not pattern_matched: break self.typ = None for types in matches.pop('types', []): self.typ = types[0] self.id_ = None for ids in matches.pop('ids', []): self.id_ = ids[0] self.classes = [a[0] for a in matches.pop('classes', [])] self.attrs = [ Attr(l, o, r.strip()) for l, o, r in matches.pop('attrs', []) ] self.pseudos = [ Pseudo(*a[1:]) for a in matches.pop('pseudos', []) ]
def test_lookbehind(self): self.assertEqual(regex.search(r"123(?<=a\d+)", "a123").span(), (1, 4)) self.assertEqual(regex.search(r"123(?<=a\d+)", "b123"), None) self.assertEqual(regex.search(r"123(?<!a\d+)", "a123"), None) self.assertEqual(regex.search(r"123(?<!a\d+)", "b123").span(), (1, 4)) self.assertEqual(bool(regex.match("(a)b(?<=b)(c)", "abc")), True) self.assertEqual(regex.match("(a)b(?<=c)(c)", "abc"), None) self.assertEqual(bool(regex.match("(a)b(?=c)(c)", "abc")), True) self.assertEqual(regex.match("(a)b(?=b)(c)", "abc"), None) self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)x|c))c", "abc"), None) self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(2)b|x))c", "abc"), None) self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(2)x|b))c", "abc")), True) self.assertEqual(regex.match("(?:(a)|(x))b(?<=(?(1)c|x))c", "abc"), None) self.assertEqual(bool(regex.match("(?:(a)|(x))b(?<=(?(1)b|x))c", "abc")), True) self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c", "abc")), True) self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(2)c|x))c", "abc"), None) self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(2)x|c))c", "abc")), True) self.assertEqual(regex.match("(?:(a)|(x))b(?=(?(1)b|x))c", "abc"), None) self.assertEqual(bool(regex.match("(?:(a)|(x))b(?=(?(1)c|x))c", "abc")), True) self.assertEqual(regex.match("(a)b(?<=(?(2)x|c))(c)", "abc"), None) self.assertEqual(regex.match("(a)b(?<=(?(2)b|x))(c)", "abc"), None) self.assertEqual(regex.match("(a)b(?<=(?(1)c|x))(c)", "abc"), None) self.assertEqual(bool(regex.match("(a)b(?<=(?(1)b|x))(c)", "abc")), True) self.assertEqual(bool(regex.match("(a)b(?=(?(2)x|c))(c)", "abc")), True) self.assertEqual(regex.match("(a)b(?=(?(2)b|x))(c)", "abc"), None) self.assertEqual(bool(regex.match("(a)b(?=(?(1)c|x))(c)", "abc")), True) self.assertEqual(repr(type(regex.compile(r"(a)\2(b)"))), self.PATTERN_CLASS)
def test_named_lists(self): options = [u"one", u"two", u"three"] self.assertEqual(regex.match(ur"333\L<bar>444", u"333one444", bar=options).group(), u"333one444") self.assertEqual(regex.match(ur"(?i)333\L<bar>444", u"333TWO444", bar=options).group(), u"333TWO444") self.assertEqual(regex.match(ur"333\L<bar>444", u"333four444", bar=options), None) options = ["one", "two", "three"] self.assertEqual(regex.match(r"333\L<bar>444", "333one444", bar=options).group(), "333one444") self.assertEqual(regex.match(r"(?i)333\L<bar>444", "333TWO444", bar=options).group(), "333TWO444") self.assertEqual(regex.match(r"333\L<bar>444", "333four444", bar=options), None) self.assertEqual(repr(type(regex.compile(r"3\L<bar>4\L<bar>+5", bar=["one", "two", "three"]))), self.PATTERN_CLASS) self.assertEqual(regex.findall(r"^\L<options>", "solid QWERT", options=set(['good', 'brilliant', '+s\\ol[i}d'])), []) self.assertEqual(regex.findall(r"^\L<options>", "+solid QWERT", options=set(['good', 'brilliant', '+solid'])), ['+solid']) options = [u"STRASSE"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"STRASSE", u"stress"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"stra\N{LATIN SMALL LETTER SHARP S}e", words=options).span(), (0, 6)) options = [u"stra\N{LATIN SMALL LETTER SHARP S}e"] self.assertEqual(regex.match(ur"(?fiu)\L<words>", u"STRASSE", words=options).span(), (0, 7)) options = ["kit"] self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SKITS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?iu)\L<words>", u"SK\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}TS", words=options).span(), (1, 4)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" stra\N{LATIN SMALL LETTER SHARP S}e STRASSE ").span(), (1, 15)) self.assertEqual(regex.search(ur"(?fiu)\b(\w+) +\1\b", u" STRASSE stra\N{LATIN SMALL LETTER SHARP S}e ").span(), (1, 15)) self.assertEqual(regex.search(r"^\L<options>$", "", options=[]).span(), (0, 0))
def split_if_contraction(self, word): # Handle preposition+determiner contractions. word = regex.sub(ur'^(A|a)l$', ur'a el', word) word = regex.sub(ur'^(D|d)el$', ur'de el', word) # Before looking at clitic regexes, check if the word is in a blacklist. if word in self.non_contractions: return word # Before looking at clitic regexes, check if the word is in a whitelist. if word in self.contractions: return ' '.join(self.contractions[word]) # Right now excludes capitalized words. Might fail if the word is in the # beginning of the sentences, but avoids catching a lot of proper nouns, # such as "Charles", "Bonaparte", etc. if regex.search(ur'^[^\p{IsLower}]', word) is not None: return word # Handle clitics. word = regex.sub( \ ur'(ar|ir|ír)(me|te|se|nos|le|lo|la|les|los|las)$', \ ur'\1 \2', word) word = regex.sub( \ ur'(er)(se|le|lo|la|les|los|las)$', \ ur'\1 \2', word) word = regex.sub( \ ur'á(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \ ur'a\1 \2', word) word = regex.sub( \ ur'é(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \ ur'e\1 \2', word) word = regex.sub(ur'í(ndo)(me|te|se|nos|os|le|lo|la|les|los|las)$', \ ur'i\1 \2', word) word = regex.sub(ur'á(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \ ur'a\1 \2 \3', word) word = regex.sub(ur'é(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \ ur'e\1 \2 \3', word) word = regex.sub(ur'í(r|ndo)(se)(me|te|nos|os|le|lo|la|les|los|las)$', \ ur'i\1 \2 \3', word) word = regex.sub(ur'á(r)(os)(le|lo|la|les|los|las)$', \ ur'a\1 \2 \3', word) word = regex.sub(ur'é(r)(os)(le|lo|la|les|los|las)$', \ ur'e\1 \2 \3', word) word = regex.sub(ur'í(r)(os)(le|lo|la|les|los|las)$', \ ur'i\1 \2 \3', word) # In AnCora, all contractions have two words only. word = ' '.join(word.split(' ')[:2]) return word
def check_spaces(self, tokens, original_text): """Compare the tokens with the original text to see which tokens had trailing whitespace (to be able to annotate SpaceAfter=No) and which tokens contained internal whitespace (to be able to annotate OriginalSpelling="..."). """ extra_info = ["" for _ in tokens] normalized = self.spaces.sub(" ", original_text) normalized = self.junk_between_spaces.sub(" ", normalized) normalized = normalized.strip() for token_index, t in enumerate(tokens): original_spelling = None token = t.token token_length = len(token) if normalized.startswith(token): normalized = normalized[token_length:] else: orig = [] for char in token: first_char = None while first_char != char: try: first_char = normalized[0] normalized = normalized[1:] orig.append(first_char) except IndexError: warnings.warn("IndexError in this paragraph: '%s'\nTokens: %s" % (original_text, tokens)) original_spelling = "".join(orig) m = self.starts_with_junk.search(normalized) if m: if original_spelling is None: original_spelling = token original_spelling += normalized[:m.end()] normalized = normalized[m.end():] if original_spelling is not None: extra_info[token_index] = 'OriginalSpelling="%s"' % original_spelling if len(normalized) > 0: if normalized.startswith(" "): normalized = normalized[1:] else: if len(extra_info[token_index]) > 0: extra_info[token_index] = ", " + extra_info[token_index] extra_info[token_index] = "SpaceAfter=No" + extra_info[token_index] try: assert len(normalized) == 0 except AssertionError: warnings.warn("AssertionError in this paragraph: '%s'\nTokens: %s\nRemaining normalized text: '%s'" % (original_text, tokens, normalized)) return extra_info
def __init__(self, search, re_verbose=False, re_version=0): """Initialize.""" if isinstance(search, compat.binary_type): self.binary = True tokens = btokens ctokens = ctok.btokens else: self.binary = False tokens = utokens ctokens = ctok.utokens self._verbose_flag = ctokens["verbose_flag"] self._empty = ctokens["empty"] self._b_slash = ctokens["b_slash"] self._ls_bracket = ctokens["ls_bracket"] self._rs_bracket = ctokens["rs_bracket"] self._esc_end = ctokens["esc_end"] self._end = ctokens["end"] self._quote = ctokens["quote"] self._negate = ctokens["negate"] self._regex_flags = tokens["regex_flags"] self._nl = ctokens["nl"] self._hashtag = ctokens["hashtag"] self._V0 = tokens["v0"] self._V1 = tokens["v1"] self.search = search if regex.DEFAULT_VERSION == V0: self.groups, quotes = self.find_char_groups_v0(search) else: # pragma: no cover self.groups, quotes = self.find_char_groups_v1(search) self.verbose, self.version = self.find_flags(search, quotes, re_verbose, re_version) if self.version != regex.DEFAULT_VERSION: if self.version == V0: # pragma: no cover self.groups = self.find_char_groups_v0(search)[0] else: self.groups = self.find_char_groups_v1(search)[0] if self.verbose: self._verbose_tokens = ctokens["verbose_tokens"] else: self._verbose_tokens = tuple() self.extended = []
def write_script(script, movie): script = script.split('\n') scenes = [] characters = OrderedDict() bool = False first_scene = True scene_count = 1 with open('./output/' + movie + '_results.txt', 'w') as f: final = [] for line in script: result = re.search(r'((?:EXT|INT).+)', line) if result: if first_scene: first_scene = False else: if characters[scene]: final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + str(characters[scene])) scene_count += 1 else: final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + 'None') scene_count += 1 bool = True scene = re.sub(r'\s+\d+', '', result.group(1)) scene = re.sub(r'\r', '', scene) scenes.append(scene) characters[scene] = [] elif bool: result2 = re.search('^\s+([A-Z]{2}.+)(?<![a-z]+)', line) if result2: if '!' not in result2.group(1) and ',' not in result2.group(1) and ' ...' not in result2.group(1) \ and ' - ' not in result2.group(1) and ':' not in result2.group(1) and len(result2.group(1)) < 25 \ and 'FADE' not in result2.group(1) and 'THE END' not in result2.group(1): character = re.sub(r'^\s+', '', result2.group(1)) character = re.sub(r'\r', '', character) characters[scene].append(character) if characters[scene]: final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + str(characters[scene])) scene_count += 1 else: final.append(str(scene_count) + ':\t' + str(scene) + ':\t' + 'None') scene_count += 1 for line in final: f.write(line + '\n')