我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用regex.sub()。
def extract_video_titles(input_file, output_file): with open(input_file, encoding='utf-8') as file: titles = json.load(file) count = 0 for video_id, title in titles.items(): # Remove all punctuation and symbols in unicode title = re.sub(r'[\p{P}\p{S}]+', '', title) titles[video_id] = title.split(' ') count += 1 print('{}: {}'.format(count, video_id), end='\r', flush=True) with open(output_file, 'w', encoding='utf-8') as file: json.dump(titles, file)
def test_bug_1140(self): # regex.sub(x, y, u'') should return u'', not '', and # regex.sub(x, y, '') should return '', not u''. # Also: # regex.sub(x, y, unicode(x)) should return unicode(y), and # regex.sub(x, y, str(x)) should return # str(y) if isinstance(y, str) else unicode(y). for x in 'x', u'x': for y in 'y', u'y': z = regex.sub(x, y, u'') self.assertEqual((type(z), z), (unicode, u'')) z = regex.sub(x, y, '') self.assertEqual((type(z), z), (str, '')) z = regex.sub(x, y, unicode(x)) self.assertEqual((type(z), z), (unicode, unicode(y))) z = regex.sub(x, y, str(x)) self.assertEqual((type(z), z), (type(y), y))
def test_symbolic_refs(self): self.assertRaisesRegex(regex.error, self.MISSING_GT, lambda: regex.sub('(?P<a>x)', r'\g<a', 'xx')) self.assertRaisesRegex(regex.error, self.MISSING_GROUP_NAME, lambda: regex.sub('(?P<a>x)', r'\g<', 'xx')) self.assertRaisesRegex(regex.error, self.MISSING_LT, lambda: regex.sub('(?P<a>x)', r'\g', 'xx')) self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?P<a>x)', r'\g<a a>', 'xx')) self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?P<a>x)', r'\g<1a1>', 'xx')) self.assertRaisesRegex(IndexError, self.UNKNOWN_GROUP_I, lambda: regex.sub('(?P<a>x)', r'\g<ab>', 'xx')) # The new behaviour of unmatched but valid groups is to treat them like # empty matches in the replacement template, like in Perl. self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\g<b>', 'xx'), '') self.assertEqual(regex.sub('(?P<a>x)|(?P<b>y)', r'\2', 'xx'), '') # The old behaviour was to raise it as an IndexError. self.assertRaisesRegex(regex.error, self.BAD_GROUP_NAME, lambda: regex.sub('(?P<a>x)', r'\g<-1>', 'xx'))
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
def prepare(self): # log the full request and headers if the log level is set to debug if log.level == 10: log.debug("Preparing request: {} {}".format(self.request.method, self.request.path)) for k, v in self.request.headers.items(): log.debug("{}: {}".format(k, v)) if 'X-Forwarded-Proto' in self.request.headers: proto = self.request.headers['X-Forwarded-Proto'] else: proto = self.request.protocol if proto != 'https' and 'enforce_https' in self.application.config['general']: mode = self.application.config['general']['enforce_https'] if mode == 'reject': self.set_status(404) self.finish() else: # default to redirect self.redirect(regex.sub(r'^([^:]+)', 'https', self.request.full_url()), permanent=True) return super().prepare()
def eval_expr(expr): import ast import operator as op op = { ast.Add: op.add, ast.Sub: op.sub, ast.Mult: op.mul, ast.Div: op.truediv, ast.Pow: op.pow, ast.BitXor: op.xor, ast.USub: op.neg, } def eval_(node): if isinstance(node, ast.Num): return fractions.Fraction(node.n) elif isinstance(node, ast.BinOp): return op[type(node.op)](eval_(node.left), eval_(node.right)) elif isinstance(node, ast.UnaryOp): return op[type(node.op)](eval_(node.operand)) raise TypeError(node) return eval_(ast.parse(str(expr), mode='eval').body)
def test_against_kanjidic(self): kanjidic_data = {} with open(kanjidic_file, 'rt') as f: for line in f: kanji, *fields = line.strip().split() if kanji in TestLoadedData.kanjis.keys(): # kanjidic marks bound affixes with '-', but we don't fields = [re.sub('-$', '', f) for f in fields] fields = [re.sub('^-', '', f) for f in fields] kanjidic_data[kanji] = fields for kanji in joyodb.loaded_data.kanjis: for reading in kanji.readings: if reading.variation_of: continue # variations are not in kanjidic if (kanji.kanji, reading.reading) not in KANJIDIC_MISSING_READINGS: self.assertIn(reading.reading, kanjidic_data[kanji.kanji])
def make_vocab(fpath, fname): '''Constructs vocabulary. Args: fpath: A string. Input file path. fname: A string. Output file name. Writes vocabulary line by line to `preprocessed/fname` ''' text = codecs.open(fpath, 'r', 'utf-8').read() text = regex.sub("[^\s\p{Latin}']", "", text) words = text.split() word2cnt = Counter(words) if not os.path.exists('preprocessed'): os.mkdir('preprocessed') with codecs.open('preprocessed/{}'.format(fname), 'w', 'utf-8') as fout: fout.write("{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n{}\t1000000000\n".format("<PAD>", "<UNK>", "<S>", "</S>")) for word, cnt in word2cnt.most_common(len(word2cnt)): fout.write(u"{}\t{}\n".format(word, cnt))
def request_url(url, headers=None, name=''): print('req', url) data = set() servers = list() try: response = requests.get(url, headers=headers, verify=False).text data.update(map(lambda x: re.sub('\s', '', x), re.findall('ssr?://[a-zA-Z0-9=]+', response))) soup = BeautifulSoup(response, 'html.parser') title = soup.find('title').text info = {'message': '', 'url': url, 'name': str(title)} for i, server in enumerate(data): try: servers.append(parse(server, ' '.join([title, name, str(i)]))) except Exception as e: logging.exception(e, stack_info=False) print('URL:', url, 'SERVER', server) except Exception as e: print(url) logging.exception(e, stack_info=False) return [], {'message': str(e), 'url': '', 'name': ''} return servers, info
def generate_char_list(string, strip_html=True): if strip_html: s = strip_html_tags(string.lower()) else: s = string.lower() normalized_string = regex.sub(r'\s+', r' ', s) # change any kind of whitespace to a single space list_norm_chars = regex.findall(r"\w|[?!'#@$:\"&*=,]", normalized_string) return list_norm_chars
def generate_word_list(string, strip_html=True): if strip_html: s = strip_html_tags(string.lower()) else: s = string.lower() normalized_string = regex.sub(r"\s+", r' ', s) # change any kind of whitespace to a single space # list of words all words seen during training including strings like '!!!' , '??', '....' # as these repeated punctuations tend to imply more than the're gramatical meaning list_normalized_string = regex.findall(r"\b\w+[']?\w*\b|\!+|\?+|\.{3,}", normalized_string) return list_normalized_string
def strip_html_tags(string, verbose=False): p = regex.compile(r'<.*?>') return p.sub(' ', string)
def remove_ansi_escape_sequences(text): # http://stackoverflow.com/questions/14693701/how-can-i-remove-the-ansi-escape-sequences-from-a-string-in-python # also clean up the line endings return regex.sub(r'(\x9b|\x1b\[)[0-?]*[ -\/]*[@-~]|\ *\r', '', text) # helper for running sut as subprocess within pty # does two things # * test app running in pty in subprocess # * get test coverage from subprocess # docu: # http://blog.fizyk.net.pl/blog/gathering-tests-coverage-for-subprocesses-in-python.html
def test_basic_regex_sub(self): self.assertEqual(regex.sub("(?i)b+", "x", "bbbb BBBB"), 'x x') self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y'), '9.3 -3 24x100y') self.assertEqual(regex.sub(r'\d+', self.bump_num, '08.2 -2 23x99y', 3), '9.3 -3 23x99y') self.assertEqual(regex.sub('.', lambda m: r"\n", 'x'), "\\n") self.assertEqual(regex.sub('.', r"\n", 'x'), "\n") self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<a>', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?P<a>x)', r'\g<a>\g<1>', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?P<unk>x)', r'\g<unk>\g<unk>', 'xx'), 'xxxx') self.assertEqual(regex.sub('(?P<unk>x)', r'\g<1>\g<1>', 'xx'), 'xxxx') self.assertEqual(regex.sub('a', r'\t\n\v\r\f\a\b\B\Z\a\A\w\W\s\S\d\D', 'a'), "\t\n\v\r\f\a\b\\B\\Z\a\\A\\w\\W\\s\\S\\d\\D") self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), "\t\n\v\r\f\a") self.assertEqual(regex.sub('a', '\t\n\v\r\f\a', 'a'), chr(9) + chr(10) + chr(11) + chr(13) + chr(12) + chr(7)) self.assertEqual(regex.sub(r'^\s*', 'X', 'test'), 'Xtest') self.assertEqual(regex.sub(ur"x", ur"\x0A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\u000A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\U0000000A", u"x"), u"\n") self.assertEqual(regex.sub(ur"x", ur"\N{LATIN CAPITAL LETTER A}", u"x"), u"A") self.assertEqual(regex.sub(r"x", r"\x0A", "x"), "\n") self.assertEqual(regex.sub(r"x", r"\u000A", "x"), "\\u000A") self.assertEqual(regex.sub(r"x", r"\U0000000A", "x"), "\\U0000000A") self.assertEqual(regex.sub(r"x", r"\N{LATIN CAPITAL LETTER A}", "x"), "\\N{LATIN CAPITAL LETTER A}")
def test_bug_449964(self): # Fails for group followed by other escape. self.assertEqual(regex.sub(r'(?P<unk>x)', r'\g<1>\g<1>\b', 'xx'), "xx\bxx\b")
def test_bug_449000(self): # Test for sub() on escaped characters. self.assertEqual(regex.sub(r'\r\n', r'\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub('\r\n', r'\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub(r'\r\n', '\n', 'abc\r\ndef\r\n'), "abc\ndef\n") self.assertEqual(regex.sub('\r\n', '\n', 'abc\r\ndef\r\n'), "abc\ndef\n")
def test_bug_114660(self): self.assertEqual(regex.sub(r'(\S)\s+(\S)', r'\1 \2', 'hello there'), 'hello there')
def test_bug_462270(self): # Test for empty sub() behaviour, see SF bug #462270 self.assertEqual(regex.sub('(?V0)x*', '-', 'abxd'), '-a-b-d-') self.assertEqual(regex.sub('(?V1)x*', '-', 'abxd'), '-a-b--d-') self.assertEqual(regex.sub('x+', '-', 'abxd'), 'ab-d')
def test_dollar_matches_twice(self): # $ matches the end of string, and just before the terminating \n. pattern = regex.compile('$') self.assertEqual(pattern.sub('#', 'a\nb\n'), 'a\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a\nb\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#') pattern = regex.compile('$', regex.MULTILINE) self.assertEqual(pattern.sub('#', 'a\nb\n' ), 'a#\nb#\n#') self.assertEqual(pattern.sub('#', 'a\nb\nc'), 'a#\nb#\nc#') self.assertEqual(pattern.sub('#', '\n'), '#\n#')
def test_unmatched_in_sub(self): # Issue 1519638. self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "xy"), 'y-x') self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "xy"), 'y-x-') self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "x"), '-x') self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "x"), '-x-') self.assertEqual(regex.sub(r"(?V0)(x)?(y)?", r"\2-\1", "y"), 'y-') self.assertEqual(regex.sub(r"(?V1)(x)?(y)?", r"\2-\1", "y"), 'y--')
def split_if_contraction(self, word): # Handle preposition+determiner contractions. word = regex.sub(ur'^(A|a)u$', ur'à le', word) word = regex.sub(ur'^(A|a)uquel$', ur'à lequel', word) word = regex.sub(ur'^(A|a)ux$', ur'à les', word) word = regex.sub(ur'^(A|a)uxquels$', ur'à lesquels', word) word = regex.sub(ur'^(A|a)uxquelles$', ur'à lesquelles', word) word = regex.sub(ur'^(D|d)u$', ur'de le', word) word = regex.sub(ur'^(D|d)uquel$', ur'de lequel', word) word = regex.sub(ur'^(D|d)es$', ur'de les', word) word = regex.sub(ur'^(D|d)esquels$', ur'de lesquels', word) word = regex.sub(ur'^(D|d)esquelles$', ur'de lesquelles', word) return word
def _clean_characters(self, characters): """Clean characters (e.g. convert \t to a space).""" if self._lower: characters = characters.lower() characters = regex.sub(r'\t|\s+|\u200d', ' ', characters) characters = regex.sub(r'`', "'", characters) characters = regex.sub(r'–', "-", characters) return characters
def compile_replace(pattern, repl, flags=0): """Construct a method that can be used as a replace method for `sub`, `subn`, etc.""" call = None if pattern is not None and isinstance(pattern, REGEX_TYPE): if isinstance(repl, (compat.string_type, compat.binary_type)): repl = ReplaceTemplate(pattern, repl, bool(flags & FORMAT)) call = Replace( functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash ) elif isinstance(repl, Replace): if flags: raise ValueError("Cannot process flags argument with a compiled pattern!") if repl.pattern_hash != hash(pattern): raise ValueError("Pattern hash doesn't match hash in compiled replace!") call = repl elif isinstance(repl, ReplaceTemplate): if flags: raise ValueError("Cannot process flags argument with a ReplaceTemplate!") call = Replace( functools.partial(_apply_replace_backrefs, repl=repl), repl.use_format, repl.pattern_hash ) else: raise TypeError("Not a valid type!") else: raise TypeError("Pattern must be a compiled regular expression!") return call # Convenience methods like re has, but slower due to overhead on each call. # It is recommended to use compile_search and compile_replace
def sub(pattern, repl, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): """Wrapper for `sub`.""" is_replace = _is_replace(repl) is_string = isinstance(repl, (compat.string_type, compat.binary_type)) if is_replace and repl.use_format: raise ValueError("Compiled replace cannot be a format object!") pattern = compile_search(pattern, flags) return regex.sub( pattern, (compile_replace(pattern, repl) if is_replace or is_string else repl), string, count, flags, pos, endpos, concurrent, **kwargs )
def subf(pattern, format, string, count=0, flags=0, pos=None, endpos=None, concurrent=None, **kwargs): # noqa B002 """Wrapper for `subf`.""" is_replace = _is_replace(format) is_string = isinstance(format, (compat.string_type, compat.binary_type)) if is_replace and not format.use_format: raise ValueError("Compiled replace is not a format object!") pattern = compile_search(pattern, flags) rflags = FORMAT if is_string else 0 return regex.sub( pattern, (compile_replace(pattern, format, flags=rflags) if is_replace or is_string else format), string, count, flags, pos, endpos, concurrent, **kwargs )
def build_html(fragment, css=False): fragment = regex.sub(r'<p([^>])*></p>', r'<p\1> </p>', fragment) css_link = '' if css: css_link = LINK_TEXT new = HTML.format(css_link, fragment) soup = gumbo_bs4.parse(new) return soup.serialize_xhtml()
def splitWord(str): str = re.sub("[^A-Za-z]", "", str) words = re.split(r'(?<=[a-z])(?=[A-Z])|(?<=[A-Z])(?=[A-Z][a-z])|(?<=[a-z]A)(?=[A-Z])', str, flags=re.V1) return words
def remove_punctuation(self, text): ''' Get rid of punctuation except apostrophes ''' return re.sub(r"[^\P{P}\']+", "", text)
def slugify(value): """ Parameters ---------- value: str the value to slug Convert spaces to hyphens. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace. """ value = regex.sub('[^\w\s-]', '', value).strip().lower() return regex.sub('[-\s]+', '-', value)
def jsonFormat(year=2014, month=1, day=1, newsType=0): text = getJson(year, month, day, newsType) returnValue = list() if text.startswith('var data=') is True: tmp = re.sub(',*,', ',', text.lstrip('var data=').rstrip(';').replace('\n', '').replace(',[]', '')) if newsType is not 0: tmp = re.sub(r'(,|\{)([a-z]*?)(:)', r'\1"\2"\3', tmp) tmp = re.sub(r'(\[),(\{)', r'\1\2', tmp.replace('\\', '/')) try: tmpValue = json.loads(tmp, strict=False) except: return list() childClassification = getChildClassification(tmpValue[u'category']) if newsType is 1: valuelist = tmpValue[u'ent'] else: valuelist = tmpValue[u'news'] for list0 in valuelist: for list1 in list0: if list1 is not None: if list1[u'l'].find('photoview') is -1 and list1[u'l'].find('blog') is -1: returnValue.append( [list1[u'p'].split()[0], list1[u'p'].split()[1], getSiteURL(newsType)[0], childClassification[list1[u'c']], list1[u'l'], list1[u't']]) del tmp del text del tmpValue del childClassification del valuelist gc.collect() return returnValue
def getnews(URL): date = str() html = networkExceptionCatch(URL) soup = BeautifulSoup(html, 'html.parser') alls = soup.find_all('div', id="endText") for div in alls: if div.find('script'): div = re.sub(r'<script.*?</script>', '', div) p_in_div = div.find_all('p') if len(p_in_div) is 0: p_in_div = re.sub(r'(<div id="endText">)(.*?)(</p>)(<p>)', r'\1\2\4', p_in_div) for p_tag in p_in_div: if p_tag.text is not None: date += p_tag.text + u'\n' return date
def sub_pattern(self, match): match_dict = match.groupdict() pattern = GROK_PATTERNS[match_dict['pattern_name']] pattern_output_raw = match_dict['pattern_output'] pattern_type_raw = match_dict['pattern_type'] if pattern_output_raw: pattern_output = pattern_output_raw.lstrip(':') new_pattern = GROK_NEW_PATTERN.format(name=pattern_output, pattern=pattern) if pattern_type_raw: pattern_type = pattern_type_raw.lstrip(':') self.pattern_types[pattern_output] = __builtins__[pattern_type] else: new_pattern = pattern return regex.sub(GROK_REPLACE_PATTERN, self.sub_pattern, new_pattern)
def grok_re_preprocess(re_pattern): traverser = PatternTraverser() new_pattern = regex.sub(GROK_REPLACE_PATTERN, traverser.sub_pattern, re_pattern) return new_pattern, traverser.pattern_types
def test_collect_types(self): traverser = pattern_matching.PatternTraverser() re = regex.sub(pattern_matching.GROK_REPLACE_PATTERN, traverser.sub_pattern, 'This is process %{POSINT:processid:int} running in %{PATH:process_dir}') self.assertDictEqual(traverser.pattern_types, {'processid': int})
def pinyin_transform(text): if re.search("?", text): return "" text = re.sub( unicodedata.normalize("NFD", "ü"), "ü", re.sub( unicodedata.normalize("NFD", "ê"), "ê", unicodedata.normalize("NFD", text) ) ) if re.search( "[aeiouêü]" + tones + "[aeiou]?[aeiouêü]" + tones + "", text.lower()): return "" text = text.lower() if not re.search(tones, text) and re.match("[1-5]", text): return re.sub("(\d)(\p{Ll})", "\1 \2", text) if re.search("[??,.?]", text): text = re.sub( "([??])$", lambda x: " y?" if x.group() == "?" else " bù", text ) text = re.sub("([??])", r" \1 ", text) text = re.sub("([,.?])", r" \1 ", text) text = re.sub(" +", " ", text) text = re.sub("^ ", "", text) text = re.sub(" $", "", text) text = re.sub("\. \. \.", "...", text) text = re.sub("['\-]", " ", text) text = re.sub( "([aeiouêü]" + tones + "?n?g?r?)([bpmfdtnlgkhjqxzcsywr]h?)", r"\1 \2", text ) text = re.sub(" ([grn])$", r"\1", text) text = re.sub(" ([grn]) ", r"\1 ", text) return unicodedata.normalize("NFC", text)
def sub_repeatedly(pattern, repl, term): while True: new_term = re.sub(pattern, repl, term) if new_term == term: return term term = new_term
def decompose(text): def repl(match): k = match.group() if k in recomposer.keys(): return recomposer[k] return k text = unicodedata.normalize("NFD", text) text = re.sub(".[" + BREVE + DIA + CARON + "]", repl, text) return text # Remove grave accents; don't affect acute or composed diaeresis in ?? or # uncomposed diaeresis in -??- (as in plural ?????? of ??????). # NOTE: Translit must already be decomposed! See comment at top.
def remove_grave_accents(word): def repl(match): k = match.group() if k in grave_deaccenter.keys(): return grave_deaccenter[k] return k ru_removed = re.sub("[?????]", repl, word) return ru_removed
def tr_sub(text, include_monosyllabic_jo_accent="", noadj="", noshto="", sub="", forceadj=""): if sub: subs = sub.split(",") for subpair in subs: subsplit = subpair.split("/") text = re.sub(subsplit[0], subsplit[1], text) return tr(text, None, None, include_monosyllabic_jo_accent, noadj, noshto, forceadj) #for adjectives, pronouns
def sub_repeatedly(pattern, repl, term): """apply sub() repeatedly until no change""" while True: new_term = re.sub(pattern, repl, term) if new_term == term: return term term = new_term # If enabled, compare this module with new version of module in # Module:User:Benwing2/ru-pron to make sure all pronunciations are the same. # To check for differences, go to Template:tracking/ru-pron/different-pron # and look at what links to the page. # test_new_ru_pron_module = False # If enabled, do new code for final -?; else, the old way
def phon_respelling(text, remove_grave): text = re.sub("[" + CFLEX + DUBGR + DOTABOVE + DOTBELOW + "?]", "", text) # Remove grave accents from annotations but maybe not from phonetic respelling if remove_grave: text = com.remove_grave_accents(text) return text # Return the actual IPA corresponding to Cyrillic text. ADJ, GEN, BRACKET # and POS are as in [[Template:ru-IPA]]. If IS_TRANFORMED is true, the text # has already been passed through m_ru_translit.apply_tr_fixes(); otherwise, # this will be done.
def IPA_to_CMUBET(text): """Convert IPA to CMUBET for US English. Use `IPA`_ and symbol set used in Wiktionary and `CMUBET`_ symbol set used in CMUDict. .. _IPA: https://en.wiktionary.org/wiki/Module:IPA/data/symbols .. _CMUBET: https://cmusphinx.github.io/wiki/cmubet/ Parameters ---------- text : string String of IPA text parsed from Wiktionary. Returns ------- string Converted CMUBET text. """ text = re.sub("??", ":", text) text = text.lstrip("/[").rstrip("]/") text = text.strip("'-!$") text += " " CMUBET_lst = [] i = 0 while i < len(text) - 1: if text[i:i+2] in i2c_lookup.keys(): CMUBET_lst.append(i2c_lookup[text[i:i+2]]) i += 1 elif text[i] in i2c_lookup.keys(): CMUBET_lst.append(i2c_lookup[text[i]]) i += 1 return " ".join(CMUBET_lst)
def syllabify(text): def repl(match): a, b, c, d = \ match.group(1), match.group(2), match.group(3), match.group(4) if re.match(weak_h, b + c) or re.match(aspirate + "h", b + " " + c): b, c = "", b + c if c == "" and b != "": c, b = b, "" return a + b + "." + c + d for _ in range(2): text = re.sub(syllabify_pattern, repl, text) return text
def normalize_quotes(token): token = re.sub(r"-$", '', token) token = re.sub(r"``", '\u201c', token) token = re.sub(r"''", '\u201d', token) return token
def remove_elongation(text): return regex.sub(r'(.)\1{3,}', r'\1\1', text, flags=regex.UNICODE)
def clean(text): #removing extra spaces text = regex.sub(r'[\s\n]+', ' ', text, flags=regex.UNICODE) # todo : add more cleaning methods return text