我们从Python开源项目中,提取了以下33个代码示例,用于说明如何使用re.L。
def preprocessing(content): remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ') ## preprocessing #1 : remove XXenglishXX and numbers preprocessing_1 = re.compile(r'\d*',re.L) ## only substitute numbers #preprocessing_1 = re.compile(r'\w*',re.L) ## substitute number & English content = preprocessing_1.sub("",content) ## preprocessing #2 : remove punctuation preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation)) content = preprocessing_2.sub("",content) ## preprocessing #3 : remove Chinese punctuation and multiple whitspaces content = content.replace('\n','') for punc in remove_punc: content = content.replace(punc,'') try: content = parsing.strip_multiple_whitespaces(content) except: print 'Warning : failed to strip whitespaces @ ' return content
def defSyntax(self): '''Define re patterns according to syntax.''' #------------------REGEX patterns------------------ if self.syntax=='markdown': self._img_re=re.compile('^(.*)!\\[(.+?)\\]\\((.+?)\\)', re.M | re.L) self._h_re_base = r''' (^(.+)[ \t]*\n(=+|-+)[ \t]*\n+) | (^(\#{%s}) # \1 = string of #'s [ \t]* (.+?) # \2 = Header text [ \t]* (?<!\\) # ensure not an escaped trailing '#' \#* # optional closing #'s (not counted) \n+ ) ''' self._all_h_re=re.compile(self._h_re_base %'1,6', re.X | re.M) elif self.syntax=='zim': self._img_re=re.compile('^(.*)\\{\\{(.+?)\\}\\}(.*)$', re.M | re.L) self._h_re_base = r''' ^(\={%s}) # \1 = string of ='s [ \t]* (.+?) # \2 = Header text [ \t]* \1 \n+ ''' self._all_h_re=re.compile(self._h_re_base %'1,6', re.X | re.M) else: raise Exception("Unknown syntax %s" %self.syntax) return
def test_from_re(self): # re.U and re.S flags are implicitly set self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a")) self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a")) # re.I flag can be set explicitly self.assertEqual( RegExp.from_re(re.compile("a", re.I)), RegExp("a", ignore_case=True)) # re.M, re.L and re.X are forbidden for flag in [re.M, re.L, re.X]: self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag))
def iternext(self): """ Iterate through characters of the string. Count escaped l, L, c, C, E, N, p, P, backslash as a single char. """ if self.index > self.max_index: raise StopIteration char = self.string[self.index:self.index + 1] if char == self._b_slash: m = self._re_search_ref.match(self.string[self.index + 1:]) if m: ref = m.group(0) if len(ref) == 1 and ref in self._long_search_refs: if ref == self._unicode_name: raise SyntaxError('Format for Unicode name is \\N{name}!') elif ref == self._uni_prop: raise SyntaxError('Format for Unicode property is \\p{property}!') elif ref == self._inverse_uni_prop: raise SyntaxError('Format for inverse Unicode property is \\P{property}!') char += m.group(1) if m.group(1) else m.group(2) elif char == self._ls_bracket: m = self._re_posix.match(self.string[self.index:]) if m: char = m.group(0) self.index += len(char) self.current = char return self.current # Templates
def test_constants(self): self.assertEqual(re.I, re.IGNORECASE) self.assertEqual(re.L, re.LOCALE) self.assertEqual(re.M, re.MULTILINE) self.assertEqual(re.S, re.DOTALL) self.assertEqual(re.X, re.VERBOSE)
def test_flags(self): for flag in [re.I, re.M, re.X, re.S, re.L]: self.assertNotEqual(re.compile('^pattern$', flag), None)
def test_flags(self): for flag in [re.I, re.M, re.X, re.S, re.L]: self.assertTrue(re.compile('^pattern$', flag))
def check_en_US_iso88591(self): locale.setlocale(locale.LC_CTYPE, 'en_US.iso88591') self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) self.assertTrue(re.match(b'\xc5', b'\xe5', re.L|re.I)) self.assertTrue(re.match(b'\xe5', b'\xc5', re.L|re.I)) self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) self.assertTrue(re.match(b'(?Li)\xc5', b'\xe5')) self.assertTrue(re.match(b'(?Li)\xe5', b'\xc5'))
def check_en_US_utf8(self): locale.setlocale(locale.LC_CTYPE, 'en_US.utf8') self.assertTrue(re.match(b'\xc5\xe5', b'\xc5\xe5', re.L|re.I)) self.assertIsNone(re.match(b'\xc5', b'\xe5', re.L|re.I)) self.assertIsNone(re.match(b'\xe5', b'\xc5', re.L|re.I)) self.assertTrue(re.match(b'(?Li)\xc5\xe5', b'\xc5\xe5')) self.assertIsNone(re.match(b'(?Li)\xc5', b'\xe5')) self.assertIsNone(re.match(b'(?Li)\xe5', b'\xc5'))
def parse_string(self, txt): import re, unicodedata, locale if type(txt) is not str: txt = txt.decode('utf-8') #locale.setlocale(locale.LC_ALL, 'ca_ES') prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$", re.L) if not prog.match(txt): return False else: # ~ Replace accents txt = ''.join((c for c in unicodedata.normalize('NFD', txt) if unicodedata.category(c) != 'Mn')) return txt.replace(" ", "_")
def validCharacters(txt): import re, unicodedata, locale txt=txt.decode('utf-8') locale.setlocale(locale.LC_ALL, 'ca_ES') prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$".decode('UTF-8'), re.L) if not prog.match(txt): return False else: return txt
def _parseString(self, txt): import re, unicodedata, locale if type(txt) is not str: txt = txt.decode('utf-8') locale.setlocale(locale.LC_ALL, 'ca_ES') prog = re.compile("[-_àèìòùáéíóúñçÀÈÌÒÙÁÉÍÓÚÑÇ .a-zA-Z0-9]+$", re.L) if not prog.match(txt): return False else: # ~ Replace accents txt = ''.join((c for c in unicodedata.normalize('NFD', txt) if unicodedata.category(c) != 'Mn')) return txt.replace(" ", "_")
def get_info(host): """get some infomation of a host""" req = '' try: req = requests.get(host,timeout=10) req.encoding = req.apparent_encoding result = re.findall(re.compile('<title>(.*?)</title>',re.L),req.text) return result[0] except Exception as e: print(e) return None #----------------------------------------------------------------------
def iternext(self): """ Iterate through characters of the string. Count escaped l, L, c, C, E and backslash as a single char. """ if self.index > self.max_index: raise StopIteration char = self.string[self.index:self.index + 1] if char == self._b_slash: m = self._replace_ref.match(self.string[self.index + 1:]) if m: ref = m.group(0) if len(ref) == 1 and ref in self._long_replace_refs: if ref == self._hex: raise SyntaxError('Format for byte is \\xXX!') elif ref == self._group: raise SyntaxError('Format for group is \\g<group_name_or_index>!') elif ref == self._unicode_name: raise SyntaxError('Format for Unicode name is \\N{name}!') elif ref == self._unicode_narrow: # pragma: no cover raise SyntaxError('Format for Unicode is \\uXXXX!') elif ref == self._unicode_wide: # pragma: no cover raise SyntaxError('Format for wide Unicode is \\UXXXXXXXX!') if self.use_format and (m.group(3) or m.group(4)): char += self._b_slash self.index -= 1 if not self.use_format or not m.group(4): char += m.group(1) if m.group(1) else m.group(2) elif self.use_format and char in (self._lc_bracket, self._rc_bracket): m = self._format_replace_group.match(self.string[self.index:]) if m: if m.group(2): char = m.group(2) else: self.index += 1 else: raise ValueError("Single unmatched curly bracket!") self.index += len(char) self.current = char return self.current