我们从Python开源项目中,提取了以下49个代码示例,用于说明如何使用re.U。
def setup(self, config): """ Compile configured regular expressions. :param config: Configuration object. :type config: ``dict`` """ self.matches = {} patterns = [] for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items(): patterns.append( r'\b(?P<{}>{})\b'.format(entity_type, pattern_conf[helper.PATTERN])) self.pattern = regex.compile( '|'.join(patterns), regex.I | regex.U)
def split_into_sentences(text): potential_end_pat = re.compile(r"".join([ r"([\w\.'’&\]\)]+[\.\?!])", # A word that ends with punctuation r"([‘’“”'\"\)\]]*)", # Followed by optional quote/parens/etc r"(\s+(?![a-z\-–—]))", # Followed by whitespace + non-(lowercase or dash) ]), re.U ) dot_iter = re.finditer(potential_end_pat, text) end_indices = [ (x.start() + len(x.group(1)) + len(x.group(2))) for x in dot_iter if is_sentence_ender(x.group(1)) ] spans = zip([None] + end_indices, end_indices + [None]) sentences = [ text[start:end].strip() for start, end in spans ] return sentences
def replace_wiki_links(text, raw_link=False): """ ?????? ????-?????? ???? '[user_id|link_text]' ?? ??????????? HTML :param text: ????? ??? ????????? :param raw_link: ?????? ?????? ?????? ????-?????? """ link_format = "{1} (vk.com/{0})" if raw_link else "<a href=\"https://vk.com/{0}\">{1}</a>" pattern = re.compile(r"\[([^|]+)\|([^|]+)\]", re.U) results = pattern.findall(text, re.U) for i in results: user_id = i[0] link_text = i[1] before = "[{0}|{1}]".format(user_id, link_text) after = link_format.format(user_id, link_text) text = text.replace(before, after) return text
def _extract_info(self, soup): empty_info = {'from': 0, 'to': 0, 'total': 0} div_ssb = soup.find('div', id='ssb') if not div_ssb: self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup) return empty_info p = div_ssb.find('p') if not p: self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup) return empty_info txt = ''.join(p.findAll(text=True)) txt = txt.replace(',', '') matches = re.search(r'%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U) if not matches: return empty_info return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
def _html_unescape(self, str): def entity_replacer(m): entity = m.group(1) if entity in name2codepoint: return unichr(name2codepoint[entity]) else: return m.group(0) def ascii_replacer(m): cp = int(m.group(1)) if cp <= 255: return unichr(cp) else: return m.group(0) s = re.sub(r'&#(\d+);', ascii_replacer, str, re.U) return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
def _extract_info(self, soup): empty_info = {'from': 0, 'to': 0, 'total': 0} td_rsb = soup.find('td', 'rsb') if not td_rsb: self._maybe_raise(ParseError, "Td with number of results was not found on Blogs search page", soup) return empty_info font = td_rsb.find('font') if not font: self._maybe_raise(ParseError, """<p> tag within <tr class='rsb'> was not found on Blogs search page""", soup) return empty_info txt = ''.join(font.findAll(text=True)) txt = txt.replace(',', '') if self.hl == 'es': matches = re.search(r'Resultados (\d+) - (\d+) de (?:aproximadamente )?(\d+)', txt, re.U) elif self.hl == 'en': matches = re.search(r'Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U) if not matches: return empty_info return {'from': int(matches.group(1)), 'to': int(matches.group(2)), 'total': int(matches.group(3))}
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ): """ Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace """ token_pattern = re.compile(r"[\s\-]+", re.U) def custom_tokenizer( s ): return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ] # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call if apply_norm: norm_function = "l2" else: norm_function = None tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) X = tfidf.fit_transform(docs) terms = [] # store the vocabulary map v = tfidf.vocabulary_ for i in range(len(v)): terms.append("") for term in v.keys(): terms[ v[term] ] = term return (X,terms)
def search_filename(fname, fields): """Extract movie title/date from filename and return dict with movies infos """ path_tokens = os.path.normpath(fname).split(os.sep) candidate = path_tokens[-1] res = re.split(FNAME_SPLIT_RE, candidate, flags=re.I | re.U)[0].strip() res = scrub(res, '[({])}', ' ') res = ' '.join([x for x in re.split(r'[\s\._]', res, flags=re.U) if x]) years = re.findall(r'((?:19|20)\d\d)', res) if years: toks = re.split(r'(%s)' % years[-1], res) else: toks = [res] title = toks[0].strip() year = toks[1] if len(toks) > 1 else None item = search_by(title, year, fields) if item: item['filename'] = fname return item
def __get_series_data(program, ext_info): episode = int(program['episode']) season = int(program['season']) desc = ext_info['synopsis'] if ext_info else u'Año: %s' % program['year'] if season == 0: sn = re.findall(r'.*\sT(\d*/?\d+).*', program['full_title'], re.U) season = int(sn[0].replace('/', '')) if sn else season if 'episode_title' in program: title = program['serie'] stitle = '%ix%02d %s' % (season, episode, program['episode_title']) else: title = re.findall(r'(.*)\sT\d*/?\d+.*', program['full_title'], re.U) title = title[0] if title else program['full_title'] stitle = '%ix%02d %s' % ( season, episode, ext_info['originalTitle'] if ext_info and 'originalTitle' in ext_info else 'Episodio %i' % episode ) return { 'title': title, 'sub-title': stitle, 'season': season if season > 0 else '', 'episode': episode, 'desc': desc }
def test_ignore_case(self): self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC") self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC") self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b") self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb") self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b") self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb") self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa") self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a") self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa") if have_unicode: assert u(r'\u212a').lower() == u'k' # '?' self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I)) self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I)) self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I)) self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I)) assert u(r'\u017f').upper() == u'S' # '?' self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I)) self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I)) self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I)) self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
def test_ignore_case_set(self): self.assertTrue(re.match(r'[19A]', 'A', re.I)) self.assertTrue(re.match(r'[19a]', 'a', re.I)) self.assertTrue(re.match(r'[19a]', 'A', re.I)) self.assertTrue(re.match(r'[19A]', 'a', re.I)) if have_unicode: self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I)) self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I)) self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I)) self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I)) assert u(r'\u212a').lower() == u'k' # '?' self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I)) self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I)) self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I)) self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I)) assert u(r'\u017f').upper() == u'S' # '?' self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I)) self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I)) self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I)) self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: if i < 256: self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i))) if i < 0x10000: self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i))) self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0")) self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z")) self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) self.assertRaises(re.error, re.match, r"[\911]", "") self.assertRaises(re.error, re.match, r"[\x1z]", "") self.assertRaises(re.error, re.match, r"[\u123z]", "") self.assertRaises(re.error, re.match, r"[\U0001234z]", "") self.assertRaises(re.error, re.match, r"[\U00110000]", "")
def test_sre_byte_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i]))) self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i]))) self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) self.assertIsNotNone(re.match(br"\u", b'u')) self.assertIsNotNone(re.match(br"\U", b'U')) self.assertIsNotNone(re.match(br"\0", b"\000")) self.assertIsNotNone(re.match(br"\08", b"\0008")) self.assertIsNotNone(re.match(br"\01", b"\001")) self.assertIsNotNone(re.match(br"\018", b"\0018")) self.assertIsNotNone(re.match(br"\567", bytes([0o167]))) self.assertRaises(re.error, re.match, br"\911", b"") self.assertRaises(re.error, re.match, br"\x1", b"") self.assertRaises(re.error, re.match, br"\x1z", b"")
def m3u2list(data): """convert an m3u data to a list""" matches = re.compile('^#EXTINF:-?[0-9]*(.*?),(.*?)\n(.*?)$', re.I + re.M + re.U + re.S).findall(data) li = [] for params, display_name, url in matches: item_data = {'params': params, 'display_name': display_name, 'url': url} li.append(item_data) playlist = [] for channel in li: item_data = {'display_name': channel['display_name'], 'url': channel['url']} matches = re.compile(' (.+?)="(.+?)"', re.I + re.M + re.U + re.S).findall(channel['params']) for field, value in matches: item_data[field.strip().lower().replace('-', '_')] = value.strip() playlist.append(item_data) return playlist
def is_hebrew(string): 'A hacky way to check if our string is in Hebrew - check the 1rst char' # Drop digits from the string string = re.sub('\d', '', string) # Drop special characters from the string string = re.sub('\W', '', string, flags = re.U) # Strip the string string = string.strip() # Support empty strings if not string: return None # Make sure the string is UTF-8 if type(string) != unicode: string = string.decode('utf-8') HEBREW_AB = unicode(u'???????????????????????????') if string[0] in HEBREW_AB: return True else: return False
def test_sre_character_class_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]: if i < 256: self.assertTrue(re.match(r"[\%o]" % i, chr(i))) self.assertTrue(re.match(r"[\%o8]" % i, chr(i))) self.assertTrue(re.match(r"[\%03o]" % i, chr(i))) self.assertTrue(re.match(r"[\%03o0]" % i, chr(i))) self.assertTrue(re.match(r"[\%03o8]" % i, chr(i))) self.assertTrue(re.match(r"[\x%02x]" % i, chr(i))) self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i))) self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i))) if i < 0x10000: self.assertTrue(re.match(r"[\u%04x]" % i, chr(i))) self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i))) self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i))) self.assertTrue(re.match(r"[\U%08x]" % i, chr(i))) self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0")) self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z")) self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e")) self.assertRaises(re.error, re.match, r"[\911]", "") self.assertRaises(re.error, re.match, r"[\x1z]", "") self.assertRaises(re.error, re.match, r"[\u123z]", "") self.assertRaises(re.error, re.match, r"[\U0001234z]", "") self.assertRaises(re.error, re.match, r"[\U00110000]", "")
def test_sre_byte_literals(self): for i in [0, 8, 16, 32, 64, 127, 128, 255]: self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0")) self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8")) self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i]))) self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0")) self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z")) self.assertTrue(re.match(br"\u", b'u')) self.assertTrue(re.match(br"\U", b'U')) self.assertTrue(re.match(br"\0", b"\000")) self.assertTrue(re.match(br"\08", b"\0008")) self.assertTrue(re.match(br"\01", b"\001")) self.assertTrue(re.match(br"\018", b"\0018")) self.assertTrue(re.match(br"\567", bytes([0o167]))) self.assertRaises(re.error, re.match, br"\911", b"") self.assertRaises(re.error, re.match, br"\x1", b"") self.assertRaises(re.error, re.match, br"\x1z", b"")
def has_omnifunc(self, ft): if ft not in self.trigger_cache: name = '{}_omni_trigger'.format(ft) option = self.get_option(name) if not option: return False try: self.trigger_cache[ft] = re.compile( to_unicode(option, 'utf-8'), re.X | re.U) except Exception: return False try: return bool(vim.current.buffer.options['omnifunc']) except vim.error: return False
def compile(searchString): r""" Return the user's searchString compiled to a regular expression. Example terms: @call +work (A) carrots Term may be prefixed with ! or ~ for negation. Terms may be combined with "," or " " (AND) or with "|" (OR). Terms only match the beginning of a word in the task. Terms are case-insensitive. Expressions may NOT be nested with parentheses. Only \-character special regular expression sets are allowed, everything else is escaped. """ if not searchString: return None terms = SimpleTextFilter._splitter.split(searchString) terms = [SimpleTextFilter._term2re(term) for term in terms] return re.compile("".join(terms), re.I | re.U)
def slugify(string, separator=r'-'): r""" Slugify a unicode string using unicodedata to normalize the string. :Example: >>> slugify(u"H\xe9ll\xf8 W\xc3\xb6rld") 'hell-world' >>> slugify("Bonjour, tout l'monde !", separator="_") 'bonjour_tout_lmonde' >>> slugify("\tStuff with -- dashes and... spaces \n") 'stuff-with-dashes-and-spaces' """ string = normalize(string) string = re.sub(r'[^\w\s' + separator + ']', '', string, flags=re.U) string = string.strip().lower() return re.sub(r'[' + separator + '\s]+', separator, string, flags=re.U)
def token_words (self,source): list_words=[] source_without_urls=u'' #renove urls from tweet urls=re.findall (r'(http[s]*://\S+)', source,re.U) for url in urls: start=source.find(url) end=len(url) source_without_urls=source_without_urls+source[0:start-1] source=source[start+end:] source_without_urls=source_without_urls+source list_tokens=re.findall (r'[#@]*\w+', source_without_urls,re.U) # remove users and hashtags for token in list_tokens: if (token.find('#') == -1) and (token.find('@') == -1): number= re.search(r'\d+',token) if not number: token=token.lower() list_words.append(token) return list_words
def set_user_mention_day(self,date,text): list_mentions=re.findall (r'@\w+', text) if len (list_mentions) >0: user=list_mentions[0] if re.match(r'[\.]*(@\w+)[^\t\n]+',text): if user in self.top_users_reply: index= self.top_users_reply.index(user) self.dict_top_users_reply_day.store(date,index,1) elif re.match('[rt[\s]*(@\w+)[:]*',text,re.U): if user in self.top_users_RT: index= self.top_users_RT.index(user) self.dict_top_users_RT_day.store(date,index,1) for user in list_mentions: if user in self.top_users_mention: index= self.top_users_mention.index(user) self.dict_top_users_mention_day.store(date,index,1) return
def get_tweet (tweet): data = tweet.split('\t') if len (data) >= 10: id_tweet = data[0] timestamp = data[1] date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U) (year,month,day,hour,minutes,seconds) = date_hour[0] author= data[2] text = data[3] app = data[4] user_id = data[6] followers = data[6] following = data[7] statuses = data[8] loc = data[9] return (year,month,day,hour,minutes,seconds, author,text,app,user_id,followers,following,statuses,loc) else: print ' tweet not match' return None
def get_tweet (tweet): data = tweet.split('\t') if len (data) >= 8: id_tweet = data[0] timestamp = data[1] date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U) (year,month,day,hour,minutes,seconds) = date_hour[0] author= data[2] text = data[3] app = data[4] id_user = data[5] followers = data[6] following = data [7] return (id_tweet,year,month,day,hour,minutes,seconds, author,text,app,id_user,followers,following) else: print ' tweet not match' return None
def get_tweet_source (text): source=None text_aux=text start=text_aux.find('RT') while start != -1: #print start text=text_aux[start:] #print text RT = re.match('[RT[\s]*(@\w+)[:]*',text,re.U) if RT: source=RT.group(1) text_aux=text[len(RT.group(0)):] #print text_aux #print source start=text_aux.find('RT') else: break return (source, text_aux)
def __parse_episode_number(self, eps_title): ''' parse the episode number from episode title, it use a list of regular expressions. the position in the list is the priority of the regular expression. :param eps_title: the title of episode. :return: episode number if matched, otherwise, -1 ''' try: for regex in episode_regex_tuple: search_result = re.search(regex, eps_title, re.U | re.I) if search_result is not None: return int(search_result.group(1)) return -1 except Exception: return -1
def parse_episode_number(self, eps_title): ''' parse the episode number from episode title, it use a list of regular expressions. the position in the list is the priority of the regular expression. :param eps_title: the title of episode. :return: episode number if matched, otherwise, -1 ''' try: for regex in episode_regex_tuple: search_result = re.search(regex, eps_title, re.U | re.I) if search_result is not None: matched_number = int(search_result.group(1)) if self.bangumi.eps_no_offset is not None: matched_number = matched_number + self.bangumi.eps_no_offset return matched_number return -1 except Exception as error: logger.warn(error) return -1
def parse_episode_number(self, eps_title): ''' parse the episode number from episode title, it use a list of regular expressions. the position in the list is the priority of the regular expression. :param eps_title: the title of episode. :return: episode number if matched, otherwise, -1 ''' try: for regex in episode_regex_tuple: search_result = re.search(regex, eps_title, re.U | re.I) if search_result is not None: return int(search_result.group(1)) return -1 except Exception: return -1
def normalize(text): """ ?????????????????????????????????? normalize(text) ?????? str ???????? >>> print(normalize("?????")=="????") # ? ? ? ? ? ??? ???? True """ if six.PY2: for data in rule2py2: text=re.sub(data[0].replace(u"t",u"[????]"),data[1],text,re.U) else: for data in rule2: text=re.sub(data[0].replace("t","[????]"),data[1],text,re.U) for data in list(zip(rule1,rule1)): text=re.sub(data[0].replace(u"t",u"[????]")+"+",data[1],text,re.U) return text
def __init__(self, width=70, initial_indent="", subsequent_indent="", expand_tabs=True, replace_whitespace=True, fix_sentence_endings=False, break_long_words=True, drop_whitespace=True, break_on_hyphens=True): self.width = width self.initial_indent = initial_indent self.subsequent_indent = subsequent_indent self.expand_tabs = expand_tabs self.replace_whitespace = replace_whitespace self.fix_sentence_endings = fix_sentence_endings self.break_long_words = break_long_words self.drop_whitespace = drop_whitespace self.break_on_hyphens = break_on_hyphens # recompile the regexes for Unicode mode -- done in this clumsy way for # backwards compatibility because it's rather common to monkey-patch # the TextWrapper class' wordsep_re attribute. self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U) self.wordsep_simple_re_uni = re.compile( self.wordsep_simple_re.pattern, re.U) # -- Private methods ----------------------------------------------- # (possibly useful for subclasses to override)
def __init__(self): # Initialize the standard TreebankWordTokenizer. super(self.__class__, self).__init__() # Adding to TreebankWordTokenizer, the splits on # - chervon quotes u'\xab' and u'\xbb' . # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d' improved_open_quote_regex = re.compile(u'([«“‘])', re.U) improved_close_quote_regex = re.compile(u'([»”’])', re.U) improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U) self.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 ')) self.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 ')) self.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
def translate(original):#original <type 'unicode'> waittrans = re.findall(u'[?-?|?-?|?-?|?|?]+',original,re.U) findnum = len(waittrans) subnum = 0 waitfill = original while(findnum!=subnum): waitfill = re.sub(u'[?-?|?-?|?-?|?|?]+',"%s",waitfill,re.U) subnum = len(re.findall('%s',waitfill)) # if len(re.findall('%',waitfill)) != subnum: waitfill = re.sub(u'%(?!s)','?'.decode("utf-8"),waitfill) filltext=[] print "workload",len(waittrans) for line in waittrans: if line in { u"?" : "", u"?" : ""}: filltext.append(line) continue send = line.encode("utf-8") gettrans = baidufanyi(send) if re.search(u"[???]",gettrans[-1]): gettrans = gettrans[0:-1] filltext.append(gettrans) translation = waitfill %tuple(filltext) translation = re.sub("?".decode("utf-8"),'%',translation) return translation
def tweetassembler(**args): in_reply_to_status = args['in_reply_to_status'] if in_reply_to_status is not None: regex = u'.*??.*' if re.match(regex, in_reply_to_status.text, re.U): # ??????ID??? id = in_reply_to_status.in_reply_to_status_id # ?????????????? qkou_status = api.get_status(id) entities = qkou_status.entities['hashtags'] # ???????????????? if len(entities) > 0: hashtag = entities[0]['text'] # ?????????????? info_num = re.search("(?<=lec)[0-9]*", hashtag) news_num = re.search("(?<=news)[0-9]*", hashtag) if info_num is not None: qkou_id = info_num.group() log.debug("[ Stream ] Info??????") dm_text = get_info(qkou_id) elif news_num is not None: news_id = news_num.group() log.debug("[ Stream ] News??????") dm_text = get_news(news_id) else: pass try: api.send_direct_message( user_id=in_reply_to_status.user.id, text=dm_text) log.debug('[ Stream ] DM???') except Exception as e: log.exception(e) else: pass
def load(self): Pref.view = False Pref.elapsed_time = 0.4 Pref.running = False Pref.wrdRx = re.compile(s.get('word_regexp', "^[^\w]?`*\w+[^\w]*$"), re.U) Pref.wrdRx = Pref.wrdRx.match Pref.splitRx = s.get('word_split', None) if Pref.splitRx: Pref.splitRx = re.compile(Pref.splitRx, re.U) Pref.splitRx = Pref.splitRx.findall Pref.enable_live_count = s.get('enable_live_count', True) Pref.enable_readtime = s.get('enable_readtime', False) Pref.enable_line_word_count = s.get('enable_line_word_count', False) Pref.enable_line_char_count = s.get('enable_line_char_count', False) Pref.enable_count_lines = s.get('enable_count_lines', False) Pref.enable_count_chars = s.get('enable_count_chars', False) Pref.enable_count_pages = s.get('enable_count_pages', True) Pref.words_per_page = s.get('words_per_page', 300) Pref.page_count_mode_count_words = s.get('page_count_mode_count_words', True) Pref.char_ignore_whitespace = s.get('char_ignore_whitespace', True) Pref.readtime_wpm = s.get('readtime_wpm', 200) Pref.whitelist = [x.lower() for x in s.get('whitelist_syntaxes', []) or []] Pref.blacklist = [x.lower() for x in s.get('blacklist_syntaxes', []) or []] Pref.strip = s.get('strip', []) for window in sublime.windows(): for view in window.views(): view.erase_status('WordCount'); view.settings().erase('WordCount')
def __init__(self, version, pattern): self._version = version self._ip_rex = re.compile(r"(" + pattern + r")", re.U | re.I) self._cidr_rex = re.compile(r"\s*/\s*(\d{1,5})", re.U | re.I) self._range_rex = re.compile(r"\s*-\s*(" + pattern + r")", re.U | re.I)
def test_from_re(self): # re.U and re.S flags are implicitly set self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a")) self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a")) # re.I flag can be set explicitly self.assertEqual( RegExp.from_re(re.compile("a", re.I)), RegExp("a", ignore_case=True)) # re.M, re.L and re.X are forbidden for flag in [re.M, re.L, re.X]: self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag))
def init(self, pattern, ignore_case=False): Atom.init(self) flags = re.U | re.S | (re.I if ignore_case else 0) self._regexp = re.compile(pattern, flags)
def format_regexp(format, regexp): escape_slash_rex = re.compile(r"((?:^|[^\\])(?:\\\\)*?)(/+)", re.U) def escape_slash(match): return match.group(1) + match.group(2).replace("/", "\\/") pattern = regexp.pattern pattern = escape_slash_rex.sub(escape_slash, pattern) result = "/" + pattern + "/" if regexp.ignore_case: result += "i" yield result
def escape_whitespace(unicode_string): r""" Return the given unicode string with the whitespace escaped using 'unicode-escape' encoding. >>> escape_whitespace(u"space is not escaped") u'space is not escaped' >>> escape_whitespace(u"multi\nline\nwith\ttabs") u'multi\\nline\\nwith\\ttabs' """ return re.sub(r"\s", lambda x: unicode(x.group(0).encode("unicode-escape")), unicode_string, re.U)
def slugify(value, allow_unicode=False): """ Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace. """ value = force_text(value) if allow_unicode: value = unicodedata.normalize('NFKC', value) value = re.sub('[^\w\s-]', '', value, flags=re.U).strip().lower() return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U)) value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') value = re.sub('[^\w\s-]', '', value).strip().lower() return mark_safe(re.sub('[-\s]+', '-', value))
def remove_tags(html, tags): """Returns the given HTML with given tags removed.""" warnings.warn( "django.utils.html.remove_tags() and the removetags template filter " "are deprecated. Consider using the bleach library instead.", RemovedInDjango110Warning, stacklevel=3 ) tags = [re.escape(tag) for tag in tags.split()] tags_re = '(%s)' % '|'.join(tags) starttag_re = re.compile(r'<%s(/?>|(\s+[^>]*>))' % tags_re, re.U) endtag_re = re.compile('</%s>' % tags_re) html = starttag_re.sub('', html) html = endtag_re.sub('', html) return html