Python re 模块,U 实例源码


项目:gransk    作者:pcbje    | 项目源码 | 文件源码
def setup(self, config):
    Compile configured regular expressions.

    :param config: Configuration object.
    :type config: ``dict``
    self.matches = {}

    patterns = []

    for entity_type, pattern_conf in config.get(helper.ENTITIES, {}).items():
          r'\b(?P<{}>{})\b'.format(entity_type, pattern_conf[helper.PATTERN]))

    self.pattern = regex.compile(
        regex.I | regex.U)
项目:SerpScrap    作者:ecoron    | 项目源码 | 文件源码
def split_into_sentences(text):
    potential_end_pat = re.compile(r"".join([
        r"([\w\.'’&\]\)]+[\.\?!])",  # A word that ends with punctuation
        r"([‘’“”'\"\)\]]*)",  # Followed by optional quote/parens/etc
        r"(\s+(?![a-z\-–—]))",  # Followed by whitespace + non-(lowercase or dash)
    dot_iter = re.finditer(potential_end_pat, text)
    end_indices = [
        (x.start() + len( + len(
        for x in dot_iter
        if is_sentence_ender(
    spans = zip([None] + end_indices, end_indices + [None])
    sentences = [
        text[start:end].strip() for start, end in spans
    return sentences
项目:mm-randbot    作者:arvego    | 项目源码 | 文件源码
def replace_wiki_links(text, raw_link=False):
    ?????? ????-?????? ???? '[user_id|link_text]' ?? ??????????? HTML
    :param text: ????? ??? ?????????
    :param raw_link: ?????? ?????? ?????? ????-??????
    link_format = "{1} ({0})" if raw_link else "<a href=\"{0}\">{1}</a>"
    pattern = re.compile(r"\[([^|]+)\|([^|]+)\]", re.U)
    results = pattern.findall(text, re.U)
    for i in results:
        user_id = i[0]
        link_text = i[1]
        before = "[{0}|{1}]".format(user_id, link_text)
        after = link_format.format(user_id, link_text)
        text = text.replace(before, after)
    return text
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _extract_info(self, soup):
        empty_info = {'from': 0, 'to': 0, 'total': 0}
        div_ssb = soup.find('div', id='ssb')
        if not div_ssb:
            self._maybe_raise(ParseError, "Div with number of results was not found on Google search page", soup)
            return empty_info
        p = div_ssb.find('p')
        if not p:
            self._maybe_raise(ParseError, """<p> tag within <div id="ssb"> was not found on Google search page""", soup)
            return empty_info
        txt = ''.join(p.findAll(text=True))
        txt = txt.replace(',', '')
        matches ='%s (\d+) - (\d+) %s (?:%s )?(\d+)' % self._re_search_strings, txt, re.U)
        if not matches:
            return empty_info
        return {'from': int(, 'to': int(, 'total': int(}
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _html_unescape(self, str):
        def entity_replacer(m):
            entity =
            if entity in name2codepoint:
                return unichr(name2codepoint[entity])

        def ascii_replacer(m):
            cp = int(
            if cp <= 255:
                return unichr(cp)

        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _extract_info(self, soup):
        empty_info = {'from': 0, 'to': 0, 'total': 0}
        td_rsb = soup.find('td', 'rsb')
        if not td_rsb:
            self._maybe_raise(ParseError, "Td with number of results was not found on Blogs search page", soup)
            return empty_info
        font = td_rsb.find('font')
        if not font:
            self._maybe_raise(ParseError, """<p> tag within <tr class='rsb'> was not found on Blogs search page""", soup)
            return empty_info
        txt = ''.join(font.findAll(text=True))
        txt = txt.replace(',', '')
        if self.hl == 'es':
            matches ='Resultados (\d+) - (\d+) de (?:aproximadamente )?(\d+)', txt, re.U)
        elif self.hl == 'en':
            matches ='Results (\d+) - (\d+) of (?:about )?(\d+)', txt, re.U)
        if not matches:
            return empty_info
        return {'from': int(, 'to': int(, 'total': int(}
项目:doork    作者:AeonDave    | 项目源码 | 文件源码
def _html_unescape(self, str):
        def entity_replacer(m):
            entity =
            if entity in name2codepoint:
                return unichr(name2codepoint[entity])

        def ascii_replacer(m):
            cp = int(
            if cp <= 255:
                return unichr(cp)

        s =    re.sub(r'&#(\d+);',  ascii_replacer, str, re.U)
        return re.sub(r'&([^;]+);', entity_replacer, s, re.U)
项目:topic-ensemble    作者:derekgreene    | 项目源码 | 文件源码
def preprocess_simple( docs, stopwords, min_df = 3, min_term_length = 2, ngram_range = (1,1), apply_tfidf = True, apply_norm = True ):
    Preprocess a list containing text documents stored as strings, where the documents have already been tokenized and are separated by whitespace
    token_pattern = re.compile(r"[\s\-]+", re.U)

    def custom_tokenizer( s ):
        return [x.lower() for x in token_pattern.split(s) if (len(x) >= min_term_length) ]

    # Build the Vector Space Model, apply TF-IDF and normalize lines to unit length all in one call
    if apply_norm:
        norm_function = "l2"
        norm_function = None
    tfidf = TfidfVectorizer(stop_words=stopwords, lowercase=True, strip_accents="unicode", tokenizer=custom_tokenizer, use_idf=apply_tfidf, norm=norm_function, min_df = min_df, ngram_range = ngram_range) 
    X = tfidf.fit_transform(docs)
    terms = []
    # store the vocabulary map
    v = tfidf.vocabulary_
    for i in range(len(v)):
    for term in v.keys():
        terms[ v[term] ] = term
    return (X,terms)
项目:flinck    作者:Kraymer    | 项目源码 | 文件源码
def search_filename(fname, fields):
    """Extract movie title/date from filename and return dict with movies infos
    path_tokens = os.path.normpath(fname).split(os.sep)
    candidate = path_tokens[-1]
    res = re.split(FNAME_SPLIT_RE, candidate,
                   flags=re.I | re.U)[0].strip()
    res = scrub(res, '[({])}', ' ')
    res = ' '.join([x for x in re.split(r'[\s\._]', res, flags=re.U) if x])
    years = re.findall(r'((?:19|20)\d\d)', res)
    if years:
        toks = re.split(r'(%s)' % years[-1], res)
        toks = [res]
    title = toks[0].strip()
    year = toks[1] if len(toks) > 1 else None
    item = search_by(title, year, fields)
    if item:
        item['filename'] = fname
        return item
项目:tv_grab_es_movistartv    作者:MovistarTV    | 项目源码 | 文件源码
def __get_series_data(program, ext_info):
        episode = int(program['episode'])
        season = int(program['season'])
        desc = ext_info['synopsis'] if ext_info else u'Año: %s' % program['year']
        if season == 0:
            sn = re.findall(r'.*\sT(\d*/?\d+).*', program['full_title'], re.U)
            season = int(sn[0].replace('/', '')) if sn else season
        if 'episode_title' in program:
            title = program['serie']
            stitle = '%ix%02d %s' % (season, episode, program['episode_title'])
            title = re.findall(r'(.*)\sT\d*/?\d+.*', program['full_title'], re.U)
            title = title[0] if title else program['full_title']
            stitle = '%ix%02d %s' % (
                season, episode, ext_info['originalTitle']
                if ext_info and 'originalTitle' in ext_info else 'Episodio %i' % episode
        return {
            'title': title,
            'sub-title': stitle,
            'season': season if season > 0 else '',
            'episode': episode,
            'desc': desc
项目:oil    作者:oilshell    | 项目源码 | 文件源码
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
项目:oil    作者:oilshell    | 项目源码 | 文件源码
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
项目:python2-tracer    作者:extremecoders-re    | 项目源码 | 文件源码
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
项目:python2-tracer    作者:extremecoders-re    | 项目源码 | 文件源码
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertIsNotNone(re.match(r"[\%o]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%o8]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertIsNotNone(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertIsNotNone(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertIsNotNone(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertIsNotNone(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertIsNotNone(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertIsNotNone(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "")
项目:web_ctp    作者:molebot    | 项目源码 | 文件源码
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertIsNotNone(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertIsNotNone(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertIsNotNone(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertIsNotNone(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertIsNotNone(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertIsNotNone(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertIsNotNone(re.match(br"\u", b'u'))
        self.assertIsNotNone(re.match(br"\U", b'U'))
        self.assertIsNotNone(re.match(br"\0", b"\000"))
        self.assertIsNotNone(re.match(br"\08", b"\0008"))
        self.assertIsNotNone(re.match(br"\01", b"\001"))
        self.assertIsNotNone(re.match(br"\018", b"\0018"))
        self.assertIsNotNone(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"")
项目    作者:Ideneal    | 项目源码 | 文件源码
def m3u2list(data):
    """convert an m3u data to a list"""
    matches = re.compile('^#EXTINF:-?[0-9]*(.*?),(.*?)\n(.*?)$', re.I + re.M + re.U + re.S).findall(data)
    li = []
    for params, display_name, url in matches:
        item_data = {'params': params, 'display_name': display_name, 'url': url}

    playlist = []
    for channel in li:
        item_data = {'display_name': channel['display_name'], 'url': channel['url']}
        matches = re.compile(' (.+?)="(.+?)"', re.I + re.M + re.U + re.S).findall(channel['params'])
        for field, value in matches:
            item_data[field.strip().lower().replace('-', '_')] = value.strip()
    return playlist
项目:pefile.pypy    作者:cloudtracer    | 项目源码 | 文件源码
def test_ignore_case(self):
        self.assertEqual(re.match("abc", "ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match("abc", u"ABC", re.I).group(0), "ABC")
        self.assertEqual(re.match(r"(a\s[^a])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[^a]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"(a\s[abc])", "a b", re.I).group(1), "a b")
        self.assertEqual(re.match(r"(a\s[abc]*)", "a bb", re.I).group(1), "a bb")
        self.assertEqual(re.match(r"((a)\s\2)", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s\2*)", "a aa", re.I).group(1), "a aa")
        self.assertEqual(re.match(r"((a)\s(abc|a))", "a a", re.I).group(1), "a a")
        self.assertEqual(re.match(r"((a)\s(abc|a)*)", "a aa", re.I).group(1), "a aa")

        if have_unicode:
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(ur'K', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(ur'k', u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'\u212a'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'S', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur's', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'\u017f'), u's', re.U | re.I))
项目:pefile.pypy    作者:cloudtracer    | 项目源码 | 文件源码
def test_ignore_case_set(self):
        self.assertTrue(re.match(r'[19A]', 'A', re.I))
        self.assertTrue(re.match(r'[19a]', 'a', re.I))
        self.assertTrue(re.match(r'[19a]', 'A', re.I))
        self.assertTrue(re.match(r'[19A]', 'a', re.I))
        if have_unicode:
            self.assertTrue(re.match(ur'[19A]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'a', re.U | re.I))
            self.assertTrue(re.match(ur'[19a]', u'A', re.U | re.I))
            self.assertTrue(re.match(ur'[19A]', u'a', re.U | re.I))
            assert u(r'\u212a').lower() == u'k' # '?'
            self.assertTrue(re.match(u(r'[19K]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19k]'), u(r'\u212a'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'K', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u212a]'), u'k', re.U | re.I))
            assert u(r'\u017f').upper() == u'S' # '?'
            self.assertTrue(re.match(ur'[19S]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(ur'[19s]', u(r'\u017f'), re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u'S', re.U | re.I))
            self.assertTrue(re.match(u(r'[19\u017f]'), u's', re.U | re.I))
项目:dbs-back    作者:Beit-Hatfutsot    | 项目源码 | 文件源码
def is_hebrew(string):
    'A hacky way to check if our string is in Hebrew - check the 1rst char'
    # Drop digits from the string
    string = re.sub('\d', '', string)
    # Drop special characters from the string
    string = re.sub('\W', '', string, flags = re.U)
    # Strip the string
    string = string.strip()
    # Support empty strings
    if not string:
        return None
    # Make sure the string is UTF-8
    if type(string) != unicode:
        string = string.decode('utf-8')
    HEBREW_AB = unicode(u'???????????????????????????')
    if string[0] in HEBREW_AB:
        return True
        return False
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "")
项目:ouroboros    作者:pybee    | 项目源码 | 文件源码
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertTrue(re.match(br"\u", b'u'))
        self.assertTrue(re.match(br"\U", b'U'))
        self.assertTrue(re.match(br"\0", b"\000"))
        self.assertTrue(re.match(br"\08", b"\0008"))
        self.assertTrue(re.match(br"\01", b"\001"))
        self.assertTrue(re.match(br"\018", b"\0018"))
        self.assertTrue(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"")
项目:completor.vim    作者:maralla    | 项目源码 | 文件源码
def has_omnifunc(self, ft):
        if ft not in self.trigger_cache:
            name = '{}_omni_trigger'.format(ft)
            option = self.get_option(name)
            if not option:
                return False

                self.trigger_cache[ft] = re.compile(
                    to_unicode(option, 'utf-8'), re.X | re.U)
            except Exception:
                return False

            return bool(vim.current.buffer.options['omnifunc'])
        except vim.error:
            return False
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def test_sre_character_class_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255, 256, 0xFFFF, 0x10000, 0x10FFFF]:
            if i < 256:
                self.assertTrue(re.match(r"[\%o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\%03o8]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\x%02xz]" % i, chr(i)))
            if i < 0x10000:
                self.assertTrue(re.match(r"[\u%04x]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04x0]" % i, chr(i)))
                self.assertTrue(re.match(r"[\u%04xz]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x]" % i, chr(i)))
            self.assertTrue(re.match(r"[\U%08x0]" % i, chr(i)+"0"))
            self.assertTrue(re.match(r"[\U%08xz]" % i, chr(i)+"z"))
        self.assertTrue(re.match(r"[\U0001d49c-\U0001d4b5]", "\U0001d49e"))
        self.assertRaises(re.error, re.match, r"[\911]", "")
        self.assertRaises(re.error, re.match, r"[\x1z]", "")
        self.assertRaises(re.error, re.match, r"[\u123z]", "")
        self.assertRaises(re.error, re.match, r"[\U0001234z]", "")
        self.assertRaises(re.error, re.match, r"[\U00110000]", "")
项目:kbe_server    作者:xiaohaoppy    | 项目源码 | 文件源码
def test_sre_byte_literals(self):
        for i in [0, 8, 16, 32, 64, 127, 128, 255]:
            self.assertTrue(re.match((r"\%03o" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\%03o0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\%03o8" % i).encode(), bytes([i])+b"8"))
            self.assertTrue(re.match((r"\x%02x" % i).encode(), bytes([i])))
            self.assertTrue(re.match((r"\x%02x0" % i).encode(), bytes([i])+b"0"))
            self.assertTrue(re.match((r"\x%02xz" % i).encode(), bytes([i])+b"z"))
        self.assertTrue(re.match(br"\u", b'u'))
        self.assertTrue(re.match(br"\U", b'U'))
        self.assertTrue(re.match(br"\0", b"\000"))
        self.assertTrue(re.match(br"\08", b"\0008"))
        self.assertTrue(re.match(br"\01", b"\001"))
        self.assertTrue(re.match(br"\018", b"\0018"))
        self.assertTrue(re.match(br"\567", bytes([0o167])))
        self.assertRaises(re.error, re.match, br"\911", b"")
        self.assertRaises(re.error, re.match, br"\x1", b"")
        self.assertRaises(re.error, re.match, br"\x1z", b"")
项目:QTodoTxt2    作者:QTodoTxt    | 项目源码 | 文件源码
def compile(searchString):
        Return the user's searchString compiled to a regular expression.

        Example terms: @call +work (A) carrots
        Term may be prefixed with ! or ~ for negation.
        Terms may be combined with "," or " " (AND) or with "|" (OR).
        Terms only match the beginning of a word in the task.
        Terms are case-insensitive.
        Expressions may NOT be nested with parentheses.
        Only \-character special regular expression sets are allowed, everything else is escaped.
        if not searchString:
            return None

        terms = SimpleTextFilter._splitter.split(searchString)
        terms = [SimpleTextFilter._term2re(term) for term in terms]

        return re.compile("".join(terms), re.I | re.U)
项目:formpack    作者:kobotoolbox    | 项目源码 | 文件源码
def slugify(string, separator=r'-'):
    Slugify a unicode string using unicodedata to normalize the string.
        >>> slugify(u"H\xe9ll\xf8 W\xc3\xb6rld")
        >>> slugify("Bonjour, tout l'monde !", separator="_")
        >>> slugify("\tStuff with -- dashes and...   spaces   \n")

    string = normalize(string)
    string = re.sub(r'[^\w\s' + separator + ']', '', string, flags=re.U)
    string = string.strip().lower()
    return re.sub(r'[' + separator + '\s]+', separator, string, flags=re.U)
项目:t-hoarder_kit    作者:congosto    | 项目源码 | 文件源码
def token_words (self,source):
  #renove urls from tweet
    urls=re.findall (r'(http[s]*://\S+)', source,re.U)
    for url in urls:
    list_tokens=re.findall (r'[#@]*\w+', source_without_urls,re.U) 
#  remove users and hashtags
    for token in list_tokens:
      if (token.find('#') == -1) and (token.find('@') == -1):
        if not number:
    return list_words
项目:t-hoarder_kit    作者:congosto    | 项目源码 | 文件源码
def set_user_mention_day(self,date,text):
    list_mentions=re.findall (r'@\w+', text)
    if len (list_mentions) >0:
      if re.match(r'[\.]*(@\w+)[^\t\n]+',text):
        if user in self.top_users_reply:
          index= self.top_users_reply.index(user)
      elif re.match('[rt[\s]*(@\w+)[:]*',text,re.U):
        if user in self.top_users_RT:
          index= self.top_users_RT.index(user)
      for user in list_mentions:
        if user in self.top_users_mention:
          index= self.top_users_mention.index(user)
项目:t-hoarder_kit    作者:congosto    | 项目源码 | 文件源码
def get_tweet (tweet):
   data = tweet.split('\t')
   if len (data) >= 10:
     id_tweet = data[0]
     timestamp = data[1]
     date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
     (year,month,day,hour,minutes,seconds) = date_hour[0]
     author= data[2]
     text = data[3]
     app = data[4]
     user_id = data[6]
     followers = data[6]
     following = data[7]
     statuses = data[8]
     loc = data[9]
     return (year,month,day,hour,minutes,seconds, author,text,app,user_id,followers,following,statuses,loc)
     print ' tweet not match'
     return None
项目:t-hoarder_kit    作者:congosto    | 项目源码 | 文件源码
def get_tweet (tweet):
   data = tweet.split('\t')
   if len (data) >= 8:
     id_tweet = data[0]
     timestamp = data[1]
     date_hour =re.findall(r'(\d\d\d\d)-(\d\d)-(\d\d)\s(\d\d):(\d\d):(\d\d)',timestamp,re.U)
     (year,month,day,hour,minutes,seconds) = date_hour[0]
     author= data[2]
     text = data[3]
     app = data[4]
     id_user = data[5]
     followers = data[6]
     following = data [7]
     return (id_tweet,year,month,day,hour,minutes,seconds, author,text,app,id_user,followers,following)
     print ' tweet not match'
     return None
项目:t-hoarder_kit    作者:congosto    | 项目源码 | 文件源码
def get_tweet_source (text):
  while  start !=  -1:
    #print start
    #print text
    RT = re.match('[RT[\s]*(@\w+)[:]*',text,re.U)
    if RT:
      #print text_aux
      #print source
  return (source, text_aux)
项目:Albireo    作者:lordfriend    | 项目源码 | 文件源码
def __parse_episode_number(self, eps_title):
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
            for regex in episode_regex_tuple:
                search_result =, eps_title, re.U | re.I)
                if search_result is not None:
                    return int(

            return -1
        except Exception:
            return -1
项目:Albireo    作者:lordfriend    | 项目源码 | 文件源码
def parse_episode_number(self, eps_title):
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
            for regex in episode_regex_tuple:
                search_result =, eps_title, re.U | re.I)
                if search_result is not None:
                    matched_number = int(
                    if self.bangumi.eps_no_offset is not None:
                        matched_number = matched_number + self.bangumi.eps_no_offset
                    return matched_number

            return -1
        except Exception as error:
            return -1
项目:Albireo    作者:lordfriend    | 项目源码 | 文件源码
def parse_episode_number(self, eps_title):
        parse the episode number from episode title, it use a list of regular expressions. the position in the list
        is the priority of the regular expression.
        :param eps_title: the title of episode.
        :return: episode number if matched, otherwise, -1
            for regex in episode_regex_tuple:
                search_result =, eps_title, re.U | re.I)
                if search_result is not None:
                    return int(

            return -1
        except Exception:
            return -1
项目:pythainlp    作者:PyThaiNLP    | 项目源码 | 文件源码
def normalize(text):
    ?????? str
    >>> print(normalize("?????")=="????") # ? ? ? ? ? ??? ????
    if six.PY2:
        for data in rule2py2:
        for data in rule2:
    for data in list(zip(rule1,rule1)):
    return text
项目:kinect-2-libras    作者:inessadl    | 项目源码 | 文件源码
def __init__(self,
        self.width = width
        self.initial_indent = initial_indent
        self.subsequent_indent = subsequent_indent
        self.expand_tabs = expand_tabs
        self.replace_whitespace = replace_whitespace
        self.fix_sentence_endings = fix_sentence_endings
        self.break_long_words = break_long_words
        self.drop_whitespace = drop_whitespace
        self.break_on_hyphens = break_on_hyphens

        # recompile the regexes for Unicode mode -- done in this clumsy way for
        # backwards compatibility because it's rather common to monkey-patch
        # the TextWrapper class' wordsep_re attribute.
        self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
        self.wordsep_simple_re_uni = re.compile(
            self.wordsep_simple_re.pattern, re.U)

    # -- Private methods -----------------------------------------------
    # (possibly useful for subclasses to override)
项目:earthy    作者:alvations    | 项目源码 | 文件源码
def __init__(self):
        # Initialize the standard TreebankWordTokenizer.
        super(self.__class__, self).__init__()
        # Adding to TreebankWordTokenizer, the splits on
        # - chervon quotes u'\xab' and u'\xbb' .
        # - unicode quotes u'\u2018', u'\u2019', u'\u201c' and u'\u201d'
        improved_open_quote_regex = re.compile(u'([«“‘])', re.U)
        improved_close_quote_regex = re.compile(u'([»”’])', re.U)
        improved_punct_regex = re.compile(r'([^\.])(\.)([\]\)}>"\'' u'»”’ ' r']*)\s*$', re.U)
        self.STARTING_QUOTES.insert(0, (improved_open_quote_regex, r' \1 '))
        self.ENDING_QUOTES.insert(0, (improved_close_quote_regex, r' \1 '))
        self.PUNCTUATION.insert(0, (improved_punct_regex, r'\1 \2 \3 '))
项目:idol    作者:nondanee    | 项目源码 | 文件源码
def translate(original):#original <type 'unicode'>
    waittrans = re.findall(u'[?-?|?-?|?-?|?|?]+',original,re.U)
    findnum = len(waittrans)
    subnum = 0
    waitfill = original
        waitfill = re.sub(u'[?-?|?-?|?-?|?|?]+',"%s",waitfill,re.U)
        subnum = len(re.findall('%s',waitfill))
    # if len(re.findall('%',waitfill)) != subnum:
    waitfill = re.sub(u'%(?!s)','?'.decode("utf-8"),waitfill)

    print "workload",len(waittrans)
    for line in waittrans:     

        if line in { u"?" : "", u"?" : ""}:

        send = line.encode("utf-8") 
        gettrans = baidufanyi(send)

            gettrans = gettrans[0:-1]              


    translation = waitfill %tuple(filltext)
    translation = re.sub("?".decode("utf-8"),'%',translation)
    return translation
项目:Qkou_kit    作者:pddg    | 项目源码 | 文件源码
def tweetassembler(**args):
    in_reply_to_status = args['in_reply_to_status']
    if in_reply_to_status is not None:
        regex = u'.*??.*'
        if re.match(regex, in_reply_to_status.text, re.U):
            # ??????ID???
            id = in_reply_to_status.in_reply_to_status_id
            # ??????????????
            qkou_status = api.get_status(id)
            entities = qkou_status.entities['hashtags']
            # ????????????????
            if len(entities) > 0:
                hashtag = entities[0]['text']
                # ??????????????
                info_num ="(?<=lec)[0-9]*", hashtag)
                news_num ="(?<=news)[0-9]*", hashtag)
                if info_num is not None:
                    qkou_id =
                    log.debug("[ Stream ] Info??????")
                    dm_text = get_info(qkou_id)
                elif news_num is not None:
                    news_id =
                    log.debug("[ Stream ] News??????")
                    dm_text = get_news(news_id)
              , text=dm_text)
                    log.debug('[ Stream ] DM???')
                except Exception as e:
项目:sublime-text-3-packages    作者:nickjj    | 项目源码 | 文件源码
def load(self):
        Pref.view                   = False
        Pref.elapsed_time           = 0.4
        Pref.running                = False

        Pref.wrdRx                  = re.compile(s.get('word_regexp', "^[^\w]?`*\w+[^\w]*$"), re.U)
        Pref.wrdRx                  = Pref.wrdRx.match
        Pref.splitRx                = s.get('word_split', None)
        if Pref.splitRx:
            Pref.splitRx            = re.compile(Pref.splitRx, re.U)
            Pref.splitRx            = Pref.splitRx.findall

        Pref.enable_live_count      = s.get('enable_live_count', True)
        Pref.enable_readtime        = s.get('enable_readtime', False)
        Pref.enable_line_word_count = s.get('enable_line_word_count', False)
        Pref.enable_line_char_count = s.get('enable_line_char_count', False)
        Pref.enable_count_lines     = s.get('enable_count_lines', False)
        Pref.enable_count_chars     = s.get('enable_count_chars', False)
        Pref.enable_count_pages     = s.get('enable_count_pages', True)

        Pref.words_per_page         = s.get('words_per_page', 300)
        Pref.page_count_mode_count_words = s.get('page_count_mode_count_words', True)
        Pref.char_ignore_whitespace = s.get('char_ignore_whitespace', True)
        Pref.readtime_wpm           = s.get('readtime_wpm', 200)
        Pref.whitelist              = [x.lower() for x in s.get('whitelist_syntaxes', []) or []]
        Pref.blacklist              = [x.lower() for x in s.get('blacklist_syntaxes', []) or []]
        Pref.strip                  = s.get('strip', [])

        for window in
            for view in window.views():
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def __init__(self, version, pattern):
        self._version = version

        self._ip_rex = re.compile(r"(" + pattern + r")", re.U | re.I)
        self._cidr_rex = re.compile(r"\s*/\s*(\d{1,5})", re.U | re.I)
        self._range_rex = re.compile(r"\s*-\s*(" + pattern + r")", re.U | re.I)
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def test_from_re(self):
        # re.U and re.S flags are implicitly set
        self.assertEqual(RegExp.from_re(re.compile("a", re.U)), RegExp("a"))
        self.assertEqual(RegExp.from_re(re.compile("a", re.S)), RegExp("a"))

        # re.I flag can be set explicitly
            RegExp.from_re(re.compile("a", re.I)),
            RegExp("a", ignore_case=True))

        # re.M, re.L and re.X are forbidden
        for flag in [re.M, re.L, re.X]:
            self.assertRaises(ValueError, RegExp.from_re, re.compile("a", flag))
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def init(self, pattern, ignore_case=False):

        flags = re.U | re.S | (re.I if ignore_case else 0)
        self._regexp = re.compile(pattern, flags)
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def format_regexp(format, regexp):
    escape_slash_rex = re.compile(r"((?:^|[^\\])(?:\\\\)*?)(/+)", re.U)

    def escape_slash(match):
        return +"/", "\\/")

    pattern = regexp.pattern
    pattern = escape_slash_rex.sub(escape_slash, pattern)

    result = "/" + pattern + "/"
    if regexp.ignore_case:
        result += "i"
    yield result
项目:abusehelper    作者:Exploit-install    | 项目源码 | 文件源码
def escape_whitespace(unicode_string):
    Return the given unicode string with the whitespace escaped
    using 'unicode-escape' encoding.

    >>> escape_whitespace(u"space is not escaped")
    u'space is not escaped'

    >>> escape_whitespace(u"multi\nline\nwith\ttabs")

    return re.sub(r"\s", lambda x: unicode("unicode-escape")), unicode_string, re.U)
项目:CodingDojo    作者:ComputerSocietyUNB    | 项目源码 | 文件源码
def slugify(value, allow_unicode=False):
    Convert to ASCII if 'allow_unicode' is False. Convert spaces to hyphens.
    Remove characters that aren't alphanumerics, underscores, or hyphens.
    Convert to lowercase. Also strip leading and trailing whitespace.
    value = force_text(value)
    if allow_unicode:
        value = unicodedata.normalize('NFKC', value)
        value = re.sub('[^\w\s-]', '', value, flags=re.U).strip().lower()
        return mark_safe(re.sub('[-\s]+', '-', value, flags=re.U))
    value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii')
    value = re.sub('[^\w\s-]', '', value).strip().lower()
    return mark_safe(re.sub('[-\s]+', '-', value))
项目:CodingDojo    作者:ComputerSocietyUNB    | 项目源码 | 文件源码
def remove_tags(html, tags):
    """Returns the given HTML with given tags removed."""
        "django.utils.html.remove_tags() and the removetags template filter "
        "are deprecated. Consider using the bleach library instead.",
        RemovedInDjango110Warning, stacklevel=3
    tags = [re.escape(tag) for tag in tags.split()]
    tags_re = '(%s)' % '|'.join(tags)
    starttag_re = re.compile(r'<%s(/?>|(\s+[^>]*>))' % tags_re, re.U)
    endtag_re = re.compile('</%s>' % tags_re)
    html = starttag_re.sub('', html)
    html = endtag_re.sub('', html)
    return html
项目:hostapd-mana    作者:adde88    | 项目源码 | 文件源码
def __init__(self,
        self.width = width
        self.initial_indent = initial_indent
        self.subsequent_indent = subsequent_indent
        self.expand_tabs = expand_tabs
        self.replace_whitespace = replace_whitespace
        self.fix_sentence_endings = fix_sentence_endings
        self.break_long_words = break_long_words
        self.drop_whitespace = drop_whitespace
        self.break_on_hyphens = break_on_hyphens

        # recompile the regexes for Unicode mode -- done in this clumsy way for
        # backwards compatibility because it's rather common to monkey-patch
        # the TextWrapper class' wordsep_re attribute.
        self.wordsep_re_uni = re.compile(self.wordsep_re.pattern, re.U)
        self.wordsep_simple_re_uni = re.compile(
            self.wordsep_simple_re.pattern, re.U)

    # -- Private methods -----------------------------------------------
    # (possibly useful for subclasses to override)