Python cchardet 模块，detect() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用cchardet.detect()。

项目：talonspider 作者：howie6879 | 项目源码 | 文件源码

def _get_html(cls, html, url, html_etree, params, **kwargs):
        if html:
            html = etree.HTML(html)
        elif url:
            if not kwargs.get('headers', None):
                kwargs['headers'] = {
                    "User-Agent": get_random_user_agent()
                }
            response = requests.get(url, params, **kwargs)
            response.raise_for_status()
            content = response.content
            charset = cchardet.detect(content)
            text = content.decode(charset['encoding'])
            html = etree.HTML(text)
        elif html_etree is not None:
            return html_etree
        else:
            raise ValueError("html(url or html_etree) is expected")
        return html

项目：zmirror 作者：aploium | 项目源码 | 文件源码

def encoding_detect(byte_content):
    """
    ??????????????, ????, ??? None
    :param byte_content: ????????
    :type byte_content: bytes
    :return: ?????None
    :rtype: Union[str, None]
    """

    if force_decode_remote_using_encode is not None:
        return force_decode_remote_using_encode
    if possible_charsets:
        for charset in possible_charsets:
            try:
                byte_content.decode(encoding=charset)
            except:
                pass
            else:
                return charset
    if cchardet_available:  # detect the encoding using cchardet (if we have)
        return c_chardet(byte_content)['encoding']

    return None

项目：mercure 作者：synhack | 项目源码 | 文件源码

def clone_url(url):
    """Get http code of url.

    :param url: url to clone
    :return:
    """
    # get html
    if '://' not in url:
        url = 'http://' + url

    r = requests.get(url)
    # We don't trust requests encoding so we use cchardet
    # to detect real encoding
    # Without it we got decode error (for example: baidu.com)
    r.encoding = cchardet.detect(r.content)['encoding']
    html = r.content.decode(r.encoding)

    # set relative url rule
    if '<base' not in html:
        html = html.replace('<head>', '<head><base href="%s" />' % url)

    return html

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def extract(filename,key_part=['# ??']):  # ???????????  ???????
    # print filename
    lines=get_text(filename)
    words=[]
    texts=''
    for key in key_part:
        index=lines.index(key)
        # print index
        words+=lines[index+1].decode('utf-8').split()  # ??????
        # print cchardet.detect(lines[index+1])
        texts+=lines[index+1].decode('utf-8')+' '
    words_dict={}
    for w in words:
        words_dict[w]=words_dict.get(w,0)+1   # dict
    return Document(words_dict,filename,texts)
    # return words

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def extract(lines,filename,key_part=['# ??']):  # ???????????  ???????
    # print filename
    words=[]
    for key in key_part:
        try:
            # print key
            # print cchardet.detect(key)
            index=lines.index(key)
            print index
            words+=lines[index+1].decode('utf-8').split()  # ??????
        except Exception,e:
            print e
    words_dict={}
    for w in words:
        words_dict[w]=words_dict.get(w,0)+1   # dict
    # return words
    return Document(words_dict,filename)

项目：owllook 作者：howie6879 | 项目源码 | 文件源码

def requests_target_fetch(url):
    """
    :param url:
    :return:
    """
    try:
        headers = {'user-agent': get_random_user_agent()}
        response = requests.get(url=url, headers=headers, verify=False)
        response.raise_for_status()
        content = response.content
        charset = cchardet.detect(content)
        text = content.decode(charset['encoding'])
        return text
    except Exception as e:
        LOGGER.exception(e)
        return None

项目：Projects 作者：it2school | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：Projects 作者：it2school | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：TACTIC-Handler 作者：listyque | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：TACTIC-Handler 作者：listyque | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：UPBGE-CommunityAddon 作者：elmeunick9 | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：llk 作者：Tycx2ry | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：llk 作者：Tycx2ry | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：harbour-sailfinder 作者：DylanVanAssche | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：B.E.N.J.I. 作者：the-ethan-hunt | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：B.E.N.J.I. 作者：the-ethan-hunt | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：Taigabot 作者：FrozenPigs | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：Taigabot 作者：FrozenPigs | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：weeman 作者：evait-security | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：weeman 作者：evait-security | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：flickr_downloader 作者：Denisolt | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：flickr_downloader 作者：Denisolt | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：isar 作者：ilbers | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：isar 作者：ilbers | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：Crunchyroll-XML-Decoder 作者：jaw20 | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：Crunchyroll-XML-Decoder 作者：jaw20 | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：catchup4kodi 作者：catchup4kodi | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：catchup4kodi 作者：catchup4kodi | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：catchup4kodi 作者：catchup4kodi | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：catchup4kodi 作者：catchup4kodi | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：ShelbySearch 作者：Agentscreech | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：ShelbySearch 作者：Agentscreech | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：respeaker_virtualenv 作者：respeaker | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：respeaker_virtualenv 作者：respeaker | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：tellmeabout.coffee 作者：billyfung | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：tellmeabout.coffee 作者：billyfung | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：Price-Comparator 作者：Thejas-1 | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def write2file(item_parts):  # ???????
    for i,items in enumerate(item_parts):
        s=items[1]  # ???
        # print cchardet.detect(s)
        # print len(items)
        f=open(u'%s.txt' %(s),'w')
        list0=[]
        for item in items[0]:
            # item=item[0]
            list0.append('%s\t%s\n' %(item.filename.decode('GB18030'),item.text))
        f.write('\n'.join(list0))
        f.close()

# ???????2?

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def cluster_process(filenames,key_part,s,n_clusters=2):
    documents=[]
    texts=[]
    for fname in filenames:
        # key_part: # ?? ???# ?? ???# ???# ???# ???# ???
        # # ?? ???# ?? ???# ???# ???# ??
        # = = = ?? = = =   = = = ?? = = =   = = = ?? = = =
        # key_part=['# ??']
        # ??key_part??
        d=extract(fname,key_part=key_part)
        documents.append(d)

    # documents,words=tfidf(documents)
    # print len(documents),len(words)
    # docs=create_format_mat(documents,words)

    docs=get_tfidf(documents)  # ??gensim??tfidf

    # ??
    # labels [0,1,0,1,1,...]
    labels,score=clustering(docs,n_clusters)

    print 'key_part:','_'.join(key_part).decode('utf-8')
    item_parts=[]
    filename_parts=[]
    for i in range(n_clusters):
        # item=[filenames[j] for j in range(len(labels)) if labels[j]==i]
        item=[documents[j] for j in range(len(labels)) if labels[j]==i]
        # print cchardet.detect(s)
        # ?????
        filename_parts.append(([filenames[j] for j in range(len(labels)) if labels[j]==i],u'%s_%s_%d' %(s,'_'.join(key_part),i)))
        item_parts.append((item,u'%s_%s_%d' %(s,'_'.join(key_part),i)))
        print 'class_%d:%d' %(i,len(item))
    # ?????
    print 'score:',score
    print '-'*20
    write2file(item_parts)  # ????
    return filename_parts

项目：ParseLawDocuments 作者：FanhuaandLuomu | 项目源码 | 文件源码

def get_text(f_path,filename):
    # global source_path
    lines=[]
    with open(f_path+os.sep+filename,'r') as f:
        for line in f:
            line=line.strip()
            if line.strip()=='':
                line='None'
            # print cchardet.detect(line)
            lines.append(line)
    # print len(lines)
    return lines

项目：script.quasar.t411-rik91 作者：rik91 | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：script.quasar.t411-rik91 作者：rik91 | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：-PunkScan 作者：swordli | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']

项目：-PunkScan 作者：swordli | 项目源码 | 文件源码

def chardet_dammit(s):
            return chardet.detect(s)['encoding']
        #import chardet.constants
        #chardet.constants._debug = 1

项目：-PunkScan 作者：swordli | 项目源码 | 文件源码

def chardet_dammit(s):
        return cchardet.detect(s)['encoding']