我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用cchardet.detect()。
def _get_html(cls, html, url, html_etree, params, **kwargs): if html: html = etree.HTML(html) elif url: if not kwargs.get('headers', None): kwargs['headers'] = { "User-Agent": get_random_user_agent() } response = requests.get(url, params, **kwargs) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) html = etree.HTML(text) elif html_etree is not None: return html_etree else: raise ValueError("html(url or html_etree) is expected") return html
def encoding_detect(byte_content): """ ??????????????, ????, ??? None :param byte_content: ???????? :type byte_content: bytes :return: ?????None :rtype: Union[str, None] """ if force_decode_remote_using_encode is not None: return force_decode_remote_using_encode if possible_charsets: for charset in possible_charsets: try: byte_content.decode(encoding=charset) except: pass else: return charset if cchardet_available: # detect the encoding using cchardet (if we have) return c_chardet(byte_content)['encoding'] return None
def clone_url(url): """Get http code of url. :param url: url to clone :return: """ # get html if '://' not in url: url = 'http://' + url r = requests.get(url) # We don't trust requests encoding so we use cchardet # to detect real encoding # Without it we got decode error (for example: baidu.com) r.encoding = cchardet.detect(r.content)['encoding'] html = r.content.decode(r.encoding) # set relative url rule if '<base' not in html: html = html.replace('<head>', '<head><base href="%s" />' % url) return html
def extract(filename,key_part=['# ??']): # ??????????? ??????? # print filename lines=get_text(filename) words=[] texts='' for key in key_part: index=lines.index(key) # print index words+=lines[index+1].decode('utf-8').split() # ?????? # print cchardet.detect(lines[index+1]) texts+=lines[index+1].decode('utf-8')+' ' words_dict={} for w in words: words_dict[w]=words_dict.get(w,0)+1 # dict return Document(words_dict,filename,texts) # return words
def extract(lines,filename,key_part=['# ??']): # ??????????? ??????? # print filename words=[] for key in key_part: try: # print key # print cchardet.detect(key) index=lines.index(key) print index words+=lines[index+1].decode('utf-8').split() # ?????? except Exception,e: print e words_dict={} for w in words: words_dict[w]=words_dict.get(w,0)+1 # dict # return words return Document(words_dict,filename)
def requests_target_fetch(url): """ :param url: :return: """ try: headers = {'user-agent': get_random_user_agent()} response = requests.get(url=url, headers=headers, verify=False) response.raise_for_status() content = response.content charset = cchardet.detect(content) text = content.decode(charset['encoding']) return text except Exception as e: LOGGER.exception(e) return None
def chardet_dammit(s): return cchardet.detect(s)['encoding']
def chardet_dammit(s): return chardet.detect(s)['encoding'] #import chardet.constants #chardet.constants._debug = 1
def write2file(item_parts): # ??????? for i,items in enumerate(item_parts): s=items[1] # ??? # print cchardet.detect(s) # print len(items) f=open(u'%s.txt' %(s),'w') list0=[] for item in items[0]: # item=item[0] list0.append('%s\t%s\n' %(item.filename.decode('GB18030'),item.text)) f.write('\n'.join(list0)) f.close() # ???????2?
def cluster_process(filenames,key_part,s,n_clusters=2): documents=[] texts=[] for fname in filenames: # key_part: # ?? ???# ?? ???# ???# ???# ???# ??? # # ?? ???# ?? ???# ???# ???# ?? # = = = ?? = = = = = = ?? = = = = = = ?? = = = # key_part=['# ??'] # ??key_part?? d=extract(fname,key_part=key_part) documents.append(d) # documents,words=tfidf(documents) # print len(documents),len(words) # docs=create_format_mat(documents,words) docs=get_tfidf(documents) # ??gensim??tfidf # ?? # labels [0,1,0,1,1,...] labels,score=clustering(docs,n_clusters) print 'key_part:','_'.join(key_part).decode('utf-8') item_parts=[] filename_parts=[] for i in range(n_clusters): # item=[filenames[j] for j in range(len(labels)) if labels[j]==i] item=[documents[j] for j in range(len(labels)) if labels[j]==i] # print cchardet.detect(s) # ????? filename_parts.append(([filenames[j] for j in range(len(labels)) if labels[j]==i],u'%s_%s_%d' %(s,'_'.join(key_part),i))) item_parts.append((item,u'%s_%s_%d' %(s,'_'.join(key_part),i))) print 'class_%d:%d' %(i,len(item)) # ????? print 'score:',score print '-'*20 write2file(item_parts) # ???? return filename_parts
def get_text(f_path,filename): # global source_path lines=[] with open(f_path+os.sep+filename,'r') as f: for line in f: line=line.strip() if line.strip()=='': line='None' # print cchardet.detect(line) lines.append(line) # print len(lines) return lines