我们从Python开源项目中,提取了以下17个代码示例,用于说明如何使用bs4.UnicodeDammit()。
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get('id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip(): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
def __init__(self, data, encoding=None): """ Initialize serializer class :param data: ori data :param encoding: encoding type of your ori data """ self.data = data if not self.data: raise ValueError("You must input origin data to this class") # if you don't support encoding type we will use chardet to check the type self.encoding = encoding if encoding else UnicodeDammit(self.data).original_encoding self.encoding = None if self.encoding == "utf-8" else self.encoding # initialize beautiful soup # only_content_div = SoupStrainer("body") self.obj = BeautifulSoup(data, features="lxml", from_encoding=self.encoding)
def simplify_quotes(text): """ Even though UnicodeDammit smart_quotes_to="ascii" takes care of many cases, some crap can still be left... In addition to the smart-quotes, on *output* we also want to catch the case of `` -> " and '' -> " (NLTK has some tokenizers that convert like that). So, this can be used in the input cleaners chain, AFTER UnicodeDammit; it can also be used from OutputProofreader. >>> text = b'Have some ``weird" “quotes” and curlies,” won’t you please. Quotes are ‘fun’'.decode('utf8') >>> print simplify_quotes(text) Have some "weird" "quotes" and curlies," won't you please. Quotes are 'fun' >>> print simplify_quotes(unichr(8220) + u"foo" + unichr(8221) + unichr(8216) + u"bar" + unichr(8217)) "foo"'bar' >>> text = b'``weird" “quotes” aren’t very ‘fun’ I don’t think'.decode('utf8') >>> print simplify_quotes(text) "weird" "quotes" aren't very 'fun' I don't think """ return (text .replace(u"``", u'"') .replace(u"''", u'"') .replace(u'“', u'"') .replace(u'”', u'"') .replace(u'’', u"'") .replace(u'‘', u"'"))
def parse(self, response): """ default parse method, rule is not useful now """ # import pdb; pdb.set_trace() response = response.replace(url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) log.msg("Parse: index level:" + str(index_level)) if index_level in [1, 2, 3, 4]: self.save_to_file_system(index_level, response) relative_urls = self.get_follow_links(index_level, hxs) if relative_urls is not None: for url in relative_urls: log.msg('yield process, url:' + url) yield Request(url, callback=self.parse) elif index_level == 5: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['_id'] = linkedin_id personProfile['url'] = UnicodeDammit(response.url).markup yield personProfile
def decode_html(html_string): """??BS4?UnicodeDammit???????, ???unicode??, ???????100%, ???????? """ dammit = UnicodeDammit(html_string, ['GB2312', 'GBK', 'GB18030'], smart_quotes_to="html", is_html=True) doc = dammit.unicode_markup #print("dammit —— ", dammit.original_encoding) # FIXME ???????'ISO-8859-2', ??????, ???????????? if dammit.original_encoding == 'ISO-8859-2': enc = get_encoding(html_string) print(enc) enc ="utf-8" doc = html_string.decode(enc) elif not dammit.unicode_markup: raise UnicodeDecodeError("Failed to detect encoding, tried [%s]", ', '.join(dammit.triedEncodings)) # print(doc.encode('utf-8')) return doc
def clean_unicode(comment_str): comment_str = comment_str.replace('\n', '').replace('\r', '').strip() comment_str = ' '.join(comment_str.split()) return UnicodeDammit(comment_str).unicode_markup
def _get_encoding(cls, input_string, encoding): converted = UnicodeDammit(input_string, [encoding] if encoding else []) # Not worth raising exception? lxml will raise if parse fails. # if not converted.unicode_markup: # raise UnicodeDecodeError('Failed to detect encoding') return converted.original_encoding
def get_encoding(input_string, guesses=None, is_html=False): """Return the encoding of a byte string. Uses bs4 UnicodeDammit. :param string input_string: Encoded byte string. :param list[string] guesses: (Optional) List of encoding guesses to prioritize. :param bool is_html: Whether the input is HTML. """ converted = UnicodeDammit(input_string, override_encodings=[guesses] if guesses else [], is_html=is_html) return converted.original_encoding
def parse_html(html_file): """ Read the HTML file using lxml's HTML parser, but convert to Unicode using Beautiful Soup's UnicodeDammit class. Can raise LxmlError or TypeError if the file can't be opened or parsed. """ unicode_html = UnicodeDammit(html_file, smart_quotes_to="html", is_html=True) if unicode_html.unicode_markup is None: raise ValueError("no HTML provided") if not unicode_html.unicode_markup: raise ValueError("could not detect character encoding") return lxml.html.fromstring(unicode_html.unicode_markup)
def unicode_dammit(s, override_encodings=('utf-8', 'windows-1252', 'iso-8859-1', 'latin-1'), smart_quotes_to="ascii"): """ using bs4.UnicodeDammit, "coerce" text to unicode. replaces (some) 'smart quotes'. fixes (some) mixed encodings What's it do under the hood? The docs explain some, the source explains even more of course. https://www.crummy.com/software/BeautifulSoup/bs4/doc/#unicode-dammit >>> with_smart_quotes = b"I just \x93love\x94 your word processor\x92s smart quotes" >>> assert unicode_dammit(with_smart_quotes) == 'I just "love" your word processor\\'s smart quotes' :param override_encodings: why these defaults - in short, they are commonly seen in input texts I've played with. whether they are mixed or not. someday-maybe this can be configured with better control if needed. """ cleaned = UnicodeDammit(s, smart_quotes_to=smart_quotes_to, override_encodings=override_encodings).unicode_markup return cleaned
def decode_html(html_string): converted = UnicodeDammit(html_string) if not converted.unicode_markup: raise UnicodeDecodeError( "Failed to detect encoding, tried [%s]", ', '.join(converted.tried_encodings)) # print converted.original_encoding return converted.unicode_markup
def to_unicode_or_bust(self, obj, encoding='utf-8'): try: if isinstance(obj, basestring): if not isinstance(obj, unicode): obj = unicode(obj, encoding) return obj except: return bs4.UnicodeDammit(obj, is_html=False).unicode_markup
def pycurl_get_resp(data_buf, headers, payload, resp): charset = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: charset = match.group(1) print('Decoding using %s' % charset) body = data_buf.getvalue() if len(body) == 0: data = '' charset = 'utf-8' else: if charset is None: dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup charset = dammit.original_encoding else: data = body.decode(charset, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup_lxml = BeautifulSoup(data, 'lxml') soup_html = BeautifulSoup(data, 'html.parser') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': get_title(soup_lxml), 'links': get_links(soup_lxml), 'links2': get_links2(soup_lxml), 'metas': get_metas(soup_lxml), 'images': get_images(soup_lxml), 'scripts': get_scripts(soup_lxml), 'text': get_text(soup_html), 'data': data, 'headers': headers, 'charset': charset, 'spider': 'pycurl', 'payload': payload, })
def beautify(self, data, charset): dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup return data
def normalize_email_name(name): name = UnicodeDammit(name).unicode_markup # sanitize, keep only words, spaces and minimal punctuation # includes unicode apostrophes, though. name = re.sub( r"[^-\w\s'\u2019\u2032\u00b4\.\(\)]", '', name, 0, re.UNICODE) return name
def get_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_get(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) charset = None if 'content-type' in headers: content_type = headers['content-type'].lower() match = re.search('charset=(\S+)', content_type) if match: charset = match.group(1) print('Decoding using %s' % charset) body = data_buf.getvalue() if len(body) == 0: data = '' charset = 'utf-8' else: if charset is None: dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup charset = dammit.original_encoding else: data = body.decode(charset, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] soup_lxml = BeautifulSoup(data, 'lxml') soup_html = BeautifulSoup(data, 'html.parser') resp.update({ 'url': payload.get('url'), # 'soup': soup, 'title': get_title(soup_lxml), 'links': get_links(soup_lxml), 'links2': get_links2(soup_lxml), 'metas': get_metas(soup_lxml), 'images': get_images(soup_lxml), 'scripts': get_scripts(soup_lxml), 'text': get_text(soup_html), 'data': data, 'headers': headers, 'charset': charset, 'spider': 'pycurl', 'payload': payload, }) post_func = payload.get('post_func') if post_func: post_func = load(post_func) resp = post_func(payload, resp) return resp finally: c.close()
def post_request(payload, share=None): c = pycurl.Curl() data_buf = BytesIO() # header_buf = BytesIO() headers = {'count': 0, 'content': [{}]} try: setup_curl_for_post(c, payload, data_buf, headers, share) # header_buf) with aiohttp.Timeout(payload.get('aiohttp_timeout', 60)): resp = await CurlLoop.handler_ready(c) # encoding = None # if 'content-type' in headers: # content_type = headers['content-type'].lower() # match = re.search('charset=(\S+)', content_type) # if match: # encoding = match.group(1) # print('Decoding using %s' % encoding) body = data_buf.getvalue() encoding = 'utf-8' data = body.decode(encoding, 'ignore') if len(body) > 0 else '' # if encoding is None: # dammit = UnicodeDammit(body, ["utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") # data = dammit.unicode_markup # encoding = dammit.original_encoding # else: # data = body.decode(encoding, 'ignore') # headers.remove({}) headers['content'] = [h for h in headers['content'] if len(h) > 0] resp.update({ # 'url': payload.get('url'), 'data': data, 'headers': headers, 'encoding': encoding, }) post_func = payload.get('post_func') if type(post_func) == str: post_func = load(post_func) if post_func: resp = post_func(payload, resp) # post_func = payload.get('post_func') # if post_func: # post_func = load(post_func) # resp = post_func(payload, resp) return resp finally: c.close()