我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用bs4.dammit.original_encoding()。
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, str)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set to 'utf-8', a superset of ASCII. chardet = bs4.dammit.chardet_dammit logging.disable(logging.WARNING) try: def noop(str): return None # Disable chardet, which will realize that the ASCII is ASCII. bs4.dammit.chardet_dammit = noop ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding.lower(), "utf-8") finally: logging.disable(logging.NOTSET) bs4.dammit.chardet_dammit = chardet
def test_fromEncoding_renamed_to_from_encoding(self): with warnings.catch_warnings(record=True) as w: utf8 = b"\xc3\xa9" soup = self.soup(utf8, fromEncoding="utf8") msg = str(w[0].message) self.assertTrue("fromEncoding" in msg) self.assertTrue("from_encoding" in msg) self.assertEqual("utf8", soup.original_encoding)
def test_ascii_in_unicode_out(self): # ASCII input is converted to Unicode. The original_encoding # attribute is set. ascii = b"<foo>a</foo>" soup_from_ascii = self.soup(ascii) unicode_output = soup_from_ascii.decode() self.assertTrue(isinstance(unicode_output, unicode)) self.assertEqual(unicode_output, self.document_for(ascii.decode())) self.assertEqual(soup_from_ascii.original_encoding, "ascii")
def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, u'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, u'Sacr\xe9 bleu!')
def test_detect_utf8(self): utf8 = b"\xc3\xa9" dammit = UnicodeDammit(utf8) self.assertEqual(dammit.unicode_markup, u'\xe9') self.assertEqual(dammit.original_encoding, 'utf-8')
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding, 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding, 'utf-8')
def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding, 'utf-8')
def test_detect_html5_style_meta_tag(self): for data in ( b'<html><meta charset="euc-jp" /></html>', b"<html><meta charset='euc-jp' /></html>", b"<html><meta charset=euc-jp /></html>", b"<html><meta charset=euc-jp/></html>"): dammit = UnicodeDammit(data, is_html=True) self.assertEqual( "euc-jp", dammit.original_encoding)
def test_sniffed_xml_encoding(self): # A document written in UTF-16LE will be converted by a different # code path that sniffs the byte order markers. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
def test_exclude_encodings(self): utf8_data = "Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding)
def test_unicode_in_unicode_out(self): # Unicode input is left alone. The original_encoding attribute # is not set. soup_from_unicode = self.soup(self.unicode_data) self.assertEqual(soup_from_unicode.decode(), self.unicode_data) self.assertEqual(soup_from_unicode.foo.string, 'Sacr\xe9 bleu!') self.assertEqual(soup_from_unicode.original_encoding, None)
def test_utf8_in_unicode_out(self): # UTF-8 input is converted to Unicode. The original_encoding # attribute is set. soup_from_utf8 = self.soup(self.utf8_data) self.assertEqual(soup_from_utf8.decode(), self.unicode_data) self.assertEqual(soup_from_utf8.foo.string, 'Sacr\xe9 bleu!')
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, '\u05dd\u05d5\u05dc\u05e9')
def test_dont_see_smart_quotes_where_there_are_none(self): utf_8 = b"\343\202\261\343\203\274\343\202\277\343\202\244 Watch" dammit = UnicodeDammit(utf_8) self.assertEqual(dammit.original_encoding.lower(), 'utf-8') self.assertEqual(dammit.unicode_markup.encode("utf-8"), utf_8)
def test_ignore_inappropriate_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self): utf8_data = "Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self): # This is UTF-8. utf8_data = "Räksmörgås".encode("utf-8") # But if we exclude UTF-8 from consideration, the guess is # Windows-1252. dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') # And if we exclude that, there is no valid guess at all. dammit = UnicodeDammit( utf8_data, exclude_encodings=["utf-8", "windows-1252"]) self.assertEqual(dammit.original_encoding, None)
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual("<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)
def test_exclude_encodings(self): utf8_data = u"Räksmörgås".encode("utf-8") soup = self.soup(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual("windows-1252", soup.original_encoding)
def test_convert_hebrew(self): hebrew = b"\xed\xe5\xec\xf9" dammit = UnicodeDammit(hebrew, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'iso-8859-8') self.assertEqual(dammit.unicode_markup, u'\u05dd\u05d5\u05dc\u05e9')
def test_ignore_inappropriate_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") dammit = UnicodeDammit(utf8_data, ["iso-8859-8"]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_ignore_invalid_codecs(self): utf8_data = u"Räksmörgås".encode("utf-8") for bad_encoding in ['.utf8', '...', 'utF---16.!']: dammit = UnicodeDammit(utf8_data, [bad_encoding]) self.assertEqual(dammit.original_encoding.lower(), 'utf-8')
def test_exclude_encodings(self): # This is UTF-8. utf8_data = u"Räksmörgås".encode("utf-8") # But if we exclude UTF-8 from consideration, the guess is # Windows-1252. dammit = UnicodeDammit(utf8_data, exclude_encodings=["utf-8"]) self.assertEqual(dammit.original_encoding.lower(), 'windows-1252') # And if we exclude that, there is no valid guess at all. dammit = UnicodeDammit( utf8_data, exclude_encodings=["utf-8", "windows-1252"]) self.assertEqual(dammit.original_encoding, None)
def test_byte_order_mark_removed(self): # A document written in UTF-16LE will have its byte order marker stripped. data = b'\xff\xfe<\x00a\x00>\x00\xe1\x00\xe9\x00<\x00/\x00a\x00>\x00' dammit = UnicodeDammit(data) self.assertEqual(u"<a>áé</a>", dammit.unicode_markup) self.assertEqual("utf-16le", dammit.original_encoding)