我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用codecs.BOM_UTF8。
def _read_content(self, file): first_bytes = min(32, os.path.getsize(file)) with open(file, 'rb') as f: raw = f.read(first_bytes) if raw.startswith(codecs.BOM_UTF8): encoding = 'utf-8-sig' else: encoding = 'utf-8' with open(file, encoding=encoding) as f: lines = [line.rstrip('\n') for line in f.readlines()] if not lines: raise MalformedFileError('The file is empty.') return lines
def getConfigDict(configPath): configDict = None try: configRaw = open(configPath, "rb").read() except IOError: print("ERROR: I/O fatal error.", file = sys.stderr) sys.exit(1) if configRaw.startswith(codecs.BOM_UTF8): configRaw = configRaw[3:] try: configDict = toml.loads(configRaw) except toml.TomlDecodeError as tomlExp: for string in tomlExp.args: print("ERROR: Invalid TOML syntax. " + string, file = sys.stderr) sys.exit(1) except TypeError as typeExp: for string in typeExp.args: print("ERROR: Invalid config file. " + string, file = sys.stderr) sys.exit(1) except: print("ERROR: Invalid config file. Please make sure it is UTF-8 encoded and complies TOML specification.", file = sys.stderr) print("Please review TOML specification at: https://github.com/toml-lang/toml", file = sys.stderr) sys.exit(1) return configDict
def test_position(self): stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\nccc\nddde\nf\ngh") self.assertEquals(stream.position(), (1, 0)) self.assertEquals(stream.charsUntil('c'), u"a\nbb\n") self.assertEquals(stream.position(), (3, 0)) stream.unget(u"\n") self.assertEquals(stream.position(), (2, 2)) self.assertEquals(stream.charsUntil('c'), u"\n") self.assertEquals(stream.position(), (3, 0)) stream.unget(u"\n") self.assertEquals(stream.position(), (2, 2)) self.assertEquals(stream.char(), u"\n") self.assertEquals(stream.position(), (3, 0)) self.assertEquals(stream.charsUntil('e'), u"ccc\nddd") self.assertEquals(stream.position(), (4, 3)) self.assertEquals(stream.charsUntil('h'), u"e\nf\ng") self.assertEquals(stream.position(), (6, 1))
def parse(self, data): """ Returns the parsed tree. """ if isinstance(data, six.binary_type): if data[:3] == codecs.BOM_UTF8: encoding = 'utf-8-sig' else: encoding = 'latin1' content = data.decode(encoding).strip() else: content = data.strip() if not content: return ParseResults() return self.script.parseString(content, parseAll=True)
def readFile(filename, mode=u'rU', continueOnError=False, displayError=True, encoding=None): try: if filename != u'-': if not encoding: with open(os.path.expanduser(filename), mode) as f: return f.read() with codecs.open(os.path.expanduser(filename), mode, encoding) as f: content = f.read() # codecs does not strip UTF-8 BOM (ef:bb:bf) so we must if not content.startswith(codecs.BOM_UTF8): return content return content[3:] return text_type(sys.stdin.read()) except IOError as e: if continueOnError: if displayError: stderrWarningMsg(e) setSysExitRC(FILE_ERROR_RC) return None systemErrorExit(FILE_ERROR_RC, e) except (LookupError, UnicodeDecodeError, UnicodeError) as e: Cmd.Backup() usageErrorExit(e) # Write a file
def __init__(self, f, fieldnames=None, encoding=UTF8, **kwds): self.encoding = encoding try: self.reader = csv.reader(UTF8Recoder(f, encoding) if self.encoding != UTF8 else f, dialect=csv.excel, **kwds) if not fieldnames: self.fieldnames = self.reader.next() if len(self.fieldnames) > 0 and self.fieldnames[0].startswith(codecs.BOM_UTF8): self.fieldnames[0] = self.fieldnames[0].replace(codecs.BOM_UTF8, u'', 1) else: self.fieldnames = fieldnames except (csv.Error, StopIteration): self.fieldnames = [] except LookupError as e: Cmd.Backup() usageErrorExit(e) self.numfields = len(self.fieldnames)
def line_locations(self): if self._line_locations is None: # we need to calculate our line number offset information try: contents = open(self.filename, 'rb') except: # file not available, locked, etc... pass else: with contents: line_info = [] file_len = 0 for line in contents: line_len = len(line) if not line_info and line.startswith(BOM_UTF8): line_len -= len(BOM_UTF8) # Strip the BOM, Django seems to ignore this... if line.endswith(to_bytes('\r\n')): line_len -= 1 # Django normalizes newlines to \n file_len += line_len line_info.append(file_len) contents.close() self._line_locations = line_info return self._line_locations
def encodingFromContents( contents ): if( len(contents) > len(codecs.BOM_UTF8) and contents[0:len(codecs.BOM_UTF8)] == codecs.BOM_UTF8 ): encoding = 'utf-8' elif( len(contents) > len(codecs.BOM_UTF16_LE) and contents[0:len(codecs.BOM_UTF16_LE)] in [codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE]): encoding = 'utf-16' elif( len(contents) > len(codecs.BOM_UTF32_LE) and contents[0:len(codecs.BOM_UTF32_LE)] in [codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE]): encoding = 'utf-32' else: encoding = locale.getdefaultlocale()[1] # Mac says mac-roman when utf-8 is what is required if encoding == 'mac-roman': encoding = 'utf-8' if encoding is None: encoding = 'iso8859-1' return encoding
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16le', codecs.BOM_UTF16_BE: 'utf-16be', codecs.BOM_UTF32_LE: 'utf-32le', codecs.BOM_UTF32_BE: 'utf-32be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream if encoding: self.rawStream.seek(seek) return lookupEncoding(encoding) else: self.rawStream.seek(0) return None
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
def encode(input, errors='strict'): return (codecs.BOM_UTF8 + codecs.utf_8_encode(input, errors)[0], len(input))
def decode(input, errors='strict'): prefix = 0 if input[:3] == codecs.BOM_UTF8: input = input[3:] prefix = 3 (output, consumed) = codecs.utf_8_decode(input, errors, True) return (output, consumed+prefix)
def encode(self, input, final=False): if self.first: self.first = 0 return codecs.BOM_UTF8 + \ codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0]
def decode(self, input, errors='strict'): if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this is a BOM # => try again on the next call return ("", 0) elif input[:3] == codecs.BOM_UTF8: self.decode = codecs.utf_8_decode (output, consumed) = codecs.utf_8_decode(input[3:],errors) return (output, consumed+3) # (else) no BOM present self.decode = codecs.utf_8_decode return codecs.utf_8_decode(input, errors) ### encodings module API
def guess_json_utf(data): """ :rtype: str """ # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM_UTF32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
def encode(self, input, final=False): if self.first: self.first = 0 return codecs.BOM_UTF8 + codecs.utf_8_encode(input, self.errors)[0] else: return codecs.utf_8_encode(input, self.errors)[0]
def _buffer_decode(self, input, errors, final): if self.first: if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this really is a BOM # => try again on the next call return (u"", 0) else: self.first = None else: self.first = None if input[:3] == codecs.BOM_UTF8: (output, consumed) = codecs.utf_8_decode(input[3:], errors, final) return (output, consumed+3) return codecs.utf_8_decode(input, errors, final)
def decode(self, input, errors='strict'): if len(input) < 3: if codecs.BOM_UTF8.startswith(input): # not enough data to decide if this is a BOM # => try again on the next call return (u"", 0) elif input[:3] == codecs.BOM_UTF8: self.decode = codecs.utf_8_decode (output, consumed) = codecs.utf_8_decode(input[3:],errors) return (output, consumed+3) # (else) no BOM present self.decode = codecs.utf_8_decode return codecs.utf_8_decode(input, errors) ### encodings module API
def test_strip_bom(self): content = u"\u3053\u3093\u306b\u3061\u308f" json_doc = codecs.BOM_UTF8 + b(json.dumps(content)) self.assertEqual(json.load(BytesIO(json_doc)), content) for doc in json_doc, json_doc.decode('utf8'): self.assertEqual(json.loads(doc), content)
def decode_raw_stream(self, text, decode_raw, known_encoding, filename): """given string/unicode or bytes/string, determine encoding from magic encoding comment, return body as unicode or raw if decode_raw=False """ if isinstance(text, compat.text_type): m = self._coding_re.match(text) encoding = m and m.group(1) or known_encoding or 'ascii' return encoding, text if text.startswith(codecs.BOM_UTF8): text = text[len(codecs.BOM_UTF8):] parsed_encoding = 'utf-8' m = self._coding_re.match(text.decode('utf-8', 'ignore')) if m is not None and m.group(1) != 'utf-8': raise exceptions.CompileException( "Found utf-8 BOM in file, with conflicting " "magic encoding comment of '%s'" % m.group(1), text.decode('utf-8', 'ignore'), 0, 0, filename) else: m = self._coding_re.match(text.decode('utf-8', 'ignore')) if m: parsed_encoding = m.group(1) else: parsed_encoding = known_encoding or 'ascii' if decode_raw: try: text = text.decode(parsed_encoding) except UnicodeDecodeError: raise exceptions.CompileException( "Unicode decode operation of encoding '%s' failed" % parsed_encoding, text.decode('utf-8', 'ignore'), 0, 0, filename) return parsed_encoding, text
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) assert isinstance(string, bytes) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream self.rawStream.seek(encoding and seek or 0) return encoding
def guess_json_utf(data): # JSON always starts with two ASCII characters, so detection is as # easy as counting the nulls and from their location and count # determine the encoding. Also detect a BOM, if present. sample = data[:4] if sample in (codecs.BOM_UTF32_LE, codecs.BOM32_BE): return 'utf-32' # BOM included if sample[:3] == codecs.BOM_UTF8: return 'utf-8-sig' # BOM included, MS style (discouraged) if sample[:2] in (codecs.BOM_UTF16_LE, codecs.BOM_UTF16_BE): return 'utf-16' # BOM included nullcount = sample.count(_null) if nullcount == 0: return 'utf-8' if nullcount == 2: if sample[::2] == _null2: # 1st and 3rd are null return 'utf-16-be' if sample[1::2] == _null2: # 2nd and 4th are null return 'utf-16-le' # Did not detect 2 valid UTF-16 ascii-range characters if nullcount == 3: if sample[:3] == _null3: return 'utf-32-be' if sample[1:] == _null3: return 'utf-32-le' # Did not detect a valid UTF-32 ascii-range character return None
def test_bom(self): stream = HTMLInputStream(codecs.BOM_UTF8 + "'") self.assertEquals(stream.charEncoding[0], 'utf-8') self.assertEquals(stream.char(), "'")
def test_newlines(self): stream = HTMLInputStreamShortChunk(codecs.BOM_UTF8 + "a\nbb\r\nccc\rddddxe") self.assertEquals(stream.position(), (1, 0)) self.assertEquals(stream.charsUntil('c'), u"a\nbb\n") self.assertEquals(stream.position(), (3, 0)) self.assertEquals(stream.charsUntil('x'), u"ccc\ndddd") self.assertEquals(stream.position(), (4, 4)) self.assertEquals(stream.charsUntil('e'), u"x") self.assertEquals(stream.position(), (4, 5))
def detectBOM(self): """Attempts to detect at BOM at the start of the stream. If an encoding can be determined from the BOM return the name of the encoding otherwise return None""" bomDict = { codecs.BOM_UTF8: 'utf-8', codecs.BOM_UTF16_LE: 'utf-16-le', codecs.BOM_UTF16_BE: 'utf-16-be', codecs.BOM_UTF32_LE: 'utf-32-le', codecs.BOM_UTF32_BE: 'utf-32-be' } # Go to beginning of file and read in 4 bytes string = self.rawStream.read(4) # Try detecting the BOM using bytes from the string encoding = bomDict.get(string[:3]) # UTF-8 seek = 3 if not encoding: # Need to detect UTF-32 before UTF-16 encoding = bomDict.get(string) # UTF-32 seek = 4 if not encoding: encoding = bomDict.get(string[:2]) # UTF-16 seek = 2 # Set the read position past the BOM if one was found, otherwise # set it to the start of the stream self.rawStream.seek(encoding and seek or 0) return encoding