我们从Python开源项目中,提取了以下11个代码示例,用于说明如何使用codecs.BOM_UTF16。
def remove_bom(filename): if os.path.isfile(filename): f = open(filename, 'rb') # read first 4 bytes header = f.read(4) # check for BOM bom_len = 0 encodings = [(codecs.BOM_UTF32, 4), (codecs.BOM_UTF16, 2), (codecs.BOM_UTF8, 3)] # remove appropriate number of bytes for h, l in encodings: if header.startswith(h): bom_len = l break f.seek(0) f.read(bom_len) return f
def export(self): out = StringIO() final = StringIO() import csv writer = csv.writer(out, delimiter='\t') if self.rows: import codecs final.write(codecs.BOM_UTF16) writer.writerow( [to_unicode(col, "utf8") for col in self.rows.colnames]) data = out.getvalue().decode("utf8") data = data.encode("utf-16") data = data[2:] final.write(data) out.truncate(0) records = self.represented() for row in records: writer.writerow( [str(col).decode('utf8').encode("utf-8") for col in row]) data = out.getvalue().decode("utf8") data = data.encode("utf-16") data = data[2:] final.write(data) out.truncate(0) return str(final.getvalue())
def export(self): out = cStringIO.StringIO() final = cStringIO.StringIO() import csv writer = csv.writer(out, delimiter='\t') if self.rows: import codecs final.write(codecs.BOM_UTF16) writer.writerow( [unicode(col).encode("utf8") for col in self.rows.colnames]) data = out.getvalue().decode("utf8") data = data.encode("utf-16") data = data[2:] final.write(data) out.truncate(0) records = self.represented() for row in records: writer.writerow( [str(col).decode('utf8').encode("utf-8") for col in row]) data = out.getvalue().decode("utf8") data = data.encode("utf-16") data = data[2:] final.write(data) out.truncate(0) return str(final.getvalue())
def _wmic_output(): """ Returns the output from running the built-in `wmic` command. Redirects the output of `wmic` to a temporary file and then reads it back in. This would be cleaner if done using subprocess, but attempting to capture `stdout` internally led to freezing under Windows XP. (This may have been happening because the script is not being run as a main process.) """ # choose a unique file name (re-entrant/thread-safe/crash-safe) OUTPUT_PATH = os.path.join( tempfile.gettempdir(), "kolibri_disks-{}.txt".format(uuid.uuid4()) ) # pipe output from the WMIC command to the temp file cmd = "wmic logicaldisk list full /format:csv > {}".format(OUTPUT_PATH) returnCode = os.system(cmd) if returnCode: raise Exception("Could not run command '{}'".format(cmd)) # output from WMIC is ostensibly UTF-16 with open(OUTPUT_PATH, 'rb') as f: bin_output = f.read() # The very first time WMIC is run on a windows machine, the output gets mangled. # The BOM is replaced by WMIC's initialization message, so we need to put it back. # (On all subsequent runs, these next lines do nothing.) INIT_MSG = "Please wait while WMIC is being installed.".encode('ascii') # Yes, ascii. bin_output = bin_output.replace(INIT_MSG, codecs.BOM_UTF16) # finally, decode the well-formatted UTF-16 byte string output = bin_output.decode('utf-16') # clean up temp file os.remove(OUTPUT_PATH) return output
def decode(str, errors='strict'): """ Decode strings :param str str: input string :param str errors:error level :return: str """ output = '' try: if len(str) < 3: if codecs.BOM_UTF8.startswith(str): # not enough data to decide if this is a BOM # => try again on the next call output = "" elif str[:3] == codecs.BOM_UTF8: (output, sizes) = codecs.utf_8_decode(str[3:], errors) elif str[:3] == codecs.BOM_UTF16: output = str[3:].decode('utf16') else: # (else) no BOM present (output, sizes) = codecs.utf_8_decode(str, errors) return str(output) except (UnicodeDecodeError, Exception): # seems, its getting not a content (images, file, etc) try: return str.decode('cp1251') except (UnicodeDecodeError, Exception): return ""
def get_decoded_header(header, value): subject, encoding = decode_header(value)[0] subject = subject.strip() # extra whitespace will mess up encoding if isinstance(subject, bytes): # Remove Byte Order Mark (BOM) from UTF strings if encoding == 'utf-8': return re.sub(codecs.BOM_UTF8, b"", subject).decode(encoding) if encoding == 'utf-16': return re.sub(codecs.BOM_UTF16, b"", subject).decode(encoding) elif encoding == 'utf-32': return re.sub(codecs.BOM_UTF32, b"", subject).decode(encoding) # Try various UTF decodings for any unknown 8bit encodings elif encoding == 'unknown-8bit': for enc in [('utf-8', codecs.BOM_UTF8), ('utf-32', codecs.BOM_UTF32), # 32 before 16 so it raises errors ('utf-16', codecs.BOM_UTF16)]: try: return re.sub(enc[1], b"", subject).decode(enc[0]) except UnicodeDecodeError: continue # If none of those encoding work return it in RFC2047 format return str(subject) # Provide RFC2047 format string if encoding is a unknown encoding # Better to have the analyst decode themselves than to provide a mangled string elif encoding is None: return str(subject) else: return subject.decode(encoding)