几天前,我遇到了针对Base-36的CodeReview,它编码了一个字节数组。但是,随后的答案并没有涉及解码回字节数组,也没有重用答案来执行不同基数(基数)的编码。
链接问题的答案使用BigInteger。因此,就实现而言,可以对基数及其数字进行参数化。
不过,BigInteger的问题在于,我们将输入视为假定的整数。但是,我们的输入(字节数组)只是一系列不透明的值。
.NET程序员如何使用BigInteger来创建合理有效且与基数无关的编码器,并具有解码支持以及处理字节序的能力,并且能够“解决”丢失的结尾零字节?
编辑 [2020/01/26]:FWIW,下面的代码及其单元测试与我在Github上的开源库一起发布。
编辑 [2016/04/19]:如果您喜欢异常,则可能希望更改一些Decode实现代码以抛出,InvalidDataException而不仅仅是返回null。
InvalidDataException
编辑 [2014/09/14]:我在Encode()中添加了一个“ HACK”,以处理输入中最后一个字节已签名的情况(如果要转换为sbyte)。我现在能想到的唯一明智的解决方案是将数组的大小调整为1。通过了此案例的其他单元测试,但我没有重新运行perf代码以解决此类情况。如果可以帮助您,请始终对Encode()的输入在末尾包含一个虚拟0字节,以避免进行其他分配。
我创建了一个RadixEncoding类(在“代码”部分中找到),该类使用三个参数初始化:
要创建Base-36编码,并使用little-endian输入,并考虑到以零字节结尾:
const string k_base36_digits = "0123456789abcdefghijklmnopqrstuvwxyz"; var base36_no_zeros = new RadixEncoding(k_base36_digits, EndianFormat.Little, false);
然后实际执行编码/解码:
const string k_input = "A test 1234"; byte[] input_bytes = System.Text.Encoding.UTF8.GetBytes(k_input); string encoded_string = base36_no_zeros.Encode(input_bytes); byte[] decoded_bytes = base36_no_zeros.Decode(encoded_string);
与Diagnostics.Stopwatch一起计时,在i7 860 @ 2.80GHz上运行。定时EXE本身运行,而不是在调试器下运行。
使用上面相同的 k_base36_digits 字符串 EndianFormat.Little 初始化编码,并 确认结尾为零字节 (即使UTF8字节没有多余的结尾零字节)
编码“测试1234”的UTF8字节1,000,000次需要2.6567905 秒
编码“测试1234。稍大一点!”的UTF8字节。100,000次需要1.1577325秒。 要解码相同的字符串,相同的时间需要1.244326秒
如果没有CodeContracts生成器,则必须使用if / throw代码重新实现合同。
using System; using System.Collections.Generic; using System.Numerics; using Contract = System.Diagnostics.Contracts.Contract; public enum EndianFormat { /// <summary>Least Significant Bit order (lsb)</summary> /// <remarks>Right-to-Left</remarks> /// <see cref="BitConverter.IsLittleEndian"/> Little, /// <summary>Most Significant Bit order (msb)</summary> /// <remarks>Left-to-Right</remarks> Big, }; /// <summary>Encodes/decodes bytes to/from a string</summary> /// <remarks> /// Encoded string is always in big-endian ordering /// /// <p>Encode and Decode take a <b>includeProceedingZeros</b> parameter which acts as a work-around /// for an edge case with our BigInteger implementation. /// MSDN says BigInteger byte arrays are in LSB->MSB ordering. So a byte buffer with zeros at the /// end will have those zeros ignored in the resulting encoded radix string. /// If such a loss in precision absolutely cannot occur pass true to <b>includeProceedingZeros</b> /// and for a tiny bit of extra processing it will handle the padding of zero digits (encoding) /// or bytes (decoding).</p> /// <p>Note: doing this for decoding <b>may</b> add an extra byte more than what was originally /// given to Encode.</p> /// </remarks> // Based on the answers from http://codereview.stackexchange.com/questions/14084/base-36-encoding-of-a-byte-array/ public class RadixEncoding { const int kByteBitCount = 8; readonly string kDigits; readonly double kBitsPerDigit; readonly BigInteger kRadixBig; readonly EndianFormat kEndian; readonly bool kIncludeProceedingZeros; /// <summary>Numerial base of this encoding</summary> public int Radix { get { return kDigits.Length; } } /// <summary>Endian ordering of bytes input to Encode and output by Decode</summary> public EndianFormat Endian { get { return kEndian; } } /// <summary>True if we want ending zero bytes to be encoded</summary> public bool IncludeProceedingZeros { get { return kIncludeProceedingZeros; } } public override string ToString() { return string.Format("Base-{0} {1}", Radix.ToString(), kDigits); } /// <summary>Create a radix encoder using the given characters as the digits in the radix</summary> /// <param name="digits">Digits to use for the radix-encoded string</param> /// <param name="bytesEndian">Endian ordering of bytes input to Encode and output by Decode</param> /// <param name="includeProceedingZeros">True if we want ending zero bytes to be encoded</param> public RadixEncoding(string digits, EndianFormat bytesEndian = EndianFormat.Little, bool includeProceedingZeros = false) { Contract.Requires<ArgumentNullException>(digits != null); int radix = digits.Length; kDigits = digits; kBitsPerDigit = System.Math.Log(radix, 2); kRadixBig = new BigInteger(radix); kEndian = bytesEndian; kIncludeProceedingZeros = includeProceedingZeros; } // Number of characters needed for encoding the specified number of bytes int EncodingCharsCount(int bytesLength) { return (int)Math.Ceiling((bytesLength * kByteBitCount) / kBitsPerDigit); } // Number of bytes needed to decoding the specified number of characters int DecodingBytesCount(int charsCount) { return (int)Math.Ceiling((charsCount * kBitsPerDigit) / kByteBitCount); } /// <summary>Encode a byte array into a radix-encoded string</summary> /// <param name="bytes">byte array to encode</param> /// <returns>The bytes in encoded into a radix-encoded string</returns> /// <remarks>If <paramref name="bytes"/> is zero length, returns an empty string</remarks> public string Encode(byte[] bytes) { Contract.Requires<ArgumentNullException>(bytes != null); Contract.Ensures(Contract.Result<string>() != null); // Don't really have to do this, our code will build this result (empty string), // but why not catch the condition before doing work? if (bytes.Length == 0) return string.Empty; // if the array ends with zeros, having the capacity set to this will help us know how much // 'padding' we will need to add int result_length = EncodingCharsCount(bytes.Length); // List<> has a(n in-place) Reverse method. StringBuilder doesn't. That's why. var result = new List<char>(result_length); // HACK: BigInteger uses the last byte as the 'sign' byte. If that byte's MSB is set, // we need to pad the input with an extra 0 (ie, make it positive) if ( (bytes[bytes.Length-1] & 0x80) == 0x80 ) Array.Resize(ref bytes, bytes.Length+1); var dividend = new BigInteger(bytes); // IsZero's computation is less complex than evaluating "dividend > 0" // which invokes BigInteger.CompareTo(BigInteger) while (!dividend.IsZero) { BigInteger remainder; dividend = BigInteger.DivRem(dividend, kRadixBig, out remainder); int digit_index = System.Math.Abs((int)remainder); result.Add(kDigits[digit_index]); } if (kIncludeProceedingZeros) for (int x = result.Count; x < result.Capacity; x++) result.Add(kDigits[0]); // pad with the character that represents 'zero' // orientate the characters in big-endian ordering if (kEndian == EndianFormat.Little) result.Reverse(); // If we didn't end up adding padding, ToArray will end up returning a TrimExcess'd array, // so nothing wasted return new string(result.ToArray()); } void DecodeImplPadResult(ref byte[] result, int padCount) { if (padCount > 0) { int new_length = result.Length + DecodingBytesCount(padCount); Array.Resize(ref result, new_length); // new bytes will be zero, just the way we want it } } #region Decode (Little Endian) byte[] DecodeImpl(string chars, int startIndex = 0) { var bi = new BigInteger(); for (int x = startIndex; x < chars.Length; x++) { int i = kDigits.IndexOf(chars[x]); if (i < 0) return null; // invalid character bi *= kRadixBig; bi += i; } return bi.ToByteArray(); } byte[] DecodeImplWithPadding(string chars) { int pad_count = 0; for (int x = 0; x < chars.Length; x++, pad_count++) if (chars[x] != kDigits[0]) break; var result = DecodeImpl(chars, pad_count); DecodeImplPadResult(ref result, pad_count); return result; } #endregion #region Decode (Big Endian) byte[] DecodeImplReversed(string chars, int startIndex = 0) { var bi = new BigInteger(); for (int x = (chars.Length-1)-startIndex; x >= 0; x--) { int i = kDigits.IndexOf(chars[x]); if (i < 0) return null; // invalid character bi *= kRadixBig; bi += i; } return bi.ToByteArray(); } byte[] DecodeImplReversedWithPadding(string chars) { int pad_count = 0; for (int x = chars.Length - 1; x >= 0; x--, pad_count++) if (chars[x] != kDigits[0]) break; var result = DecodeImplReversed(chars, pad_count); DecodeImplPadResult(ref result, pad_count); return result; } #endregion /// <summary>Decode a radix-encoded string into a byte array</summary> /// <param name="radixChars">radix string</param> /// <returns>The decoded bytes, or null if an invalid character is encountered</returns> /// <remarks> /// If <paramref name="radixChars"/> is an empty string, returns a zero length array /// /// Using <paramref name="IncludeProceedingZeros"/> has the potential to return a buffer with an /// additional zero byte that wasn't in the input. So a 4 byte buffer was encoded, this could end up /// returning a 5 byte buffer, with the extra byte being null. /// </remarks> public byte[] Decode(string radixChars) { Contract.Requires<ArgumentNullException>(radixChars != null); if (kEndian == EndianFormat.Big) return kIncludeProceedingZeros ? DecodeImplReversedWithPadding(radixChars) : DecodeImplReversed(radixChars); else return kIncludeProceedingZeros ? DecodeImplWithPadding(radixChars) : DecodeImpl(radixChars); } };
using System; using Microsoft.VisualStudio.TestTools.UnitTesting; static bool ArraysCompareN<T>(T[] input, T[] output) where T : IEquatable<T> { if (output.Length < input.Length) return false; for (int x = 0; x < input.Length; x++) if(!output[x].Equals(input[x])) return false; return true; } static bool RadixEncodingTest(RadixEncoding encoding, byte[] bytes) { string encoded = encoding.Encode(bytes); byte[] decoded = encoding.Decode(encoded); return ArraysCompareN(bytes, decoded); } [TestMethod] public void TestRadixEncoding() { const string k_base36_digits = "0123456789abcdefghijklmnopqrstuvwxyz"; var base36 = new RadixEncoding(k_base36_digits, EndianFormat.Little, true); var base36_no_zeros = new RadixEncoding(k_base36_digits, EndianFormat.Little, true); byte[] ends_with_zero_neg = { 0xFF, 0xFF, 0x00, 0x00 }; byte[] ends_with_zero_pos = { 0xFF, 0x7F, 0x00, 0x00 }; byte[] text = System.Text.Encoding.ASCII.GetBytes("A test 1234"); Assert.IsTrue(RadixEncodingTest(base36, ends_with_zero_neg)); Assert.IsTrue(RadixEncodingTest(base36, ends_with_zero_pos)); Assert.IsTrue(RadixEncodingTest(base36_no_zeros, text)); }