我需要处理一个大文件,大约40万行和200M。但是有时我必须从头开始处理。如何在这里使用迭代器(收益率回报)?基本上,我不喜欢将所有内容加载到内存中。我知道在.NET中使用迭代器效率更高。
除非使用固定大小的编码(例如ASCII),否则向后读取文本文件确实很棘手。当您获得可变大小的编码(例如UTF-8)时,在获取数据时,您将继续检查是否在字符中间。
该框架没有内置任何内容,我怀疑您必须为每种可变宽度编码分别进行硬编码。
编辑:这已经过 一些 测试- 但这并不是说它仍然没有一些细微的错误。它使用了MiscUtil的StreamUtil,但我在底部仅包括了必要的(新)方法。哦,它需要重构- 您会看到一种非常繁重的方法:
using System; using System.Collections; using System.Collections.Generic; using System.IO; using System.Text; namespace MiscUtil.IO { /// <summary> /// Takes an encoding (defaulting to UTF-8) and a function which produces a seekable stream /// (or a filename for convenience) and yields lines from the end of the stream backwards. /// Only single byte encodings, and UTF-8 and Unicode, are supported. The stream /// returned by the function must be seekable. /// </summary> public sealed class ReverseLineReader : IEnumerable<string> { /// <summary> /// Buffer size to use by default. Classes with internal access can specify /// a different buffer size - this is useful for testing. /// </summary> private const int DefaultBufferSize = 4096; /// <summary> /// Means of creating a Stream to read from. /// </summary> private readonly Func<Stream> streamSource; /// <summary> /// Encoding to use when converting bytes to text /// </summary> private readonly Encoding encoding; /// <summary> /// Size of buffer (in bytes) to read each time we read from the /// stream. This must be at least as big as the maximum number of /// bytes for a single character. /// </summary> private readonly int bufferSize; /// <summary> /// Function which, when given a position within a file and a byte, states whether /// or not the byte represents the start of a character. /// </summary> private Func<long,byte,bool> characterStartDetector; /// <summary> /// Creates a LineReader from a stream source. The delegate is only /// called when the enumerator is fetched. UTF-8 is used to decode /// the stream into text. /// </summary> /// <param name="streamSource">Data source</param> public ReverseLineReader(Func<Stream> streamSource) : this(streamSource, Encoding.UTF8) { } /// <summary> /// Creates a LineReader from a filename. The file is only opened /// (or even checked for existence) when the enumerator is fetched. /// UTF8 is used to decode the file into text. /// </summary> /// <param name="filename">File to read from</param> public ReverseLineReader(string filename) : this(filename, Encoding.UTF8) { } /// <summary> /// Creates a LineReader from a filename. The file is only opened /// (or even checked for existence) when the enumerator is fetched. /// </summary> /// <param name="filename">File to read from</param> /// <param name="encoding">Encoding to use to decode the file into text</param> public ReverseLineReader(string filename, Encoding encoding) : this(() => File.OpenRead(filename), encoding) { } /// <summary> /// Creates a LineReader from a stream source. The delegate is only /// called when the enumerator is fetched. /// </summary> /// <param name="streamSource">Data source</param> /// <param name="encoding">Encoding to use to decode the stream into text</param> public ReverseLineReader(Func<Stream> streamSource, Encoding encoding) : this(streamSource, encoding, DefaultBufferSize) { } internal ReverseLineReader(Func<Stream> streamSource, Encoding encoding, int bufferSize) { this.streamSource = streamSource; this.encoding = encoding; this.bufferSize = bufferSize; if (encoding.IsSingleByte) { // For a single byte encoding, every byte is the start (and end) of a character characterStartDetector = (pos, data) => true; } else if (encoding is UnicodeEncoding) { // For UTF-16, even-numbered positions are the start of a character. // TODO: This assumes no surrogate pairs. More work required // to handle that. characterStartDetector = (pos, data) => (pos & 1) == 0; } else if (encoding is UTF8Encoding) { // For UTF-8, bytes with the top bit clear or the second bit set are the start of a character // See http://www.cl.cam.ac.uk/~mgk25/unicode.html characterStartDetector = (pos, data) => (data & 0x80) == 0 || (data & 0x40) != 0; } else { throw new ArgumentException("Only single byte, UTF-8 and Unicode encodings are permitted"); } } /// <summary> /// Returns the enumerator reading strings backwards. If this method discovers that /// the returned stream is either unreadable or unseekable, a NotSupportedException is thrown. /// </summary> public IEnumerator<string> GetEnumerator() { Stream stream = streamSource(); if (!stream.CanSeek) { stream.Dispose(); throw new NotSupportedException("Unable to seek within stream"); } if (!stream.CanRead) { stream.Dispose(); throw new NotSupportedException("Unable to read within stream"); } return GetEnumeratorImpl(stream); } private IEnumerator<string> GetEnumeratorImpl(Stream stream) { try { long position = stream.Length; if (encoding is UnicodeEncoding && (position & 1) != 0) { throw new InvalidDataException("UTF-16 encoding provided, but stream has odd length."); } // Allow up to two bytes for data from the start of the previous // read which didn't quite make it as full characters byte[] buffer = new byte[bufferSize + 2]; char[] charBuffer = new char[encoding.GetMaxCharCount(buffer.Length)]; int leftOverData = 0; String previousEnd = null; // TextReader doesn't return an empty string if there's line break at the end // of the data. Therefore we don't return an empty string if it's our *first* // return. bool firstYield = true; // A line-feed at the start of the previous buffer means we need to swallow // the carriage-return at the end of this buffer - hence this needs declaring // way up here! bool swallowCarriageReturn = false; while (position > 0) { int bytesToRead = Math.Min(position > int.MaxValue ? bufferSize : (int)position, bufferSize); position -= bytesToRead; stream.Position = position; StreamUtil.ReadExactly(stream, buffer, bytesToRead); // If we haven't read a full buffer, but we had bytes left // over from before, copy them to the end of the buffer if (leftOverData > 0 && bytesToRead != bufferSize) { // Buffer.BlockCopy doesn't document its behaviour with respect // to overlapping data: we *might* just have read 7 bytes instead of // 8, and have two bytes to copy... Array.Copy(buffer, bufferSize, buffer, bytesToRead, leftOverData); } // We've now *effectively* read this much data. bytesToRead += leftOverData; int firstCharPosition = 0; while (!characterStartDetector(position + firstCharPosition, buffer[firstCharPosition])) { firstCharPosition++; // Bad UTF-8 sequences could trigger this. For UTF-8 we should always // see a valid character start in every 3 bytes, and if this is the start of the file // so we've done a short read, we should have the character start // somewhere in the usable buffer. if (firstCharPosition == 3 || firstCharPosition == bytesToRead) { throw new InvalidDataException("Invalid UTF-8 data"); } } leftOverData = firstCharPosition; int charsRead = encoding.GetChars(buffer, firstCharPosition, bytesToRead - firstCharPosition, charBuffer, 0); int endExclusive = charsRead; for (int i = charsRead - 1; i >= 0; i--) { char lookingAt = charBuffer[i]; if (swallowCarriageReturn) { swallowCarriageReturn = false; if (lookingAt == '\r') { endExclusive--; continue; } } // Anything non-line-breaking, just keep looking backwards if (lookingAt != '\n' && lookingAt != '\r') { continue; } // End of CRLF? Swallow the preceding CR if (lookingAt == '\n') { swallowCarriageReturn = true; } int start = i + 1; string bufferContents = new string(charBuffer, start, endExclusive - start); endExclusive = i; string stringToYield = previousEnd == null ? bufferContents : bufferContents + previousEnd; if (!firstYield || stringToYield.Length != 0) { yield return stringToYield; } firstYield = false; previousEnd = null; } previousEnd = endExclusive == 0 ? null : (new string(charBuffer, 0, endExclusive) + previousEnd); // If we didn't decode the start of the array, put it at the end for next time if (leftOverData != 0) { Buffer.BlockCopy(buffer, 0, buffer, bufferSize, leftOverData); } } if (leftOverData != 0) { // At the start of the final buffer, we had the end of another character. throw new InvalidDataException("Invalid UTF-8 data at start of stream"); } if (firstYield && string.IsNullOrEmpty(previousEnd)) { yield break; } yield return previousEnd ?? ""; } finally { stream.Dispose(); } } IEnumerator IEnumerable.GetEnumerator() { return GetEnumerator(); } } } // StreamUtil.cs: public static class StreamUtil { public static void ReadExactly(Stream input, byte[] buffer, int bytesToRead) { int index = 0; while (index < bytesToRead) { int read = input.Read(buffer, index, bytesToRead - index); if (read == 0) { throw new EndOfStreamException (String.Format("End of stream reached with {0} byte{1} left to read.", bytesToRead - index, bytesToRead - index == 1 ? "s" : "")); } index += read; } } }
反馈非常欢迎。这很有趣:)