I have been having issues processing large files. At first I was blaming the parser, so I wrote one, but still have the same issues. If I use this code to scan a 1,000,000 record file (250 MB), it takes up 4 GB of memory while processing. I would expect under 50 MB, considering I am only taking one line at a time:
func sample(fileURL: URL) {
if let aStreamReader = StreamReader(path: fileURL.path) {
defer {
aStreamReader.close()
}
while let line = aStreamReader.nextLine() {
// insert industrious code here... (a function call)
}
}
}
(note this doesn't do anything except read the file and discard the results) **
Why is the entire file being processed rather than one line at a time?
** The files I need to process range in the many GBs. I did not write the StreamReader - I have found the same code in a number of places with minor variations -- it appears to be based on a C# class. This is the StreamReader code I am using:
// StreamReader.swift
import Foundation
class StreamReader {
let encoding: String.Encoding
let chunkSize: Int
var fileHandle: FileHandle!
let delimData: Data
var buffer: Data
var atEof: Bool
init?(path: String, delimiter: String = "\n", encoding: String.Encoding = .utf8,
chunkSize: Int = 4096) {
guard let fileHandle = FileHandle(forReadingAtPath: path),
let delimData = delimiter.data(using: encoding) else {
return nil
}
self.encoding = encoding
self.chunkSize = chunkSize
self.fileHandle = fileHandle
self.delimData = delimData
self.buffer = Data(capacity: chunkSize)
self.atEof = false
}
deinit {
self.close()
}
/// Return next line, or nil on EOF.
func nextLine() -> String? {
precondition(fileHandle != nil, "Attempt to read from closed file")
// Read data chunks from file until a line delimiter is found:
while !atEof {
if let range = buffer.range(of: delimData) {
// Convert complete line (excluding the delimiter) to a string:
let line = String(data: buffer.subdata(in: 0..<range.lowerBound), encoding: encoding)
// Remove line (and the delimiter) from the buffer:
buffer.removeSubrange(0..<range.upperBound)
return line
}
let tmpData = fileHandle.readData(ofLength: chunkSize)
if !tmpData.isEmpty {
buffer.append(tmpData)
} else {
// EOF or read error.
atEof = true
if !buffer.isEmpty {
// Buffer contains last line in file (not terminated by delimiter).
let line = String(data: buffer as Data, encoding: encoding)
buffer.count = 0
return line
}
}
}
return nil
}
/// Start reading from the beginning of file.
func rewind() {
fileHandle.seek(toFileOffset: 0)
buffer.count = 0
atEof = false
}
/// Close the underlying file. No reading must be done after calling this method.
func close() {
fileHandle?.closeFile()
fileHandle = nil
}
}
extension StreamReader : Sequence {
func makeIterator() -> AnyIterator<String> {
return AnyIterator {
return self.nextLine()
}
}
}