The bz2
module provides a standard open()
method from which one can call readline()
. However, my situation is one where I have a stream (pointing to a large amount of data) that I want to decompress lines from on the fly. My current implementation is as follows but I know there must be a more succinct way to do this.
import bz2
import csv
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = None
BZ2_FILE = None
BZ2_READ_SIZE = 100 * 1024
def bz2_csv_rows(fp):
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
BZ2_BUFFER = ''
BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
BZ2_FILE = fp
for row in csv.reader(iter(bz2_line_reader, b'')):
yield row
def bz2_line_reader():
global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE
if BZ2_BUFFER is None:
return None
while '\n' not in BZ2_BUFFER:
bindata = BZ2_FILE.read(BZ2_READ_SIZE)
try:
data = BZ2_DECOMPRESSOR.decompress(bindata)
except EOFError:
break
except IOError:
pass
BZ2_BUFFER += data
if len(data) < BZ2_READ_SIZE:
BZ2_FILE = None
break
i = BZ2_BUFFER.find('\n')
if i is None or i < 0:
line = BZ2_BUFFER
BZ2_BUFFER = None
return line
line = BZ2_BUFFER[:i]
BZ2_BUFFER = BZ2_BUFFER[i + 1:]
return line
Thoughts?