How to read lines from arbitrary BZ2 streams for CSV?

Question

The bz2 module provides a standard open() method from which one can call readline(). However, my situation is one where I have a stream (pointing to a large amount of data) that I want to decompress lines from on the fly. My current implementation is as follows but I know there must be a more succinct way to do this.

import bz2
import csv

BZ2_BUFFER = ''

BZ2_DECOMPRESSOR = None

BZ2_FILE = None

BZ2_READ_SIZE = 100 * 1024


def bz2_csv_rows(fp):
    global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE

    BZ2_BUFFER = ''
    BZ2_DECOMPRESSOR = bz2.BZ2Decompressor()
    BZ2_FILE = fp

    for row in csv.reader(iter(bz2_line_reader, b'')):
        yield row


def bz2_line_reader():
    global BZ2_BUFFER, BZ2_DECOMPRESSOR, BZ2_FILE, BZ2_READ_SIZE

    if BZ2_BUFFER is None:
        return None

    while '\n' not in BZ2_BUFFER:
        bindata = BZ2_FILE.read(BZ2_READ_SIZE)

        try:
            data = BZ2_DECOMPRESSOR.decompress(bindata)
        except EOFError:
            break
        except IOError:
            pass

        BZ2_BUFFER += data

        if len(data) < BZ2_READ_SIZE:
            BZ2_FILE = None
            break

    i = BZ2_BUFFER.find('\n')
    if i is None or i < 0:
        line = BZ2_BUFFER
        BZ2_BUFFER = None
        return line

    line = BZ2_BUFFER[:i]
    BZ2_BUFFER = BZ2_BUFFER[i + 1:]
    return line

Thoughts?

IMHO, an io.TextIOWrapper over the decompressed stream is all you need, but I could not understand how you get your data... — Serge Ballesta, Dec 12 '17 at 18:15

martineau · Accepted Answer · 2019-04-03T17:56:32.050

6

Here's something that's a little more succinct, and (in my opinion) it's more readable and gets rid of all those nasty global variables your code uses:

import bz2
import csv
from functools import partial

class BZ2_CSV_LineReader(object):
    def __init__(self, filename, buffer_size=4*1024):
        self.filename = filename
        self.buffer_size = buffer_size

    def readlines(self):
        with open(self.filename, 'rb') as file:
            for row in csv.reader(self._line_reader(file)):
                yield row

    def _line_reader(self, file):
        buffer = ''
        decompressor = bz2.BZ2Decompressor()
        reader = partial(file.read, self.buffer_size)

        for bindata in iter(reader, b''):
            block = decompressor.decompress(bindata).decode('utf-8')
            buffer += block
            if '\n' in buffer:
                lines = buffer.splitlines(True)
                if lines:
                    buffer = '' if lines[-1].endswith('\n') else lines.pop()
                    for line in lines:
                        yield line

if __name__ == '__main__':

    bz2_csv_filename = 'test_csv.bz2'
    for row in BZ2_CSV_LineReader(bz2_csv_filename).readlines():
        print(row)

edited Apr 03 '19 at 17:56

answered Dec 12 '17 at 23:42

martineau

119,623
25
170
301

1

Thanks for the code - this is excellent! To make this code compatible with Python 3, change the line `block = decompressor.decompress(biodata)` to `block = decompressor.decompress(bindata).decode("utf-8")`. – Demitri Dec 24 '17 at 19:51
@Demitri: You're welcome...that's good to hear and your suggestion looks useful, however since this question is tagged "python-2.7", I won't be changing my answer with regards to that. – martineau Dec 24 '17 at 19:56
1

Oh no, I don't expect you to, just added it to help anyone else who wanted to use it with Python 3 (or future me who might find this answer again). – Demitri Dec 24 '17 at 20:15
1

@Demitri: Decided to incorporate the changes you mentioned into the answer in a way that makes it work in both Python 2 and 3 so it'll be useful to more folks. – martineau Dec 24 '17 at 23:05

Evgeny Smolin · Answer 2 · 2020-11-03T15:16:12.353

2

Maybe it'll be useful: I use Python 3 and I have a large csv.bz2 file. I handle it this way:

import bz2
import csv

def bz2_csv_rows(fp):
    with bz2.open(fp, mode='rt', newline='') as bzfp:
        for row in csv.reader(bzfp):
            yield row

Key feature is to open stream in text mode: mode='rt' in call bz2.open() instead of manual searching "\n" in binary mode. But I'm not sure this will work for not physical files.

edited Nov 03 '20 at 15:16

answered Nov 03 '20 at 15:09

Evgeny Smolin

46
1
2

This is much more simple than the accepted answer! – zedomel Jul 07 '22 at 19:14

How to read lines from arbitrary BZ2 streams for CSV?

2 Answers2