In search of a solution of similar this but in python using gzip
or zlib
.
This SO question How to inflate a partial zlib file does not work (see the first test case)
Not a duplicate of Unzipping part of a .gz file using python, that is not working (and outdated)
These two: Unzip part of a file using python gzip module and Is it possible to figure how to decompress a file, knowing its first bytes? are close to this question (though different) but unfortunately first one doesn't have a working solution and the second one doesn't have any answers at all...
I am iterating over a chunked pieces of gzip bytes received from a remote server, it looks something like this :
async with aiohttp.ClientSession() as session:
async with session.get(LINK) as response:
with open(FILE, "wb") as f:
async for chunk in response.content.iter_chunked(chunk_size):
# Write the decompressed chunk
# to `f`
...
The following are the non-working solutions :
1)
decompressor = zlib.decompressobj(16 + zlib.MAX_WBITS)
async with aiohttp.ClientSession() as session:
async with session.get(LINK) as response:
with open(FILE, "wb") as f:
async for chunk in response.content.iter_chunked(chunk_size):
# Write the decompressed chunk
r = decompressor.decompress(chunk, chunk_size)
# for some reason `r` is always empty
# writing to `f` is pointless
print(f"{len(chunk) = }, {r = }, {len(r) = }")
And here, the r
seems to be empty.
stdout :
len(chunk) = 64, r = b'', len(r) = 0
len(chunk) = 64, r = b'', len(r) = 0
len(chunk) = 64, r = b'', len(r) = 0
len(chunk) = 64, r = b'', len(r) = 0
len(chunk) = 64, r = b'', len(r) = 0
len(chunk) = 64, r = b'', len(r) = 0
...
2)
doing zlib.decompress(...)
doesn't seem to work either on partial data
async with aiohttp.ClientSession() as session:
async with session.get(LINK) as response:
with open(DIR, "wb") as f:
async for chunk in response.content.iter_chunked(chunk_size):
f.write(zlib.decompress(chunk))
This raises :
Traceback (most recent call last):
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 54, in <module>
asyncio.run(main())
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\asyncio\runners.py", line 44, in run
return loop.run_until_complete(main)
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 641, in run_until_complete
return future.result()
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 51, in main
await download_content(0)
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 47, in download_content
f.write(zlib.decompress(chunk))
zlib.error: Error -3 while decompressing data: incorrect header check
3)
Passing in gzip.decompress(chunk)
like this :
with open(DIR, "wb") as f:
async for chunk in response.content.iter_chunked(chunk_size):
f.write(gzip.decompress(chunk))
Causes this :
Traceback (most recent call last):
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 54, in <module>
asyncio.run(main())
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\asyncio\runners.py", line 44, in run
return loop.run_until_complete(main)
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\asyncio\base_events.py", line 641, in run_until_complete
return future.result()
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 51, in main
await download_content(0)
File "c:\Users\lumin\Desktop\rplace\get_data.py", line 47, in download_content
f.write(gzip.decompress(chunk))
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\gzip.py", line 557, in decompress
return f.read()
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\gzip.py", line 301, in read
return self._buffer.read(size)
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\_compression.py", line 118, in readall
while data := self.read(sys.maxsize):
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\gzip.py", line 479, in read
self._read_eof()
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\gzip.py", line 523, in _read_eof
crc32, isize = struct.unpack("<II", self._read_exact(8))
File "C:\Users\lumin\AppData\Local\Programs\Python\Python310\lib\gzip.py", line 425, in _read_exact
raise EOFError("Compressed file ended before the "
EOFError: Compressed file ended before the end-of-stream marker was reached
The full code looks something like this :
from typing import Final
import aiohttp
import asyncio
import os
if os.name == "nt":
# Prevent noisy exit on Windows
asyncio.set_event_loop_policy(asyncio.WindowsSelectorEventLoopPolicy())
async def download_content(
number: int, *, directory: str | None = None, chunk_size: int = 64
) -> None:
"""
Download the content of a archived canvas history file.
And extracts it immediately.
Args:
number: The number associated with the archive.
directory: The directory to extract the file to, defaults to root.
chunk_size: The size of the chunks to download and extract, defaults to 64.
Raises:
TypeError: Argument got invalid type.
ValueError: number wasn't between 0 and 77.
"""
if not isinstance(number, int):
raise TypeError(f"'number' must be of type 'int' got {type(number)}")
if not isinstance(directory, str) and directory is not None:
raise TypeError(f"'directory' must be of type 'str' got {type(directory)}")
if not isinstance(chunk_size, int):
raise TypeError(f"'chunk_size' must be of type 'int' got {type(chunk_size)}")
if 0 > number > 77:
raise ValueError(f"'number' must be between 0 and 77 got {number}")
LINK: Final[str] = "https://placedata.reddit.com/data/canvas-history/2022_place_canvas_history-"
FILE_LOCATION: Final[str] = f"{'0' * (12 - len(str(number)))}{number}.csv.gzip"
DIR: Final[str] = directory if directory is not None else "./"
async with aiohttp.ClientSession() as session:
async with session.get(LINK + FILE_LOCATION) as response:
with open(DIR + FILE_LOCATION[:-5], "wb") as f:
async for chunk in response.content.iter_chunked(chunk_size):
# Write the decompressed chunk to the file
...
async def main():
await download_content(0)
asyncio.run(main())
TLDR: We have received an gzip
file and are iterating over the chunks, we are interested in decompressing these said partial data and writing them to a file.