Inspired by accepted answer to this question I'm trying to wrap PyCurl with requests
-like interface. Everythig would be fine, but after following PyCURL docs describing how to read body encoding from headers, I'm experiencing the following problem. Header callback is called for every response header but only after iterator starts yielding response lines, which makes encoding/charset detection pointless.
Here's the code:
import re
import io
import urllib
import urllib.error
import http
import pycurl
class CurlHTTPStream(object):
SELECT_TIMEOUT = 10
HTTP_STANDARD_ENCODING = 'iso-8859-1'
def __init__(self, method, url, data=None, params=None, headers=None):
self.url = url
self.received_buffer = io.BytesIO()
self.curl = pycurl.Curl()
self.curl.setopt(pycurl.CUSTOMREQUEST, method)
if headers:
self.curl.setopt(
pycurl.HTTPHEADER,
[
'{}: {}'.format(key, value)
for key, value in headers.items()
]
)
if params:
query_string = '&'.join((
'{}={}'.format(key, value)
for key, value in params.items()
))
url = '{}?{}'.format(url, query_string)
self.curl.setopt(pycurl.URL, url)
self.curl.setopt(pycurl.ENCODING, 'gzip')
self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)
self.curl_multi = pycurl.CurlMulti()
self.curl_multi.add_handle(self.curl)
self.status_code = 0
self.headers = {}
def _any_data_received(self):
return self.received_buffer.tell() != 0
def _get_received_data(self):
result = self.received_buffer.getvalue()
self.received_buffer.truncate(0)
self.received_buffer.seek(0)
return result
def _check_status_code(self):
if self.status_code == 0:
self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
raise urllib.error.HTTPError(
self.url, self.status_code, None, None, None
)
def _perform_on_curl(self):
while True:
ret, num_handles = self.curl_multi.perform()
if ret != pycurl.E_CALL_MULTI_PERFORM:
break
return num_handles
def _iter_chunks(self):
while True:
remaining = self._perform_on_curl()
if self._any_data_received():
self._check_status_code()
yield self._get_received_data()
if remaining == 0:
break
self.curl_multi.select(self.SELECT_TIMEOUT)
self._check_status_code()
self._check_curl_errors()
def _check_curl_errors(self):
for f in self.curl_multi.info_read()[2]:
raise pycurl.error(*f[1:])
def iter_lines(self):
chunks = self._iter_chunks()
return self._split_lines_from_chunks(chunks)
def _split_lines_from_chunks(self, chunks):
print('foo')
print(self.headers)
charset = None
if 'content-type' in self.headers:
content_type = self.headers['content-type'].lower()
match = re.search('charset=(\S+)', content_type)
if match:
charset = match.group(1)
print('Decoding using %s' % charset)
if charset is None:
charset = self.HTTP_STANDARD_ENCODING
print('Assuming encoding is %s' % charset)
pending = None
for chunk in chunks:
if pending is not None:
chunk = pending + chunk
lines = chunk.splitlines()
if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
pending = lines.pop()
else:
pending = None
for line in lines:
yield line.decode(charset)
if pending is not None:
yield pending.decode(charset)
def header_function(self, header_line):
print('hello')
header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
if ':' not in header_line:
return
name, value = header_line.split(':', 1)
name = name.strip()
value = value.strip()
name = name.lower()
self.headers[name] = value
def request(method, url, data=None, params=None, headers=None,
stream=False):
if stream:
return CurlHTTPStream(method, url, data=data, params=params,
headers=headers)
And that's what happens in the terminal when I try to test it:
Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
... print(l)
...
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}
There are more lines coming from CouchDB changes feed but I truncated the output since they're not relevant.
Basically foo
in the output indicates that it enters the block where it expects headers to be in place but the next line shows that self.headers
is empty. And multiple hello
stands for every call to header_function()
. How can it be that write callback which writes the body to BytesIO
is called before header callback gets triggered?