PyCURL is processing body before headers

Question

Inspired by accepted answer to this question I'm trying to wrap PyCurl with requests-like interface. Everythig would be fine, but after following PyCURL docs describing how to read body encoding from headers, I'm experiencing the following problem. Header callback is called for every response header but only after iterator starts yielding response lines, which makes encoding/charset detection pointless.

Here's the code:

import re
import io
import urllib
import urllib.error
import http

import pycurl


class CurlHTTPStream(object):

    SELECT_TIMEOUT = 10
    HTTP_STANDARD_ENCODING = 'iso-8859-1'

    def __init__(self, method, url, data=None, params=None, headers=None):
        self.url = url
        self.received_buffer = io.BytesIO()

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if headers:
            self.curl.setopt(
                pycurl.HTTPHEADER,
                [
                    '{}: {}'.format(key, value)
                    for key, value in headers.items()
                ]
            )
        if params:
            query_string = '&'.join((
                '{}={}'.format(key, value)
                for key, value in params.items()
            ))
            url = '{}?{}'.format(url, query_string)
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.ENCODING, 'gzip')
        self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)

        self.curl_multi = pycurl.CurlMulti()
        self.curl_multi.add_handle(self.curl)

        self.status_code = 0
        self.headers = {}

    def _any_data_received(self):
        return self.received_buffer.tell() != 0

    def _get_received_data(self):
        result = self.received_buffer.getvalue()
        self.received_buffer.truncate(0)
        self.received_buffer.seek(0)
        return result

    def _check_status_code(self):
        if self.status_code == 0:
            self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
        if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
            raise urllib.error.HTTPError(
                self.url, self.status_code, None, None, None
            )

    def _perform_on_curl(self):
        while True:
            ret, num_handles = self.curl_multi.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        return num_handles

    def _iter_chunks(self):
        while True:
            remaining = self._perform_on_curl()
            if self._any_data_received():
                self._check_status_code()
                yield self._get_received_data()
            if remaining == 0:
                break
            self.curl_multi.select(self.SELECT_TIMEOUT)

        self._check_status_code()
        self._check_curl_errors()

    def _check_curl_errors(self):
        for f in self.curl_multi.info_read()[2]:
            raise pycurl.error(*f[1:])

    def iter_lines(self):
        chunks = self._iter_chunks()
        return self._split_lines_from_chunks(chunks)

    def _split_lines_from_chunks(self, chunks):
        print('foo')
        print(self.headers)
        charset = None
        if 'content-type' in self.headers:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                charset = match.group(1)
                print('Decoding using %s' % charset)
        if charset is None:
            charset = self.HTTP_STANDARD_ENCODING
            print('Assuming encoding is %s' % charset)
        pending = None
        for chunk in chunks:
            if pending is not None:
                chunk = pending + chunk
            lines = chunk.splitlines()
            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                pending = lines.pop()
            else:
                pending = None
            for line in lines:
                yield line.decode(charset)
        if pending is not None:
            yield pending.decode(charset)

    def header_function(self, header_line):
        print('hello')
        header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
        if ':' not in header_line:
            return
        name, value = header_line.split(':', 1)
        name = name.strip()
        value = value.strip()
        name = name.lower()
        self.headers[name] = value


def request(method, url, data=None, params=None, headers=None,
            stream=False):
    if stream:
        return CurlHTTPStream(method, url, data=data, params=params,
                              headers=headers)

And that's what happens in the terminal when I try to test it:

Python 3.5.1 (default, Dec 09 2015, 07:29:36) [GCC] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> from pycurl_requests.requests import request
>>> r = request('GET', 'http://my-couchdb-instance:5984/user-30323561366530622d336135622d343637372d386464392d613038653536663865636566/_changes', params={'feed': 'continuous'}, stream=True)
>>> for l in r.iter_lines():
...     print(l)
... 
foo
{}
Assuming encoding is iso-8859-1
hello
hello
hello
hello
hello
hello
hello
{"seq":1,"id":"account","changes":[{"rev":"1-806053b347406e04d1872e13199fd3cf"}]}
{"seq":4,"id":"identity-bd2c5007-9df3-4ece-9751-843bf5523edd","changes":[{"rev":"1-e3a98ec37776f2cb479b2dcae0266700"}]}
{"seq":5,"id":"section_phone-0342667c-ecbd-401f-acfe-7bb2a1aa3159","changes":[{"rev":"1-457342bc895c7cb6924ceabd07e1ffcf"}]}

There are more lines coming from CouchDB changes feed but I truncated the output since they're not relevant.

Basically foo in the output indicates that it enters the block where it expects headers to be in place but the next line shows that self.headers is empty. And multiple hello stands for every call to header_function(). How can it be that write callback which writes the body to BytesIO is called before header callback gets triggered?

score 0 · Answer 1 · answered Jul 14 '16 at 12:21

I've found the solution. The problem was that _split_lines_from_chunks(self, chunks) was trigerred before anything came with the response, so headers were also not there yet.

Here's the code that works. The charset is detected when first line of body is available, so I already have all the headers processed for sure.

import re
import io
import urllib
import urllib.error
import http

import pycurl


class CurlHTTPStream(object):

    SELECT_TIMEOUT = 10
    HTTP_STANDARD_ENCODING = 'iso-8859-1'

    def __init__(self, method, url, data=None, params=None, headers=None):
        self.url = url
        self.received_buffer = io.BytesIO()

        self.curl = pycurl.Curl()
        self.curl.setopt(pycurl.CUSTOMREQUEST, method)
        if headers:
            self.curl.setopt(
                pycurl.HTTPHEADER,
                [
                    '{}: {}'.format(key, value)
                    for key, value in headers.items()
                ]
            )
        if params:
            query_string = '&'.join((
                '{}={}'.format(key, value)
                for key, value in params.items()
            ))
            url = '{}?{}'.format(url, query_string)
        self.curl.setopt(pycurl.URL, url)
        self.curl.setopt(pycurl.ENCODING, 'gzip')
        self.curl.setopt(pycurl.CONNECTTIMEOUT, 5)
        self.curl.setopt(pycurl.HEADERFUNCTION, self.header_function)
        self.curl.setopt(pycurl.WRITEFUNCTION, self.received_buffer.write)

        self.curl_multi = pycurl.CurlMulti()
        self.curl_multi.add_handle(self.curl)

        self.status_code = 0
        self.headers = {}
        self._charset = None

    def _any_data_received(self):
        return self.received_buffer.tell() != 0

    def _get_received_data(self):
        result = self.received_buffer.getvalue()
        self.received_buffer.truncate(0)
        self.received_buffer.seek(0)
        return result

    def _check_status_code(self):
        if self.status_code == 0:
            self.status_code = self.curl.getinfo(pycurl.HTTP_CODE)
        if self.status_code != 0 and self.status_code != http.HTTPStatus.OK:
            raise urllib.error.HTTPError(
                self.url, self.status_code, None, None, None
            )

    def _perform_on_curl(self):
        while True:
            ret, num_handles = self.curl_multi.perform()
            if ret != pycurl.E_CALL_MULTI_PERFORM:
                break
        return num_handles

    def _iter_chunks(self):
        while True:
            remaining = self._perform_on_curl()
            if self._any_data_received():
                self._check_status_code()
                yield self._get_received_data()
            if remaining == 0:
                break
            self.curl_multi.select(self.SELECT_TIMEOUT)

        self._check_status_code()
        self._check_curl_errors()

    def _check_curl_errors(self):
        for f in self.curl_multi.info_read()[2]:
            raise pycurl.error(*f[1:])

    def iter_lines(self):
        chunks = self._iter_chunks()
        return self._split_lines_from_chunks(chunks)

    def _split_lines_from_chunks(self, chunks):
        print('foo')
        print(self.headers)
        pending = None
        for chunk in chunks:
            if pending is not None:
                chunk = pending + chunk
            lines = chunk.splitlines()
            if lines and lines[-1] and chunk and lines[-1][-1] == chunk[-1]:
                pending = lines.pop()
            else:
                pending = None
            for line in lines:
                yield line.decode(self.charset)
        if pending is not None:
            yield pending.decode(self.charset)

    @property
    def charset(self):
        if self._charset is not None:
            return self._charset
        try:
            content_type = self.headers['content-type'].lower()
            match = re.search('charset=(\S+)', content_type)
            if match:
                self._charset = match.group(1).strip()
                print('Decoding using %s' % self._charset)
            else:
                raise KeyError('charset')
        except KeyError:
            self._charset = self.HTTP_STANDARD_ENCODING
            print('Assuming encoding is %s' % self._charset)
        return self._charset

    def header_function(self, header_line):
        print('hello')
        header_line = header_line.decode(self.HTTP_STANDARD_ENCODING)
        if ':' not in header_line:
            return
        name, value = header_line.split(':', 1)
        name = name.strip()
        value = value.strip()
        name = name.lower()
        self.headers[name] = value


def request(method, url, data=None, params=None, headers=None,
            stream=False):
    if stream:
        return CurlHTTPStream(method, url, data=data, params=params,
                              headers=headers)

PyCURL is processing body before headers

1 Answers1