I'm working on an ocr project using scrapy using the ocr api at https://ocr.space/ocrapi . I have some code that works successfully using requests:
file_string = ctypes.string_at(image_data_pointer, length.value)
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
payload = {'isOverlayRequired': overlay,
'apikey': api_key,
'language': language,
r = requests.post('https://api.ocr.space/parse/image',
files={payload_filename: file_string},
data=payload,
)
return r.content.decode()
I'm now trying to turn this into a scrapy post request. I have:
payload_filename = 'my_hires_image.jpg'
# Post payload as multipart encoded image file with filename.
# requests.post(THE_URL, files={'file': (payload_filename, payload)})
body = {'file': file_string,
'isOverlayRequired': True,
'apikey': 'mykey',
'language': 'eng',
}
files = {payload_filename: file_string}
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
please note that file_string is a byte string. You can see it in screenshot above. The code is giving me:
File "/\lib\site-packages\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "/\lib\site-packages\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "/\lib\site-packages\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "/\lib\site-packages\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "/\lib\site-packages\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "Emy_PROject/spiders\ocr_spider.py", line 148, in get_PDF
yield FormRequest(url='https://api.ocr.space/parse/image', headers=headers2, body=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "/\lib\site-packages\scrapy\http\request\form.py", line 27, in __init__
super(FormRequest, self).__init__(*args, **kwargs)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 26, in __init__
self._set_body(body)
File "/\lib\site-packages\scrapy\http\request\__init__.py", line 69, in _set_body
self._body = to_bytes(body, self.encoding)
File "/\lib\site-packages\scrapy\utils\python.py", line 117, in to_bytes
'object, got %s' % type(text).__name__)
TypeError: to_bytes must receive a unicode, str or bytes object, got dict
How can I get this working?
edit:
body = {'files':file_string,
'isOverlayRequired': True,
'apikey': '*******',
'language': 'eng',
}
body = urllib.parse.urlencode(body)
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
yields:
File "....\scrapy\utils\defer.py", line 102, in iter_errback
yield next(it)
File "....\scrapy\spidermiddlewares\offsite.py", line 29, in process_spider_output
for x in result:
File "....\scrapy\spidermiddlewares\referer.py", line 339, in <genexpr>
return (_set_referer(r) for r in result or ())
File "....\scrapy\spidermiddlewares\urllength.py", line 37, in <genexpr>
return (r for r in result or () if _filter(r))
File "....\scrapy\spidermiddlewares\depth.py", line 58, in <genexpr>
return (r for r in result or () if _filter(r))
File "myproject\spiders\ocr_spider.py", line 151, in get_PDF
x = FormRequest('https://api.ocr.space/parse/image', headers=headers2, formdata=body, callback=self.ocr_space, meta={'row': row, 'cookiejar': i}, dont_filter=True)
File "....\scrapy\http\request\form.py", line 31, in __init__
querystr = _urlencode(items, self.encoding)
File "....\scrapy\http\request\form.py", line 66, in _urlencode
for k, vs in seq
File "....\scrapy\http\request\form.py", line 65, in <listcomp>
values = [(to_bytes(k, enc), to_bytes(v, enc))
ValueError: not enough values to unpack (expected 2, got 1)