I am creating a web scraper to go over almost 400k records. Basically, it works like this, I have a CSV of part numbers that need to be searched on this site. The site has an exposed API so I am able to skip the frontend and make a direct request to the site after logging in. I created one function called GetPdcResults() which takes in a list of parts and a start number. The start argument is for if the scraper stops for any reason I can start it back up at the same point it left off on the parts list. Then the main loop of the scraper which enumerates over each part in the list builds a payload for that part and requests the information. Some error handling for if I have a network error or a cookie error which only happens when my user's session has expired. Then it calls the CleanPdcResults() function. This cleans the response returned from the site and saves the relevant information to a CSV for exporting.
To my understanding recursion is when a function calls itself repeatedly and there is a limit to this in python and is more resource intensive. Iteration is when you use a loop to repeat a set of actions.
I think I want iteration in my app, not recursion because currently I a, getting this error I have never seen before.
RecursionError: maximum recursion depth exceeded while calling a Python object
I'm assuming because there is recursion happening in my functions instead of iteration but I can't seem to point it out. The only time a function is calling itself is when there is a cookie error and the GetPdcResults() function is called again but that wouldn't be called so many times that a Limit is reached.
Can someone help me find where recursion is happening in my scrapper and how I can convert it to iteration to stop this error?? Any help is appreciated!
def GetPdcResults(parts, start=0):
logger = PartsLogger()
logger.log_cookie(headers['cookie'])
logger.log_num_parts(parts)
for (i, part) in tqdm(enumerate(parts[start:], start), total=len(parts[start:])):
if part == nan:
break
logger.log_cur_part(i, part)
payload = "{\"catalogId\":\"2\",\"locatorService\":\"Panda\""
payload += f',"partNumber":"{part}", "cacheKey":"{part}_2_en-US_7497fea0-4fb6-4b28-b0e8-62e3e4204cc5"{"}"}'
try:
response = requests.request("POST", url, headers=headers, data=payload)
except requests.exceptions.RequestException as e:
print('\n[-] Request Error')
print(e)
logger.log_error(str(e), part=part)
if response.status_code == 401:
logger.log_error('[-] Cookie Error', part=part)
print('\n[-] Cookie Error')
GetPdcResults(parts, start=i)
break
CleanPdcResults(response.json(), i, part, logger)
def CleanPdcResults(resp, index, part, logger):
try:
pdc_results = resp['d']['PdcResults']
pdc92 = {}
for pdc in pdc_results:
if '92' in pdc['LocationName']:
pdc92.update(pdc)
break
if(bool(pdc92)):
foundPart = [{'':index, 'Part':part, 'Qty':pdc92['Quantity']}]
df = pd.DataFrame(foundPart)
if not exists('Parts.csv'):
df.to_csv('Parts.csv', index=False)
df.to_csv('Parts.csv', mode='a', index=False, header=False)
else:
print('\n[-] Part Not Found')
except Exception as e:
logger.log_error(str(e), part=part, response=resp)
Traceback (most recent call last):
File "c:\Users\carte\OneDrive\Documents\GrayTeck\Chad S\CleanPartsCSV.py", line 30, in run
GetPdcResults(partsList, start=startIndex)
File "c:\Users\carte\OneDrive\Documents\GrayTeck\Chad S\GetPDCRes.py", line 57, in GetPdcResults
GetPdcResults(parts, start=i)
File "c:\Users\carte\OneDrive\Documents\GrayTeck\Chad S\GetPDCRes.py", line 57, in GetPdcResults
GetPdcResults(parts, start=i)
File "c:\Users\carte\OneDrive\Documents\GrayTeck\Chad S\GetPDCRes.py", line 57, in GetPdcResults
GetPdcResults(parts, start=i)
[Previous line repeated 973 more times]
File "c:\Users\carte\OneDrive\Documents\GrayTeck\Chad S\GetPDCRes.py", line 48, in GetPdcResults
response = requests.request("POST", url, headers=headers, data=payload)
File "C:\Python310\lib\site-packages\requests\api.py", line 61, in request
return session.request(method=method, url=url, **kwargs)
File "C:\Python310\lib\site-packages\requests\sessions.py", line 529, in request
resp = self.send(prep, **send_kwargs)
File "C:\Python310\lib\site-packages\requests\sessions.py", line 645, in send
r = adapter.send(request, **kwargs)
File "C:\Python310\lib\site-packages\requests\adapters.py", line 440, in send
resp = conn.urlopen(
File "C:\Python310\lib\site-packages\urllib3\connectionpool.py", line 703, in urlopen
httplib_response = self._make_request(
File "C:\Python310\lib\site-packages\urllib3\connectionpool.py", line 449, in _make_request
six.raise_from(e, None)
File "<string>", line 3, in raise_from
File "C:\Python310\lib\site-packages\urllib3\connectionpool.py", line 444, in _make_request
httplib_response = conn.getresponse()
File "C:\Python310\lib\http\client.py", line 1374, in getresponse
response.begin()
File "C:\Python310\lib\http\client.py", line 337, in begin
self.headers = self.msg = parse_headers(self.fp)
File "C:\Python310\lib\http\client.py", line 236, in parse_headers
return email.parser.Parser(_class=_class).parsestr(hstring)
File "C:\Python310\lib\email\parser.py", line 67, in parsestr
return self.parse(StringIO(text), headersonly=headersonly)
File "C:\Python310\lib\email\parser.py", line 56, in parse
feedparser.feed(data)
File "C:\Python310\lib\email\feedparser.py", line 176, in feed
self._call_parse()
File "C:\Python310\lib\email\feedparser.py", line 180, in _call_parse
self._parse()
File "C:\Python310\lib\email\feedparser.py", line 295, in _parsegen
if self._cur.get_content_maintype() == 'message':
File "C:\Python310\lib\email\message.py", line 594, in get_content_maintype
ctype = self.get_content_type()
File "C:\Python310\lib\email\message.py", line 578, in get_content_type
value = self.get('content-type', missing)
File "C:\Python310\lib\email\message.py", line 471, in get
return self.policy.header_fetch_parse(k, v)
File "C:\Python310\lib\email\_policybase.py", line 316, in header_fetch_parse
return self._sanitize_header(name, value)
File "C:\Python310\lib\email\_policybase.py", line 287, in _sanitize_header
if _has_surrogates(value):
File "C:\Python310\lib\email\utils.py", line 57, in _has_surrogates
s.encode()
RecursionError: maximum recursion depth exceeded while calling a Python object