1

I am scraping this website https://www.woolworths.com.au/shop/browse/pet/dog-puppy basically it is more of a API data extraction because all the data coming from the POST request. Script is working but I have to update cookies manually every hour which is not suitable so I am looking for a work around to avoid cookie expiration or update cookie some how. Here is the code:

import httpx
import pandas as pd
import math
import re
from datetime import datetime
from datetime import date

now = datetime.now()
today = date.today()

TAG_RE = re.compile(r"<[^>]+>")


def remove_tags(text):
    return TAG_RE.sub(" ", text)


class WoolsWorthScraper:

    all_info = []

    cookies = {
        "_abck": "1AF9FA9968986E01D95DE635CE5CA49A~0~YAAQxKwwF4/SWjCGAQAAKl1xbwk/ATaJRzCaF4YKQ1lcfE4ZbDlWge8dxv5TNcUiQxDjsod+ZRyl0Z22ciftnXmNrmKNDj6gG7GkdAMvW7tJNTdSA3mpw3BJad78c8gWEi7xF7gmDdPmvDZcpBvT68TE8xi5YS+Y/o7+nnwbwmxRN6sHuTFQ3Mxr08gmypK0p7UXYKX5w8wigTHaIkUKl6GBTc3eVRBz87wXz6VSvhMgu3lsAyqX+hbQwmSPmlbnKeZHYRpnmcO7mK/apgy0lLtHJrISokAIBIPMT2Ocq/yuZG6zjGrzAMzoH8D2g4JhY6TUXLz94DMH/nMDX+4JnxAL+iqxWp1T5oAb/Z4YeH6l7x9ZARUzQfFGkHR3GC+XNzLWTaYYc2ernQF1WH6jyVo+HKkBcShBkUJ4VitieA==~-1~-1~-1",
        "AMCV_4353388057AC8D357F000101%40AdobeOrg": "870038026%7CMCIDTS%7C19408%7CMCMID%7C39487458205068416855238848785487881282%7CMCOPTOUT-1676913803s%7CNONE%7CvVersion%7C5.0.0",
        "ai_user": "fL4KLTsKGOkSYGQVYJwdIm|2023-02-17T10:51:04.008Z",
        "utag_main": "v_id:01865effc7310001995bec5e16700504600370090086e",
        "AKA_A2": "A",
        "akaalb_woolworths.com.au": "~op=www_woolworths_com_au_ZoneB:PROD-ZoneB|www_woolworths_com_au_BFF_MEL_Launch:WOW-BFF-MEL|~rv=57~m=PROD-ZoneB:0|WOW-BFF-MEL:0|~os=43eb3391333cc20efbd7f812851447e6~id=590a8239185204a5780d732b4c9292bd",
        "ak_bmsc": "CD2C640DE68D42B2754BD2302BC2A37C~000000000000000000000000000000~YAAQxKwwFxGlWjCGAQAAhixsbxJUZ8zXdaHVjTY1WO1sXuLqJXIDKRZL9aAbj0FinAr1ldLs7cMxdizMJT1xbbY5bO3780nEgbXnLd15tdyhxY+RSbZK9AyRmB6wNQO6MAykh9/POXagsbq1qC6ssFfWTYpB55o8FWbh5ksJoXWiYoHrgXpVDI1P6s4Sg6vvt5N123DjwHWKp5aaP9qsRApj8F5b+YMXM5LxWxhczsj4s2IttET1qFP7EZ6qG9N/5HPL6BfrSwY8XChmWWjgUw8tZO6fSp5tDn6gAi16kikTaZfIFPmHAt9MZPBae5i/kOrosUt5DOe7EQJko31jHinK55rPmC59QBF7JjO9kp4DkpQ15sMvWW8Lwvxj7KU7dnMAgmob3WoK0PZx3I4Fa8g1AAiBMB0IKbOGjU9qmDpW/cbwwdCE0NTTsIIm/CECEK0qbutJra1ufQDH95brcwf5phlpqOT+ioED+JjMPx2MAm8X/xJKnhvR2B5eY1REZUuTokKbBYKKDY8MuSABQ+c1iDu+fg==",
        "bm_sz": "4EFADCBD9227B61CBF2058518F420B83~YAAQxKwwFzSkWjCGAQAAZx9sbxL7XiBGlPH7LjomD0zmd16wBZq+n/3CeaS3Mfp3Y8afqdHE3/DXVElgVcJY2BgO5O7wRh09eQxdGkOWsb3W+H7vTLUsBlLp7tfJos+LD7CJEzFdjznn6Me849d8nOHeo1oXWh++oS1MQvbizyhVJljH8Dk+QfXDGmn1TT8bCiI6eoQsXD/NScXqTMoHgvzHTZVYgXNGiBku8YNdqisNqFEU7OKwr43tH41OZPv8qYAoKaN4rbiChlbM+ADUhxgW7Y+DO0jqC0vxlZqtc0iKKVmdhsNkbWTi~4538673~3753012",
        "dtCookie": "v_4_srv_-2D23_sn_4S74KMN57JFMT0RRTPLB6J1VIJGF2ARD",
        "rxVisitor": "1676906602556SGGP1KLPFDLV7SQNQEJDFR2CERF411V0",
        "dtPC": "-23$506602547_710h1vCHCJHUMUHKLVGLEONNHBUCFQJHBAIIGD-0e0",
        "rxvt": "1676908402560|1676906602560",
        "INGRESSCOOKIE": "1676906603.861.44.354018|37206e05370eb151ee9f1b6a1c80a538",
        "at_check": "true",
        "mbox": "session#503c621d6b474fe78bd8b03aefb53740#1676908464",
        "w-rctx": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY5MDY2MDIsImV4cCI6MTY3NjkxMDIwMiwiaWF0IjoxNjc2OTA2NjAyLCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjdiMGI2NzBkLWNlNzQtNDRjOS1hNDZjLWNlNjA1NTdlOTgyYyIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.PqCTLDVRHjNI6tIitPL2cMM49KKiGpTtndpKQnWdiQ9IBT-yt3R7TXRvy_mYD7Pwl1SeiIc_opXlK8Wz5X7Obiz6ZmyF4qLgCwZTrDmss8RXrEADSMdOSTrAfUh4fGvc71YOFJpXlxQDDCZJ0F69wK7ihd7gEBTC8gK3PoEJ8ZJukZ-AC27_23Y6ZsTgFqcMrObcJzxEmVOoLiRnJsgTnWe5Bn-bMF_IZ7k5cXlBZavB-nsVcu_WokOsmu3USnqiO6YhbtJSe6Xt7g7WqbY3o6-1AhdEkFwyTG_lOz1Ffu-NzIOozRp_Dmf0yXjgofRVgeMYC9bVipCUCH4MYq5G9A",
        "wow-auth-token": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY5MDY2MDIsImV4cCI6MTY3NjkxMDIwMiwiaWF0IjoxNjc2OTA2NjAyLCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjdiMGI2NzBkLWNlNzQtNDRjOS1hNDZjLWNlNjA1NTdlOTgyYyIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.PqCTLDVRHjNI6tIitPL2cMM49KKiGpTtndpKQnWdiQ9IBT-yt3R7TXRvy_mYD7Pwl1SeiIc_opXlK8Wz5X7Obiz6ZmyF4qLgCwZTrDmss8RXrEADSMdOSTrAfUh4fGvc71YOFJpXlxQDDCZJ0F69wK7ihd7gEBTC8gK3PoEJ8ZJukZ-AC27_23Y6ZsTgFqcMrObcJzxEmVOoLiRnJsgTnWe5Bn-bMF_IZ7k5cXlBZavB-nsVcu_WokOsmu3USnqiO6YhbtJSe6Xt7g7WqbY3o6-1AhdEkFwyTG_lOz1Ffu-NzIOozRp_Dmf0yXjgofRVgeMYC9bVipCUCH4MYq5G9A",
        "prodwow-auth-token": "eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY5MDY2MDIsImV4cCI6MTY3NjkxMDIwMiwiaWF0IjoxNjc2OTA2NjAyLCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjdiMGI2NzBkLWNlNzQtNDRjOS1hNDZjLWNlNjA1NTdlOTgyYyIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.PqCTLDVRHjNI6tIitPL2cMM49KKiGpTtndpKQnWdiQ9IBT-yt3R7TXRvy_mYD7Pwl1SeiIc_opXlK8Wz5X7Obiz6ZmyF4qLgCwZTrDmss8RXrEADSMdOSTrAfUh4fGvc71YOFJpXlxQDDCZJ0F69wK7ihd7gEBTC8gK3PoEJ8ZJukZ-AC27_23Y6ZsTgFqcMrObcJzxEmVOoLiRnJsgTnWe5Bn-bMF_IZ7k5cXlBZavB-nsVcu_WokOsmu3USnqiO6YhbtJSe6Xt7g7WqbY3o6-1AhdEkFwyTG_lOz1Ffu-NzIOozRp_Dmf0yXjgofRVgeMYC9bVipCUCH4MYq5G9A",
        "bm_sv": "6D916601877CE397455B41021E7D90B0~YAAQxKwwF8fPWjCGAQAA4i9xbxJMcZZX3SRiJc8H/2OWxVd6CQKk49gEl0O0wT3mv4+D6A9wsdylfS0Y+a8L3oq1HcOqqjcvYU2Q69nMyy5p47DMz3Y4LOTax4rtpeUyPdIBnepo4hvgW7IpZflzwEtZ7wGZlcGTt07hQYIq7y3h89qLI2WJI/qlneu5K86vwFZoo/ifvURyN/omDAT7B18VdC/VY2LOPr5OMuZ9zjcwqd19kpuDWiOUbduqD3HG2qyoscaDLQ==~1",
        "bm_mi": "7BC42F9FC8843FEAE717820962D55720~YAAQxKwwF3WkWjCGAQAAHSJsbxJWLCaVDLVlZUXANvKNhiit4WdPAGQCneOh1lvEi3vhSkbYt8C8J2AvBiXarO3BLab8YBDQBlZMRXGCuBFOr1a4kFeC1MFzs2YE7u60q8MitSBSNsJ3yQDL73Jr4Mxd8K48NuB5nETVz/tJ0zeLL2aIKTnPXwBCe93mrtR0VS2gRm7hw7JxVHB1R6Pvkph9Kt1H2TkIhpsHvMiG3JpPxoUjtFb1XKWcoXy0hahc+oyqRzS4M690sU6r9U8my5d3vy7WXpJR6Kk/ypsTh8f/RZCwAraNSJpCjdmDvZm2VQgRdPeKGkAxQBjHOX2mqWWCrC9F~1",
        "AMCVS_4353388057AC8D357F000101%40AdobeOrg": "1",
        "fullstoryEnabled": "false",
        "ai_session": "8N6Low4Vw9N3j4n63nJUgM|1676906605359|1676906930898",
    }

    headers = {
        "User-Agent": "Mozilla/5.0 (X11; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/109.0",
        "Accept": "application/json, text/plain, */*",
        "Accept-Language": "en-US,en;q=0.5",
        # 'Accept-Encoding': 'gzip, deflate, br',
        "Content-Type": "application/json",
        "Request-Id": "|ff65d6a2b5ef40deba161436fc928041.6fdc6ae6448243a2",
        "Request-Context": "appId=cid-v1:4601595d-64c0-46e0-be60-45622438acb3",
        "traceparent": "00-ff65d6a2b5ef40deba161436fc928041-6fdc6ae6448243a2-01",
        "Origin": "https://www.woolworths.com.au",
        "Connection": "keep-alive",
        "Referer": "https://www.woolworths.com.au/shop/browse/pet/dog-puppy?pageNumber=2",
        # "Cookie": "_abck=1AF9FA9968986E01D95DE635CE5CA49A~0~YAAQxKwwF9AxTzCGAQAAxiD3bAmsSwB2l0fu0Qkxwjxnj0eBYY2KO8HjhItiu5sN7xFgjkvdQqKgyv/hu4VkBsWJ3oYiyNXI14J3VvZGyn8YlAXhwkpxUFARbsS77w/DPoYunsl9ebanNTa5tkHlGnljdRYEP3t/wbKcd/nsI4HMtUFPK6ue8otsWnYwo1Bh36es48ACGX36BGjeA499YjAIltTnyPWNVRWm6QtaGoxoU2mixG6z2Z5Lk+GoizGD7EwOeYR5kXJTFSC+OhfiQmGOLafeGxFYrrw7yqiy2PCRQlcq2/uTk1LSaTFkBifkg8UVXHraNU6s0gZbtDRhaix8+ioePm0gsd3IQDsFM5HwLS2AtyLaT9B+QdlydT50nft3XKx2gOXXBTXqEiXdRCVTLrrlpVXzvoMuPvRNaA==~-1~-1~-1; AMCV_4353388057AC8D357F000101%40AdobeOrg=870038026%7CMCIDTS%7C19408%7CMCMID%7C39487458205068416855238848785487881282%7CMCOPTOUT-1676872577s%7CNONE%7CvVersion%7C5.0.0; ai_user=fL4KLTsKGOkSYGQVYJwdIm|2023-02-17T10:51:04.008Z; utag_main=v_id:01865effc7310001995bec5e16700504600370090086e; AKA_A2=A; akaalb_woolworths.com.au=~op=www_woolworths_com_au_BFF_MEL:WOW-BFF-MEL|www_woolworths_com_au_ZoneB:PROD-ZoneB|www_woolworths_com_au_BFF_MEL_Launch:WOW-BFF-MEL|~rv=38~m=PROD-ZoneB:0|WOW-BFF-MEL:0|~os=43eb3391333cc20efbd7f812851447e6~id=994ae2ad8d40ae4f894208c8efa8f90b; ak_bmsc=66320F9DDA68208B9C0D55A35DCB1E9C~000000000000000000000000000000~YAAQxKwwF8kxTzCGAQAAjRz3bBJVwgNzhzJMpljtUkZOOQeE+27f7XbTT9ERG1i7v88IxKavwFLbIeq3sYSIvpgsMxN5oS/ZpPGz46kuEEHSI1t6RSCSjBRJG1O0pxwlhXCKmwup688hxpv0aBM+fPfXSVbh5VJhenskXxcHHnyBQju3rFwLfPDzA0VuoEt9Nu5esXFBci+C+ZQ5TCCoUFoWqPi77a0hR43VmaoTnnPQHnxuUbQMN68MT0+HdgEEKDos8h887II1whD69+vIei9yDQFh/BJ8pLXSijwY7uJveNXS9iO/oUfzu3pSOhiFBssHijHtMwuAC1HG9OSPhAg/huTbTubMengbhfNa/q2+Q/JJnGo6Tiz84dltIYcnr6TVzVVdFwyRjQkr5TVa9RsMxhVzfFzpZP1mk0Ya5hAfjV9qzGIiO4EMi7HtD7yOqSL+z8wpgdB+OJPCSdWG83LV3l1frAKwgW2MccKQOmcBTSHE0UvazB6LGOJNhTqD3lqYNIsvcdvFTDjbbIHCoxT9tOo22A==; bm_sz=2BE74D2F94FD38AA4F1E55F6C1F9188D~YAAQxKwwF70xTzCGAQAA+RD3bBKyUXhOFZJhgV4xph4IyzASZFnO7x0YSadZ/ShjLYP8dZgY3quZjGYLMTExGlimRcHMNYN4vOrIet4GQrogX8VjrQO4w8a7oTGJAbEsRoblyp9rm/0f2fmYIfVHEKZ/zbAHMsNgwpXV+bavPMAT6HS6bk3AFWT1OFYwWQjqwWvXCI1PRTXrCTV0gwqZavteliAcnE2o+mLHvM+xmTIK4H0LAz4PREpgrutV4xdKJdPqjWDmza/0nkHbl8ZAYXjWJaxZwzhzhGiWBnoloK0eXQjJXWS030hm~4473668~4405558; dtCookie=v_4_srv_-2D56_sn_9E6PQ066TSE6586TQN5508G53GNOV6JD; rxVisitor=16768653768100HJM7JQLRTRONBMC8TQOR7A4LNIEL8GJ; dtPC=-56$465376777_250h1vVVUHQOATEPUJHRKQFCCCKRJPMTHKRECQ-0e0; rxvt=1676867176818|1676865376818; INGRESSCOOKIE=1676865377.813.45.481628|37206e05370eb151ee9f1b6a1c80a538; at_check=true; mbox=session#7f5b89582cf94c6f80497a78f463dff4#1676867240; w-rctx=eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY4NjUzNzcsImV4cCI6MTY3Njg2ODk3NywiaWF0IjoxNjc2ODY1Mzc3LCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjI4ZTcwYjVlLTcxNDAtNGZhMy05Y2E2LTc2YzFlYWVjZTgxYiIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.UEK0YeJ96xgcYOkQUPhlpEssxXDujbZatvUKtFJ9HK7zn3IFeOxRfZ5pb5UOTsXH4-6FoEB5YK3fqJuF4Xq88C-0T_XiuClaUGL5fIU3E8iOxQYgfMBJT5pMlYqZ5v6mD2V9DIjfF8Np15nIbPdxW_imN7BXhI6Fa7RNI5xow3SkxMzpiDiD_SgHjfZKPq-ifcpZHKbUSotpXKjRlHCQC8MM1fwMX4v1FPepI8r0YM4_ZZCeidJsHpoZPqhqWq_n822s_Ubmoi-fAJ3nQa_pUR3O20HmwOhmVaUvkqDIahYugDbbmF7bDojDawU4YP8qm2uf_immjaS-FbkHU1k9mg; wow-auth-token=eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY4NjUzNzcsImV4cCI6MTY3Njg2ODk3NywiaWF0IjoxNjc2ODY1Mzc3LCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjI4ZTcwYjVlLTcxNDAtNGZhMy05Y2E2LTc2YzFlYWVjZTgxYiIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.UEK0YeJ96xgcYOkQUPhlpEssxXDujbZatvUKtFJ9HK7zn3IFeOxRfZ5pb5UOTsXH4-6FoEB5YK3fqJuF4Xq88C-0T_XiuClaUGL5fIU3E8iOxQYgfMBJT5pMlYqZ5v6mD2V9DIjfF8Np15nIbPdxW_imN7BXhI6Fa7RNI5xow3SkxMzpiDiD_SgHjfZKPq-ifcpZHKbUSotpXKjRlHCQC8MM1fwMX4v1FPepI8r0YM4_ZZCeidJsHpoZPqhqWq_n822s_Ubmoi-fAJ3nQa_pUR3O20HmwOhmVaUvkqDIahYugDbbmF7bDojDawU4YP8qm2uf_immjaS-FbkHU1k9mg; prodwow-auth-token=eyJhbGciOiJSUzI1NiIsInR5cCI6IkpXVCJ9.eyJuYmYiOjE2NzY4NjUzNzcsImV4cCI6MTY3Njg2ODk3NywiaWF0IjoxNjc2ODY1Mzc3LCJpc3MiOiJXb29sd29ydGhzIiwiYXVkIjoid3d3Lndvb2x3b3J0aHMuY29tLmF1Iiwic2lkIjoiMCIsInVpZCI6IjI4ZTcwYjVlLTcxNDAtNGZhMy05Y2E2LTc2YzFlYWVjZTgxYiIsIm1haWQiOiIwIiwiYXV0IjoiU2hvcHBlciIsImF1YiI6IjAiLCJhdWJhIjoiMCIsIm1mYSI6IjEifQ.UEK0YeJ96xgcYOkQUPhlpEssxXDujbZatvUKtFJ9HK7zn3IFeOxRfZ5pb5UOTsXH4-6FoEB5YK3fqJuF4Xq88C-0T_XiuClaUGL5fIU3E8iOxQYgfMBJT5pMlYqZ5v6mD2V9DIjfF8Np15nIbPdxW_imN7BXhI6Fa7RNI5xow3SkxMzpiDiD_SgHjfZKPq-ifcpZHKbUSotpXKjRlHCQC8MM1fwMX4v1FPepI8r0YM4_ZZCeidJsHpoZPqhqWq_n822s_Ubmoi-fAJ3nQa_pUR3O20HmwOhmVaUvkqDIahYugDbbmF7bDojDawU4YP8qm2uf_immjaS-FbkHU1k9mg; bm_sv=A384BB54394BFFBBF9277F772238C50F~YAAQxKwwFwsyTzCGAQAAWkX3bBJJoDs/gJ3TG6zMp3HVW2g4HQ5+8iD06O4UWV6ZYHK5Nd00Q20lrEutmUYcZSV07OuemApEpZ+25As+xUEhAUcoh1JCsrkjbsdBMUQdlBj5LNR0WaR76d4aLEAszXmBGDkrZmK98Q7OpYDMmfyaff8fw6u/qI9MBYUBAwuRFxR+xo1kX8gdKE0FiFYHGoyB2FA8iMP4MYzKGH/xegz+C7Ei271MNXB+crWZOORODApP0kDLhQ==~1; bm_mi=80BA8C6828A8644CDA09DB11ECB2499B~YAAQxKwwF8ExTzCGAQAA6hP3bBL0glag+IIL0mU0/EVmDU10GjOmkUqLhxwefohLPypRGAkN0BATviz0N8DW0x78quux78pOX1xFPZn5zf3ISMnxvgl6aBy6qjmcCrsgXtM6dIUmRruZm0ipb/Ef2e0v50SKsgiPlZEdbbPF7YK5Ywd8u6Hd+ER6p4dfwC+KfinYHfaAg7E60tN8l2YHlwAVI8CwoXoB5oN2Qkodt/dQRG6fX5CUVYd09x4CKF2CTi5PZr+8lBwPuNAgW+0tWU1hFspAbRgTpa84Z//zEZnqMPUAgxprfJUipIOtt7FfibG7TNLgmphBdctA6LV72JEY/RQb~1; AMCVS_4353388057AC8D357F000101%40AdobeOrg=1; ai_session=uEFiMV9iVA98/Qzf+BLd+R|1676865379549|1676865379549; fullstoryEnabled=false",
        "Sec-Fetch-Dest": "empty",
        "Sec-Fetch-Mode": "cors",
        "Sec-Fetch-Site": "same-origin",
        "Sec-GPC": "1",
        # Requests doesn't support trailers
        # 'TE': 'trailers',
    }

    dog_json_data = {
        "categoryId": "1_EF205FA",
        "pageNumber": 1,
        "pageSize": 36,
        "sortType": "TraderRelevance",
        "url": "/shop/browse/pet/dog-puppy?pageNumber=1",
        "location": "/shop/browse/pet/dog-puppy?pageNumber=1",
        "formatObject": '{"name":"Dog & Puppy"}',
        "isSpecial": False,
        "isBundle": False,
        "isMobile": True,
        "filters": [],
        "token": "",
        "enableGp": False,
        "isHideUnavailableProducts": False,
    }

    cat_json_data = {
        "categoryId": "1_1969229",
        "pageNumber": 2,
        "pageSize": 36,
        "sortType": "TraderRelevance",
        "url": "/shop/browse/pet/cat-kitten?pageNumber=2",
        "location": "/shop/browse/pet/cat-kitten?pageNumber=2",
        "formatObject": '{"name":"Cat & Kitten"}',
        "isSpecial": False,
        "isBundle": False,
        "isMobile": True,
        "filters": [],
        "token": "",
        "enableGp": False,
        "isHideUnavailableProducts": False,
    }

    list_json_data = [dog_json_data, cat_json_data]

    base_url = "https://www.woolworths.com.au/apis/ui/browse/category"

    def return_json_data(self):
        for json_data in self.list_json_data:
            return json_data

    def fetch(self, url):
        print(f"HTTP POST request to URL: {url}", end="\n")
        with httpx.Client(headers=self.headers) as client:
            for json_data in self.list_json_data:
                resp = client.post(
                    self.base_url,
                    cookies=self.cookies,
                    json=json_data,
                    timeout=40,
                )
                print(f" | Status Code: {resp.status_code}")
                return resp

    def pagination(self, response):
        json_blob = response.json()
        products = json_blob["Bundles"]
        total_items = json_blob["TotalRecordCount"]
        total_pages = round(math.ceil(total_items / len(products)))
        for json_data in self.list_json_data:
            for page_no in range(1, total_pages + 1):
                json_data["pageNumber"] = page_no
                print(
                    f"HTTP POST request page {page_no}",
                    end="\n",
                )
                with httpx.Client(headers=self.headers) as client:
                    resp = client.post(
                        self.base_url,
                        cookies=self.cookies,
                        json=json_data,
                        timeout=40,
                    )
                    self.parse(resp)

    def parse(self, response):
        products = response.json()["Bundles"]
        for prod in products:
            item = {}
            product = prod["Products"][0]
            item["Scraped_Date"] = now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[0]
            item["Scraped_Time"] = now.strftime("%m/%d/%Y, %H:%M:%S").split(",")[1]
            item["Stock_Code"] = product["Stockcode"]
            item["Product_Name"] = product["Name"]
            item["Product_Category"] = (
                product["AdditionalAttributes"]["piescategorynamesjson"]
                .strip("][")
                .strip('"')
            )
            item["Sub_Category"] = (
                product["AdditionalAttributes"]["piessubcategorynamesjson"]
                .strip("][")
                .strip('"')
            )
            item["Brand"] = product["Brand"]
            item["Price/100g"] = product["CupPrice"]
            item["Price"] = product["Price"]
            item["Was_Price"] = product["WasPrice"]
            item["Save"] = product["SavingsAmount"]
            item["Size"] = product["PackageSize"]
            try:
                item["Description"] = remove_tags(
                    product["AdditionalAttributes"]["description"]
                    .replace("\r", "")
                    .replace("\n", "")
                    .strip()
                )
            except:
                item["Description"] = "N/A"
            item["Ingredients"] = product["AdditionalAttributes"]["ingredients"]
            item["Availability"] = (
                "InStock" if product["IsAvailable"] else "Out of Stock"
            )
            item["Image"] = product["LargeImageFile"]

            self.all_info.append(item)

    def to_csv(self):
        df = pd.DataFrame(self.all_info).fillna("N/A")

        df.to_csv(f"woolsworth.csv", index=False)

        print('Stored results to "woolsworth.csv"')

    def run(self):

        init_response = self.fetch(self.base_url)

        self.pagination(init_response)
        self.to_csv()


if __name__ == "__main__":
    scraper = WoolsWorthScraper()
    scraper.run()

After every hour I have to copy the curl the curl request from developer tools and copy the cookies to make it work. Is there any way or work around this manual cookie copy pasting?

X-somtheing
  • 219
  • 2
  • 10

1 Answers1

2

You could run real browser via playwright or selenium to get cookies from there and then continue parsing using requests

Here is example how to get cookies of given website using playwright:

from playwright.sync_api import sync_playwright

def get_site_cookies(url: str) -> dict:
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False)
        page = browser.new_page()
        page.goto(url)
        cookies_list = page.context.cookies()
        cookies_dict = {el['name']: el['value'] for el in cookies_list}
        browser.close()

    return cookies_dict

url = 'https://www.woolworths.com.au/shop/browse/pet/dog-puppy'
print(get_site_cookies(url))

This code will print something like this:

{'AKA_A2': 'A', 'bm_sz': 'D9250E4256DA8BEC219C6350D3972AFD~....
Alex Kosh
  • 2,206
  • 2
  • 19
  • 18
  • Thanks a lot! It worked like a charm. Thanks once again. – X-somtheing Feb 20 '23 at 17:30
  • How can I run the playwright code in jupyter notebook? I tried to run the sync code but it gives me this error: `Error: It looks like you are using Playwright Sync API inside the asyncio loop. Please use the Async API instead.` when I tried to run async code it is giving me time out error. – X-somtheing Feb 20 '23 at 17:32
  • It's still the same, you just need to add `async` and `await` in some places and change imports, look for examples in docs https://playwright.dev/python/docs/library – Alex Kosh Feb 20 '23 at 17:34
  • I tried `async` api but in jupyter notebook it is giving me timeout error and in normal it is working fine with `async` api – X-somtheing Feb 20 '23 at 17:36
  • 1
    Check this answer https://stackoverflow.com/questions/71228742/how-to-use-the-playwright-library-in-a-jupyter-notebook-instead-of-using-a-regul/71702599#71702599 – Alex Kosh Feb 20 '23 at 17:39