0

my goal is to load url content of 'http://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525' to my String variable in php. Something like that:

$url = 'http://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525';
$s = file_get_contents($url);

I have tried curl with headers, PHP Snoopy with headers, cookies etc... There is always 403 Forbidden error. I am sure my IP is not blocked, I think I just need to set headers correctly, to make server think my request is from browser, but I don't know how to do that. Does anyone have any idea? Please put here only code, which work :) Thank you very much!

Here is what I have tried:

require('Snoopy.class.php');

$snoopy = new Snoopy;

$snoopy->agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:69.0) Gecko/20100101 Firefox/69.0";

$snoopy->rawheaders['Accept'] = 'text/html,application/xhtml+xm…plication/xml;q=0.9,*/*;q=0.8';
$snoopy->rawheaders['Accept-Encoding']    = 'gzip, deflate, br';
$snoopy->rawheaders['Cache-Control']    = 'max-age=0';
$snoopy->rawheaders['Connection']   = 'keep-alive';
$snoopy->rawheaders['Host'] = 'www.fanatics.com';
$snoopy->rawheaders['Upgrade-Insecure-Requests']    = '1';
$snoopy->rawheaders['Pragma']   = 'no-cache';
$snoopy->rawheaders['Cache-control']    = 'no-cache';

// set some cookies:
$snoopy->cookies["_4c_"] = 'dVRdT6NAFP0rhgdfdArzBTNNzKbWmrhZrdqu+2imMBS2LZBhEKvpf99LobVpXR7I3HPP/RrO5dOpE505fcwDjJmUAaWcXDoLvS6d/qcTFs37rXlVZun0ncTaouy7bl3XvVhlyqZh2QvzlZvFS3dutM7QTK1RocKFNqWrlMkzZPJoDhY68aMsXWg0VyuN/oKp1y3FzRFhHr+wiPpM8IDgiwJR7JGASBrQiw8kEZGSUyY54c6lE+aRhuaw7AW9xrYfjUU8OBZQvArtq10XDaXWs7MyWoAj0m9pqF/rNLLJNnZL79BEp/PEtkm2cGEaA051mkV5fRzWofswHzfcSZIWRZrN77VN8gjgBwAfKxMmqtTRRC31ndWrDh+vsvRFGSanurRHmDjFOP8G808xccgb5lWRZ7+heAf80mpe6bGZarXaNwa+Jt5E2tzdHBgvalnp1n5Ucz1tb7SxJjves7aVyWDkYVXafKXNYS6YFRrIbIvNTF6XW8JtanScv5/5EuAcdOf82V5nCSZ4tDG7PGVqm5KHwutQEOypY3w/fX69Hg2G44cD6e5JGJMeyYNepq07c8ty7wERuNj9OUG457ul78NKcAnKY0KwH4On6yt8HqfRFR/w2xEdDm8GciR86iF8jakYjUbeLZWBN+Tng6fRVSOEAjbIEXBY5iF8dTBg5zaXznu7eIT6FCpwD5RrYcuEz7zmAYZJo24DQYpMhjEXSMZaIkaiGImYYUSwBhVG1JO8KdHmFJJQLj2KJSR5S3c5Ik1gagk5JIsQ43GAZoz4SAbC87DCKva/cmDMcUC4CIKuLyx2bRXLLiP+IvtYQkFOd2S2H6J469jieGQmT0duhbH7Fv+P/e66ykbqbWPbH1rQ3IGPj2/Fx5tm1eOOSw5Sw/8GY3qSerP5Bw==';
$snoopy->cookies["_fbp"] = 'fb.1.1571147589917.1588467218';
$snoopy->cookies["_hjid"] = '7d7361a6-a8fa-425c-845f-cde65d28a8c2';
$snoopy->cookies["_s"] = 'www.fanatics.com';
$snoopy->cookies["ak_bmsc"] = 'FDDF6F8C06B9DDE6A81D37907A6D189702103C5F017800006DD8A55D466EFA47~pl4YTKpBC1Yx4j2ACSV7q19gRwWvM6YNG23Lhxux1aI5rRzPy+nAAgMou/OXcPiO6Xq8kt+o3X2UAlGiIOMOfefl0pGUayM+5jDZ9TwaZB6U1Il5N+4x+urio7qApuFzGAyOb/Z8MmUIKZm0bw/nZ+rjW4TjZMV6OqpKcN5+K+lQH4YDHCKgQ7i8Voy6e3DoOujrxRcZ+F3NA+jBvrDnwCS3FdcEWy01Xj0zjftoDe+1mQF706v/XiQ5BGkQE8flV4caWlcIHg5XEoiF0/R1TGO/oop7Jz856YGSzNB9Tu3QE=';
$snoopy->cookies["akacd_pr_fanatics_split"] = '3748603387~rv=82~id=08a386370acf068756a7fd9a0c8cba6c';
$snoopy->cookies["akacd_PR_Iris_Assets"] = '3748603387~rv=16~id=4a6fc3ff6fb09f15653ee0eab76dbc23';
$snoopy->cookies["AWSALB"] = 'QEURmrXOk+ZA9CUf9d9SWf3JIQMFslRH/HLkQO21g0DNaqfNvaLhwfnPlQ0CTAbSL89ssdPsTGEGY6jzT6tedXkpjqAEBDfFSOLM8lOTGua1mwBNpb2QFUYhkaG5ZT2WVYqgC5j+BZkdISBc+IDlXJj2R6Eo51kXyenIGu4AMU2yZC5mJRKHMrFrQh6h/w==';
$snoopy->cookies["bm_sv"] = '702A2103BD49D7ED3DA6DF170D621D18~s4xzqC1pj+xmvNpan7rZGH7ZUlRmsCfBbH7VLIjzNGYLA18kZQDXMITaNFeCDGUGRj+todjh/RB6EYgDRfP8QMk/8IfuFuDU5p9GZowbYswUKGduvOeK2DONsxdVTiUwtlpKzryRDTMo3dwRcEzC/SK+ZIlfD4MozpoBp1YbpYQ=';
$snoopy->cookies["civ"] = '1.1.0-rc-20191009.11859';
$snoopy->cookies["cqe"] = '["2131:A:0:1","2212:B:1:1","2224:B:1:1"]';
$snoopy->cookies["eci"] = '6ed2aa4671c6d5d5';
$snoopy->cookies["ist"] = '8545b19d-cba6-4737-b92c-d248680d8833';
$snoopy->cookies["platform1"] = 'iris';
$snoopy->cookies["pu"] = 'true';
$snoopy->cookies["RT"] = '"dm=www.fanatics.com&si=b1be4b93-820c-4e59-8176-fa6106ad7384&ss=1571147581600&sl=9&tt=103703&obo=0&sh=1571149929933=9:0:103703,1571149395897=8:0:95750,1571148930619=7:0:85524,1571148451899=6:0:77527,1571148008439=5:0:38362&bcn=//686eb51b.akstat.io/&ld=1571149929934"';
$snoopy->cookies["s_cc"] = 'true';
$snoopy->cookies["s_fid"] = '5A5FE3CCDA9E8630-1B138EEE0F3970C5';
$snoopy->cookies["s_fuid"] = '65121740551816822241951060216690340037';
$snoopy->cookies["s_loc"] = 'en-US';
$snoopy->cookies["s_sq"] = '[[B]]';
$snoopy->cookies["sa"] = 'sid=8545b19d-cba6-4737-b92c-d248680d8833';
$snoopy->cookies["sr_browser_id"] = 'c5f32661-0b0c-4d7d-9b48-f4ca42341809';
$snoopy->cookies["sr_pik_session_id"] = '4fa67621-c452-7019-9319-b0359eabd432';
$snoopy->cookies["st"] = '510005';
$snoopy->cookies["uc"] = 'USD';
$snoopy->cookies["va"] = '{"cc":0,"ct":0,"cpi":[],"nv":true,"af":null,"el":false}';
$snoopy->cookies["vid"] = '9476b980-ef58-11e9-8efe-2f6ebdc5d1cf';
$snoopy->cookies["vrc"] = 'cb979b0fcaf4794c';
$snoopy->cookies["xsrfp"] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyZXFIb3N0Ijoid3d3LmZhbmF0aWNzLmNvbSIsInZpc2l0b3JJZCI6Ijk0NzZiOTgwLWVmNTgtMTFlOS04ZWZlLTJmNmViZGM1ZDFjZiIsImlhdCI6MTU3MTE1MDE0OSwiZXhwIjoxNTcyNzIwOTk5OTkwfQ.eVHvH7Ewg1bpjCW0Zjennh6rncR4VfemXfaQW3FkKw4';
$snoopy->cookies["xsrft"] = 'eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9.eyJyZXFIb3N0Ijoid3d3LmZhbmF0aWNzLmNvbSIsInZpc2l0b3JJZCI6Ijk0NzZiOTgwLWVmNTgtMTFlOS04ZWZlLTJmNmViZGM1ZDFjZiIsImlhdCI6MTU3MTE1MDE0OSwiZXhwIjoxNTcyNzIwOTk5OTkwfQ.-03GVO07cCWbRl0j8nFrL9Coh8pr0U8RnjmhSwGPqiQ"';

// fetch the text of the website www.google.com:
if($snoopy->fetchtext($url)){ 
    // other methods: fetch, fetchform, fetchlinks, submittext and submitlinks

    // response code:
    print "response code: ".$snoopy->response_code."<br/>\n";

    // print the headers:

    print "<b>Headers:</b><br/>";
    while(list($key,$val) = each($snoopy->headers)){
        print $key.": ".$val."<br/>\n";
    }

    print "<br/>\n";

    // print the texts of the website:
    print "Text:<pre>".htmlspecialchars($snoopy->results)."</pre>\n";

    echo("<pre>");
    var_dump($snoopy);

}
else {
    print "Snoopy: error while fetching document: ".$snoopy->error."\n";
}
Jerry
  • 3
  • 1
  • I get the same on that URL, so they're apparently blocking certain requests. They probably don't like people scraping their site. – aynber Oct 15 '19 at 16:01
  • 1
    when website is trying really hard to block your requests, you probably should not be scraping it. – Dimi Oct 15 '19 at 16:02
  • Yes, but PHP Snoopy can simulate a browser, I think you only need to set it right, but I don't know how :( – Jerry Oct 15 '19 at 16:09
  • `curl --compressed -A 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:69.0) Gecko/20100101 Firefox/69.0' -Lv http://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-gre6485721+p-3102729373+z-9-2995349525` works. So should be possible. They are served by Akamai which has a lot of bot protections. – drew010 Oct 15 '19 at 18:05

2 Answers2

0

first obtain all the headers that your real browser sends, this can be obtained by starting a netcat server like

nc -l 9999

then point your browser to your netcat server at http://127.0.0.1:9999/

now you will see all the headers that your real browser sent, it looks something like:

GET / HTTP/1.1
Host: 127.0.0.1:9999
User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0
Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8
Accept-Language: en-US,en;q=0.5
Accept-Encoding: gzip, deflate
Connection: keep-alive
Upgrade-Insecure-Requests: 1

then copy all the headers, execpt a few special ones, first off, do not copy the GET / HTTP/1.1 header, let curl set it for you (it will), and do not copy the "Host" header but let curl set it for you (it will), and do not copy the User-Agent-header, instead set it with the CURLOPT_USERAGENT function (otherwise your real UA will be revealed if you have to follow a redirect or whatever, but if you use the proper CURLOPT_USERAGENT, your real UA won't be revealed), and do not copy the Accept-Encoding: gzip, deflate-header, but instead set CURLOPT_ENCODING to emptystring, this will make curl set the Accept-Encoding header for you manually, with all encodings that your libcurl was built to support (it's usually gzip and deflate, but curl can also be built with br support, for example. but if you set the header manually to gzip, and the server decide to use gzip, and your libcurl was compiled without gzip support (unlikely but possible), you would get unreadable garbled binary data), then you'll probably remain with something like:

<?php

$ch=curl_init();

curl_setopt_array($ch,array(
    CURLOPT_URL=>'http://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525',
    CURLOPT_USERAGENT=>'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0',
    CURLOPT_ENCODING => '', // Accept-Encoding: gzip, deflate
    CURLOPT_HTTPHEADER=>array(
        'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language: en-US,en;q=0.5',
        'Connection: keep-alive',
        'Upgrade-Insecure-Requests: 1',
    ),
    CURLOPT_RETURNTRANSFER=>true,
    CURLOPT_FOLLOWLOCATION=>true,
));
$html=curl_exec($ch);
var_dump($html);

which does indeed work. not all of these headers are required, and to find out exactly which header is required, just remove the headers 1-by-1 until the script stop working, and you'll find it: the last header you removed was probably the header that was required.

hanshenrik
  • 19,904
  • 4
  • 43
  • 89
  • Not working for me. I have tried to use netcat, to copy all the headers and still error 403 :( – Jerry Oct 21 '19 at 11:57
  • @Jerry well that's weird. run this code: http://paste.debian.net/plainh/1af9162a what output do you get from that code? – hanshenrik Oct 21 '19 at 12:26
  • `boolean false string '* Hostname was found in DNS cache * Trying 23.4.250.74... * Connected to www.fanatics.com (23.4.250.74) port 80 (#9) > GET /nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525 HTTP/1.1 User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0 Host: www.fanatics.com Accept-Encoding: deflate, gzip Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8 Accept-Language: en-US,en;q=0.5 '... (length=2539)` – Jerry Oct 21 '19 at 13:06
  • @Jerry sigh, seems you have xdebug installed, add to the top of your code ``` – hanshenrik Oct 21 '19 at 14:00
0

@hanshenrik Too many characters for comment so the answer is here:

string '* Hostname was found in DNS cache
* Hostname in DNS cache was stale, zapped
*   Trying 23.4.250.74...
* Connected to www.fanatics.com (23.4.250.74) port 80 (#11)
> GET /nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525 HTTP/1.1

User-Agent: Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:60.0) Gecko/20100101 Firefox/60.0

Host: www.fanatics.com

Accept-Encoding: deflate, gzip

Accept: text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8

Accept-Language: en-US,en;q=0.5

Connection: keep-alive

Upgrade-Insecure-Requests: 1



< HTTP/1.1 301 Moved Permanently

* Server nginx/1.12.2 is not blacklisted
< Server: nginx/1.12.2

< Content-Type: text/html

< Location: https://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525

< merc-action: redirect

< merc-cache-hit: false

< merc-target-path: /nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525

< merc-source-path: /nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525

< merc-is-translated: True

< Content-Length: 0

< X-EdgeConnect-MidMile-RTT: 18

< X-EdgeConnect-Origin-MEX-Latency: 367

< Expires: Mon, 21 Oct 2019 14:03:58 GMT

< Cache-Control: max-age=0, no-cache

< Pragma: no-cache

< Date: Mon, 21 Oct 2019 14:03:58 GMT

< Connection: keep-alive

< Set-Cookie: platform1=iris; expires=Tue, 22-Oct-2019 14:03:58 GMT; path=/; domain=.fanatics.com

< Set-Cookie: ak_bmsc=EE891ED91788823FB031EECFAC57E0CF02103C0CFD2D0000CEBAAD5D41CFAF7F~pl3LyygpG6Y9HjJwCePrLH4bJHwOKQn5pnB9d63C4i8G10E6kPczUbxTRfzC6YFjC+qidBrxwEw2GIitPQzE2T6sHUjXZbtiGmxv5a4ewxxbHmxDoHEgzs0f2poLjQnnQZcyoS6gJLTFnaZzHCMItvISwSVHAyPDKe4wdNQo717j10kJ6yqh8BTl/Pi29SXhNMgDgM2934vZikjE7g3fldhEBhoPuhg43JMNkQKCls3tE=; expires=Mon, 21 Oct 2019 16:03:58 GMT; max-age=7200; path=/; domain=.fanatics.com; HttpOnly

< Set-Cookie: akacd_pr_fanatics_split=3749119437~rv=13~id=8a5aabe9608f95d1b251e4749c911d59; path=/;

< 

* Connection #11 to host www.fanatics.com left intact
* Issue another request to this URL: 'https://www.fanatics.com/nfl/green-bay-packers/aaron-rodgers-green-bay-packers-nike-game-jersey-green/o-2405+t-36485721+p-3102729373+z-9-2995349525'
* Found bundle for host www.fanatics.com: 0x212f92c8d0
* Hostname was NOT found in DNS cache
*   Trying 23.4.250.74...
* Connected to www.fanatics.com (23.4.250.74) port 443 (#12)
* SSL certificate problem: unable to get local issuer certificate
* Closing connection 12
'

Thank you for helping me

Jerry
  • 3
  • 1
  • that is *not* `403 Forbidden` - first you received a `"HTTP/1.1 301 Moved Permanently"` redirect, and followed the redirect, then curl failed to verify the TLS certificate they use - a dirty quickfix is to just set CURLOPT_SSL_VERIFYPEER to 0, and the "proper" fix is to ask your sysadmin to install a certificate chain ala https://curl.haxx.se/docs/caextract.html , or provide your own and set it via CURLOPT_CAINFO – hanshenrik Oct 21 '19 at 14:15
  • I had 403 forbidden on other computer. Now I have 403 forbidden everywhere. Even if I use a browser :-D They started to block IP addresses. Very bad for me :-D – Jerry Oct 22 '19 at 16:57