3
import requests
r = requests.get('https://my.cigna.com')
print(r.text)

When I run the above code on the local system It gives me below response(check last line)

<!DOCTYPE html>\r\n<html><head>\r\n<meta http-equiv="Pragma" content="no-cache"/>\r\n<meta http-equiv="Expires" content="-1"/>\r\n<meta http-equiv="C
acheControl" content="no-cache"/>\r\n<meta http-equiv="Content-Type" content="text/html; charset=utf-8"/>\r\n<link rel="shortcut icon" href="data:;bas
e64,iVBORw0KGgo="/>\r\n\r\n<script>\r\n\r\n(function(){\r\nwindow["bobcmn"] = "11111110101010200000005200000005200000000251c47c46200000096300000000300
000000300000006/TSPD/300000008TSPD_101300000005https3000000b00832d38fa5ab2000bef1c6292d5d65850e93300734653e4ce2c59efe6632d24f66fbd7d1f0f1f7e1082c3139f
f0a2800873fd1b935f9eeef417fa5c01bffa575ead6603ec35dcc40233cad0a332514761bee131cffbe1575200000000200000000";\r\n\r\nwindow.Cus=!!window.Cus;try{(functi
on(){(function(){var z={decrypt:function(z){try{return JSON.parse(function(z){z=z.split("l");var s="";for(var _=0;_<z.length;++_)s+=String.fromCharCod
e(z[_]);return s}(z))}catch(_){}}};return z={configuration:z.decrypt("123l34l97l99l116l105l118l101l34l58l34l110l111l34l44l34l100l101l98l117l103l103l10
5l110l103l34l58l34l110l111l34l44l34l109l111l100l117l108l101l49l34l58l34l101l110l97l98l108l101l100l34l44l34l109l111l100l117l108l101l50l34l58l34l101l110
l97l98l108l101l100l34l44l34l109l111l100l117l108l101l51l34l58l34l101l110l97l98l108l101l100l34l44l34l109l111l100l117l108l101l52l34l58l34l101l110l97l98l1
08l101l100l34l125")}})();\nvar Sz=44;try{var Iz,Jz,Lz=Z(118)?1:0,zZ=Z(358)?1:0,ZZ=Z(706)?0:1,sZ=Z(440)?1:0,SZ=Z(844)?0:1;for(var lZ=(Z(87),0);lZ<Jz;++
lZ)Lz+=(Z(870),2),zZ+=Z(279)?2:1,ZZ+=(Z(127),2),sZ+=Z(732)?1:2,SZ+=Z(198)?3:2;Iz=Lz+zZ+ZZ+sZ+SZ;window.zz===Iz&&(window.zz=++Iz)}catch(zs){window.zz=I
z}var Ss=!0;function S(z,s){z+=s;return z.toString(36)}\nfunction _s(z){var s=93;!z||document[I(s,211,198,208,198,191,198,201,198,209,214,176,209,190,
209,194)]&&document[J(s,211,198,208,198,191,198,201,198,209,214,176,209,190,209,194)]!==S(68616527573,s)||(Ss=!1);return Ss}function Is(){}_s(window[I
s[S(1086810,Sz)]]===Is);_s(typeof ie9rgb4!==S(1242178186155,Sz));_s(RegExp("\\x3c")[S(1372161,Sz)](function(){return"\\x3c"})&!RegExp(S(42845,Sz))[S(1
372161,Sz)](function(){return"\'x3\'+\'d\';"}));\nvar js=window[J(Sz,141,160,160,141,143,148,113,162,145,154,160)]||RegExp(I(Sz,153,155,142,149,168,14
1,154,144,158,155,149,144),J(Sz,149))[S(1372161,Sz)](window["\\x6e\\x61vi\\x67a\\x74\\x6f\\x72"]["\\x75\\x73e\\x72A\\x67\\x65\\x6et"]),Js=+new Date+(Z
(476)?6E5:537525),zS,ZS,_S,IS=window[J(Sz,159,145,160,128,149,153,145,155,161,160)],jS=js?Z(845)?23770:3E4:Z(731)?6069:6E3;\ndocument[I(Sz,141,144,144
,113,162,145,154,160,120,149,159,160,145,154,145,158)]&&document[I(Sz,141,144,144,113,162,145,154,160,120,149,159,160,145,154,145,158)](I(Sz,162,149,1
59,149,142,149,152,149,160,165,143,148,141,154,147,145),function(z){var s=15;document[I(s,133,120,130,120,113,120,123,120,131,136,98,131,112,131,116)]
&&(document[I(s,133,120,130,120,113,120,123,120,131,136,98,131,112,131,116)]===S(1058781968,s)&&z[J(s,120,130,99,129,132,130,131,116,115)]?_S=!0:docum
ent[J(s,133,120,130,120,113,\n120,123,120,131,136,98,131,112,131,116)]===S(68616527651,s)&&(zS=+new Date,_S=!1,JS()))});function JS(){if(!document[J(6
7,180,184,168,181,188,150,168,175,168,166,183,178,181)])return!0;var z=+new Date;if(z>Js&&(Z(175)?6E5:391380)>z-zS)return _s(!1);var s=_s(ZS&&!_S&&zS+
jS<z);zS=z;ZS||(ZS=!0,IS(function(){ZS=!1},Z(22)?1:0));return s}JS();var LS=[Z(100)?17795081:12388952,Z(902)?2147483647:27611931586,Z(659)?2147483647:
1558153217];\nfunction I(z){var s=arguments.length,_=[];for(var L=1;L<s;++L)_.push(arguments[L]-z);return String.fromCharCode.apply(String,_)}function
 oS(z){var s=73;z=typeof z===S(1743045603,s)?z:z[J(s,189,184,156,189,187,178,183,176)](Z(681)?21:36);var _=window[z];if(!_[I(s,189,184,156,189,187,178
,183,176)])return;var L=""+_;window[z]=function(z,s){ZS=!1;return _(z,s)};window[z][J(s,189,184,156,189,187,178,183,176)]=function(){return L}}for(var
 z_=(Z(30),0);z_<LS[S(1294399161,Sz)];++z_)oS(LS[z_]);\n_s(!1!==window[J(Sz,111,161,159)]);window.OZ={ss:"080c2c1eda01800069124f1b5a83fc96fafe0a0dbc83
e2e583cf1f4bc9a341edae4a30a47d7c4b3813be508797f9bc851524c4d9baf10876ef1f1078d2e9eb2e566dd9b7b6a562958391630c37574b49bfc5623b576c20cd27c302ee516a176224
f2fbb00a43bff05b8edd78ac57568a9b648b2973acfc228913dee8be06ecf802d0499c07d98d5d"};function J(z){var s=arguments.length,_=[],L=1;while(L<s)_[L-1]=argume
nts[L++]-z;return String.fromCharCode.apply(String,_)}function s_(z){var s=+new Date,_;!document[I(99,212,216,200,213,220,182,200,207,200,198,215,210,
213,164,207,207)]||s>Js&&(Z(129)?6E5:494155)>s-zS?_=_s(!1):(_=_s(ZS&&!_S&&zS+jS<s),zS=s,ZS||(ZS=!0,IS(function(){ZS=!1},Z(173)?1:0)));return!(argument
s[z]^_)}function Z(z){return 481>z}\n(function i_(s){s&&"number"!==typeof s||("number"!==typeof s&&(s=1E3),s=Math.max(s,1),setInterval(function(){i_(s
-10)},s))})(!0);})();}catch(x){\n}finally{ie9rgb4=void(0);};function ie9rgb4(a,b){return a>>b>>0};\n\r\n})();\r\n\r\n</script>\r\n\r\n<script type="te
xt/javascript" src="/TSPD/0832d38fa5ab2000c3e644a33889f1e28c37ed3e7e11517eab4ec9c87769bb4e4892f69070fe1752?type=10"></script>\r\n<noscript>Please enab
le JavaScript to view the page content.<br/>Your support ID is: 11583418831671342532.</noscript>\r\n</head><body>\r\n</body></html>

but when I run it on my Heroku or AMI server it works fine and gives

<!DOCTYPE HTML PUBLIC "-//IETF//DTD HTML//EN">
<HTML>
<HEAD>
<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">
<TITLE></TITLE>
<script>

var bizrouter_uri = "/rte/public/validatetoken";

function getQueryStringValue (key) {
  return unescape(window.location.search.replace(new RegExp("^(?:.*[&\\?]" + escape(key).replace(/[\.\+\*]/g, "\\$&") + "(?:\\=([^&]*))?)?.*$",
 "i"), "$1"));
}

function setCookie(cname, cvalue, exhours) {
    var d = new Date();
    d.setTime(d.getTime() + (exhours*60*60*1000));
    var expires = "expires="+d.toGMTString();
    var cook = cname + "=" + cvalue + ";" + expires + ";path=/";
    document.cookie = cook;
    //document.cookie = cname + "=" + cvalue + "; " + expires + "; domain=" + document.domain + "; path=/";
}

function postForm() {
  var myToken = getQueryStringValue("token");

  //create the Token text cookie
  if (myToken) {
    setCookie("HW_TOKEN_TEXT", myToken, 1);
    document.webseal.action = bizrouter_uri;
    document.webseal.token.value = myToken;
  } else {
    setCookie("HW_TOKEN_TEXT", "", 0);
  }
  setCookie("HW_TOKEN_DNS", window.location.protocol  + "//" + document.domain, 1);
    document.webseal.submit();
}

</script>
</HEAD>
<body onLoad="postForm();">             
                <form name="webseal" method="post" action="/web/public/guest" >
                     <input type="hidden" name="token" value="default" />
                </form>
</HTML>

Not able to figure out, how to run it on local system. Any help will appreciable.

NOTE: This is not regarding to "how to scrape JS content/how to scrape website with JS enabled". This is for if it is working on AMI and Heroku then why not working on local system

SHIVAM JINDAL
  • 2,844
  • 1
  • 17
  • 34
  • Did you try to use selenium instead? – Maaz Mar 15 '19 at 08:40
  • I can use selenium, but scraping from selenium is very slow. And the main point is why same code behaves diff on two machines. – SHIVAM JINDAL Mar 15 '19 at 08:43
  • Possible duplicate of [Using python Requests with javascript pages](https://stackoverflow.com/questions/26393231/using-python-requests-with-javascript-pages) –  Mar 15 '19 at 08:48
  • @reportgunner This is diff because "This is not regarding to "how to scrape JS content/how to scrape website with JS enabled". This is for if it is working on AMI and Heroku then why not working on local system" – SHIVAM JINDAL Mar 15 '19 at 09:01
  • @SHIVAMJINDAL scroll down to answer from **marvb**. It says `Good news: there is now a requests module that supports javascript` –  Mar 15 '19 at 09:04
  • getting this on that library ```>>> from requests_html import HTMLSession Traceback (most recent call last): File "", line 1, in from requests_html import HTMLSession ImportError: cannot import name HTMLSession >>> ``` The original question is why same code not working. Use of other library is an alternative not a solution – SHIVAM JINDAL Mar 15 '19 at 09:18
  • Related: [Web-scraping JavaScript page with Python](https://stackoverflow.com/questions/8049520/web-scraping-javascript-page-with-python) – Stevoisiak Jan 14 '21 at 03:23
  • Could you run in pythonanywhere.com and share the console? – Smart Manoj Jul 09 '21 at 13:37
  • @SmartManoj How does pythonanywhere shell and aws/heroku differ? I shared response of aws and local machine in question – SHIVAM JINDAL Jul 10 '21 at 18:43

2 Answers2

2

First of all, the response payload indicated the there is a F5 load-balancer between you and the web server 'https://my.cigna.com'. Because this is a known signature from F5:

Please enable JavaScript to view the page content.<br/>Your support ID is: 
11583418831671342532.

Secondary, there is a way to bypass it by using a Javascript capable http client, such as Selenium + Chrome, as indicated by @Maaz:

import time

from selenium import webdriver
import selenium.webdriver.chrome.service as service

service = service.Service('/path/to/chromedriver')
service.start()
capabilities = {'chrome.binary': '/path/to/custom/chrome'}
driver = webdriver.Remote(service.service_url, capabilities)
driver.get('http://www.google.com/xhtml');
time.sleep(5) # Let the user actually see something!
driver.quit()

For implementation explanation, you can refer to this page: http://chromedriver.chromium.org/getting-started

sam
  • 122
  • 7
0

Some pages (possibly Ajax?) take several steps to load. This

...<noscript>Please enable JavaScript to view the page content.<br/>Your support ID is: 11583418831671342532.</noscript>...

is the initial page content which is then transformed into actual html. For my scraping endeavor, I used time.sleep(5) to let the page load before scrapping. So the order would be

driver.get("https://google.com")
time.sleep(5)
print(driver.page_source)
shtyler
  • 71
  • 1
  • 7