0

I am trying to parse the pages from the website: "https://rapid.sap.com/bp/"

Done till now and the Observations:

  1. Tried using requests library with BeautifulSoup and tried to get the page source but then the page source showed just a single line "Application is Loading..." in the body tag. The following is the response I got.

<!DOCTYPE html>
<html>
 <head>
  <meta http-equiv="Content-Type" content="text/html; charset=UTF-8"/>
  <meta http-equiv="X-UA-Compatible" content="IE=edge"/>
  <meta http-equiv="Content-Language" content="en-US"/>
  <!-- resolve IE Cache issue -->
<meta http-equiv="expires" content="0"/>
<meta http-equiv="cache-control" content="no-cache"/>
<meta http-equiv="pragma" content="no-cache"/>
<!-- resolve IE Cache issue -->
  <meta name="description" content="SAP Best Practices Explorer - The next generation web channel to search, browse and consume SAP and Partner Best Practices."> 
  <meta name="google-site-verification" content="ozvyxNxcsZGZb99XQ7BbC4orDKdCt3LJutqzR2vV9bM"/>
  
  <title>
   SAP Best Practices Explorer
  </title>
  



 <link rel="stylesheet" type="text/css" href="belize/css/dist.css?v=20200422040617">

 </head>
 <body class="sapUiBody belize">
  <div id='content'>Application is loading ......</div>
 </body>
 <script type="text/javascript" src="belize/js/sha1.js" async="async"></script>
 <script type='text/javascript'>
  window.appConfig = {
   sapui5Context:'https://sapui5.hana.ondemand.com/',
   sapui5Version:'1.54.3',
   isMock: false,
   isDevLandscape: false,
   isLoggedIn: false,   
   loginMode: 'saml',
   buildNumber: '20200422040617',
   csrfToken: '-9SVmJ8pSCnUKsEa-2nMgoJHJYmrWimVoOA',
   landscape: 'RBP',
   siteIdOnWarp: '2103052a-aabe-2640-92ab-de9aeb9f13a8',  
   noRedirect: '',
   sessionTimeout: 3600,
   tcVersion: '10',
   bcpCreationPath: 'https://support.wdf.sap.corp/sap/bc/dsi/rest/ii/create_zini?sap-language=EN',
   bcpReadingPath: 'https://support.wdf.sap.corp/sap/bc/dsi/rest/ii/read_zini',
   bridgeUrl: 'https://bridge.int.sap.hana.ondemand.com/bridge/api/',
   bcpUrl:'https://support.wdf.sap.corp/sap/bc/dsi/ii/create_zini?sap-language=EN&system_id=RBP&priority=5&main_impact=A&category_label=SV-RDS-BC-EXP',
   userName: '',
   jSessionId: '-Pza9Vw1kHs5Ss3CfWrFbm-rEkqicQFHYcEA_SAP'
  };

  if(!window.appConfig.locale){
   window.appConfig.locale = navigator.language.split('-')[0];
  }

  if(window.location.hostname != "localhost"){
            window["sap-ui-config"] = {
       productive: true
   };
       }   
 </script>
 



 <script src="belize/js/dist.js?v=20200422040617" type="text/javascript"></script> 

 
   
  



<script type="text/javascript" src="https://accounts.sap.com/ui/resources/javascripts/SAP_IDS.js?locale=en_GB"></script>
<script type="text/javascript">
 var idsServer = 'https://accounts.sap.com';
 global.createIdsLoginLink(); 
</script>
  
 <script type="text/javascript">
  if(!global.isAutoLoginChecked()) {
   var $iframe = $("<iframe src='autologin_check' style='width:0;height:0;visibility:hidden;display:none;' width='0' height='0'/>");
   $("body").append($iframe);
   $iframe.on('load', function(){
    global.setAutoLoginCheckedCookie();
  
    try {
     var iframeDocument = $iframe[0].contentDocument || $iframe[0].contentWindow.document;
     var $body = $($iframe.contents().find('body'));
     var text = $body.text().trim();
     
     if(text === 'ok') {
      window.location.reload();
     }
    } catch(e) {
     global.loadScript();
    }
   });
  } else {
   global.loadScript();
  }
 </script>

 
 

<script type='text/javascript' defer> 
 /**
  *  SAP web analytics tool javascript snippet
  */  
 var swa = {
  pubToken: '2103052a-aabe-2640-92ab-de9aeb9f13a8',
        baseUrl: 'https://webanalytics.cfapps.eu10.hana.ondemand.com/tracker/',
  visitorCookieTimeout: 63113852,
  dntLevel: 1,
  bannerEnabled: false
 };
 
 if(!appConfig.isMock){
  swa.owner = getCurrentUser;
  function getCurrentUser(){
   var fingerprint;
   var _swa_fingerprint = getSwaFingerprint();
   
   if((!_swa_fingerprint) || (_swa_fingerprint && _swa_fingerprint.indexOf('GMT') === -1)){
    _swa_fingerprint = guid();
    window.document.cookie = "swa_fingerprint=" + _swa_fingerprint + setExpiresDays().toGMTString();
   }
   fingerprint = Sha1.hash(_swa_fingerprint, true);
   
   return fingerprint;
  };
  function guid() {
   function s4() {
       return Math.floor((1 + Math.random()) * 0x10000)
         .toString(16)
         .substring(1);
   }
   return s4() + s4() + '-' + s4() + '-' + s4() + '-' + s4() + '-' + s4() + s4() + s4();
  };

  function getSwaFingerprint(){
   var _swa_fingerprint = "";
   var propArr = window.document.cookie.split(';');
   for(var i = 0; i < propArr.length; i++){
    if(propArr[i].indexOf('swa_fingerprint') != -1){
     _swa_fingerprint = propArr[i].split("=")[1];
     break;
    }
   }
   return _swa_fingerprint;
  };

  function setExpiresDays(){
   var date = new Date();
   var expiresDays = 90;

   date.setTime(date.getTime()+expiresDays*24*3600*1000); 
   return date
  };
 }
 
 (function(){
  var d=document, g=d.createElement('script'), s=d.getElementsByTagName('script')[0];
  g.type='text/javascript'; g.defer=true; g.async=true; g.src=swa.baseUrl+'js/privacy.js';
  s.parentNode.insertBefore(g,s);
  })();
  
</script>
</html>
  1. Next, I tried using Selenium driver to load the website and then, load the content in a soup, and use it to parse the website but sometimes, the soup doesn't contain all the data from the webpage. So, using this is not at all a full proof 100% working solution, I can rely on. Also, this takes time if I have to parse through many webpages and save their contents in a document.
url = "https://rapid.sap.com/bp/#/browse/categories/sap_s%254hana/areas/on-premise/packageversions/BP_OP_ENTPR/S4HANA/1909/GB/6/EN/scopeitems/BKP" 

driver.get(url)

soup = BeautifulSoup(driver.page_source, 'html5lib') 

Is there something else I am missing in point 1, so that I can get the contents and parse the webpage in a time much faster than selenium. Also any better approach would be appreciated.

Thanks for the help.

Humayun Ahmad Rajib
  • 1,502
  • 1
  • 10
  • 22
user7319461
  • 11
  • 1
  • 6
  • You've to use `selenium` [wait](https://selenium-python.readthedocs.io/waits.html), where you waiting for the elements to be loaded and presented within the source code! this is common question and answered a lo in the community. that's a dupe – αԋɱҽԃ αмєяιcαη Apr 22 '20 at 15:40
  • Does this answer your question? [Make Selenium wait 10 seconds](https://stackoverflow.com/questions/45347675/make-selenium-wait-10-seconds) – αԋɱҽԃ αмєяιcαη Apr 22 '20 at 15:41
  • Use headless browsers like [pyppeteer](https://github.com/miyakogi/pyppeteer) or [phantomJS](https://phantomjs.org/). – Kalsi Apr 22 '20 at 16:11

0 Answers0