-1

I made a request to a server for this page using Python's get() function from the request module. When I access the request's content I get this (sample):

b'\n\n\n\n<!DOCTYPE html>\n<html\nxmlns:og="http://ogp.me/ns#"\nxmlns:fb="http://www.facebook.com/2008/fbml">\n    <head>\n        <meta charset="utf-8">\n        <meta http-equiv="X-UA-Compatible" content="IE=edge">\n\n    <meta name="apple-itunes-app" content="app-id=342792525, app-argument=imdb:///?src=mdot">\n        \n    \n            <script type="text/javascript">var ue_t0=window.ue_t0||+new Date();</script>\n            <script type="text/javascript">\n                var ue_mid = "A1EVAM02EL8SFB"; \n                var ue_sn = "www.imdb.com";  \n                var ue_furl = "fls-na.amazon.com";\n                var ue_sid = "000-0000000-0000000";\n                var ue_id = "03N6Z2NEAF09T9H26QYE";\n                (function(e){var c=e;var a=c.ue||{};a.main_scope="mainscopecsm";a.q=[];a.t0=c.ue_t0||+new Date();a.d=g;function g(h){return +new Date()-(h?0:a.t0)}function d(h){return function(){a.q.push({n:h,a:arguments,t:a.d()})}}function b(m,l,h,j,i){var k={m:m,f:l,l:h,c:""+j,err:i,fromOnError:1,args:arguments};c.ueLogError(k);return false}b.skipTrace=1;e.onerror=b;function f(){c.uex("ld")}if(e.addEventListener){e.addEventListener("load",f,false)}else{if(e.attachEvent){e.attachEvent("onload",f)}}a.tag=d("tag");a.log=d("log");a.reset=d("rst");c.ue_csm=c;c.ue=a;c.ueLogError=d("err");c.ues=d("ues");c.uet=d("uet");c.uex=d("uex");c.uet("ue")})(window);(function(e,d){var a=e.ue||{};function c(g){if(!g){return}var f=d.head||d.getElementsByTagName("head")[0]||d.documentElement,h=d.createElement("script");h.async="async";h.src=g;f.insertBefore(h,f.firstChild)}function b(){var k=e.ue_cdn||"z-ecx.images-amazon.com",g=e.ue_cdns||"images-na.ssl-images-amazon.com",j="/images/G/01/csminstrumentation/",h=e.ue_file||"ue-full-11e51f253e8ad9d145f4ed644b40f692._V1_.js",f,i;if(h.indexOf("NSTRUMENTATION_FIL")>=0){return}if("ue_https" in e){f=e.ue_https}else{f=e.location&&e.location.protocol=="https:"?1:0}i=f?"https://":"http://";i+=f?g:k;i+=j;i+=h;c(i)}if(!e.ue_inline){if(a.loadUEFull){a.loadUEFull()}else{b()}}a.uels=c;e.ue=a})(window,document);\n                if (!(\'CS\' in window)) { window.CS = {}; }\n                    window.CS.loginLocation = "https://www.imdb.com/registration/signin?u=%2Fsearch%2Ftitle%3Frelease_date%3D2017%26sort%3Dnum_votes%2Cdesc%26page%3D1";\n            </script>\n \n\n        \n        <script type="text/javascript">var IMDbTimer={starttime: new Date().getTime(),pt:\'java\'};</script>\n        \n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        <title>IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31 - IMDb</title>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_title"] = new Date().getTime(); })(IMDbTimer);</script>\n        \n            <link rel="canonical" href="http://www.imdb.com/search/title" />\n            <meta property="og:url" content="http://www.imdb.com/search/title" />\n        \n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_icon"] = new Date().getTime(); })(IMDbTimer);</script>\n        <link href="http://ia.media-imdb.com/images/G/01/imdb/images/safari-favicon-517611381._CB522736552_.svg" mask rel="icon" sizes="any">\n        <link rel="icon" type="image/ico" href="http://ia.media-imdb.com/images/G/01/imdb/images/favicon-2165806970._CB522736556_.ico" />\n        <meta name="theme-color" content="#000000" />\n        <link rel="shortcut icon" type="image/x-icon" href="http://ia.media-imdb.com/images/G/01/imdb/images/desktop-favicon-2165806970._CB522736561_.ico" />\n        <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-4151659188._CB522736129_.png" rel="apple-touch-icon"> \n        <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-76x76-53536248._CB522736233_.png" rel="apple-touch-icon" sizes="76x76"> \n        <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-120x120-2442878471._CB522736253_.png" rel="apple-touch-icon" sizes="120x120"> \n        <link href="http://ia.media-imdb.com/images/G/01/imdb/images/mobile/apple-touch-icon-web-152x152-1475823641._CB522736557_.png" rel="apple-touch-icon" sizes="152x152">            \n        <link rel="search" type="application/opensearchdescription+xml" href="http://ia.media-imdb.com/images/G/01/imdb/images/imdbsearch-3349468880._CB522736605_.xml" title="IMDb" />\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_icon"] = new Date().getTime(); })(IMDbTimer);</script>\n        \n        <meta property="pageType" content="search" />\n        <meta property="subpageType" content="title" />\n\n\n        <link rel=\'image_src\' href="http://ia.media-imdb.com/images/G/01/imdb/images/logos/imdb_fb_logo-1730868325._CB522736557_.png">\n        <meta property=\'og:image\' content="http://ia.media-imdb.com/images/G/01/imdb/images/logos/imdb_fb_logo-1730868325._CB522736557_.png" />\n\n    <meta property=\'fb:app_id\' content=\'115109575169727\' />\n\n    <meta property=\'og:title\' content="IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31" />\n    <meta property=\'og:site_name\' content=\'IMDb\' />\n    <meta name="title" content="IMDb: Most Voted Titles Released 2017-01-01 to 2017-12-31 - IMDb" />\n        <meta name="description" content="IMDb\'s advanced search allows you to run extremely powerful queries over all people and titles in the database. Find exactly what you\'re looking for!" />\n        <meta property="og:description" content="IMDb\'s advanced search allows you to run extremely powerful queries over all people and titles in the database. Find exactly what you\'re looking for!" />\n        <meta name="request_id" content="03N6Z2NEAF09T9H26QYE" />\n        \n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_css"] = new Date().getTime(); })(IMDbTimer);</script>\n<link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/consumersite-4100637360._CB530008524_.css" />\n<!-- h=ics-1e-c4-2xl-4b098b82.us-east-1 -->\n<link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/site/consumer-navbar-mega-238568768._CB532297092_.css" />\n<!--[if IE]><link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/ie-1170868033._CB522736261_.css" /><![endif]-->\n\n            <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/other-3780135229._CB530008515_.css" />\n            <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/starbarwidget-2454701167._CB522736579_.css" />\n            <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/collections/watchlistButton-3806422028._CB531876201_.css" />\n        <noscript>\n            <link rel="stylesheet" type="text/css" href="http://ia.media-imdb.com/images/G/01/imdb/css/wheel/nojs-2827156349._CB522739048_.css">\n        </noscript>\n  <script>(function(t){ (t.events = t.events || {})["csm_head_post_css"] = new Date().getTime(); })(IMDbTimer);</script>\n        \n  <script>(function(t){ (t.events = t.events || {})["csm_head_pre_ads"] = new Date().getTime(); })(IMDbTimer);</script>\n        \n        <script  type="text/javascript">\n            // ensures js doesn\'t die if ads service fails.  \n            // Note that we need to define the js here, since ad js is being rendered inline after this.\n            (function(f) {\n                // Fallback javascript, when the ad Service call fails.  \n                \n                if((window.csm == null || window.generic == null || window.consoleLog == null)) {\n                    if (window.console && console.log) {\n                        console.log("one or more of window.csm, window.generic or window.consoleLog has been stubbed...");\n                    }\n                }\n                \n                window.csm = window.csm || { measure:f, record:f, duration:f, listen:f, metrics:{} };\n                window.generic = window.generic || { monitoring: { start_timing: f, stop_timing: f } };\n                window.consoleLog = window.consoleLog || f;\n            })(function() {});\n        </script>\n  <script>\n    if (\'csm\' in window) {\n      csm.measure(\'csm_head_delivery_finished\');\n    }\n  </script>\n  

What format is this, and what syntax features do you look for to recognize it quickly?

Alex
  • 3,958
  • 4
  • 17
  • 24

4 Answers4

2

This is mainly HTML with some inline script encoded as a byte response (the leading b). The \n (at the beginning) is the newline operator in markup language and is there because of the empty lines at the beginning of the site.

What is the problem? What did you expect getting?

creyD
  • 1,972
  • 3
  • 26
  • 55
  • Thanks for the response! I have no experience in web development, so I got a bit confused by lines like `if (!(\'CS\' in window)) { window.CS = {}; }\n`, which didn't seem HTML to me. – Alex Apr 26 '17 at 11:07
2

you got byte response b'....', you can find more information, in the answer

in order to get full text from page, use this example:

import requests as r

url = 'your_url_here'
content = r.get(url).text

print(content)

UPD: for the parsing you can use Scrapy or Beautiful Soup tools.

Community
  • 1
  • 1
  • Great, I was really wondering what's with the `b'` in the beginning. That was really what got me thinking that is some special kind of format. I was also seeing that the `type` of the content is `bytes`, so your answer explains a lot. Thanks! – Alex Apr 26 '17 at 11:23
  • Welcome;) for the `HTTP` requests, I recommend use `requests` package. –  Apr 26 '17 at 11:30
  • Yea, that's what I'm using, and `BeautifulSoup` to parse the content. I just got confused by that b' in the beginning. – Alex Apr 26 '17 at 11:33
1

It's HTML, and I determined that from the DTD: <!DOCTYPE html> You can parse it with BeautifulSoup. Since I can't yet comment, I'd also like to explain to @Alex. The excess code is most likely from a script tag, which allows ECMAScript to be executed on the fly in a page. Hope this helps, BoxTechy

BoxTechy
  • 339
  • 2
  • 6
  • 16
1

What you seem to ask actually is the

if(!('CS' in window)) { window.CS = {}; }

It is JavaScript, a typical programming language what you can embed in HTML (if you look carefully, you will certainly find that it is between <script> and </script> tags).

And what you see in particular is the 'in' operator (https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Operators/in) and an empty object (the {} part). If 'window' object has no 'CS' property at all, an empty object is assigned. In fact, many programmers would have written

window.CS=window.CS || {};
tevemadar
  • 12,389
  • 3
  • 21
  • 49