6

Users can paste an URL in a textbox on my site. When they do, I want to get that URL via jQuery AJAX and read the opengraph metadata from it. How can I do this?

I read this post How to read Open Graph and meta tags from a webpage with a url but the link in it is broken and it's more advanced than I need and not in jQuery :)

I don't need anything else but the opengraph metadata, so no parsing of structures etc.

Here's an example of a page: http://www.ebay.com/itm/Microsoft-Surface-Pro-3-12-Tablet-256GB-SSD-Intel-Core-i7-Haswell-8GB-RAM-/281656969697

So one of the fields I'd like to extract is <meta property="og:image" content="http://i.ebayimg.com/images/i/281656969697-0-1/s-l1000.jpg" ></meta>, to be precise the value http://i.ebayimg.com/images/i/281656969697-0-1/s-l1000.jpg

What I have now is copied from here: http://icant.co.uk/articles/crossdomain-ajax-with-jquery/error-handling.html

See my comment marked @Flo where I want to extract the open graph data, but I don't know how to parse the JSON response.

<a href="www.ebay.com/itm/Microsoft-Surface-Pro-3-12-Tablet-256GB-SSD-Intel-Core-i7-Haswell-8GB-RAM-/281656969697" class="ajaxtrigger">Load Ajax Content</a>
<div id="target"></div>

    <script language="javascript" type="text/javascript">
    $(function () {

        $('.ajaxtrigger').click(function () {
            var container = $('#target');
            container.attr('tabIndex', '-1');
            var trigger = $(this);
            var url = trigger.attr('href');
            if (!trigger.hasClass('loaded')) {
                trigger.append('<span></span>');
                trigger.addClass('loaded');
                var msg = trigger.find('span').last();
            } else {
                var msg = trigger.find('span').last();
            }
            doAjax(url, msg, container);
            return false;
        });
    });


    function doAjax(url, msg, container) {
        // if the URL starts with http
        if (url.match('^http')) {
            // assemble the YQL call
            msg.removeClass('error');
            msg.html(' (loading...)');
            $.getJSON("//query.yahooapis.com/v1/public/yql?" +
                      "q=SELECT%20*%20FROM%20html%20WHERE%20url=%27" +
                      encodeURIComponent(url) +
                      "%27%20AND%20xpath=%27descendant-or-self::meta%27&format=json&callback=?",
              function (data) {
                  if (data.results[0]) {
                      var data = filterData(data.results[0]);

                      //@Flo: get metadata from result, but now???

                      msg.html(' (ready.)');
                      container.
                        html(data).
                          focus().
                            effect("highlight", {}, 1000);
                  } else {
                      msg.html(' (error!)');
                      msg.addClass('error');
                      var errormsg = '<p>Error: could not load the page.</p>';
                      container.
                        html(errormsg).
                          focus().
                            effect('highlight', { color: '#c00' }, 1000);
                  }
              }
            );
        } else {
            $.ajax({
                url: url,
                timeout: 5000,
                success: function (data) {
                    msg.html(' (ready.)');
                    container.
                      html(data).
                        focus().
                          effect("highlight", {}, 1000);
                },
                error: function (req, error) {
                    msg.html(' (error!)');
                    msg.addClass('error');
                    if (error === 'error') { error = req.statusText; }
                    var errormsg = 'There was a communication error: ' + error;
                    container.
                      html(errormsg).
                        focus().
                          effect('highlight', { color: '#c00' }, 1000);
                },
                beforeSend: function (data) {
                    msg.removeClass('error');
                    msg.html(' (loading...)');
                }
            });
        }
    }
    function filterData(data) {
        // filter all the nasties out
        // no body tags
        data = data.replace(/<?\/body[^>]*>/g, '');
        // no linebreaks
        data = data.replace(/[\r|\n]+/g, '');
        // no comments
        data = data.replace(/<--[\S\s]*?-->/g, '');
        // no noscript blocks
        data = data.replace(/<noscript[^>]*>[\S\s]*?<\/noscript>/g, '');
        // no script blocks
        data = data.replace(/<script[^>]*>[\S\s]*?<\/script>/g, '');
        // no self closing scripts
        data = data.replace(/<script.*\/>/, '');
        // [... add as needed ...]
        return data;
    }


    </script>

The object returned by this query is:

Object {query: Object}
query: Object
count: 33
created: "2015-05-02T04:36:46Z"
lang: "en-US"
results: Object
meta: Array[33]
0: Object
name: "viewport"
__proto__: Object
1: Object
content: "main"
name: "layout"
__proto__: Object

How can I filter this response to return the og:image value?

Community
  • 1
  • 1
Adam
  • 6,041
  • 36
  • 120
  • 208

2 Answers2

6

Try

var url = "http://www.ebay.com/itm/Microsoft-Surface-Pro-3-12-"
          + "Tablet-256GB-SSD-Intel-Core-i7-Haswell-8GB-RAM-/281656969697";

$.getJSON("//query.yahooapis.com/v1/public/yql?" 
          + "q=SELECT%20*%20FROM%20html%20WHERE%20url=%27" 
          + encodeURIComponent(url) 
          + "%27%20AND%20xpath=%27descendant-or-self::meta%27"
          + "&format=json&callback=?"
  , function(data) {
      // `data`:`json` returned from request
      console.log(data);
      // filter returned `results.meta` array for
      // object having property `property`:`og:*` `meta` elements ;
      // and has `property` `og:image` 
      var res = $.grep(data.query.results.meta, function(image, key) {
        return image.hasOwnProperty("property") && image.property === "og:image"
      });
      // if object having property `og:image` returned , do stuff
      if (res.length > 0) {
        console.log(res[0].property);
        $("body").append(res[0].content);
      } else {
        // else, log notification
        console.log("og:image not found")
      };

});
<script src="https://ajax.googleapis.com/ajax/libs/jquery/1.11.1/jquery.min.js">
</script>

JSFiddle

brasofilo
  • 25,496
  • 15
  • 91
  • 179
guest271314
  • 1
  • 15
  • 104
  • 177
  • 1
    But how do I call the URL I need to parse? – Adam May 02 '15 at 01:38
  • @Flo Can include `$.ajax()` `js` tried at Quesion ? – guest271314 May 02 '15 at 01:40
  • Can include what `$.ajax()` have tried at Question ? What was response from `url` when tried `$.ajax(url)` ? – guest271314 May 02 '15 at 01:45
  • I have not tried anything as I don't know how to get the page, to what variable I should assign it and how to parse that variables' value :) – Adam May 02 '15 at 01:47
  • See http://api.jquery.com/jQuery.ajax/ , http://stackoverflow.com/questions/19821753/jquery-xml-error-no-access-control-allow-origin-header-is-present-on-the-req/19821851#19821851 , http://stackoverflow.com/questions/3506208/jquery-ajax-cross-domain . `js` at answer should be able to filter `html` document `meta` elements having `property` attribute , return open graph metadata . – guest271314 May 02 '15 at 01:54
  • those links talk about calling cross-domain services that seem to return json, I just wamnt to request an external html page...that has nothing to do with cross-domain policies right? – Adam May 02 '15 at 02:00
  • @Flo Yes. Appear to be different domain; unless try at actual page, or same domain ? Tried `$.ajax(/* /path/to/external/html/page */)` at `console` ?; where "/path/to/external/html/page" is _"Here's an example of a page:"_ link at OP ? The link returns `html` . Perhaps try same at link page . Open `console` , -> network tab -> headers ; check if `Access-Control-Allow-Origin: *` appear at `Response headers` ? – guest271314 May 02 '15 at 02:04
  • @Flo What was response from `$.ajax(/* /path/to/external/html/page */)` ? Did solution at answer resolve Question as to retrieving open graph metadata from `html` document ? – guest271314 May 02 '15 at 02:41
  • @Flo Unfortunately , utilizing approach described at updated Question , the response does not appear to return `meta` tags from within `head` element. Response appear to be `` element when appended to document displays "Hi! Sign in or register" http://jsfiddle.net/v47mc1ht/1/ – guest271314 May 02 '15 at 03:15
  • Found out how to do the call returning just the head...all that remains now is filtering out some of the open graph metadata...any help on that would be great! – Adam May 02 '15 at 04:43
  • @Flo Tried `js` at post? Can include `js` which returns `head` element at Question ? – guest271314 May 02 '15 at 04:58
  • Not sure how your code helps me to extract a property value by its name, e.g. `og:image`..could you provide an example on that? – Adam May 02 '15 at 05:00
  • Only `og:image` ? No other `og:*` properties ? – guest271314 May 02 '15 at 05:01
  • Yes,all og:* properties, but I figured if I knew how to access one, I can also access the others :) Thanks! – Adam May 02 '15 at 05:10
-1

I'm using https://jsonlink.io endpoint, just make a fetch() call to their endpoint with the URL and get back the metadata

mauriblint
  • 1,802
  • 2
  • 29
  • 46