2

I am trying to parse html using this script in Google app script

function parse() {

  var html = UrlFetchApp.fetch('http://www.merriam-webster.com/').getContentText();
  var doc = XmlService.parse(html);
  var html = doc.getRootElement();
  var element = getElementsByID(html, 'xx');
  return element;
}


function getElementById(element, idToFind) {  
  var descendants = element.getDescendants();  
  for(i in descendants) {
    var elt = descendants[i].asElement();
    if( elt !=null) {
      var id = elt.getAttribute('id');
      if( id !=null && id.getValue()== idToFind) return elt;    
    }
  }
}

But it says:

Error on line 27: Element type "scr" must be followed by either attribute specifications, ">" or "/>". (line 4, file "")

I am trying to parse html and then use getElementById function above. any ideas?

Hossein Alipour
  • 307
  • 1
  • 5
  • 13
  • Possible duplicate of [How to parse an HTML string in Google Apps Script without using XmlService?](https://stackoverflow.com/questions/33893143/how-to-parse-an-html-string-in-google-apps-script-without-using-xmlservice) – Rubén Jun 23 '17 at 16:30
  • The best way to parse HTML is to not use `Xml.parse` or `XmlService.parse` -- https://stackoverflow.com/a/50856901/452587 – thdoan Mar 28 '21 at 19:40

2 Answers2

3

I found that the best way to parse html in google apps is to avoid using XmlService.parse or Xml.parse. XmlService.parse doesn't work well with bad html code from certain websites.

Here a basic example on how you can parse any website easily without using XmlService.parse or Xml.parse. In this example, i am retrieving a list of president from "wikipedia.org/wiki/President_of_the_United_States" whit a regular javascript document.getElementsByTagName(), and pasting the values into my google spreadsheet.

1- Create a new Google Sheet;

2- Click the menu Tools > Script editor... to open a new tab with the code editor window and copy the following code into your Code.gs:

function onOpen() {
 var ui = SpreadsheetApp.getUi();
    ui.createMenu("Parse Menu")
      .addItem("Parse", "parserMenuItem")
      .addToUi();

}


function parserMenuItem() {
  var sideBar = HtmlService.createHtmlOutputFromFile("test");
  SpreadsheetApp.getUi().showSidebar(sideBar);
}


function getUrlData(url) {
 var doc = UrlFetchApp.fetch(url).getContentText()
 return doc                               
}

function writeToSpreadSheet(data) {
 var ss = SpreadsheetApp.getActiveSpreadsheet();
 var sheet = ss.getSheets()[0];
 var row=1

   for (var i = 0; i < data.length; i++) {
   var x = data[i];
   var range = sheet.getRange(row, 1)
   range.setValue(x);
   var row = row+1
    }
}

3- Add an HTML file to your Apps Script project. Open the Script Editor and choose File > New > Html File, and name it 'test'.Then copy the following code into your test.html

<!DOCTYPE html>
<html>
<head>    
</head>
<body>
<input id= "mButon" type="button" value="Click here to get list"
onclick="parse()">
<div hidden id="mOutput"></div>
</body>
<script>

window.onload = onOpen;

function onOpen() {
 var url = "https://en.wikipedia.org/wiki/President_of_the_United_States"
 google.script.run.withSuccessHandler(writeHtmlOutput).getUrlData(url)
 document.getElementById("mButon").style.visibility = "visible";
}

function writeHtmlOutput(x) {
 document.getElementById('mOutput').innerHTML = x;
}

function parse() {

var list = document.getElementsByTagName("area");
var data = [];

   for (var i = 0; i < list.length; i++) {
   var x = list[i];
   data.push(x.getAttribute("title"))
    }

google.script.run.writeToSpreadSheet(data);
} 
</script> 
</html>

4- Save your gs and html files and Go back to your spreadsheet. Reload your Spreadsheet. Click on "Parse Menu" - "Parse". Then click on "Click here to get list" in the sidebar.

Yves R
  • 134
  • 1
  • 5
0

To parse the HTML you'll have to sanitize it, your page has a script tag that's written with Javascript as a String, more specifically:

document.write('<scr' + 'ipt src="' + src + '"></scr' + 'ipt>');})

XML parser doesn't understand Javascript code, obviously, so you'll have to sanitize this manually, not a simple thing to do, you can however build a simple RegEx selector, as:

function getElementsByID(element, idToFind) {  
  var regId = new RegExp( '(<[^<]*id=[\'"]'+ idToFind +'[\'"][^>]*)' );
  var result = regId.exec( element );
  return result[1] + '>';
}

function parse() {
      var html = UrlFetchApp.fetch('http://www.merriam-webster.com/').getContentText();
      var element = getElementsByID(html, 'search_box_terms');
      return element; // "<input id="search_box_terms" name="query" type="text" placeholder="I'm searching for ..." value="" />"

    }

This will return the string of the element the ID you provide. This of course is a simple RegEx and won't work in ALL cases, but will do pretty well for the most of them.

Kriggs
  • 3,731
  • 1
  • 15
  • 23