2

I am trying to get the href(s) from an HTML document (Gmail email) that has link text of "cats"

For e.g. I want to extract the URLs from below links that would be anywhere in an html document

<a href="https://www.google.com/search?q=cats&oq=cats" target="_blank">cats</a>
or 
<a href="https://www.google.com/search?q=cats&oq=cats" target="_blank">yay cats</a>

Note: I'm building a Gmail add-on

credizian
  • 469
  • 5
  • 20
  • @Cooper sorry for late response. Was away from my computer yesterday, but I've looked at many other options (listing two here: https://stackoverflow.com/questions/18727341/get-all-links-in-a-document# | https://www.bettercloud.com/monitor/the-academy/extract-urls-or-link-text-from-a-google-sheets-cell/). They all provide ways to extract the href but not how to get to a specific link info – credizian Mar 01 '19 at 14:29

1 Answers1

1

Finding Links and other stuff in an email with a Gmail Addon regular expressions

This is a regex tester so it's probably more than you were expecting but it will find the links that your looking for as well as any others you wish to find. This one will work for your current needs: <.*q=cats.*>. The regex function currently being using is String.match you can find it explained here There's a url at the bottom of the page that should point back to a spreadsheet that contains a page named "Default" as explained later.

function buildAddOn(e) {
  var accessToken = e.messageMetadata.accessToken;
  GmailApp.setCurrentMessageAccessToken(accessToken);
  var msg=GmailApp.getMessageById(e.messageMetadata.messageId).getPlainBody();
  setDefaults({message:msg});
  var cards = [];
  cards.push(buildRegexTester(getDefaults()));
  return cards;
}

function buildRegexTester(dfltObj){
  var card=CardService.newCardBuilder();
  card.setHeader(CardService.newCardHeader().setTitle('Regex Tester'));
  var section=CardService.newCardSection().setHeader('Email Body Search');
  var plainBodyText=CardService.newTextParagraph().setText(dfltObj.message);
  section.addWidget(plainBodyText)
  var saveRegexButton=CardService.newTextButton().setText('Save Regex').setOnClickAction(CardService.newAction().setFunctionName('saveRegex'));
  section.addWidget(saveRegexButton);
  var regexTextBox=CardService.newTextInput()
  .setFieldName('Regex')
  .setTitle('Regex')
  .setMultiline(true)
  .setValue(dfltObj.Regex);
  section.addWidget(regexTextBox);
  var searchButton=CardService.newTextButton().setText('Search').setOnClickAction(CardService.newAction().setFunctionName('search')); 
  section.addWidget(searchButton);
  var regexResultBox=CardService.newTextInput()
  .setFieldName('Results')
  .setTitle('Results')
  .setMultiline(true)
  .setValue(dfltObj.results);
  section.addWidget(regexResultBox);
  var clearResultsButton=CardService.newTextButton().setText('Clear Results').setOnClickAction(CardService.newAction().setFunctionName('clearResults'));
  section.addWidget(clearResultsButton)
  var g_toggle=CardService.newKeyValue()
  .setContent('global search')
  .setSwitch(CardService.newSwitch()
            .setSelected(dfltObj.g)
            .setFieldName('g')
            .setValue('g')
            .setOnChangeAction(CardService.newAction().setFunctionName('saveFlags')));
  section.addWidget(g_toggle);
  var i_toggle=CardService.newKeyValue()
  .setContent('ignore case')
  .setSwitch(CardService.newSwitch()
            .setSelected(dfltObj.i)
            .setFieldName('i')
            .setValue('i')
            .setOnChangeAction(CardService.newAction().setFunctionName('saveFlags')));
  section.addWidget(i_toggle);
  var m_toggle=CardService.newKeyValue()
  .setContent('multiline search')
  .setSwitch(CardService.newSwitch()
            .setSelected(dfltObj.m)
            .setFieldName('m')
            .setValue('m')
            .setOnChangeAction(CardService.newAction().setFunctionName('saveFlags')));
  section.addWidget(m_toggle);
  card.addSection(section);
  return card.build();
}

function getDefaults(){
  var ss=SpreadsheetApp.openByUrl(RegexTesterAddon_URL);
  var sh=ss.getSheetByName('Defaults');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  var dfltObj={};
  for(var i=0;i<vA.length;i++){
    dfltObj[vA[i][0]]=vA[i][1];
  }
  return dfltObj;
}

function setDefaults(dfltObj){
  var ss=SpreadsheetApp.openByUrl(RegexTesterAddon_URL);
  var sh=ss.getSheetByName('Defaults');
  var rg=sh.getDataRange();
  var vA=rg.getValues();
  for(var i=0;i<vA.length;i++){
    if(typeof(dfltObj[vA[i][0]])!='undefined'){
      vA[i][1]=dfltObj[vA[i][0]];
    } 
  }
  rg.setValues(vA);
}

function saveFlags(e){
  Logger.log('\nsaveFlags():\n%s\n',e);
  var g=(typeof(e.formInput.g)!='undefined')?true:false;
  var i=(typeof(e.formInput.i)!='undefined')?true:false;
  var m=(typeof(e.formInput.m)!='undefined')?true:false;
  var flagObj={g:g,i:i,m:m};
  setDefaults(flagObj);
}

function saveRegex(e){
  Logger.log('\nsaveRegex():\n%s',e);
  var regex=(typeof(e.formInput.Regex)!='undefined')?e.formInput.Regex:'';
  if(regex){
    var rObj={Regex:regex};
    setDefaults(rObj);
  }
}

function saveResults(rsltObj){
  setDefaults(rsltObj);
}

function getFlags(){
  var dfltObj=getDefaults();
  var flagsA=[];
  if(dfltObj.g){flagsA.push('g');}
  if(dfltObj.i){flagsA.push('i');}
  if(dfltObj.m){flagsA.push('m');}
  var flags=flagsA.join('');
  return flags;  
}

function search(e){
  Logger.log('\nSearch():\n%s',e);
  if(typeof(e.formInput.Regex)!='undefined'){
    saveFlags(e);
    saveRegex(e);
    var dfltObj=getDefaults();
    var flags=getFlags();
    var pattern=dfltObj.Regex;
    Logger.log('\nflags: %s\npattern: %s',flags,pattern);
    var re=new RegExp(pattern,flags);
    //var result=re.exec(dfltObj.message);
    var result=dfltObj.message.match(re);
    if(result) {
      Logger.log('\nresult: %s\nmessage:',result,dfltObj.message);
      if(result){
        var rsltLog='';
        for(var i=0;i<result.length;i++){
          if(i>0){rsltLog+='\n'};
          rsltLog+='result[' + i + ']= ' + result[i];
        }
      }
      console.log('module: %s pattern: %s regex: %s flags: %s result: %s length: %s',"findData()",pattern,re,flags,rsltLog,result.length);
    }
    if(result){
      var results=rsltLog;
    }else{
      var results="No Results";
    }
    //var rsltObj={results:results};
    var rsltObj={results:escapeHtml(results)};
    saveResults(rsltObj);
    return buildRegexTester(getDefaults());
  }
}

function clearResults(e){
  var dfltObj=getDefaults();
  dfltObj.results='';
  return buildRegexTester(dfltObj);
}

//Came From: @Kip https://stackoverflow.com/a/4835406/7215091
function escapeHtml(text) {
  var map = {
    '&': '&amp;',
    '<': '&lt;',
    '>': '&gt;',
    '"': '&quot;',
    "'": '&#039;'
  };

  return text.replace(/[&<>"']/g, function(m) { return map[m]; });
}

var RegexTesterAddon_URL='link to a spreadsheet that contain defaults';

Spreadsheet Must have a page named Defaults and it looks like this:

enter image description here

The page only uses Columns A and B and it must have message,Regex,g,i,m and results as shown in the image. A typical setting for g,i,m is TRUE, FALSE, FALSE which will get you started. This Regex works for your current requirements <.*q=cats.*>

Cooper
  • 59,616
  • 6
  • 23
  • 54
  • I won't have control over the HTML. It's a Gmail add-on so would be running on whatever the email is – credizian Mar 01 '19 at 14:30
  • Sorry about that, I only tagged the question with it...I'll update the question text with this. – credizian Mar 01 '19 at 16:45
  • Hopefully this solution will help find your required link and many others also. – Cooper Mar 01 '19 at 21:24
  • Woh...there is a lot there for me to digest. Please give me some time to review and try out. BTW would you mind undoing the down vote? – credizian Mar 01 '19 at 21:58
  • :) Fair enough. Hope I can earn it back! – credizian Mar 01 '19 at 22:11
  • Okay so tried to understand your code...don't think I fully did but got the gist and then it struck me that I was underestimating regular expressions. So then I tried to build a simple regular expression (https://regexr.com/49eeh) and execute it on my add-on, but it returns null when it shouldn't. Code in apps script-> "var regex = /(href=(.*))+(>helloworld<\/a>)/gi;" – credizian Mar 02 '19 at 19:38
  • I thought you were looking for cats. It's easy to underestimate regular expressions. – Cooper Mar 02 '19 at 19:50
  • Most of the card is basic CardService stuff for Gmail Addons. I presumed that you already knew that. And the the remaining apps script mostly just stores and retrieve the defaults from to a spreadsheet. – Cooper Mar 02 '19 at 19:55
  • How are you supposed to use this? I created a sheet with a tab called `Defaults` and then `Extensions` -> `App scripts` - pasted the code but when I run it I got an error message `TypeError: Cannot read properties of undefined (reading 'messageMetadata')`??? How are you supposed to use this? – d-b Jul 13 '23 at 20:03
  • My use case is that I receive a specific mail many times a day. In each mail there is a link that needs to be opened to verify that I have received the mail. The simplest solution would be to perform a search in Gmail and extract that link from all the messages that are returned and then feed that list of links into curl/wget. But first of all I, obviously, need to make the link extraction work. – d-b Jul 13 '23 at 20:06