-2

Hello I would like help creating a regex that replaces all html tags but when there is an end div and start div next to each other it adds a space, so for example

This <b>is</b> <div>a</div><div>test</div>

This is a test

What I currently have for regex is /(<([^>]+)>)/ig which will replace all html tags but Im wondering how do I also add a space whenever there is a closing div and starting div next to each other.

I tried using /(<([^>]+)>)/ig to replace the html which works but I need help with the spacing on divs when they are next to each other

  • 2
    see: [Why it's not possible to use regex to parse HTML/XML: a formal explanation in layman's terms](https://stackoverflow.com/questions/6751105/why-its-not-possible-to-use-regex-to-parse-html-xml-a-formal-explanation-in-la) – pilchard May 10 '23 at 21:05
  • 1
    https://stackoverflow.com/a/1732454/7387397 – Sysix May 10 '23 at 21:06
  • 1
    What about replacing `(?: *<[^>]+>)+` with a space – Jerry Jeremiah May 10 '23 at 22:00
  • @pilchard Do you happen to know how to parse HTML tags with a parser ? Most parsers use regex for this. – sln May 10 '23 at 22:51

2 Answers2

1

JS has built-in support for HTML parsing. Use that instead:

function getSpaceSeparatedText(html) {
  // Create an element and use it as a parser
  let parser = document.createElement('div');
  
  parser.innerHTML = html;
  
  const result = [];
  
  for (const node of parser.childNodes) {
    // Get the trimmed text
    const text = node.textContent.trim();
    
    // If text is not empty, add it to result
    if (text) {
      result.push(text);
    }
  }
  
  return result.join(' ');
}

Try it:

console.config({ maximize: true });

function getSpaceSeparatedText(html) {
  let parser = document.createElement('div');
  
  parser.innerHTML = html;
  
  const result = [];
  
  for (const node of parser.childNodes) {
    const text = node.textContent.trim();
    
    if (text) {
      result.push(text);
    }
  }
  
  return result.join(' ');
}

const html = `
This <b>is</b> 
<div>a</div><div>test</div>
`;

console.log(getSpaceSeparatedText(html));
<script src="https://gh-canon.github.io/stack-snippet-console/console.min.js"></script>
InSync
  • 4,851
  • 4
  • 8
  • 30
0

Update: Adding a new group to the top caused an offset by one to the subsequent backreferences.
Was fixed.

This removes all HTML tags and invisible content (https://regex101.com/r/2ACiDg/1),
but you need a callback to insert a space between a closing and open div.

var text = "This <b>is</b> <div>a</div><div>test</div>"
text = text.replace(/(<\/div\s*><div\s*>)|<(?:(?:(?:(script|style|object|embed|applet|noframes|noscript|noembed)(?:\s+(?=((?:"[\S\s]*?"|'[\S\s]*?'|(?:(?!\/>)[^>])?)+))\3)?\s*>)[\S\s]*?<\/\2\s*(?=>))|(?:\/?[\w:]+\s*\/?)|(?:[\w:]+\s+(?:"[\S\s]*?"|'[\S\s]*?'|[^>]?)+\s*\/?)|\?[\S\s]*?\?|(?:!(?:(?:DOCTYPE[\S\s]*?)|(?:\[CDATA\[[\S\s]*?\]\])|(?:--[\S\s]*?--)|(?:ATTLIST[\S\s]*?)|(?:ENTITY[\S\s]*?)|(?:ELEMENT[\S\s]*?))))>/g, function(match, grp1)
    {
       if ( grp1 > "" ) 
          return " "; 
       else
          return ""
    }
);

console.log( text );
sln
  • 2,071
  • 1
  • 3
  • 11