Trying to properly extract all terms from the text. Looks like when term is inside sentence and term contains ()
it's not splitted and regex couldn't find it.
I'm trying to properly split matches that contain ()
. So Instead of this:
["What is API(Application Programming Interface) and how to use it?"]
I'm trying to get this:
["What is", "API(Application Programming Interface)", "and how to use it?"]
JSON term is properly extracted and I'm getting this:
["JSON", "is a Javascript Object Notation"]
so this is exactly what I want but in case of API I'm not getting this:
["What is", "API(Application Programming Interface)", "and how to use it?"]
I'm getting this and this is not what I want:
["What is API(Application Programming Interface) and how to use it?"]
function getAllTextNodes(element) {
let node;
let nodes = [];
let walk = document.createTreeWalker(element,NodeFilter.SHOW_TEXT,null,false);
while (node = walk.nextNode()) nodes.push(node);
return nodes;
}
const allNodes = getAllTextNodes(document.getElementById("body"))
const terms = [
{id: 1, definition: 'API stands for Application programming Interface', expression: 'API(Application Programming Interface)'},
{id: 2, definition: 'JSON stands for JavaScript Object Notation.', expression: 'JSON'}
]
const termMap = new Map(
[...terms].sort((a, b) => b.expression.length - a.expression.length)
.map(term => [term.expression.toLowerCase(), term])
);
const regex = RegExp("\\b(" + Array.from(termMap.keys()).join("|") + ")\\b", "ig");
for (const node of allNodes) {
const pieces = node.textContent.split(regex).filter(Boolean);
console.log(pieces)
}
<div id="body">
<p>API(Application Programming Interface)</p>
<p>What is API(Application Programming Interface) and how to use it?</p>
<p>JSON is a Javascript Object Notation</p>
</div>