I have a code exactly to do this but its rather undocumented and uses some code I do not own but in public domain. Its pretty easy to use and it ensures all tags are closed so they do not affect your code, use fix_html function for that. It can also limit use of tags and attributes strip_tags_attributes for this, also use strip_javascript to remove javascript functionality of any sort. I used this extensively but to be honest I do not know if this one is from production. For your second answer, I guess its best to remove styles all together so they can use <i>
or <b>
as they like. And please dont let anyone to use underline.
function findNodeValue($parent, $node) {
$nodes=array();
if(!is_a($parent, "DOMElement")) return NULL;
foreach($parent->childNodes as $child)
if($child->nodeName==$node) $nodes[]=$child;
if(count($nodes)==0) return NULL;
if(count($nodes)==1) return $nodes[0]->nodeValue;
else {
$ret=array();
foreach($nodes as $node)
$ret[]=$node->nodeValue;
return $ret;
}
}
function strip_javascript($filter){
// realign javascript href to onclick
$filter = preg_replace("/href=(['\"]).*?javascript:(.*)?\\1/i", "onclick=' $2 '", $filter);
//remove javascript from tags
while( preg_match("/<(.*)?javascript.*?\(.*?((?>[^()]+)|(?R)).*?\)?\)(.*)?>/i", $filter))
$filter = preg_replace("/<(.*)?javascript.*?\(.*?((?>[^()]+)|(?R)).*?\)?\)(.*)?>/i", "<$1$3$4$5>", $filter);
// dump expressions from contibuted content
$filter = preg_replace("/:expression\(.*?((?>[^(.*?)]+)|(?R)).*?\)\)/i", "", $filter);
$filter = preg_replace("/<iframe.*?>/", "", $filter);
$filter = preg_replace("/<\/iframe>/", "", $filter);
while( preg_match("/<(.*)?:expr.*?\(.*?((?>[^()]+)|(?R)).*?\)?\)(.*)?>/i", $filter))
$filter = preg_replace("/<(.*)?:expr.*?\(.*?((?>[^()]+)|(?R)).*?\)?\)(.*)?>/i", "<$1$3$4$5>", $filter);
// remove all on* events
while( preg_match("/<(.*)?\s?on[^>\s]+?=\s?.+?(['\"]).*?\\2\s?(.*)?>/i", $filter, $match) ) {
$filter = preg_replace("/<(.*)?\s?on[^>\s]+?=\s?.+?(['\"]).*?\\2\s?(.*)?>/i", "<$1$3>", $filter);
}
return $filter;
}
function html2a ( $html ) {
ini_set('pcre.backtrack_limit', 10000);
ini_set('pcre.recursion_limit', 10000);
if ( !preg_match_all( '@\<\s*?(\w+)((?:\b(?:\'[^\']*\'|"[^"]*"|[^\>])*)?)\>((?:(?>[^\<]*)|(?R))*)\<\/\s*?\\1(?:\b[^\>]*)?\>|\<\s*(\w+)(\b(?:\'[^\']*\'|"[^"]*"|[^\>])*)?\/?\>@uxis', $html = trim($html), $m, PREG_OFFSET_CAPTURE | PREG_SET_ORDER) )
return $html;
$i = 0;
$ret = array();
foreach ($m as $set) {
if ( strlen( $val = trim( substr($html, $i, $set[0][1] - $i) ) ) )
$ret[] = $val;
$val = $set[1][1] < 0
? array( 'tag' => strtolower($set[4][0]) )
: array( 'tag' => strtolower($set[1][0]), 'val' => html2a($set[3][0]) );
if ( preg_match_all( '/(\w+)\s*(?:=\s*(?:"([^"]*)"|\'([^\']*)\'|(\w+)))?/usix', isset($set[5]) && $set[2][1] < 0 ? $set[5][0] : $set[2][0],$attrs, PREG_SET_ORDER ) ) {
foreach ($attrs as $a) {
$val['attr'][$a[1]]=$a[count($a)-1];
}
}
$ret[] = $val;
$i = $set[0][1]+strlen( $set[0][0] );
}
$l = strlen($html);
if ( $i < $l )
if ( strlen( $val = trim( substr( $html, $i, $l - $i ) ) ) )
$ret[] = $val;
return $ret;
}
function a2html ( $a, $in = "" ) {
if ( is_array($a) ) {
$s = "";
foreach ($a as $t)
if ( is_array($t) ) {
$attrs="";
if ( isset($t['attr']) )
foreach( $t['attr'] as $k => $v )
$attrs.=" ${k}=".( strpos( $v, '"' )!==false ? "'$v'" : "\"$v\"" );
$s.= $in."<".$t['tag'].$attrs.( isset( $t['val'] ) ? ">\n".a2html( $t['val'], $in).$in."</".$t['tag'] : "/" ).">";
} else
$s.= $in.$t."";
} else {
$s = empty($a) ? "" : $in.$a."";
}
return $s;
}
function remove_unclosed(&$a, $allowunclosed) {
if(!is_array($a)) return;
foreach($a as $k=>$tag) {
if(is_array($tag)) {
if(!isset($tag["val"]) && !in_array($tag["tag"],$allowunclosed)) {
//var_dump($tag["tag"]);
unset($a[$k]);
} elseif(is_array(@$tag["val"]))
remove_unclosed($a[$k]["val"], $allowunclosed);
}
}
}
function fix_html($html, $allowunclosed=array("br")) {
$a = html2a($html);
remove_unclosed($a, $allowunclosed);
return a2html($a);
}
function strip_tags_ex($str,$allowtags) {
$strs=explode('<',$str);
$res=$strs[0];
for($i=1;$i<count($strs);$i++)
{
if(!strpos($strs[$i],'>'))
$res = $res.'<'.$strs[$i];
else
$res = $res.'<'.$strs[$i];
}
return strip_tags($res,$allowtags);
}
function strip_tags_attributes($string,$allowtags=allowedtags,$allowattributes=allowedattributes){
$string=strip_javascript($string);
$string = strip_tags_ex($string,$allowtags);
if (!is_null($allowattributes)) {
if(!is_array($allowattributes))
$allowattributes = explode(",",$allowattributes);
if(is_array($allowattributes))
$allowattributes = implode(")(?<!",$allowattributes);
if (strlen($allowattributes) > 0)
$allowattributes = "(?<!".$allowattributes.")";
$string = preg_replace_callback("/<[^>]*>/i",create_function(
'$matches',
'return preg_replace("/ [^ =]*'.$allowattributes.'=(\"[^\"]*\"|\'[^\']*\')/i", "", $matches[0]);'
),$string);
}
return $string;
}
I found the source for strip_javascript http://www.php.net/manual/en/function.strip-tags.php#89453 I don't know why its not there in the code already. Probably because no name, no email no identity to refer.
` or `- ` element, because changing it to a `` or and `` tag would change how it works. Finally, I would point out that the `` tag is deprecated; it is recommended to use the `text-decoration:underline` style instead.
– Spudley Jul 30 '11 at 20:18