I am using Simple Html Dom to get the HTML structure of a webpage. I am also fetching all the external CSS that the page is using. Here is the code:
Class MyClass {
//... Rest of irrelevant code
private function get_web_page($url)
{
$user_agent='Mozilla/5.0 (Windows NT 6.1; rv:8.0) Gecko/20100101 Firefox/8.0';
$options = array(
CURLOPT_CUSTOMREQUEST => "POST", //set request type post or get
CURLOPT_POST => true, //set to POST
CURLOPT_POSTFIELDS => array(),
CURLOPT_USERAGENT => $user_agent, //set user agent
CURLOPT_COOKIEFILE => "cookie.txt", //set cookie file
CURLOPT_COOKIEJAR => "cookie.txt", //set cookie jar
CURLOPT_RETURNTRANSFER => true, // return web page
CURLOPT_BINARYTRANSFER => true,
CURLOPT_HEADER => false, // don't return headers
CURLOPT_FOLLOWLOCATION => true, // follow redirects
CURLOPT_ENCODING => "", // handle all encodings
CURLOPT_AUTOREFERER => true, // set referer on redirect
CURLOPT_CONNECTTIMEOUT => 120, // timeout on connect
CURLOPT_TIMEOUT => 120, // timeout on response
CURLOPT_MAXREDIRS => 10, // stop after 10 redirects
);
$ch = curl_init( $url );
curl_setopt_array( $ch, $options );
$content = curl_exec( $ch );
$err = curl_errno( $ch );
$errmsg = curl_error( $ch );
$header = curl_getinfo( $ch );
curl_close( $ch );
$header['errno'] = $err;
$header['errmsg'] = $errmsg;
$header['content'] = $content;
return $header;
}
private function collect_css($url,$html)
{
$css = array();
foreach($html->find('link') as $e){
$css[] = file_get_contents($e,true); //Consider all as absolute URL
}
return $css;
}
private function collect_inlinecss($url,$html)
{
$css = array();
foreach($html->find('style') as $e){
$css = $e->innertext //Get inline CSS
}
return $css;
}
private function filter_css($css)
{
/* What should I place here to get only certain attributes (for ex- 'display' attribute only for this case)
* For example- if $css = #selector{ display : block; color: blue },
* the function should return only $css = #selector{ display : block; }
*/
}
public function index(){
$url = "http://www.example.com";
$raw = $this->get_web_page($url);
$html = str_get_html($raw['content']); //Get only HTML content using Simple HTML Dom Lib
$css = $this->collect_css($url,$html); //Get all external CSS files of webpage
$css_inline = $this->collect_inlinecss($url,$html); //Get inline CSS (<style>....</style>)
$css_filtered = $this->filter_css($css);
var_dump($css_filtered); //See next for how I want it to look like
}
The var_dump
must contain the stripped Css. The desired output for sample input Css should look like:
Input CSS(the input for filter function):
#id{
display: block;
color: blue;
padding: 0 5px;
}
#id2{
background: Yellow;
margin: 0px;
position: relative;
}
#id3{ float: left; }
Output Css (the expected result from var_dump):
/* I wish to strip off every style except 'display' and 'position' */
#id{
display: block;
}
#id2{
position: relative;
}
Can anyone enlighten me with some ray of hope or anything. I know that regex would do some help, though I am not good at that, nor do I know any good plugins out there. PS: Those who are here to say that I haven't googled before I asked- I have spent 1 hours going threw questions like this, this, but could not find any decent solutions. Please help.
Thanks