SCENARIO - I have a puppeteer script, which takes in a url and a json object as arguments for execution. - It is being called from a php script from a html file - the puppeteer script goes to the url, fetches the page content, console.log s it, and thus the content is made available in the html containing the above mentioned php script.
PROBLEM - on windows it is running perfectly, giving the desired output. However i have now shifted my project on ubuntu, and here's where the trouble begin. - The content does not load, It ends with a blank page. - I ran the puppeteer script from console and it works perfectly, logs out the page content. BUT when I call it from the php script using system() it doesn't. BUT it works perfectly on windows even from the php script.
Here's my php code
<?php
if(isset($_POST['url'])){
$split_url = str_replace('.', '_', explode('/', $_POST['url']));
$dir = "site_config/";
$content = '';
if( is_dir($dir) ){
if ($dh = opendir($dir)){
while (($file = readdir($dh)) !== false){
$filename = str_replace('.txt','',$file);
if($filename === $split_url[2]){
$content = file_get_contents($dir.$split_url[2].'.txt');
}
}
closedir($dh);
}
}
echo '<script>document.getElementById("website-input-form").style.display = "none";</script>';
/*to run with phantomjs*/
//system('phantomjs get_page_phantomjs.js "'.$_REQUEST['url'].'" '.$content);
/*to run with puppeteer*/
system('node get_page_puppeteer.js "'.$_REQUEST['url'].'" '.$content);
// system('node sample.js "'.$_REQUEST['url'].'" '.$content);
}
?>
You can see in the last line I am running another sample nodejs script. It executed perfectly.
So I don't know, maybe something's wrong with the puppeteer script?
const puppeteer = require('puppeteer');
const url = process.argv[2]; //url from command line argument
const json = process.argv[3]; //config content from command line argument
/*_________________________STEP 1____________________________________*/
async function run() {
const browser = await puppeteer.launch();
const page = await browser.newPage();
await page.goto(url, {
waitUntil: 'networkidle2',
timeout: 3000000
});
await page.evaluate(function(json){
//removing unwanted elements from html content
Array.prototype.slice.call(document.getElementsByTagName("script")).filter(function(script) {
return script.type != "application/ld+json";
}).forEach(function(script) {
script.parentNode.removeChild(script);
});
Array.prototype.slice.call(document.getElementsByTagName("style")).filter(function(style) {
return style.type != "application/ld+json";
}).forEach(function(style) {
style.parentNode.removeChild(style);
});
Array.prototype.slice.call(document.getElementsByTagName("iframe")).filter(function(iframe) {
return iframe.type != "application/ld+json";
}).forEach(function(iframe) {
iframe.parentNode.removeChild(iframe);
});
Array.prototype.slice.call(document.getElementsByTagName("video")).filter(function(video) {
return video.type != "application/ld+json";
}).forEach(function(video) {
video.parentNode.removeChild(video);
});
Array.prototype.slice.call(document.getElementsByTagName("img")).filter(function(img) {
img.setAttribute('style','max-width: 50% !important;');
return img.src.endsWith('.svg') === true;
}).forEach(function(img) {
img.parentNode.removeChild(img);
});
//providing the site's config through an element
var inp = document.createElement('div');
inp.setAttribute('textcontent', json);
inp.setAttribute('id', 'config_available');
var XMLS = new XMLSerializer();
var inp_xmls = XMLS.serializeToString(inp);
document.body.insertAdjacentHTML('afterbegin', inp_xmls);
//injecting the logic script
inp = document.createElement('script');
inp.setAttribute('src', './scraperJavascript.js');
inp.setAttribute('type', 'text/javascript');
XMLS = new XMLSerializer();
inp_xmls = XMLS.serializeToString(inp);
document.body.insertAdjacentHTML('afterbegin', inp_xmls);
}, json)
//rendering page's html
const renderedContent = await page.evaluate(() => new XMLSerializer().serializeToString(document));
console.log(renderedContent);
await browser.close();
}
run();
But if there's something wrong with the script, why is running successfully from console (on ubuntu and windows) and from php script (on windows) but not from php script (on ubuntu)
UPDATE
I ran an exception check on the puppeteer end. An exception does occur and this is its message
Error: Failed to launch chrome! [0608/095818.625603:ERROR:icu_util.cc(133)] Invalid file descriptor to ICU data received. [0608/095818.625662:FATAL:content_main_delegate.cc(57)] Check failed: false. #0 0x55dc5336182c base::debug::StackTrace::StackTrace() #1 0x55dc532e8290 logging::LogMessage::~LogMessage() #2 0x55dc51598de3 content::ContentMainDelegate::TerminateForFatalInitializationError() #3 0x55dc53017941 content::ContentMainRunnerImpl::Initialize() #4 0x55dc53021c12 service_manager::Main() #5 0x55dc53016184 content::ContentMain() #6 0x55dc571eea39 headless::(anonymous namespace)::RunContentMain() #7 0x55dc571eeac2 headless::HeadlessBrowserMain() #8 0x55dc5301ef8f headless::HeadlessShellMain() #9 0x55dc515971ac ChromeMain #10 0x7f5204329830 __libc_start_main #11 0x55dc5159702a _start TROUBLESHOOTING: https://github.com/GoogleChrome/puppeteer/blob/master/docs/troubleshooting.md at onClose (/var/www/html/master/scraper_puppeteer/node_modules/puppeteer/lib/Launcher.js:255:14) at Interface.helper.addEventListener (/var/www/html/master/scraper_puppeteer/node_modules/puppeteer/lib/Launcher.js:244:50) at emitNone (events.js:111:20) at Interface.emit (events.js:208:7) at Interface.close (readline.js:370:8) at Socket.onend (readline.js:149:10) at emitNone (events.js:111:20) at Socket.emit (events.js:208:7) at endReadableNT (_stream_readable.js:1055:12) at _combinedTickCallback (internal/process/next_tick.js:138:11)