8

I am using PHP 7.1.33 and "fabpot/goutte": "^3.2". My composer file looks like the following:

{
    "name": "ubuntu/workspace",
    "require": {
        "fabpot/goutte": "^3.2"
    },
    "authors": [
        {
            "name": "admin",
            "email": "admin@admin.com"
        }
    ]
}

I am trying to get details by a time range from a webpage but struggle how to pass the $crawler-values to my final result array $res1Array.

I tried the following:

<?php
require 'vendor/autoload.php';

use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

/**
 * Crawls Detail Calender
 * Does NOT also include wanted Date in the final result set
 * @param $wantedDate
 * @return array
 */
function updateCalendarDetailsData($wantedDate)
{
    try {
        $client = new Client();

        /*
        $x = 1;
        $LIMIT = 3;
        global $x;
        global $LIMIT;
        $x++;
        */
        $res1Array = array();

        $ffUrlArr = ["https://www.forexfactory.com/calendar.php?month=Jan2020"];
        foreach ($ffUrlArr as $key => $v) {

            try {
                $crawler = $client->request('GET', $ffUrlArr[$key]);
            } catch (\Exception $ex) {
                error_log($ex);
            }

            $TEMP = array();

            // $count = $crawler->filter('.calendar_row')->count();
            // $i = 1; // count starts at 1
            $nodeDate = date('Y-m-d');
            $crawler->filter('.calendar_row')->each(function ($node) use (&$res1Array, $wantedDate, $nodeDate) { // $count, $i,
                $EVENT = array();

                // check date for month
                $dayMonth = str_split(explode(" ", trim($node->getNode(0)->nodeValue))[0], 3);
                $day = explode(" ", trim($node->getNode(0)->nodeValue))[1];
                if (is_numeric($day)) {
                    $nodeDate = date("Y-m-d H:i:s", strtotime($dayMonth[0] . " " . $dayMonth[1] . " " . $day));
                }

                // return if wanted date is reached
                if (date("Y-m-d", strtotime($nodeDate)) == date("Y-m-d", strtotime($wantedDate))) {
                    return $res1Array;
                }

                $EVENTID = $node->attr('data-eventid');

                $API_RESPONSE = file_get_contents('https://www.forexfactory.com/flex.php?do=ajax&contentType=Content&flex=calendar_mainCal&details=' . $EVENTID);

                $API_RESPONSE = str_replace("<![CDATA[", "", $API_RESPONSE);
                $API_RESPONSE = str_replace("]]>", "", $API_RESPONSE);

                $html = <<<HTML
<!DOCTYPE html>
<html>
    <body>
       $API_RESPONSE
    </body>
</html>
HTML;

                $subcrawler = new Crawler($html);

                $subcrawler->filter('.calendarspecs__spec')->each(function ($LEFT_TD) use (&$res1Array, &$TEMP, &$EVENT) {

                    $LEFT_TD_INNER_TEXT = trim($LEFT_TD->text());

                    if ($LEFT_TD_INNER_TEXT == "Source") {

                        $TEMP = array();
                        $LEFT_TD->nextAll()->filter('a')->each(function ($LINK) use (&$TEMP) {
                            array_push($TEMP, $LINK->text(), $LINK->attr('href'));
                        });

                        $EVENT['sourceTEXT'] = $TEMP[0];
                        $EVENT['sourceURL'] = $TEMP[1];
                        $EVENT['latestURL'] = $TEMP[3];
                    }

                    if ($LEFT_TD_INNER_TEXT == "Measures") {
                        $EVENT['measures'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Usual Effect") {
                        $EVENT['usual_effect'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Frequency") {
                        $EVENT['frequency'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Why Traders") {
                        $EVENT['why_traders_care'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Derived Via") {
                        $EVENT['derived_via'] = $LEFT_TD->nextAll()->text();
                        // array_push($res1Array, $EVENT); // <---- HERE I GET THE ERROR!
                    }
                });
                /*
                $i++;
                if ($i > $count) {
                    echo "<pre>";
                    var_dump($res1Array);
                    print_r($res1Array);
                    echo "</pre>";
                    exit;
                }
                */
            });
        }
    } catch (\Exception $ex) {
        error_log($ex);
    }
    return $res1Array;
}

var_dump(updateCalendarDetailsData(date("2020-01-02")));

As you can see I am trying to create an $EVENT and push all wanted values as key value pairs inside. When I am finished I want to push it to the $resArray getting the following structure (values in this array() are just for structural purpose):

[
    sourceTEXT => "test", 
    sourceURL => "test",
    latestURL => "test", 
    measures => "test",
    usual_effect => "test",
    derived_via => "test",
    why_traders_care => "test",
    frequency => "test"
],
[
    sourceTEXT => "test1", 
    sourceURL => "test1",
    latestURL => "test1", 
    measures => "test1",
    usual_effect => "test1",
    derived_via => "test1",
    why_traders_care => "test1",
    frequency => "test1"
],
[
    sourceTEXT => "test2", 
    sourceURL => "test2",
    latestURL => "test2", 
    measures => "test2",
    usual_effect => "test2",
    derived_via => "test2",
    why_traders_care => "test2",
    frequency => "test2"
], 
// ... 

I currently get nothing back in my $res1Array.

I highly appreciate your replies!

UPDATE

I ran the script from @tftd with "fabpot/goutte": "^4.0" however I got this:

array(94) {
  [0] =>
  array(10) {
    'eventId' =>
    string(6) "114340"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [1] =>
  array(10) {
    'eventId' =>
    string(6) "114341"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [2] =>
  array(10) {
    'eventId' =>
    string(6) "114342"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [3] =>
  array(10) {
    'eventId' =>
    string(6) "114343"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [4] =>
  array(10) {
    'eventId' =>
    string(6) "114328"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [5] =>
  array(10) {
    'eventId' =>
    string(6) "113632"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
  [6] =>
  array(10) {
    'eventId' =>
    string(6) "114308"
    'date' =>
    string(10) "2020-01-01"
    'sourceTEXT' =>
    NULL
    'sourceURL' =>
    NULL
    'latestURL' =>
    NULL
    'measures' =>
    NULL
    'usual_effect' =>
    NULL
    'derived_via' =>
    NULL
    'why_traders_care' =>
    NULL
    'frequency' =>
    NULL
  }
// ...

Any suggestions why I get all this null values?

Carol.Kar
  • 4,581
  • 36
  • 131
  • 264
  • 1
    From what I understand, you want to parse the table rows for a specific date i.e. `2020-01-02` into an array which contains the row data. Is that correct? – tftd Jan 10 '20 at 13:16
  • @tftd I want to parse the table rows from today until a specific date in the future `now() - 2020/01-18`(which is not part of the above example, as it starts in the beginning on goes until a certain date, however, I could jus skip unwanted rows). My big problem is that I get an emtpy array back. Please provide a fully working example. – Carol.Kar Jan 10 '20 at 14:53
  • @Anna.Klee, regarding you "UPDATE" to the question: are you sure the links you are using are real APIs, not some test endpoints? Real API endpoints usually require some credential. And not only a lot of fields from "forexfactory.com/flex.php..." are empty, but also what you can see in a browser when you visit "https://www.forexfactory.com/calendar.php?month=Jan2020" differs from what you can get with a `file_get_contents()` or `$client->request()`. E.g. try to find event id 113606 – x00 Jan 15 '20 at 02:08

3 Answers3

2

I am still working through the code you provided, but one of the first things I'm noticing is right before you are setting $API_RESPONSE, you have the following lines of code...

// return if wanted date is reached
if (date("Y-m-d", strtotime($nodeDate)) == date("Y-m-d", strtotime($wantedDate))) {
  return $res1Array;
}

At that point in the function, you have yet to pushed any data to $res1Array, so it would return just an empty array. It isn't until the $subcrawler (and the second attempt to return $res1Array) in which you are actually pushing information into the array.

Note: I will update my answer once I work through the rest of the code, in hopes to provide you a more complete resolution to your issue.

morsecodemedia
  • 317
  • 2
  • 8
  • BTW, this is a closure function. It won't actually return anything and won't `break` the `foreach` cycle. It will only "skip" the execution of the rest of the function code. – tftd Jan 09 '20 at 16:21
  • Thx for your answer! Please include a fully workable example. – Carol.Kar Jan 09 '20 at 17:10
2

I took the liberty of rewriting your code a bit using OOP instead of leaving it functional because it's much easier to focus on smaller bits of the code. It should be easy to convert it to functional coding, should you need it.

This class takes a date which is formatted into Jan2020 to be able to get the calendar.

 $parser = new CalendarParser(date_create());

To get the events for a date range within the calendar records - you need to call $parser->getEventsBetweenDates() with a startDate and an endDate. The hours are not taken into account while parsing, but you can add it if you need it. Here's an example:

$parser->getEventsBetweenDates(
   date_create_from_format('Y-m-d H:i:s', '2020-01-01 00:00:00'),
   date_create_from_format('Y-m-d H:i:s', '2020-01-02 23:59:59')
)

The result of the above code is:

<!-- language: lang-none -->

array(22) { 
  [0] => array(10) {
    'eventId' => string(6) "114340"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [1] => array(10) {
    'eventId' => string(6) "114341"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [2] => array(10) {
    'eventId' => string(6) "114342"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [3] => array(10) {
    'eventId' => string(6) "114343"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [4] => array(10) {
    'eventId' => string(6) "114328"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [5] => array(10) {
    'eventId' => string(6) "113632"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [6] => array(10) {
    'eventId' => string(6) "114308"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [7] => array(10) {
    'eventId' => string(6) "113607"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [8] => array(10) {
    'eventId' => string(6) "113816"
    'date' => string(10) "2020-01-01"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [9] => array(10) {
    'eventId' => string(6) "114718"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(25) "Reserve Bank of Australia"
    'sourceURL' => string(21) "http://www.rba.gov.au"
    'latestURL' => string(65) "http://www.rba.gov.au/statistics/frequency/commodity-prices/2019/"
    'measures' => string(52) "Change in the selling price of exported commodities;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(120) "The average selling price of the nation's main commodity exports are sampled and then compared to the previous sampling;"
    'why_traders_care' => string(128) "It's a leading indicator of the nation's trade balance with other countries because rising commodity prices boost export income;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [10] => array(10) {
    'eventId' => string(6) "114344"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
  [11] => array(10) {
    'eventId' => string(6) "111383"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 400 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [12] => array(10) {
    'eventId' => string(6) "111382"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 450 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [13] => array(10) {
    'eventId' => string(6) "111379"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 750 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [14] => array(10) {
    'eventId' => string(6) "111380"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 800 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [15] => array(10) {
    'eventId' => string(6) "111381"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(205) "Survey of about 5000 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [16] => array(10) {
    'eventId' => string(6) "111397"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 650 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [17] => array(10) {
    'eventId' => string(6) "111102"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(34) "Challenger, Gray & Christmas, Inc."
    'sourceURL' => string(30) "http://www.challengergray.com/"
    'latestURL' => string(50) "http://www.challengergray.com/press/press-releases"
    'measures' => string(56) "Change in the number of job cuts announced by employers;"
    'usual_effect' => string(51) "'Actual' less than 'Forecast' is good for currency;"
    'derived_via' => NULL
    'why_traders_care' => NULL
    'frequency' => string(52) "Released monthly, about 3 days after the month ends;"
  }
  [18] => array(10) {
    'eventId' => string(6) "110766"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(19) "Department of Labor"
    'sourceURL' => string(18) "http://www.dol.gov"
    'latestURL' => string(20) "https://www.dol.gov/"
    'measures' => string(103) "The number of individuals who filed for unemployment insurance for the first time during the past week;"
    'usual_effect' => string(51) "'Actual' less than 'Forecast' is good for currency;"
    'derived_via' => NULL
    'why_traders_care' => string(306) "Although it's generally viewed as a lagging indicator, the number of unemployed people is an important signal of overall economic health because consumer spending is highly correlated with labor-market conditions. Unemployment is also a major consideration for those steering the country's monetary policy;"
    'frequency' => string(44) "Released weekly, 5 days after the week ends;"
  }
  [19] => array(10) {
    'eventId' => string(6) "113642"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 400 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [20] => array(10) {
    'eventId' => string(6) "111392"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => string(6) "Markit"
    'sourceURL' => string(30) "http://www.markiteconomics.com"
    'latestURL' => string(72) "https://www.markiteconomics.com/Public/Release/PressReleases?language=en"
    'measures' => string(95) "Level of a diffusion index based on surveyed purchasing managers in the manufacturing industry;"
    'usual_effect' => string(54) "'Actual' greater than 'Forecast' is good for currency;"
    'derived_via' => string(204) "Survey of about 800 purchasing managers which asks respondents to rate the relative level of business conditions including employment, production, new orders, prices, supplier deliveries, and inventories;"
    'why_traders_care' => string(213) "It's a leading indicator of economic health - businesses react quickly to market conditions, and their purchasing managers hold perhaps the most current and relevant insight into the company's view of the economy;"
    'frequency' => string(65) "Released monthly, on the first business day after the month ends;"
  }
  [21] => array(10) {
    'eventId' => string(6) "113817"
    'date' => string(10) "2020-01-02"
    'sourceTEXT' => NULL
    'sourceURL' => NULL
    'latestURL' => NULL
    'measures' => NULL
    'usual_effect' => NULL
    'derived_via' => NULL
    'why_traders_care' => string(230) "Banks facilitate the majority of foreign exchange volume. When they are closed the market is less liquid and speculators become a more dominant market influence. This can lead to both abnormally low and abnormally high volatility;"
    'frequency' => NULL
  }
}

Here's the full code:

<?php

require 'vendor/autoload.php';

use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

/**
 * Thinking OOP is easier for me.
 * You can easily restructure this into a `functional` code if that's what you need.
 */
class CalendarParser
{

    const BASE_URL = 'https://www.forexfactory.com/calendar.php?month=%s';
    const EVENT_URL = 'https://www.forexfactory.com/flex.php?do=ajax&contentType=Content&flex=calendar_mainCal&details=%d';

    /**
     * @var
     */
    private $client;

    /**
     * @var DateTime
     */
    private $calendarMonth;

    /**
     * @var Crawler
     */
    private $page;

    /**
     * @var Crawler
     */
    private $table;

    /**
     * @var array
     */
    private $dateIndexes;

    /**
     * CalendarParser constructor.
     *
     * @param DateTime $calendarMonth
     * @throws Exception
     */
    public function __construct(DateTime $calendarMonth)
    {
        $this->client = new Client();
        $this->calendarMonth = $calendarMonth;

        // Fetch page and table data and store it so we can iterate over it.
        $this->page = $this->client->request('GET', sprintf(self::BASE_URL, $this->calendarMonth->format('MY')));
        $this->table = $this->page->filter('.calendar_row');

        // Get date indexes
        $this->generateDateIndexes();
    }

    /**
     * The table uses a class called `newday` at each new date which can be used to create an index of
     * where the date records begin which makes parsing easier.
     */
    private function generateDateIndexes()
    {
        $dateIndexes = [];

        $previousDate = null;
        $this->table
            /**
             * NOTE: This is a closure function which will be called until the foreach completes.
             *       You cannot break out of it like when you do `foreach() { break; }`.
             *       If you do `return` - it will simply skip executing the rest of the function but won't break the cycle.
             */
            ->each(function (Crawler $node, $index) use (&$dateIndexes, &$previousDate) {
                $isNewDateSeparator = strpos($node->getNode(0)->getAttribute('class'), 'newday') !== false;

                if ($isNewDateSeparator) {
                    // Convert the date to `Jan-1-STARTING_YEAR` to be easier to search in the array.
                    $dateColumnNode = $node->filter('.date > span > span');
                    $stringDate = str_replace(' ', '-', $dateColumnNode->text()) . '-' . $this->calendarMonth->format('Y');
                    $date = date_create_from_format('M-d-Y', $stringDate);
                    $formattedDate = $date->format('Y-m-d');

                    $dateIndexes[$formattedDate] = [
                        'start' => $index,
                        'end'   => null
                    ];

                    if ($previousDate) {
                        $dateIndexes[$previousDate]['end'] = ($index - 1);
                    }

                    $previousDate = $formattedDate;
                }
            });

        $this->dateIndexes = $dateIndexes;
    }

    /**
     * @param Crawler $row
     * @return array
     */
    private function processEvent(DateTime $date, Crawler $row)
    {
        $eventId = $row->attr('data-eventid');

        $event = [
            'eventId'          => $eventId,
            'date'             => $date->format('Y-m-d'),
            'sourceTEXT'       => null,
            'sourceURL'        => null,
            'latestURL'        => null,
            'measures'         => null,
            'usual_effect'     => null,
            'derived_via'      => null,
            'why_traders_care' => null,
            'frequency'        => null
        ];

        $content = $this->client->request('GET', sprintf(self::EVENT_URL, $eventId))->html();
        $crawler = new Crawler($content, null, null);

        $table = $crawler->filter('.calendarspecs__spec')->first()->closest('table');

        $table->filter('tr')
              ->each(function (Crawler $tr) use (&$event) {
                  $label = $tr->filter('.calendarspecs__spec')->text();

                  $description = $tr->filter('.calendarspecs__specdescription');

                  if ($label === 'Source') {
                      $TEMP = [];
                      $description->filter(' a')
                                  ->each(function ($link) use (&$TEMP) {
                                      array_push($TEMP, $link->text(), $link->attr('href'));
                                  });

                      $event['sourceTEXT'] = $TEMP[0];
                      $event['sourceURL'] = $TEMP[1];
                      $event['latestURL'] = $TEMP[3];
                  }

                  if ($label == "Measures") {
                      $event['measures'] = $description->text();
                  }

                  if ($label == "Usual Effect") {
                      $event['usual_effect'] = $description->text();
                  }

                  if ($label == "Frequency") {
                      $event['frequency'] = $description->text();
                  }

                  // this is how it's returned.
                  if ($label == "Why TradersCare") {
                      $event['why_traders_care'] = $description->text();
                  }

                  if ($label == "Derived Via") {
                      $event['derived_via'] = $description->text();
                  }

              });

        return $event;
    }

    /**
     * Get the events between a start and end date.
     * If no endDate is defined - then it will get all events since $startDate.
     *
     * @param DateTime $startDate
     * @param DateTime|null $endDate
     *
     * @return array
     */
    public function getEventsBetweenDates(DateTime $startDate, DateTime $endDate = null)
    {
        $events = [];

        $totalCalendarRows = $this->table->count();
        foreach ($this->dateIndexes as $stringDate => $range) {
            $date = date_create_from_format('Y-m-d', $stringDate);

            // Process only the range from the start date
            if ($date >= $startDate) {
                // and break early when we reach the end.
                if ($endDate && $date > $endDate) {
                    break;
                }

                // collect and process events for the current date
                $start = $range['start'];
                $end = $range['end'] !== null ? $range['end'] : $totalCalendarRows;
                for ($i = $start; $i < $end; $i++) {
                    $events[] = $this->processEvent($date, new Crawler($this->table->getNode($i)));
                }
            }
        }

        return $events;
    }

}

$parser = new CalendarParser(date_create());

var_dump(
    $parser->getEventsBetweenDates(
        date_create_from_format('Y-m-d H:i:s', '2020-01-01 00:00:00'),
        date_create_from_format('Y-m-d H:i:s', '2020-01-02 23:59:59')
    )
);
tftd
  • 16,203
  • 11
  • 62
  • 106
  • 1
    Hey, I didn't downvote your answer! I think its top! However, when running the class I get `Uncaught Error: Call to undefined method Symfony\Component\DomCrawler\Crawler::closest()`. What versions of the libs are you running?(composer.json) Please add a fix and I am goind to accept your answer! – Carol.Kar Jan 11 '20 at 20:02
  • I am using `php 7.1` – Carol.Kar Jan 11 '20 at 20:11
  • 3
    My comment was for whoever downvoted - not you specifically. IMHO it's lame to downvote without pointing out what you think is wrong. I'm currently using `php 7.3` but that shouldn't matter. I think you might have an older version of `fabpot/goutte` - mine is `v4.0.0`. EDIT: I just noticed in your question you have pointed out you're using `^3.2`. Would it be possible to update to `4.0`? – tftd Jan 12 '20 at 16:42
  • Totally agree! I updated my version to `"fabpot/goutte": "^4.0"`, however, when running your `class` I get in the result values only `null`. Pls see my update above. – Carol.Kar Jan 12 '20 at 21:00
  • 1
    I've noticed a small typo in my class (check the revision or just copy/paste). If you search for event `111392` you'll see all fields are populated. The `null` value in other records is simply because there is no label from which you can get the data. – tftd Jan 12 '20 at 21:28
  • One final question: Is there an alternative to using in this line `$table = $crawler->filter('.calendarspecs__spec')->first()->closest('table');` the `closest()`-function, as this requires the `v4.4`. Would appreciate your updated answer! – Carol.Kar Jan 14 '20 at 19:00
  • 1
    The [fabpot/goutte@3.3.0](https://packagist.org/packages/fabpot/goutte#v3.3.0) package requires `symfony/dom-crawler` (which is where the `Crawler` class comes from) with versions either [`^4.4`](https://github.com/symfony/dom-crawler/blob/v4.4.2/Crawler.php#L447) or [`^5.0`](https://github.com/symfony/dom-crawler/blob/v5.0.2/Crawler.php#L434). This function exists in both releases (check the links). I suspect something might be wrong on your end - I have tried it with both versions and it works. Maybe check with `composer show -i` what's actually installed? – tftd Jan 15 '20 at 00:38
  • Thx for your reply! I still get an error at this line, event though `composer show -i` shows that I have installed `fabpot/goutte v4.0.0 A simple PHP Web Scraper ` installed. Is there an alternative for the `closest()` method? – Carol.Kar Jan 15 '20 at 05:31
  • What you're saying does not make sense. If you had `fabpot/goutte v4.0.0` it should have installed `symfony/dom-crawler 4.4 or 5.0` unless there is another dependency which specifically requires an even lower version of `symfony/dom-crawler`. If that's the case I cannot give you a reliable alternative in time, because i cannot know what version of the `dom-crawler` you have installed. Can you please post the entire `copmser show -i` in pastebin ? – tftd Jan 15 '20 at 15:28
  • Thanks for awarding the bounty! If you paste the `composer` output I'll update my answer tomorrow using the exact versions you have :) – tftd Jan 15 '20 at 19:34
2

I recommend you to stick to your code. It is smaller, simpler and more familiar to you.

I made a review of your code. You can find my comments marked by "***".
Also you can save this code and compare it to your original version in some diff tool.

Actually, you had only 4 small bugs.

<?php
require 'vendor/autoload.php';

// use Goutte\Client;
use Symfony\Component\DomCrawler\Crawler;

/**
 * Crawls Detail Calender
 * Does NOT also include wanted Date in the final result set
 * @param $wantedDate
 * @return array
 */
function updateCalendarDetailsData($wantedDate)
{
    // *** small optimizations
    $Year = $wantedDate->format("Y");
    $wantedDateStr = $wantedDate->format("Y M j");

    try {
        // $client = new Client(); // *** I don't see any need in this package

        $res1Array = array();

        $ffUrlArr = ["https://www.forexfactory.com/calendar.php?month=Jan2020"];
        foreach ($ffUrlArr as $key => $v) {
        // *** There one link in ffUrlArr, it's better to get rid off foreach().
        // *** But for now - let it be

            try {
                $crawler = new Crawler(file_get_contents($ffUrlArr[$key]));
                // $crawler = $client->request('GET', $ffUrlArr[$key]);
                // *** It's the only place where Goutte was used
            } catch (\Exception $ex) {
                error_log($ex);
            }

            // $TEMP = array();
            // *** No need to define it here, it's used only inside $subcrawler,
            // *** And it's redefined there

            // $nodeDate = date('Y-m-d');
            // *** no need for date('Y-m-d')
            $nodeDate = "";
            // $crawler->filter('.calendar_row')->each(function ($node) use (&$res1Array, $wantedDate, $nodeDate) {
            // *** BUG 1: here your forgot to put "&" before $nodeDate

            // *** Also, because you need to return on $wantedDate,
            // *** but you can not break from the each()
            // *** it is better to use foreach(), and in my opinion it
            // *** looks simpler. And it is less error prone,
            // *** as we can see.

            // *** By using '[data-eventid][data-touchable]' instead
            // *** of '.calendar_row' we can get rid of multiple requests
            // *** to forexfactory API with same $EVENTID
            foreach($crawler->filter('[data-eventid][data-touchable]') as $DOM_el) {
                $node = new Crawler($DOM_el);

                // $EVENT = array();
                // *** it's almost always better to define variable
                // *** near the place they are used. Moved it

                // check date for month
                // $dayMonth = str_split(explode(" ", trim($node->getNode(0)->nodeValue))[0], 3);
                // $day = explode(" ", trim($node->getNode(0)->nodeValue))[1];
                // if (is_numeric($day)) {
                //     $nodeDate = date("Y-m-d H:i:s", strtotime($dayMonth[0] . " " . $dayMonth[1] . " " . $day));
                // }
                // *** This is a cleaner and a simpler way to retrive
                // *** a date from this html. Getting nodeDate in the
                // *** form of "Y M j" (e.g. "2020 Jan 1")
                $date_node = $node->filter('.date > span > span');
                if( $date_node->count() != 0 ) {
                    $nodeDate = $Year . " " . $date_node->text();
                }

                // return if wanted date is reached
                // if (date("Y-m-d", strtotime($nodeDate)) == date("Y-m-d", strtotime($wantedDate))) {
                // *** There is no need for so many convertions.
                // *** Strings' comparison is good enough

                // *** BUG 2: Not critical, but "havy".
                // *** Because you can not break from ->each()
                // *** checking dates with "==" led to skiping only
                // *** $wantedDate, all dates after $wantedDate
                // *** were still iterated over
                if ($nodeDate == $wantedDateStr) {
                    // return $res1Array;
                    // *** Now, when we use foreach() instead of
                    // *** ->each() we can return from here.
                    // *** But still, I think it's better to use break.
                    // *** In case you would like to add some extra logic
                    // *** at the end, and for other vague reasons :)
                    break;
                }

                $EVENTID = $node->attr('data-eventid');

                $API_RESPONSE = file_get_contents('https://www.forexfactory.com/flex.php?do=ajax&contentType=Content&flex=calendar_mainCal&details=' . $EVENTID);

                $API_RESPONSE = str_replace("<![CDATA[", "", $API_RESPONSE);
                $API_RESPONSE = str_replace("]]>", "", $API_RESPONSE);

                $html = <<<HTML
<!DOCTYPE html>
<html>
    <body>
       $API_RESPONSE
    </body>
</html>
HTML;

                $subcrawler = new Crawler($html);

                // *** Took this part from tftd's answer
                // *** It's a good practice to define all possible fields
                $EVENT = [
                    'id'               => $EVENTID,
                    'date'             => $nodeDate,
                    'sourceTEXT'       => null,
                    'sourceURL'        => null,
                    'latestURL'        => null,
                    'measures'         => null,
                    'usual_effect'     => null,
                    'derived_via'      => null,
                    'why_traders_care' => null,
                    'frequency'        => null
                ];
                // $EVENT = array(); // *** But you can always switch back for this simple definition
                // $subcrawler->filter('.calendarspecs__spec')->each(function ($LEFT_TD) use (&$res1Array, &$TEMP, &$EVENT) {
                // *** once again switching from ->each() to foreach(),
                // *** just for the consistency
                foreach($subcrawler->filter('.calendarspecs__spec') as $DOM_el) {
                    $LEFT_TD = new Crawler($DOM_el);

                    $LEFT_TD_INNER_TEXT = trim($LEFT_TD->text());

                    if ($LEFT_TD_INNER_TEXT == "Source") {

                        $TEMP = array();
                        $LEFT_TD->nextAll()->filter('a')->each(function ($LINK) use (&$TEMP) {
                            array_push($TEMP, $LINK->text(), $LINK->attr('href'));
                        });

                        $EVENT['sourceTEXT'] = $TEMP[0];
                        $EVENT['sourceURL'] = $TEMP[1];
                        $EVENT['latestURL'] = $TEMP[3];
                    }

                    if ($LEFT_TD_INNER_TEXT == "Measures") {
                        $EVENT['measures'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Usual Effect") {
                        $EVENT['usual_effect'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Frequency") {
                        $EVENT['frequency'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Why TradersCare") {
                        // *** BUG 3: As tftd noticed - you had an issue
                        // *** with name of this field
                        $EVENT['why_traders_care'] = $LEFT_TD->nextAll()->text();
                    }

                    if ($LEFT_TD_INNER_TEXT == "Derived Via") {
                        $EVENT['derived_via'] = $LEFT_TD->nextAll()->text();
                        // array_push($res1Array, $EVENT); // <---- HERE I GET THE ERROR!
                        // *** BUG 4: And this was the main complication
                        // *** 1) Being here array_push() wasn't called if event
                        // ***    had no "Derived Via" field
                        // *** 2) but even more than that... it was somehow put
                        // ***    in the comments... and of course this led to
                        // ***    $res1Array never been populated
                    }
                }
                array_push($res1Array, $EVENT);
                // *** this command should be here
            }
        }
    } catch (Exception $ex) {
        error_log($ex);
    }
    return $res1Array;
}
// *** You'd better use DateTime, so its fields could be manipulated
// *** and retrieved more easily than in the case of a string representation
// var_dump(updateCalendarDetailsData(date("2020-01-02")));
var_dump(updateCalendarDetailsData(new DateTime("2020-01-02")));

?> 
x00
  • 13,643
  • 3
  • 16
  • 40