-3

I'm trying to catch some info from a website with the code below:

I think i'm not using regex correct?

$data = file_get_contents('http://domain.topdomain');

preg_match_all(
        '/<a class="link" href="(.*)">
              <div class="image">
                <img src="(.*)">
                <span class="name">(.*)<\/span>
              <\/div>
              <div class="box">
                <h3 class="title">(.*)<\/h3>
              <\/div>
         <\/a>/i',
    $data,
    $posts,
    PREG_SET_ORDER
);
echo "<p>" . count($posts) . " posts found</p>";
Joel
  • 61
  • 3
  • 11

1 Answers1

0

As stated in the comments, DOMDocument and DOMXPath is the way to go. I'm not a pro so for me this was a new area.

I ended up with this code which works the way I want:

include('../assets/db_conn.php');

$content = file_get_contents('http://domain.topdomain');
    $content = mb_convert_encoding($content, 'HTML-ENTITIES', "UTF-8");

    $doc = new DomDocument();
    libxml_use_internal_errors(true);
    $doc->loadHTML($content);
    libxml_use_internal_errors(false);
    $xpath = new DomXpath($doc);
    $articles = $xpath->query("//div[@class='posts']/div[@class='post']");
    $results = array();
    foreach ($articles as $article) {

    $node = $xpath->query("a/attribute::href", $article);
    $result['href'] = $node->item(0)->value;

    $node = $xpath->query("a/div[@class='post-image']/img/attribute::src", $article);
    $result['image_src'] = $node->item(0)->value;

    $node = $xpath->query("a/div[@class='post-image']/span[@class='post-site-name']", $article);
    $result['site_name'] = $node->item(0)->textContent;

    $node = $xpath->query("a/div[@class='post-box']/h3[@class='post-title']", $article);
    $result['title'] = $node->item(0)->textContent;

  ////////////////////////
    $content1 = file_get_contents('http://domain.topdomain'.$result['href']);

    $doc1 = new DomDocument();
    libxml_use_internal_errors(true);
    $doc1->loadHTML($content1);
    libxml_use_internal_errors(false);
    $xpath1 = new DomXpath($doc1);
    $articles1 = $xpath1->query("//div[@class='video-container']");

    foreach ($articles1 as $article1) {
        $node1 = $xpath1->query("iframe[@class='youtube-player']/attribute::src", $article1);
        $youtube_href = $node1->item(0)->value; // get the first node in the list which is a DOMAttr
        $youtube_href = explode('//www.youtube.com/embed/', $youtube_href);
        $youtube_href = explode('?', $youtube_href[1]);

        $result['youtube_href'] = $youtube_href[0];
    }

  ////////////////////////
    if($result['youtube_href'] != 'random') {
        $results[] = $result;
    }
}

$query =    "SELECT yt_url FROM article";
$stmt = $dbh->prepare($query);
$stmt->execute();
$urls = $stmt->fetchAll();
$yt_urls = array();
foreach($urls as $url) {
    $yt_urls[] = $url['yt_url'];
}
print_r($yt_urls);
$stmt = $dbh->prepare("INSERT INTO article (title, fb_title, slug, yt_url, img_url) VALUES (:title, :fb_title, :slug, :yt_url, :img_url)");

foreach($results as $value) {
    if (!in_array($value['youtube_href'], $yt_urls)) {
        echo $value['youtube_href'].'</br>';
        $slug = ltrim($value['href'], '/');

        $stmt->bindParam(':title', $value['title']);
        $stmt->bindParam(':fb_title', $value['title']);
        $stmt->bindParam(':slug', $slug);
        $stmt->bindParam(':yt_url', $value['youtube_href']);
        $stmt->bindParam(':img_url', $value['image_src']);

        $stmt->execute();
    }
}
Joel
  • 61
  • 3
  • 11