2

I have a Problem fetching URLs with Umlauts (e.g. 'ü') in it.

For example "http://www.ebay.de/bhp/kühlschrank":

My Script:

function getUrlContent($url)
{
    //echo $url;
    $ch = curl_init();


    curl_setopt($ch, CURLOPT_URL,$url);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE); 
    curl_setopt($ch, CURLOPT_TIMEOUT, 30); //timeout after 10 seconds
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch, CURLOPT_HEADER, 1);
    curl_setopt($ch, CURLOPT_ENCODING ,"UTF-8");


    $response=curl_exec($ch);
    $i = curl_getinfo($ch);
    echo "<pre>";
    print_r($i);
    return $response;

}

$url="http://www.ebay.de/bhp/kühlschrank";
$response = getUrlContent($url);

It always results in a 404.

Any Ideas?

Álvaro González
  • 142,137
  • 41
  • 261
  • 360
user3074602
  • 21
  • 1
  • 2
  • Try to encode the url before pass to getUrlContent() http://stackoverflow.com/questions/996139/php-urlencode-vs-rawurlencode – wasishincar Feb 12 '14 at 10:04

3 Answers3

2

You need to utf8_decode() the $url parameter and add a cURL parameter. FOLLOWLOCATION

<?php
function getUrlContent($url)
{
    //echo $url;
    $ch = curl_init();


    curl_setopt($ch, CURLOPT_URL,$url);
    curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
    curl_setopt($ch, CURLOPT_TIMEOUT, 30); //timeout after 10 seconds
    curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
    curl_setopt($ch, CURLOPT_HEADER, 1);
    curl_setopt($ch, CURLOPT_FOLLOWLOCATION,1); //<------- I added it here !
    curl_setopt($ch, CURLOPT_ENCODING ,"UTF-8");


    $response=curl_exec($ch);
    $i = curl_getinfo($ch);
    echo "<pre>";
    print_r($i);
    return $response;

}

$url="http://www.ebay.de/bhp/kühlschrank";
$response = getUrlContent(utf8_decode($url)); //<---- utf8 decode !!

OUTPUT :

Array
(
    [url] => http://pages.ebay.com/messages/DE_page_not_responding.html?RlogId=t6awipp%60c%7Fs%3F%3Ctof2e34e*%3B4c3-144258a5d5d-0x1f0
    [content_type] => text/html;charset=UTF-8
    [http_code] => 200
    [header_size] => 1233
    [request_size] => 264
    [filetime] => -1
    [ssl_verify_result] => 0
    [redirect_count] => 1
    [total_time] => 1.235
    [namelookup_time] => 0.266
    [connect_time] => 0.485
    [pretransfer_time] => 0.485
    [size_upload] => 0
    [size_download] => 4611
    [speed_download] => 3733
    [speed_upload] => 0
    [download_content_length] => 4611
    [upload_content_length] => 0
    [starttransfer_time] => 0.735
    [redirect_time] => 0.469
    [certinfo] => Array
        (
        )

    [primary_ip] => 66.135.205.14
    [primary_port] => 80
    [local_ip] => 192.168.1.9
    [local_port] => 61581
    [redirect_url] => 
)
Shankar Narayana Damodaran
  • 68,075
  • 43
  • 96
  • 126
  • Certainly, I've just tested and the page expects ISO-8859-1 in the URL. For some reason, I had assumed that all modern browsers defaulted to UTF-8... – Álvaro González Feb 12 '14 at 10:03
  • @ÁlvaroG.Vicario, I think Google Chrome `Version 32.0.1700.107 m` is not. – Shankar Narayana Damodaran Feb 12 '14 at 10:19
  • With this Version the response-HTML-Code is different from the one in a Browser. (http://pages.ebay.com/messages/DE_page_not_responding.html?RlogId=t6awipp%60c%7Fs%3F%3Ctof2e34e*%3B4c3-144258a5d5d-0x1f0) – user3074602 Feb 12 '14 at 10:35
1

A poor-man's alternative solution is to feed Curl with a plain US-ASCII verion of the URL:

http://www.ebay.de/bhp/k%C3%BChlschrank

I got mine by fetching the page with Firefox and copying the URL from the location bar but you can also do it from PHP:

// Assuming UTF-8
$url="http://www.ebay.de/bhp/" . rawurlencode("kühlschrank");
Álvaro González
  • 142,137
  • 41
  • 261
  • 360
0

Curl doesn't encode the URL for you, you have to do it. That is, it has to be URL encoded. Strictly you should not just decode the UTF8, as its not the same thing. You should use rawurlencode():

    <?php
function getUrlContent($url)
{
      //echo $url;
      $ch = curl_init();


      curl_setopt($ch, CURLOPT_URL,$url);
      curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, FALSE);
      curl_setopt($ch, CURLOPT_TIMEOUT, 30); //timeout after 10 seconds
      curl_setopt($ch, CURLOPT_RETURNTRANSFER,1);
      curl_setopt($ch, CURLOPT_HEADER, 1);
      curl_setopt($ch, CURLOPT_ENCODING ,"UTF-8");


      $response=curl_exec($ch);
      $i = curl_getinfo($ch);
      echo "<pre>";
      print_r($i);
      return $response;

}

$url="http://www.ebay.de/bhp/" . rawurlencode( "kühlschrank" );
$response = getUrlContent($url);
?>

OUTPUT:

    Array
(
    [url] => http://www.ebay.de/bhp/k%C3%BChlschrank
    [content_type] => text/html;charset=utf-8
    [http_code] => 200
    [header_size] => 1007
    [request_size] => 94
    [filetime] => -1
    [ssl_verify_result] => 0
    [redirect_count] => 0
    [total_time] => 1.669702
    [namelookup_time] => 0.606492
    [connect_time] => 0.744441
    [pretransfer_time] => 0.744556
    [size_upload] => 0
    [size_download] => 44032
    [speed_download] => 26371
    [speed_upload] => 0
    [download_content_length] => -1
    [upload_content_length] => 0
    [starttransfer_time] => 1.386599
    [redirect_time] => 0
    [certinfo] => Array
        (
        )

    [primary_ip] => 23.2.16.17
    [primary_port] => 80
    [local_ip] => 10.1.1.2
    [local_port] => 56592
    [redirect_url] => 
)
spinkus
  • 7,694
  • 4
  • 38
  • 62