4

I have this piece of code to fetch a Page HTML from an URL, however the response content looks encoded.

Code:

    HttpWebRequest xhr = (HttpWebRequest) WebRequest.Create(new Uri("https://www.youtube.com/watch?v=_Ewh75YGIGQ"));
        xhr.AutomaticDecompression = DecompressionMethods.GZip | DecompressionMethods.Deflate;
        //xhr.CookieContainer = request.Account.CookieContainer;
        xhr.Accept = "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8";
        xhr.Headers["Accept-Encoding"] = "gzip, deflate, br";
        xhr.Headers["Accept-Language"] = "en-US,en;q=0.5";
        xhr.Headers["Upgrade-Insecure-Requests"] = "1";
        xhr.KeepAlive = true;
        xhr.UserAgent = "Mozilla/4.0 (compatible; MSIE 9.0; Windows NT 6.1)";
        xhr.Host = "www.youtube.com";
        xhr.Referer = "https://www.youtube.com/watch?v=6aCpYxzRkf4";
        var response = xhr.GetResponse();
        string html;
        using (StreamReader reader = new StreamReader(response.GetResponseStream()))
        {
            html = reader.ReadToEnd();
        }

These are the response headers:

    X-XSS-Protection: 1; mode=block; report=https://www.google.com/appserve/security-bugs/log/youtube
    X-Content-Type-Options: nosniff
    X-Frame-Options: SAMEORIGIN
    Strict-Transport-Security: max-age=31536000
    Content-Encoding: br
    Transfer-Encoding: chunked
    Alt-Svc: quic=":443"; ma=2592000; v="44,43,39,35"
    Cache-Control: no-cache
    Content-Type: text/html; charset=utf-8
    Date: Sat, 24 Nov 2018 11:30:38 GMT
    Expires: Tue, 27 Apr 1971 19:44:06 EST
    P3P: CP="This is not a P3P policy! See http://support.google.com/accounts/answer/151657?hl=it for more info."
    Set-Cookie: PREF=f1=50000000&al=it; path=/; domain=.youtube.com; expires=Thu, 25-Jul-2019 23:23:38 GMT
    Server: YouTube Frontend Proxy

And the response string parsed with StreamReader.ReadToEnd() looks like this

garry man
  • 445
  • 4
  • 14

3 Answers3

7

Yes.. The above answer is correct. The response generated by the server is in br encoding. You need to decode it. support for br encoding is not included in default system compression packages and you will have to install the Brotli.net nuget package.

Add this in your code to cover the 3 main encoding types gzip, br and defalte

            HttpWebResponse response = (HttpWebResponse)webRequest.GetResponse();
            Stream responseStream = response.GetResponseStream();

             if (response.ContentEncoding.ToLower().Contains("gzip"))
                responseStream = new GZipStream(responseStream, CompressionMode.Decompress);
            else if (response.ContentEncoding.ToLower().Contains("deflate"))
                responseStream = new DeflateStream(responseStream, CompressionMode.Decompress);
            else if (response.ContentEncoding.ToLower().Contains("br"))
                responseStream = new BrotliStream(responseStream, CompressionMode.Decompress);
Vibhor Roy
  • 163
  • 2
  • 6
4

The answer is in the response header: Content-Encoding: br -> This means Brotli compression.

There is a .NET implementation (NuGet package) for it:

Install this to your project add "using Brotli; " and replace the "using (StreamReader....." with this code:

       using (BrotliStream bs = new BrotliStream(response.GetResponseStream(), System.IO.Compression.CompressionMode.Decompress)) {
            using (System.IO.MemoryStream msOutput = new System.IO.MemoryStream()) {
                bs.CopyTo(msOutput);
                msOutput.Seek(0, System.IO.SeekOrigin.Begin);
                using (StreamReader reader = new StreamReader(msOutput)) {
                    html = reader.ReadToEnd();
                }
            }
        }
H.G. Sandhagen
  • 772
  • 6
  • 13
0
public class ZipFileUtilities
{
    private static readonly byte[] ZipBytes1 = { 0x50, 0x4b, 0x03, 0x04, 0x0a };
    private static readonly byte[] GzipBytes = { 0x1f, 0x8b };
    private static readonly byte[] TarBytes = { 0x1f, 0x9d };
    private static readonly byte[] LzhBytes = { 0x1f, 0xa0 };
    private static readonly byte[] Bzip2Bytes = { 0x42, 0x5a, 0x68 };
    private static readonly byte[] LzipBytes = { 0x4c, 0x5a, 0x49, 0x50 };
    private static readonly byte[] ZipBytes2 = { 0x50, 0x4b, 0x05, 0x06 };
    private static readonly byte[] ZipBytes3 = { 0x50, 0x4b, 0x07, 0x08 };

    public static byte[] GetFirstBytes(string filepath, int length)
    {
        using (var sr = new StreamReader(filepath))
        {
            sr.BaseStream.Seek(0, 0);
            var bytes = new byte[length];
            sr.BaseStream.Read(bytes, 0, length);

            return bytes;
        }
    }

    public static bool IsZipFile(string filepath)
    {
        return IsCompressedData(GetFirstBytes(filepath, 5));
    }

    public static bool IsCompressedData(byte[] data)
    {
        foreach (var headerBytes in new[] { ZipBytes1, ZipBytes2, ZipBytes3, GzipBytes, TarBytes, LzhBytes, Bzip2Bytes, LzipBytes })
        {
            if (HeaderBytesMatch(headerBytes, data))
                return true;
        }

        return false;
    }

    private static bool HeaderBytesMatch(byte[] headerBytes, byte[] dataBytes)
    {
        if (dataBytes.Length < headerBytes.Length)
            throw new ArgumentOutOfRangeException(nameof(dataBytes),
                $"Passed databytes length ({dataBytes.Length}) is shorter than the headerbytes ({headerBytes.Length})");

        for (var i = 0; i < headerBytes.Length; i++)
        {
            if (headerBytes[i] == dataBytes[i]) continue;

            return false;
        }

        return true;
    }


    public static byte[] ReadFully(Stream input)
    {
        byte[] buffer = new byte[16 * 1024];
        using (MemoryStream ms = new MemoryStream())
        {
            int read;
            while ((read = input.Read(buffer, 0, buffer.Length)) > 0)
            {
                ms.Write(buffer, 0, read);
            }
            return ms.ToArray();
        }
    }


    public static byte[] Decompress(byte[] data)
    {
        using (var compressedStream = new MemoryStream(data))
        using (var zipStream = new GZipStream(compressedStream, CompressionMode.Decompress))
        using (var resultStream = new MemoryStream())
        {
            zipStream.CopyTo(resultStream);
            return resultStream.ToArray();
        }
    }


    public static string ToQueryString(NameValueCollection nvc)
    {
        if (nvc == null) return string.Empty;

        StringBuilder sb = new StringBuilder();

        foreach (string key in nvc.Keys)
        {
            if (string.IsNullOrWhiteSpace(key)) continue;

            string[] values = nvc.GetValues(key);
            if (values == null) continue;

            foreach (string value in values)
            {
                sb.Append(sb.Length == 0 ? "" : "&");
                sb.AppendFormat("{0}={1}", Uri.EscapeDataString(key), Uri.EscapeDataString(value));
            }
        }

        return sb.ToString();
    }
}

the use

                if (response.StatusCode == HttpStatusCode.OK)
                {
                    using (var responseStream = response.GetResponseStream())
                    {
                        var t = ZipFileUtilities.ReadFully(responseStream);
                        if (t != null)
                            if (ZipFileUtilities.IsCompressedData(t))
                            {
                                t = ZipFileUtilities.Decompress(t);
                            }
                        using (var ms = new MemoryStream(t))
                        using (var streamReader = new StreamReader(ms))
                        using (var jsonReader = new JsonTextReader(streamReader))
                        {
                            var serializer = new JsonSerializer();
                            modil = serializer.Deserialize<Model>(jsonReader)
                        }
                    }
                }