0

I need to create a c++ console application that query a site in turn to get the html page.

The site is static because when I queried it in the url I see the html file, so I use this code:

send(Socket, "GET /it/ricette/q-torte_forno_statico.html HTTP/1.1\r\nHost: worldrecipes.expo2015.org/\r\nConnection: close\r\n\r\n", strlen("GET /it/ricette/q-torte_forno_statico.html HTTP/1.1\r\nHost: worldrecipes.expo2015.org\r\nConnection: close\r\n\r\n"), 0);
    char buffer[1000000];
    int nDataLength;
    while ((nDataLength = recv(Socket, buffer, 1000000, 0)) > 0) {
        int i = 0;
        while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') {         
                cout << buffer[i];
                i += 1;
            }
        }

It doesn't give me any errors but don't show me the whole html page and every time that I call send the request I get different answers ... why?

Silvia B
  • 45
  • 3
  • 10
  • 1
    Read the HTTP specification ([RFC 2616](https://tools.ietf.org/html/rfc2616), and newer RFCs 7230-7235). What you have shown is not even close to being a valid implementation. You have to read from the socket until you encounter an `\r\n\r\n` sequence denoting the end of the response headers, then you have to parse the headers you received to determine the format of the response body, if any, reading that body according to its transfer encoding. You have to take the `Content-Length`, `Transfer-Encoding`, and `Content-Type` headers into account. See RFC 2616 Section 4.4 for details. – Remy Lebeau Apr 01 '16 at 18:05
  • 1
    HTTP is not trivial to implement by hand, it has a lot of rules and semantics to it. A better way to handle this is to not implement HTTP manually at all. Use a pre-existing HTTP library instead, such as [libcurl](https://curl.haxx.se/libcurl/), let it do the hard work for you. – Remy Lebeau Apr 01 '16 at 18:06
  • @RemyLebeau I know that libcurl maybe is better than this code, but I need to use this code. Now I resolve the problem but... Do you know how to resolve the problem of the size of the *buffer* because the page return to me but the buffer isn't enought big to contain all html code-> so my problem is: when I increase the size Visual studio return me an error about initialization of the socket.. – Silvia B Apr 02 '16 at 16:14
  • you can't receive the entire response into a single fixed length buffer. You can use a smaller fixed buffer to read from the socket on each loop iteration, but you need to copy that data to a dynamically growing buffer or a file on each iteration. For instance, copy read data to a `std::string` or even a `std::vector` until you reach the end of the headers, then copy the remaining data to a file instead until the end of the response is reached. – Remy Lebeau Apr 02 '16 at 16:27
  • @RemyLebeau I tried in this way: `while (buffer[i] >= 32 || buffer[i] == '\n' || buffer[i] == '\r') { //cout << buffer[i]; appo = buffer[i]; html = html+appo; i += 1; }` but I continue to show not the whole html file – Silvia B Apr 02 '16 at 16:33
  • You are still not managing the buffer data correctly. See my previous answer on this topic: http://stackoverflow.com/a/16247097/65863 – Remy Lebeau Apr 02 '16 at 17:09
  • @RemyLebeau sorry but I don't understand the pseudo-code... I'm a beginner into this programming area – Silvia B Apr 03 '16 at 13:05

1 Answers1

0

This code below seems to only work on the index page worldrecipes.expo2015.org and not the sub pages. You might want to look at more advanced webbrowser controls for visual studio for parsing and processing HTML.

Like here : http://www.codeproject.com/Articles/3365/Embed-an-HTML-control-in-your-own-window-using-pla here : https://msdn.microsoft.com/en-us/library/aa752046(v=vs.85).aspx and here : http://www.codeproject.com/Articles/3919/Using-the-WebBrowser-control-simplified

Example code:

#include <windows.h>
#include <string>
#include <stdio.h>

using std::string;

#pragma comment(lib,"ws2_32.lib")


HINSTANCE hInst;
WSADATA wsaData;
void mParseUrl(char *mUrl, string &serverName, string &filepath, string &filename);
SOCKET connectToServer(char *szServerName, WORD portNum);
int getHeaderLength(char *content);
char *readUrl2(char *szUrl, long &bytesReturnedOut, char **headerOut);


int main()
{
    const int bufLen = 1024;
    char *szUrl = "http://worldrecipes.expo2015.org/it/ricette/q-torte_forno_statico.html";
    long fileSize;
    char *memBuffer, *headerBuffer;
    FILE *fp;

    memBuffer = headerBuffer = NULL;

    if ( WSAStartup(0x101, &wsaData) != 0)
        return -1;


    memBuffer = readUrl2(szUrl, fileSize, &headerBuffer);
    printf("returned from readUrl\n");
    printf("data returned:\n%s", memBuffer);
    if (fileSize != 0)
    {
        printf("Got some data\n");
        fp = fopen("downloaded.file", "wb");
        fwrite(memBuffer, 1, fileSize, fp);
        fclose(fp);
//        SetDlgItemText(hwndDlg, IDC_EDIT4, headerBuffer);
//        SetDlgItemText(hwndDlg, IDC_EDIT5, memBuffer);
        delete(memBuffer);
        delete(headerBuffer);
    }

    WSACleanup();
    return 0;
}


void mParseUrl(char *mUrl, string &serverName, string &filepath, string &filename)
{
    string::size_type n;
    string url = mUrl;

    if (url.substr(0,7) == "http://")
        url.erase(0,7);

    if (url.substr(0,8) == "https://")
        url.erase(0,8);

    n = url.find('/');
    if (n != string::npos)
    {
        serverName = url.substr(0,n);
        filepath = url.substr(n);
        n = filepath.rfind('/');
        filename = filepath.substr(n+1);
    }

    else
    {
        serverName = url;
        filepath = "/";
        filename = "";
    }
}

SOCKET connectToServer(char *szServerName, WORD portNum)
{
    struct hostent *hp;
    unsigned int addr;
    struct sockaddr_in server;
    SOCKET conn;

    conn = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);
    if (conn == INVALID_SOCKET)
        return NULL;

    if(inet_addr(szServerName)==INADDR_NONE)
    {
        hp=gethostbyname(szServerName);
    }
    else
    {
        addr=inet_addr(szServerName);
        hp=gethostbyaddr((char*)&addr,sizeof(addr),AF_INET);
    }

    if(hp==NULL)
    {
        closesocket(conn);
        return NULL;
    }

    server.sin_addr.s_addr=*((unsigned long*)hp->h_addr);
    server.sin_family=AF_INET;
    server.sin_port=htons(portNum);
    if(connect(conn,(struct sockaddr*)&server,sizeof(server)))
    {
        closesocket(conn);
        return NULL;
    }
    return conn;
}

int getHeaderLength(char *content)
{
    const char *srchStr1 = "\r\n\r\n", *srchStr2 = "\n\r\n\r";
    char *findPos;
    int ofset = -1;

    findPos = strstr(content, srchStr1);
    if (findPos != NULL)
    {
        ofset = findPos - content;
        ofset += strlen(srchStr1);
    }

    else
    {
        findPos = strstr(content, srchStr2);
        if (findPos != NULL)
        {
            ofset = findPos - content;
            ofset += strlen(srchStr2);
        }
    }
    return ofset;
}

char *readUrl2(char *szUrl, long &bytesReturnedOut, char **headerOut)
{
    const int bufSize = 512;
    char readBuffer[bufSize], sendBuffer[bufSize], tmpBuffer[bufSize];
    char *tmpResult=NULL, *result;
    SOCKET conn;
    string server, filepath, filename;
    long totalBytesRead, thisReadSize, headerLen;

    mParseUrl(szUrl, server, filepath, filename);

    ///////////// step 1, connect //////////////////////
    conn = connectToServer((char*)server.c_str(), 80);

    ///////////// step 2, send GET request /////////////
    sprintf(tmpBuffer, "GET %s HTTP/1.0", filepath.c_str());
    strcpy(sendBuffer, tmpBuffer);
    strcat(sendBuffer, "\r\n");
    sprintf(tmpBuffer, "Host: %s", server.c_str());
    strcat(sendBuffer, tmpBuffer);
    strcat(sendBuffer, "\r\n");
    strcat(sendBuffer, "\r\n");
    send(conn, sendBuffer, strlen(sendBuffer), 0);

//    SetWindowText(edit3Hwnd, sendBuffer);
    printf("Buffer being sent:\n%s", sendBuffer);

    ///////////// step 3 - get received bytes ////////////////
    // Receive until the peer closes the connection
    totalBytesRead = 0;
    while(1)
    {
        memset(readBuffer, 0, bufSize);
        thisReadSize = recv (conn, readBuffer, bufSize, 0);

        if ( thisReadSize <= 0 )
            break;

        tmpResult = (char*)realloc(tmpResult, thisReadSize+totalBytesRead);

        memcpy(tmpResult+totalBytesRead, readBuffer, thisReadSize);
        totalBytesRead += thisReadSize;
    }

    headerLen = getHeaderLength(tmpResult);
    long contenLen = totalBytesRead-headerLen;
    result = new char[contenLen+1];
    memcpy(result, tmpResult+headerLen, contenLen);
    result[contenLen] = 0x0;
    char *myTmp;

    myTmp = new char[headerLen+1];
    strncpy(myTmp, tmpResult, headerLen);
    myTmp[headerLen] = NULL;
    delete(tmpResult);
    *headerOut = myTmp;

    bytesReturnedOut = contenLen;
    closesocket(conn);
    return(result);
}

enter image description here

Software_Designer
  • 8,490
  • 3
  • 24
  • 28
  • Subpages work just fine, just replace `GET /` with `GET /desiredpage` as needed. 301 is an HTTP redirect. You have to extract the `Location` value and send a new request to the specified URL. – Remy Lebeau Apr 03 '16 at 14:45
  • Also, this code suffers from the same buffer overflow error as the OP's code. The `while` loop needs an `i < nDataLength` condition. And the code is not doing any error handling on most of the functions it is calling. – Remy Lebeau Apr 03 '16 at 14:47