4

This issue have bothered me for weeks and I could not find any solution on the web. So I have to create a new question to you gurus.

I was trying to read/write on massive number of sockets, please see test code below. It behave normally when the sockets number is below 1500. When the number of sockets is beyond 1500, the program will crash unexpectedly. I know that I should use command ulimit -n 32768 to increase the open files number limit. But the program still can not behave correctly.

#include <unistd.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/socket.h>
#include <arpa/inet.h>
#include <stdint.h>
#include <netdb.h>
#include <errno.h>
#include <malloc.h>
#include <string.h>

int main(int argc, char* argv[])
{
    if (argc!=2)
    {
        printf("usage: test <number of sockets>\n");
        return -1;
    }

    int socketsNum=atoi(argv[1]);
    if (socketsNum<=0)
    {
        printf("error: invalid sockets number\n");
        return -1;
    }

    int *socketHandles=(int*)malloc(sizeof(int)*socketsNum);
    if (socketHandles==NULL)
    {
        printf("error: failed to alloc socket handle memory\n");
        return -1;
    }

    for (int i=0;i<socketsNum;i++)
    {
        socketHandles[i]=-1;
    }


    printf("creating %d sockets ...\n",socketsNum);
    int createdSocketsNum=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socket(AF_INET,SOCK_DGRAM,IPPROTO_UDP);
        if (socketHandle==-1)
        {
            int lastError=errno;
            printf("warning: socket() failed: index: %d, error: %d\n",i+1,lastError);
            continue;
        }

        sockaddr_in sockAddr; // 0.0.0.0:0
        memset(&sockAddr,0,sizeof(sockAddr));
        sockAddr.sin_family = AF_INET;
        sockAddr.sin_addr.s_addr = htonl(INADDR_ANY);
        sockAddr.sin_port = htons(0);

        if (bind( socketHandle, (sockaddr*) &sockAddr, sizeof(sockAddr)) == -1)
        {
            int lastError=errno;
            printf("warning: bind() failed: index: %d, error: %d\n",i+1,lastError);
            close(socketHandle);
            continue;
        }
        socketHandles[i]=socketHandle;
        createdSocketsNum++;
    }

    printf("created %d sockets.\n",createdSocketsNum);

    //test reading;
    printf("testing reading ...\n");
    int readableNumber=0;
    int unreadableNumber=0;
    int readingSkippedNumber=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            readingSkippedNumber++;
            continue;
        }

        fd_set rset;
        FD_ZERO(&rset);
        FD_SET(socketHandle, &rset);
        struct timeval timeout = {0, 0};
        int retCode=select(socketHandle + 1, &rset, NULL, NULL, &timeout);
        if (retCode==-1)
        {
            int lastError=errno;
            printf("warning: select() failed: index: %d, error: %d\n",i+1,lastError);
        }
        else if (retCode==0)
        {
            unreadableNumber++;
        }
        else
        {
            readableNumber++;
        }
    }
    printf("readable: %d, unreadable: %d, skipped: %d, total: %d\n",readableNumber,unreadableNumber,readingSkippedNumber,socketsNum);

    //test writing
    printf("testing writing ...\n");
    int writableNumber=0;
    int unwritableNumber=0;
    int writingSkippedNumber=0;
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            writingSkippedNumber++;
            continue;
        }
        fd_set wset;
        FD_ZERO(&wset);
        FD_SET(socketHandle, &wset);
        struct timeval timeout = {0, 0};
        int retCode=select(socketHandle + 1, NULL, &wset, NULL, &timeout);
        if (retCode==-1)
        {
            int lastError=errno;
            printf("warning: select() failed: index: %d, error: %d\n",i+1,lastError);
        }
        else if (retCode==0)
        {
            unwritableNumber++;
        }
        else
        {
            writableNumber++;
        }
    }
    printf("writable: %d, unwritable: %d, skipped: %d, total: %d\n",writableNumber,unwritableNumber,writingSkippedNumber,socketsNum);

    printf("closing ...\n");
    for (int i=0;i<socketsNum;i++)
    {
        int socketHandle=socketHandles[i];
        if (socketHandle==-1)
        {
            continue;
        }
        close(socketHandle);
    }
    free(socketHandles);
    printf("completed!\n");
    return 0;
}

Compile:

g++ TestSockets.cpp -ldl -g -ggdb -o TestSockets

Config:

ulimit -n 32768

Some typical results:

  1. Good result of ./TestSockets 1500:

    creating 1500 sockets ...
    created 1500 sockets.
    testing reading ...
    readable: 0, unreadable: 1500, skipped: 0, total: 1500
    testing writing ...
    writable: 1372, unwritable: 128, skipped: 0, total: 1500
    closing ...
    completed!
    
  2. Bad result of ./TestSockets 1900:

    creating 1900 sockets ...
    created 1900 sockets.
    testing reading ...
    warning: select() failed: index: 1797, error: 9
    ...(more lines trimmed)
    warning: select() failed: index: 1820, error: 9
    warning: select() failed: index: 1821, error: 22
    readable: 0, unreadable: 1878, skipped: 0, total: 1900
    testing writing ...
    warning: select() failed: index: 1641, error: 9
    ...(more lines trimmed)
    warning: select() failed: index: 1660, error: 9
    warning: select() failed: index: 1661, error: 22
    writable: 1751, unwritable: 128, skipped: 0, total: 1900
    closing ...
    completed!
    

    Comment: because 1900>1751+128, it seems that the stack was damaged.

  3. Bad result of ./TestSockets 2000:

    creating 2000 sockets ...
    created 2000 sockets.
    testing reading ...
    Segmentation fault
    

More Investigation:

According to gdb information. It seems that the stack memory was damaged during running:

    creating 2000 sockets ...
    created 2000 sockets.
    testing reading ...

    Program received signal SIGSEGV, Segmentation fault.
    0x08048b79 in main (argc=2, argv=0xffffd3b4) at TestSockets.cpp:78
    78          int socketHandle=socketHandles[i];
    (gdb) print socketHandles
    $1 = (int *) 0x0
    (gdb) info local
    socketHandle = 0
    rset = {fds_bits = {0 <repeats 32 times>}}
    timeout = {tv_sec = 0, tv_usec = 0}
    retCode = 0
    i = 1601
    socketsNum = 2000
    unreadableNumber = 1601
    unwritableNumber = 134514249
    socketHandles = 0x0
    createdSocketsNum = 2000
    readableNumber = 0
    readingSkippedNumber = 0
    writableNumber = -136436764
    writingSkippedNumber = 0
    (gdb) info stack
    #0  0x08048b79 in main (argc=2, argv=0xffffd3b4) at TestSockets.cpp:78
Tony
  • 43
  • 4
  • That looks like C but you're compiling it like C++. Could you pick one? – Biffen Jan 26 '16 at 07:58
  • 1
    Have you tried to run in a debugger to catch the crash in action? This will help you locate where it happens (in your code), as well as let you examine values of involved variables to make sure they are okay. – Some programmer dude Jan 26 '16 at 07:59
  • I tried but I didn't find any help information. (gdb) set args 2000 (gdb) run Starting program: ./TestSockets 2000 warning: Could not load shared library symbols for linux-gate.so.1. Do you need "set solib-search-path" or "set sysroot"? creating 2000 sockets ... created 2000 sockets. testing reading ... Program received signal SIGSEGV, Segmentation fault. 0x08048b79 in main () (gdb) info stack #0 0x08048b79 in main () (gdb) info local No symbol table info available. – Tony Jan 26 '16 at 08:02
  • 1
    Also, try using a memory-degger tool such as [Valgrind](http://valgrind.org/), it will help you find buffer overruns and such. – Some programmer dude Jan 26 '16 at 08:03
  • 2
    Regarding your debugger use, try building with *debug info*, add the `-g` flag when building. Then get a *stack trace* when the crash happens (the `bt` command) to see where in *your* code the crash happens (if the debugger doesn't stop there already). Having debug-info is crucial for Valgrind to be able to report the exact location of problems it finds. And please, *edit your question* to add crucial information, like stack-traces or location of the crash. – Some programmer dude Jan 26 '16 at 08:05
  • You clearly have something wrong, as GDB reports `socketHandles` being a null pointer. You should *really* try Valgrind. – Some programmer dude Jan 26 '16 at 08:33
  • Thank you Joachim. I added some debug information. But I am still confused why the stack seems damaged because the socketHandles was changed to NULL when the Segmentation Fault occured. – Tony Jan 26 '16 at 08:35
  • 1
    I was browsing other similar questions and came across a comment that I think is directly relevant to you: http://stackoverflow.com/questions/7976388/increasing-limit-of-fd-setsize-and-select#comment30822679_14929259 – jas Jan 26 '16 at 08:41
  • I know that, but I don't know why. I will try Valgrind as you recommended. – Tony Jan 26 '16 at 08:41
  • @jas, I was calling select() on each socket one by one. So changing FD_SETSIZE has no effect. I tried that way. – Tony Jan 26 '16 at 08:49
  • 1
    http://www.delorie.com/gnu/docs/glibc/libc_248.html states that "if you get a file descriptor with a value as high as FD_SETSIZE, you cannot put that descriptor into an fd_set." It's a bit frustrating that it crashes on select() with a mem corruption without any decent error state. – Oncaphillis Jan 26 '16 at 08:54

2 Answers2

4

An fd_set is limited by the maximum value of the file descriptor (not the number of file descriptors set at the same time). Usually it's 1024.

Thus, if your socket value is greater than 1023, you cannot use select on it at all.

Redefining FD_SETSIZE is not supported on operating systems I know. You might be able to successfully redefine fd_set in your program, but select will only work up to FD_SETSIZE.

n. m. could be an AI
  • 112,515
  • 14
  • 128
  • 243
  • I can confirm! On my FreeBSD box I got a seg fault at 1500, but after defining FS_SETSIZE at 2048 I could run successfully at 2000. – Serge Ballesta Jan 26 '16 at 09:08
  • What would a workaround be for an application that does need to use more sockets than that? – M.M Jan 27 '16 at 01:19
  • @M.M, please see my reply. It is a workaround example. – Tony Jan 27 '16 at 01:43
  • 1
    @Tony I think your answer is not guaranteed to work. (Maybe it worked for you on your system but it may not work for others or on other systems) – M.M Jan 27 '16 at 01:56
0

I have solved this headache problem. The fd_set on windows and Linux are totally different. On Linux if socket handle VALUE is bigger than FD_SETSIZE, there will be overrun issue on Linux version FD_SET macro. I make a workaround to alloc enough buffer for fd_set on Linux. such as,

char rsetBuffer[10240];
memset(rsetBuffer,0,10240);

fd_set& rset=(fd_set&)rsetBuffer;
FD_ZERO(&rset);
FD_SET(socketHandle, &rset);

p.s. Definition of fd_set struct and FD_SET macro on windows and Linux:

on windows:

typedef struct fd_set {
    u_int fd_count;               /* how many are SET? */
    SOCKET  fd_array[FD_SETSIZE];   /* an array of SOCKETs */
} fd_set;


#define FD_SET(fd, set) do { \
    u_int __i; \
    for (__i = 0; __i < ((fd_set FAR *)(set))->fd_count; __i++) { \
    if (((fd_set FAR *)(set))->fd_array[__i] == (fd)) { \
        break; \
    } \
    } \
    if (__i == ((fd_set FAR *)(set))->fd_count) { \
    if (((fd_set FAR *)(set))->fd_count < FD_SETSIZE) { \
        ((fd_set FAR *)(set))->fd_array[__i] = (fd); \
        ((fd_set FAR *)(set))->fd_count++; \
    } \
    } \
} while(0)

on Linux:

/* fd_set for select and pselect.  */
typedef struct
  {
    /* XPG4.2 requires this member name.  Otherwise avoid the name
       from the global namespace.  */
#ifdef __USE_XOPEN
    __fd_mask fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->fds_bits)
#else
    __fd_mask __fds_bits[__FD_SETSIZE / __NFDBITS];
# define __FDS_BITS(set) ((set)->__fds_bits)
#endif
  } fd_set;

#define __FD_SET(d, set) \
  ((void) (__FDS_BITS (set)[__FD_ELT (d)] |= __FD_MASK (d)))
#define __FD_CLR(d, set) \
  ((void) (__FDS_BITS (set)[__FD_ELT (d)] &= ~__FD_MASK (d)))
#define __FD_ISSET(d, set) \
  ((__FDS_BITS (set)[__FD_ELT (d)] & __FD_MASK (d)) != 0)
Tony
  • 43
  • 4
  • 1
    This is not a proper solution. It will not crash but it will not work either. I have wrote why in my answer. I have learned it the hard way. If you want to learn it the hard way too, you are welcome. – n. m. could be an AI Jan 27 '16 at 05:01