2

Is there a limit that I can oversubscribe the cores in MPI?

Here is my previous post: Does the number of processes in MPI have a limit?

Today I ran another program in the book "Using MPI". It works for 52 processes, but hangs for 53 processes.

My laptop is 4 core, hyperthreading, and 8G RAM. The MPI version is openmpi 1.4.3.

Thanks.

Here is the code:

#include <stdio.h>
#include <mpi.h>

#define ICTAG 0
#define SERVER_RANK 0
typedef enum { REQUEST, VALUE, GOAWAY } nxtval_msgtype;

/*
int MPI_Recv(void *buf, int count, MPI_Datatype datatype, int source, int tag, MPI_Comm comm, MPI_Status *status)
{
    int ret;
//  printf("Before MPI_Recv. tag=%d\n", tag);
    ret=PMPI_Recv(buf, count, datatype, source, tag, comm, status);
    printf("After  MPI_Recv. count=%d tag=%d source=%d\n", count, status->MPI_TAG, status->MPI_SOURCE);
    fflush(stdout);
    return ret;
}
*/
int MPI_Intercomm_create(MPI_Comm local_comm, int local_leader, MPI_Comm peer_comm, int remote_leader, int tag, MPI_Comm *newintercomm)
{
    int ret;
    printf("Before MPI_Intercomm_create\n");
    fflush(stdout);
    ret=PMPI_Intercomm_create(local_comm, local_leader, peer_comm, remote_leader, tag, newintercomm);
    printf("After  MPI_Intercomm_create\n");
    fflush(stdout);
    return ret;
}

int MPE_Counter_create_ic(MPI_Comm oldcomm, MPI_Comm *smaller_comm, MPI_Comm *counter_comm)
{
    int counter=0, message, done=0, myid, numprocs, server;
    int color, remote_leader_rank;
    MPI_Status status;
    MPI_Comm oldcommdup, splitcomm;

    MPI_Comm_dup(oldcomm, &oldcommdup);
    MPI_Comm_size(oldcommdup, &numprocs);
    MPI_Comm_rank(oldcommdup, &myid);
    server=numprocs-1;
    color=(myid==server);
    MPI_Comm_split(oldcomm, color, myid, &splitcomm);
    if(!color)
    {
        remote_leader_rank=server;
        *smaller_comm=splitcomm;
    }
    else
        remote_leader_rank=0;
    MPI_Intercomm_create(splitcomm, 0, oldcommdup, remote_leader_rank, ICTAG, counter_comm);
    MPI_Comm_free(&oldcommdup);

    if(myid==server)
    {
        while(!done)
        {
            MPI_Recv(NULL, 0, MPI_INT, MPI_ANY_SOURCE, MPI_ANY_TAG, *counter_comm, &status);
            if(status.MPI_TAG==REQUEST)
            {
                MPI_Send(&counter, 1, MPI_INT, status.MPI_SOURCE, VALUE, *counter_comm);
                ++counter;
            }
            else if(status.MPI_TAG==GOAWAY)
                done=1;
            else
            {
                fprintf(stderr, "bad tag %d sent to MPE counter\n", status.MPI_TAG);
                MPI_Abort(*counter_comm, 1);
            }
        }
    }
    return 0;
}

int MPE_Counter_nxtval_ic(MPI_Comm counter_comm, int *value)
{
    MPI_Status status;
    MPI_Send(NULL, 0, MPI_INT, SERVER_RANK, REQUEST, counter_comm);
    MPI_Recv(value, 1, MPI_INT, SERVER_RANK, VALUE, counter_comm, &status);
    return 0;
}

int MPE_Counter_free_ic(MPI_Comm *smaller_comm, MPI_Comm *counter_comm)
{
    int myid;
    MPI_Comm_rank(*smaller_comm, &myid);
    MPI_Barrier(*smaller_comm);
    if(myid==0)
        MPI_Send(NULL, 0, MPI_INT, SERVER_RANK, GOAWAY, *counter_comm);
    MPI_Comm_free(counter_comm);
    MPI_Comm_free(smaller_comm);
    return 0;
}

int main(int argc, char **argv) 
{
    int size, myid;
    MPI_Comm counter_comm, worker_comm;
    MPI_Init( &argc, &argv );
    MPE_Counter_create_ic( MPI_COMM_WORLD, &worker_comm, &counter_comm );
    MPI_Comm_size(MPI_COMM_WORLD, &size);
    MPI_Comm_rank(MPI_COMM_WORLD, &myid);
    if(myid!=size-1) {
        /* I am one of the workers; the master doesn't exit create until 
        MPE_Counter_free is called */ 
        int value, rank;
        MPI_Comm_rank( counter_comm, &rank );
        MPE_Counter_nxtval_ic( counter_comm, &value );
        printf( "[%d] received value %d\n", rank, value );
        MPE_Counter_free_ic( &worker_comm, &counter_comm );
    }
    MPI_Finalize();
    return 0;
}

Here is the result it works:

shuang@phoebe:~/usingMPI$ mpiexec -n 8 ./nxtval_ic
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[0] received value 3
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[1] received value 1
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[2] received value 2
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[3] received value 5
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[4] received value 6
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[5] received value 0
Before MPI_Intercomm_create
After  MPI_Intercomm_create
[6] received value 4
Before MPI_Intercomm_create
After  MPI_Intercomm_create

Here is the case when it hangs

shuang@phoebe:~/usingMPI$ mpiexec -n 100 ./nxtval_ic
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
Before MPI_Intercomm_create
^Cmpiexec: killing job...

--------------------------------------------------------------------------
mpiexec was unable to cleanly terminate the daemons on the nodes shown
below. Additional manual cleanup may be required - please refer to
the "orte-clean" tool for assistance.
--------------------------------------------------------------------------
Community
  • 1
  • 1
Sean
  • 2,649
  • 3
  • 21
  • 27
  • 4
    It wouldn't surprise me to learn that an MPI implementation failed to cope with this level of oversubscription, it is well outside what most of us generally do. It's possible that having so many processes in flight leads to timeouts on message transfers, it's possible that internal buffers are overwhelmed, it's possible that a routine kicks in which says *get outa here, 53 processes on 4 cores, hyper threading be damned* lots of things are possible. I'm struggling to see this as a problem. – High Performance Mark Aug 13 '14 at 10:18
  • 1
    Be careful running in oversubscribed mode - in older (particularly very old, like 1.4.x) OpenMPI, you'll want to add the option `-mca mpi_yield_when_idle 1` as per [this FAQ entry](http://www.open-mpi.org/faq/?category=running#oversubscribing). Normally that would just cause a performance problem, but with all those barriers I could easily see lockups occurring. – Jonathan Dursi Aug 13 '14 at 12:01

1 Answers1

1

There is no limit to the number of MPI processes that can exist from the standpoint of the MPI standard.

Your MPI implementation may have limitations, and if your implementation chooses to map MPI processes to OS processes (as is common and done by e.g. MPICH and OpenMPI), then you may also run into an upper limit on the number of (OS) processes your OS can support.

See Maximum number of processes in linux for how you might determine if OS processes are an issue.

In practice, I find that running more than ~50 MPI processes on my dual-core, four-hardware-thread laptop is untenable, but I have never bothered to figured out what limits this.

Community
  • 1
  • 1
Jeff Hammond
  • 5,374
  • 3
  • 28
  • 45