1

I am new to Trilinos. I write a program which reads data from a file and construct a sparse matrix with the data. The global id can be very high exceed 32bit integer range. When I change the id to smaller values. Everything works. If I run with one process i.e. mpiexec -np 1 ./myprogram, it is OK. However with multiple processes it crashes.

The data is like this: Each row in file represents a row in matrix. In each row: the first value is row id and the second value represents the number of columns in this row. After that, there are several index value pairs. Example file (small id):

   2000791 3 2000791 0.5 1000791 0.5 3000791 1.0
   1000791 2 1000791 0.5 2000791 0.5
   3000791 2 3000791 0.5 1000791 0.5
   3000792 2 3000791 0.5 1000791 0.5

Example file (large id):

   2000000000000791 3 2000000000000791 0.5 1000000000000791 0.5 3000000000000791 1.0
   1000000000000791 2 1000000000000791 0.5 2000000000000791 0.5
   3000000000000791 2 3000000000000791 0.5 1000000000000791 0.5
   3000000000000792 2 3000000000000791 0.5 1000000000000791 0.5

From gdb's output and Trilinos's source code, it seems the error is caused by Epetra_BlockMap::ConstructAutoUniform: Error. Not enough space for elements on each processor.

Below debug message and my program's source code is attached.

#0  0x00007ffff58b55c9 in raise () from /lib64/libc.so.6
#1  0x00007ffff58b6cd8 in abort () from /lib64/libc.so.6
#2  0x00007ffff61b99d5 in __gnu_cxx::__verbose_terminate_handler() ()
   from /lib64/libstdc++.so.6
#3  0x00007ffff61b7946 in ?? () from /lib64/libstdc++.so.6
#4  0x00007ffff61b7973 in std::terminate() () from /lib64/libstdc++.so.6
#5  0x00007ffff61b7b9f in __cxa_throw () from /lib64/libstdc++.so.6
#6  0x00000000004c6d2a in Epetra_BlockMap::ConstructAutoUniform (
    this=this@entry=0x85cf00, 
    NumGlobal_Elements=NumGlobal_Elements@entry=2000000000000002, 
    Element_Size=Element_Size@entry=1, 
    Index_Base=Index_Base@entry=1000000000000791, comm=..., 
    IsLongLong=IsLongLong@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:81
#7  0x00000000004c708e in Epetra_BlockMap::Epetra_BlockMap (this=0x85cf00, 
    NumGlobal_Elements=2000000000000002, Element_Size=1, 
    Index_Base=1000000000000791, comm=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:124
#8  0x0000000000497de9 in Epetra_Map::Epetra_Map (this=0x85cf00, 
    numGlobalElements=<optimized out>, indexBase=<optimized out>, comm=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetr---Type <return> to continue, or q <return> to quit---
a_Map.cpp:68
#9  0x00000000004c008f in Epetra_BasicDirectory::Generate<long long> (
    this=0x85cea0, Map=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:276
#10 0x00000000004bf4c5 in Epetra_BasicDirectory::Epetra_BasicDirectory (
    this=0x85cea0, Map=..., __in_chrg=<optimized out>, 
    __vtt_parm=<optimized out>)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BasicDirectory.cpp:121
#11 0x00000000004b1ea1 in Epetra_MpiComm::CreateDirectory (
    this=<optimized out>, map=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_MpiComm.cpp:243
#12 0x00000000004c5fcc in Epetra_BlockMap::RemoteIDList (
    this=this@entry=0x7fffffffddd0, NumIDs=NumIDs@entry=0, GIDList=0x0, 
    PIDList=0x0, LIDList=LIDList@entry=0x0, SizeList=0x0)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_BlockMap.cpp:1336
#13 0x00000000004d741b in Epetra_CrsGraph::MakeColMap_LL (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1664
---Type <return> to continue, or q <return> to quit---
#14 0x00000000004d81c9 in Epetra_CrsGraph::MakeColMap (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1764
#15 0x00000000004d83e7 in Epetra_CrsGraph::MakeIndicesLocal (
    this=this@entry=0x7fffffffddc0, domainMap=..., rangeMap=...)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsGraph.cpp:1784
#16 0x0000000000462bcb in Epetra_CrsMatrix::FillComplete (
    this=this@entry=0x7fffffffdd50, domain_map=..., range_map=..., 
    OptimizeDataStorage=OptimizeDataStorage@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1159
#17 0x0000000000462e81 in Epetra_CrsMatrix::FillComplete (
    this=this@entry=0x7fffffffdd50, 
    OptimizeDataStorage=OptimizeDataStorage@entry=true)
    at /home/myusername/mpi/trilinos-12.0.1-Source/packages/epetra/src/Epetra_CrsMatrix.cpp:1142
#18 0x000000000045a02c in read_and_construct (part=2, total_parts=4)
    at /home/myusername/mpi/myprogram/main.cpp:99
#19 0x0000000000458905 in main (argc=1, argv=0x7fffffffdfe8)

Program source code:

#include <Epetra_config.h>

#ifdef HAVE_MPI
#  include <mpi.h>
#  include <Epetra_MpiComm.h>
#else
#  include <Epetra_SerialComm.h>
#endif // HAVE_MPI

#include <Epetra_Map.h>
#include <Epetra_CrsMatrix.h>
#include <Epetra_Vector.h>
#include <Epetra_Version.h>
#include <Epetra_DistObject.h>
#include <Epetra_Export.h>
#include <Epetra_Util.h>
#include <unistd.h>

#include <stdexcept>


//std libs
#include <cstdio>
#include <vector>


using namespace std;

typedef long long global_ordinal_type;



int pid;
int np;

char *path = "/home/tianxiaochen01/matrix_small.txt";
typedef long long LL;
typedef long long *  T_LLP;


#ifdef HAVE_MPI
    Epetra_MpiComm * comm;
#else
    Epetra_SerialComm* comm;
#endif

// C style
void read_and_construct(int part,int total_parts){
    FILE * matrixfile;
    matrixfile = fopen(path,"r");
    int len = 0;
    long long src;
    vector< T_LLP > arrdst;
    vector< double * > arrvalue;
    vector< LL > myids;
    vector< int > lens;

    while (fscanf(matrixfile,"%lld %d ",&src,&len)  != EOF ){
        T_LLP dsts = new LL [ len ];
        double * values = new double [ len ];
        long long dst;
        double value;
        for (int i=0;i<len;i++){
            fscanf(matrixfile,"%lld %lf",&dst,&value);
            dsts[i] = dst;
            values[i] = value;
        }
        if ( src  % (LL)total_parts == (LL)part  ) {//is my part
            myids.push_back(src);
            lens.push_back(len);
            arrdst.push_back(dsts );
            arrvalue.push_back(values);
        }
        else {
            delete [] dsts;
            delete [] values;
        }
    }

    fclose(matrixfile);

    T_LLP arrmap = new LL [ myids.size() ];
    for (int i=0;i<myids.size();i++){
        arrmap[i] = myids[i];
    }
    Epetra_Map map((LL)-1, (int)myids.size(), arrmap ,(LL)0, *comm );

    Epetra_Vector v1(map);
    int avg_col_size = 1000;


    Epetra_CrsMatrix M(Copy,map,avg_col_size);
    //insert values into matrix
    for (int i=0;i<myids.size();i++){
        // (long long GlobalRow, int NumEntries, const double *Values, const long long *Indices)
        int e = M.InsertGlobalValues(myids[i],lens[i],arrvalue[i],arrdst[i]);
    }

    try
    {
        M.FillComplete();
    } catch (const Epetra_Object& ex) {
        cout<<"ERROR"<<endl;
        cout<<ex<<endl;
    }
    cout<<M<<endl;

}

void init(const Epetra_Comm& comm){
    pid = comm.MyPID();
    np = comm.NumProc();

}


int
main (int argc, char *argv[])
{
  using std::cout;
  using std::endl;

#ifdef HAVE_MPI
  MPI_Init (&argc, &argv);
  comm = new Epetra_MpiComm (MPI_COMM_WORLD);
  init(*comm);
#else
  comm = new Epetra_SerialComm;
  pid = 0;
  np = 1;
#endif // HAVE_MPI

    read_and_construct(pid,np);

#ifdef HAVE_MPI
  (void) MPI_Finalize ();
#endif // HAVE_MPI

  return 0;
}

Trilinos version:12.0 MPI: mpich

worldterminator
  • 2,968
  • 6
  • 33
  • 52

1 Answers1

1

It sounds like two things. one MPI_INT is a regular c int, and on nearly every platform that's 32 bits. so if giant identifiers are allowed, trillinios is going to have to send them around with the newer (MPI-2, section 10.2.5) MPI_INT64_T types

Second, maybe Trillinios is sending these around ok, but when you see "Error. Not enough space for elements on each processor.", that suggests Trillinios is allocating a dense array to hold these large values, and your processes are running out of memory. The single processor case probably works because there are no intermediate nodes to worry about.

Rob Latham
  • 5,085
  • 3
  • 27
  • 44