Why does the ZeroMQ ROUTER-DEALER pattern have high latency?

Question

Using libzmq 4.2.5 on centos 7. Getting very high latency when messages are sent from DEALER to ROUTER and even from ROUTER to DEALER. So I wrote a simple client-server program using tcp and sent messages between them just for comparison. Tcp appears to be fast.

Sending single byte from DEALER to ROUTER, zmq takes 900 microseconds.

Sending single byte from client to server, tcp takes 150 microseconds.

What am I doing wrong. I thought zmq will be at least as fast as tcp. Is there any tuning I can do to make zmq faster?

Update

router.cpp

#include <zmq.hpp>
    struct data
    {
    char one[21];
    unsigned long two;
   };
data * pdata;
std::size_t counter=0;

 int main()
{
   zmq::context_t context(1);
   zmq::socket_t Device(context,ZMQ_ROUTER);

   int iHighWaterMark=0;

  Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
  Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));

  Device.bind("tcp://0.0.0.0:5555");

  pdata=new data[10000];

 struct timespec ts_dtime;
 unsigned long sec;

  zmq::message_t message;

  zmq::pollitem_t arrPollItems[]={{Device, 0, ZMQ_POLLIN, 0},{NULL, 
                                                            0, ZMQ_POLLIN, 0}};

    while(counter < 10000)
      {
        try
      {
        int iAssert = zmq::poll(arrPollItems, 1, -1);
        if (iAssert <= 0)
          {
             if (-1 == iAssert)
          {
             printf("zmq_poll failed errno: %d error:%s", errno, 
                                 zmq_strerror(errno));
          }
            continue;
           }

          if (arrPollItems[0].revents == ZMQ_POLLIN)
           {
             while(true)
            {
               if(! Device.recv(&message,ZMQ_DONTWAIT))
                    break;

               Device.recv(&message);

                strncpy(pdata[counter].one, 
                                      (char*)message.data(),message.size());
               clock_gettime(CLOCK_REALTIME, &ts_dtime);
              pdata[counter].two = (ts_dtime.tv_sec*1e9)+ 
                                                              ts_dtime.tv_nsec;
              ++counter;
            }

           }
      }
          catch(...)
        {

         }

          }

         for(int i=0;i<counter;++i)
         printf("%d %s %lu\n",i+1,pdata[i].one,pdata[i].two);

         return 0;
        }

dealer.cpp

#include <zmq.hpp>
#include<unistd.h>

int main()
{
  zmq::context_t context(1);
  zmq::socket_t Device(context,ZMQ_DEALER);

  int iHighWaterMark=0;

  Device.setsockopt(ZMQ_SNDHWM,&iHighWaterMark,sizeof(int));
  Device.setsockopt(ZMQ_RCVHWM,&iHighWaterMark,sizeof(int));
  Device.setsockopt(ZMQ_IDENTITY,"TEST",4);

   Device.connect("tcp://0.0.0.0:5555");

    usleep(100000);

   struct timespec ts_dtime;
   unsigned long sec;

   for(std::size_t i=0;i<10000;++i)
    {
      clock_gettime(CLOCK_REALTIME, &ts_dtime);
      sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
      zmq::message_t message(21);
      sprintf((char *)message.data(),"%lu",sec);
      Device.send(message);
     usleep(500);
    }

  return 0;
 }

update 2:

router.cpp

#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>

int main (int argc, char *argv[])
{
    const char *bind_to;
    int roundtrip_count;
    size_t message_size;

    int rc;
    int i;


    if (argc != 4) {
        printf ("usage: local_lat <bind-to> <message-size> "
                "<roundtrip-count>\n");
        return 1;
    }
    bind_to = argv[1];
    message_size = atoi (argv[2]);
    roundtrip_count = atoi (argv[3]);

    zmq::context_t ctx(1);
  zmq::socket_t s(ctx,ZMQ_ROUTER);

  zmq::message_t msg,id;

    int iHighWaterMark=0;
    s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
                         sizeof (int));
    s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
                                              sizeof (int));
    s.bind( bind_to);
    struct timespec ts_dtime;
    unsigned long sec;
for (i = 0; i != roundtrip_count; i++) {
      rc =s.recv(&id);
        if (rc < 0) {
            printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
            return -1;
        }

        rc = s.recv(&msg, 0);
        if (rc < 0) {
            printf ("error in zmq_recvmsg: %s\n", zmq_strerror (errno));
            return -1;
        }

        clock_gettime(CLOCK_REALTIME, &ts_dtime);
        sec=((ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec);
        printf("%.*s %lu\n",20,(char *)msg.data(),sec);
}




    s.close();



    return 0;
}

dealer.cpp

#include <zmq.hpp>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
int main (int argc, char *argv[])
{
    const char *connect_to;
    int roundtrip_count;
    size_t message_size;

    int rc;
    int i;

    void *watch;
    unsigned long elapsed;
    double latency;

    if (argc != 4) {
        printf ("usage: remote_lat <connect-to> <message-size> "
                "<roundtrip-count>\n");
        return 1;
    }
    connect_to = argv[1];
    message_size = atoi (argv[2]);
    roundtrip_count = atoi (argv[3]);

    zmq::context_t ctx(1);
  zmq::socket_t s(ctx,ZMQ_DEALER);

  struct timespec ts_dtime;
  unsigned long sec;
int iHighWaterMark=0;
    s.setsockopt(ZMQ_SNDHWM , &iHighWaterMark,
                         sizeof (int));
    s.setsockopt(ZMQ_RCVHWM , &iHighWaterMark,
                                              sizeof (int));

    s.connect(connect_to);


    for (i = 0; i != roundtrip_count; i++) {
      zmq::message_t msg(message_size+20);
      clock_gettime(CLOCK_REALTIME, &ts_dtime);
      sec=(ts_dtime.tv_sec*1e9)+ ts_dtime.tv_nsec;
      sprintf((char *)msg.data(),"%lu",sec);
      rc = s.send(msg);
        if (rc < 0) {
            printf ("error in zmq_sendmsg: %s\n", zmq_strerror (errno));
            return -1;
        }

        sleep(1);
}
s.close();


    return 0;
}

output :

1562125527489432576 1562125527489773568
1562125528489582848 1562125528489961472
1562125529489740032 1562125529490124032
1562125530489944832 1562125530490288896
1562125531490101760 1562125531490439424
1562125532490261248 1562125532490631680
1562125533490422272 1562125533490798080
1562125534490555648 1562125534490980096
1562125535490745856 1562125535491161856
1562125536490894848 1562125536491245824
1562125537491039232 1562125537491416320
1562125538491229184 1562125538491601152
1562125539491375872 1562125539491764736
1562125540491517184 1562125540491908352
1562125541491657984 1562125541492027392
1562125542491816704 1562125542492193536
1562125543491963136 1562125543492338944
1562125544492103680 1562125544492564992
1562125545492248832 1562125545492675328
1562125546492397312 1562125546492783616
1562125547492543744 1562125547492926720
1562125564495211008 1562125564495629824
1562125565495372032 1562125565495783168
1562125566495515904 1562125566495924224
1562125567495660800 1562125567496006144
1562125568495806464 1562125568496160000
1562125569495896064 1562125569496235520
1562125570496080128 1562125570496547584
1562125571496235008 1562125571496666624
1562125572496391424 1562125572496803584
1562125573496532224 1562125573496935680
1562125574496652800 1562125574497053952
1562125575496843776 1562125575497277184
1562125576496997120 1562125576497417216
1562125577497182208 1562125577497726976
1562125578497336832 1562125578497726464
1562125579497549312 1562125579497928704
1562125580497696512 1562125580498115328
1562125581497847808 1562125581498198528
1562125582497998336 1562125582498340096
1562125583498140160 1562125583498622464
1562125584498295296 1562125584498680832
1562125585498445312 1562125585498842624
1562125586498627328 1562125586499025920

All are in the range for 350-450us

user3666197 · Answer 1 · 2019-07-07T19:20:53.267

Q1: What am I doing wrong?
_{I thought zmq will be at least as fast as tcp.}

Code-wise, nothing.

Performance-wise, the ZeroMQ is fantastic plus has so many features that tcp does not and will not provide right out of the box:

Test-setup "Sending single byte..." seems to step right into the left edge of the high-performance / low-latency messaging service:

Lets first understand the Latency and where did it come from:

The observed resulting latency figures are product of the overall sum of the resources-usage ( resources allocations + resources pools management operations + data manipulations ) and processing-efforts ( all we try to do with the data, here including times, that our task had to spend in a waiting queue, due to the system-scheduler planned multi-tasking workunits scheduling, that are not from our testing workload, but the operating system has to schedule and execute, according to the fair-scheduling-policy and actual process-priority settings ) and communications channels transport-delays ( comms E2E transport latency )

Lets next understand what do we try to compare with:

A difference between a Transmission Control Protocol ( raw tcp ) and a ZeroMQ zmq framework of smart Scalable Formal Communication Archetypes with a rich set of high-level, distributed behaviours, is about a few galaxies big.

ZeroMQ was designed as rather a Signalling and Messaging infrastructure using some of these feature-rich set of behaviours that match together - often depicted by some human-alike behaviour-archetype:

One PUSH-es, any number of joined counterparties PULL

One REQ-ests, someone from a group on the other end of the phone REP-lies

One, even potentially a one from some larger group of agents, PUB-lishes, any amount of already subscribed subscribers receive such a SUB-scribed message.

For details, one may kindly read a brief overview about the main conceptual differences in [ ZeroMQ hierarchy in less than a five seconds ] Section.

This is nothing a TCP-protocol will ever provide on its own.

This is a comfort one likes to pay for by some negligible amount of latency. Negligible? Yes, negligible once compared to the many man*years of ultimate software craftsmanship anyone would have to pay for designing another at least similarly smart messaging framework to compete with ZeroMQ.

Q2: Is there any tuning I can do to make zmq faster?

Maybe yes, maybe not.

_{Update:
- try avoiding Identity management ( tcp has no such thing either, so measured RTT-s are the lesser comparable or meaningful )

- try avoiding the blocking manner of the HWM-configurations ( tcp has no such thing either )

- may try to measure the same over a non-tcp-protocol ( a PAIR/PAIR Formal Scalable Communication Archetype, best over the least complex protocol data-pumps as inproc:// is or ipc:// in case your SandBox test bed needs to still keep distributed non-local copies etc. ) ZeroMQ context-instance's internal overheads spent on the .send() resp. .receive() methods

- may try to allow for slight increase in performance by using more threads available for the Context instance

( other performance demasking tricks depend on the nature of real-world usage - as a robustness to dropped messages, feasibility to use a conflated mode of operations, better buffer-alignment with O/S, zero-copy tricks - all being of some interest here, yet have to let and keep the smart ZeroMQ infrastructure of distributed behaviours operational, which is by far more complex task to execute, than a trivial serial sequence of otherwise blind and isolated tcp-socket byte-level operations - so, comparing times is possible, but comparing individual draconic dragster-class car ( well, better a vehicle, not even a car ) with something like a globally operating infrastructure of distributed behaviour ( like Taxify or Uber, named here just to make a use of a trivialised (dis-)similarity of approximately same scale of magnitude ) leaves the numbers reporting about phenomena, that do not provide the similar comfort, scalability of use-cases, almost linear performance scaling and robustness of the real-world use )

- may add more scheduling determinism with making the Context-instance's respective IoTHREADs-hard-wired onto respective CPU-core(s), so that the overall I/O-performance never gets evicted from CPU-schedule and remains deterministically mapped / pre-locked on even exclusively administratively dedicated CPU-core(s) - depends on a level of need and administrative policies if trying to do this ultimate performance hack}

For any performance related tweaking, one will need to post an MCVE + a fully described benchmark test suite. The ZeroMQ Latency test results report shows:

Conclusion

In a controlled environment RDTSC instruction can be used to measure time rapidly. This allows us to measure latency/density for individual messages instead of computing averages for the whole test.

We've used this approach to get performance figures of ØMQ lightweight messaging kernel (version 0.1) and we've got following results:

-- In low-volume case the latency is almost the same as the latency of the underlying transport (TCP): 50 microseconds.
-- The average jitter of latency is minimal: 0.225 microsecond.
-- The throughput on sender side is 4.8 million messages a second.
-- The density on sender side is mostly about 0.140 microsecond, however, with occasional peaks the mean density is 0.208 microsecond.
-- The throughput on receiver side is 2.7 million messages a second.
-- The density on receiver side is mostly about 0.3 microsecond. Approximately each 100 messages new batch is received causing density to grow to 3-6 microseconds. The mean density is 0.367 microsecond.

If in an ultimate need for latency shaving, one may try nanomsg, the ZeroMQ's younger sister originated by Martin SUSTRIK, the co-father of ZeroMQ ( now maintained afaik by someone else )

Why does the ZeroMQ ROUTER-DEALER pattern have high latency?

1 Answers1

Q1: What am I doing wrong?
_{I thought zmq will be at least as fast as tcp.}

Lets first understand the Latency and where did it come from:

Lets next understand what do we try to compare with:

Q2: Is there any tuning I can do to make `zmq` faster?

Conclusion

Why does the ZeroMQ ROUTER-DEALER pattern have high latency?

1 Answers1

Q1: What am I doing wrong?I thought zmq will be at least as fast as tcp.

Lets first understand the Latency and where did it come from:

Lets next understand what do we try to compare with:

Q2: Is there any tuning I can do to make zmq faster?

Conclusion

Q1: What am I doing wrong?
_{I thought zmq will be at least as fast as tcp.}

Q2: Is there any tuning I can do to make `zmq` faster?