I have to send a large amount of data (enough to saturate a gigabit link) out over a UDP socket from an embedded system running (peta)Linux to arbitrary devices. Performance in terms of speed (time spent in socket system calls/copying data esp) and efficiency (CPU time/percentage use) are critical here, so I have been attempting different methods of achieving this, each of which I would expect to be faster according to my research. When I attempt to benchmark this performance, the differences between methods seem to be almost negligible for the most part so I wonder if I am missing anything obvious (like caching playing a part in my benchmark results?) or if I am expecting too much improvement between methods.
Method 1: Copy header and data into buffer, send buffer with sendto()
.
I had expected this to be the slowest by a margin because of the overhead involved with copying all of the data before each send.
Method 2: Gather buffers with sendmsg()
to avoid copying
This is where I expected to see improvements start as copying (in my userspace application at least) had been basically eliminated.
Method 3: Use sendmmsg to avoid overhead of many calls to sendmsg()
The man page for this call implies that cutting back on system calls ((~1500 * ~5 * 30)/sec for previous 2 methods) can have performance benefits.
Method 4: Any of the previous + using connect()
on the sockets before hand I had seen some suggestions (another) that this could improve performance, again prefaced with a 'maybe':
Connected sockets can save route lookup on each packet by employing a clever optimization — Linux can save a route lookup result on a connection struct. Depending on the specifics of the setup this might save some CPU cycles.
Indeed, this seemed to yield a small benefit on my laptop when writing the MVE, EXCLUSIVELY when using in conjunction with sendmmsg()
(25ms average to send 3.5MB 'image' vs ~28.15ms avg ± 0.1ms for all previous methods, including conncet()
with sendto()
or sendmsg()
). This improvement does not seem to have transferred over to the embedded system however (perhaps this connection struct is not impl in that version of the kernel? Something I will look into after posting this, although kernel socket code is not something I pull apart often).
I am hoping anyone can do any of the following:
- Answer my main question as to why I do not see much improvement like I would expect (from eliminating userspace copying and system call/kernel overhead)
- Run the MVE a few ways and let me know if they get similar results/spot any issues with it
- Maybe even point me to a more efficient method compatible with my MVE if
one exists, as I think
connect()
withsendmmsg()
are my best attempt and I am still not super satisfied. I have seenMSG_ZEROCOPY
flag and options regarding blocking but am not sure if they will make other parts of this more difficult, and do not want to spend time impl. and testing them before understanding my current issue
What follows is a 'table' of results from my benchmarking as well as the MVE
sysCall | connected | Time(μS)
sento() | no | 28119
sendmsg() | no | 28340
sendmmsg() | no | 28367
sento() | yes | 28109
sendmsg() | yes | 28341
sendmmsg() | yes | 25021
C++ (I apologize in advance, hacked together from a much larger experiment, but should compile and run with some quick tweaks to inet addrs) I use -O3:
#include <netinet/in.h>
#include <sys/socket.h>
#include <iostream>
#include <chrono>
#include <unistd.h>
#include <string.h>
#include <random>
using namespace std;
void randomize_sim_buf();
uint8_t send_buf[9000], sim_buf[10][1920*1200*3/2], sim_header[20] = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19};
int main(int argc, char const *argv[])
{
int socket_fd = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP);
struct sockaddr_in socket_bind, socket_destination;
int return_stat = 0;
uint8_t * p;
iovec iov[2];
msghdr message_hdr;
iovec iovs[1024][2];
mmsghdr mmsg[1024];
socket_bind.sin_family = socket_destination.sin_family = AF_INET;
socket_bind.sin_addr.s_addr = htonl(0x0A42AB01);
socket_destination.sin_addr.s_addr = htonl(0x0A42AB15);
socket_bind.sin_port = htons(0xF3D4);
socket_destination.sin_port = htons(0xF4D4);
return_stat = bind(socket_fd, (sockaddr *)&socket_bind, sizeof(sockaddr_in));
bool connected = false;
cout << "Connect socket?\n";
cin >> connected;
if (connected){
return_stat = connect(socket_fd, (sockaddr *)&socket_destination, sizeof(sockaddr_in));
cout << "Socket connect() returned " << return_stat << endl;
}
string send_type = "";
int packet_size = 1500;
cout << "{sendto|sendmsg|sendmmsg} [packet size]?\n";
cin >> send_type >> packet_size;
uint64_t duration_cnt = 0;
for (int i = 0; i < 1000; i++){
int msg_idx = 0;
auto start = chrono::high_resolution_clock::now();
for (uint j = 0; j < (1920*1200*3/2); j+= packet_size){
uint datalen = (((1920*1200*3/2) - j) >= packet_size) ? packet_size : ((1920*1200*3/2) - j);
p = sim_buf[i % 10];
if (send_type == "sendto"){
memcpy(send_buf, &sim_header, 20);
memcpy(send_buf + 20, &p[j], datalen);
sendto(socket_fd, send_buf, datalen, 0, (sockaddr*)&socket_destination, sizeof(sockaddr_in));
} else if (send_type == "sendmsg"){
iov[0].iov_base = &sim_header;
iov[0].iov_len = 20;
iov[1].iov_base = &p[j];
iov[1].iov_len = datalen;
message_hdr.msg_controllen = 0;
message_hdr.msg_flags = 0;
message_hdr.msg_iov = iov;
message_hdr.msg_iovlen = 2;
message_hdr.msg_name = &socket_destination;
message_hdr.msg_namelen = sizeof(sockaddr_in);
sendmsg(socket_fd, &message_hdr, 0);
} else if (send_type == "sendmmsg"){
iovs[msg_idx][0].iov_base = &sim_header;
iovs[msg_idx][0].iov_len = 20;
iovs[msg_idx][1].iov_base = &p[j];
iovs[msg_idx][1].iov_len = datalen;
mmsg[msg_idx].msg_hdr.msg_controllen = 0;
mmsg[msg_idx].msg_hdr.msg_flags = 0;
mmsg[msg_idx].msg_hdr.msg_name = &socket_destination;
mmsg[msg_idx].msg_hdr.msg_namelen = sizeof(sockaddr_in);
mmsg[msg_idx].msg_hdr.msg_iov = iovs[msg_idx];
mmsg[msg_idx].msg_hdr.msg_iovlen = 2;
msg_idx++;
if (msg_idx == 1024){
sendmmsg(socket_fd, mmsg, msg_idx, 0);
msg_idx = 0;
}
} else {
cout << "Invalid send type supplied\n";
return -1;
}
}
if (send_type == "sendmmsg")
sendmmsg(socket_fd, mmsg, msg_idx, 0);
auto stop = chrono::high_resolution_clock::now();
duration_cnt += std::chrono::duration_cast<std::chrono::microseconds>(stop - start).count();
randomize_sim_buf();
usleep(15000);
}
cout << "Average duration to send a buffer: " << (duration_cnt / 1000) << endl;
return 0;
}
void randomize_sim_buf(){
for (int i = 0; i < 10; i++){
for (int j = 0; j < (1920*1200*3/2); j++){
sim_buf[i][j] = rand() % 255;
}
}
}