This is a follow-up to Why is TCP write latency worse when work is interleaved?
In that question we found that when CPU intensive work is inserted between write
calls to a TCP socket, the write
latency increases by a factor of 5+. This is because in the absence of the CPU intensive work, the outgoing bytes get batched before being committed as TCP packets to the device. The CPU intensive work allows the send buffer to flush, so that each new write
triggers a full packet construction which involves overhead. (As a side question, what exactly does this packet construction entail? The TCP header is <20 bytes, so I'm not sure where most of the overhead actually comes from.)
In light of this, I am looking for a way to "prepare" the next packet. This would be useful in a latency sensitive environment when you know that at some point in the future you'll need to send a packet, so you want to get done with the packet construction overhead early.
My first idea was to set the low water mark SO_SNDLOWAT
to 2, and then prepare the packet without sending it with a write
call of only one byte. In theory, SO_SNDLOWAT
should prevent this packet from actually hitting the device, so that when I measure the latency of the subsequent write
carrying actual data it should be fast. But this doesn't reduce latency at all (I'm somewhat skeptical that SO_SNDLOWAT
is doing what I expect).
Here's my server code:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <boost/timer.hpp>
#include <ctime>
#include <sched.h>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
// Function to count clock cycles
__inline__ uint64_t rdtsc(void)
{
uint32_t lo, hi;
__asm__ __volatile__ (
"xorl %%eax,%%eax \n cpuid"
::: "%rax", "%rbx", "%rcx", "%rdx");
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return (uint64_t)hi << 32 | lo;
}
// Set up some blocking work.
bool isPrime(int n) {
if (n < 2) {
return false;
}
for (int i = 2; i < n; i++) {
if (n % i == 0) {
return false;
}
}
return true;
}
// Compute the nth largest prime. Takes ~1 sec for n = 10,000
int getPrime(int n) {
int numPrimes = 0;
int i = 0;
while (true) {
if (isPrime(i)) {
numPrimes++;
if (numPrimes >= n) {
return i;
}
}
i++;
}
}
int main(int argc, char const *argv[])
{
int server_fd, new_socket, valread;
struct sockaddr_in address;
int opt = 1;
// Low water mark for socket
int lowat = 2;
int lowat2;
socklen_t optlen;
int addrlen = sizeof(address);
int result;
// Create socket for TCP server
server_fd = socket(AF_INET, SOCK_STREAM, 0);
setsockopt(server_fd, SOL_SOCKET, SO_SNDLOWAT, &lowat, sizeof(lowat));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
bind(server_fd, (struct sockaddr *)&address, sizeof(address));
listen(server_fd, 3);
// Accept one client connection
new_socket = accept(server_fd, (struct sockaddr *)&address, (socklen_t*)&addrlen);
setsockopt(new_socket, SOL_SOCKET, SO_SNDLOWAT, &lowat, sizeof(lowat));
// Check that SO_SNDLOWAT was updated
getsockopt(new_socket, SOL_SOCKET, SO_SNDLOWAT, &lowat2, &optlen);
printf("New lowat value: %d\n", lowat2);
char sendBuffer[1] = {0};
int primes[20] = {0};
int N = 10;
for (int i = 0; i < N; i++) {
sendBuffer[0] = 97 + i;
boost::timer t;
auto start = rdtsc();
write(new_socket, sendBuffer, 1);
auto end = rdtsc();
printf("%d mics (%llu cycles) to write\n", int(1e6 * t.elapsed()), end-start);
// Inserting blocking work here slows down the `write` calls by a
// factor of 5.
primes[i] = getPrime(10000 + i);
// Attempt to prep the next packet without sending it, by writing 'X'.
sendBuffer[0] = 88;
write(new_socket, sendBuffer, 1);
primes[i] = getPrime(1000 + i);
}
// Prevent the compiler from optimizing away the prime computation.
printf("prime: %d\n", primes[8]);
}
And client code:
// Server side C/C++ program to demonstrate Socket programming
// #include <iostream>
#include <unistd.h>
#include <stdio.h>
#include <sys/socket.h>
#include <stdlib.h>
#include <netinet/in.h>
#include <netinet/tcp.h>
#include <string.h>
#include <unistd.h>
int main(int argc, char const *argv[])
{
int sock, valread;
struct sockaddr_in address;
int opt = 1;
int addrlen = sizeof(address);
// We'll be passing uint32's back and forth
unsigned char recv_buffer[1024] = {0};
// Create socket for TCP server
sock = socket(AF_INET, SOCK_STREAM, 0);
// Set TCP_NODELAY so that writes won't be batched
setsockopt(sock, SOL_SOCKET, TCP_NODELAY, &opt, sizeof(opt));
address.sin_family = AF_INET;
address.sin_addr.s_addr = INADDR_ANY;
address.sin_port = htons(8080);
// Accept one client connection
if (connect(sock, (struct sockaddr *)&address, (socklen_t)addrlen) != 0) {
throw("connect failed");
}
int N = 10;
int loc[N+1];
int nloc, curloc;
for (nloc = curloc = 0; curloc < N; nloc++) {
int n = read(sock, recv_buffer + curloc, sizeof recv_buffer-curloc);
if (n <= 0) {
break;
}
curloc += n;
loc[nloc] = curloc;
// usleep(100000);
}
int last = 0;
for (int i = 0; i < nloc; i++) {
printf("%*.*s ", loc[i] - last, loc[i] - last, recv_buffer + last);
last = loc[i];
}
printf("\n");
}
Output:
New lowat value: 2
14 mics (31252 cycles) to write
25 mics (49088 cycles) to write
26 mics (55558 cycles) to write
26 mics (53618 cycles) to write
26 mics (54468 cycles) to write
28 mics (58382 cycles) to write
Removing the prime computations altogether reduces the write
latency to ~5,000 cycles (a factor of 10 or so faster).
I'm wondering if I have something wrong with my SO_SNDLOWAT
implementation, or alternately if there's a cleaner way to prep the packet.
The client's output (where spaces denote separate read
calls) suggests that SO_SNDLOWAT
is failing: a X b X c X d X e X
.
Update: Per Gil's suggestion, I tried using the MSG_MORE
flag when I send the X
packets as a signal to hold off on the actual device write. This seems to work (after making the second blocking work take <200ms) in that the client's output becomes a Xb Xc Xd Xe Xf
. But counterintuitively, the payload write
s actually become slower (100,000 cycles vs 50,000 cycles without MSG_MORE
vs 5,000 cycles without blocking work). MSG_MORE
code:
// Attempt to prep the next packet without sending it, by writing 'X'.
sendBuffer[0] = 88;
send(new_socket, sendBuffer, 1, MSG_MORE);
primes[i] = getPrime(1000 + i + 1);