NVMe SSD read speed drops after multiple GBs read

Question

I have a Seagate FireCuda 530 4TB SSD (M.2 NVMe), whose specs say its sequential read is up to 7.3 GB/s, and its IOPS is 1M. I'm trying to write a program that achieves anything close to the claimed speed. The OS is Ubuntu 22.04, AMD CPU (Ryzen Threadripper 3990X), Gigabyte TRX40 Designare motherboard, 256GB of RAM DDR4 2667 MHz.

Here's the program I have so far (and I also have mmap() based solution, but even in theory it shouldn't be faster - see https://github.com/srogatch/nvme-max-read for the mmap-based version):

#include <sys/mman.h>
#include <fcntl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <sys/uio.h>

#include <string>
#include <iostream>
#include <filesystem>
#include <memory>
#include <cstring>
#include <chrono>
#include <vector>
#include <thread>
#include <cstdint>
#include <sstream>
namespace fs = std::filesystem;

const std::string gc_src_dir = "/scratch/LLMs/models/bloom";
constexpr size_t gc_page_size = 4096;
constexpr uintptr_t gc_page_mask = gc_page_size - 1;
constexpr size_t gc_n_workers = 64;

bool EndsWith(const std::string& text, const std::string& suffix) {
    if(text.size() < suffix.size()) {
        return false;
    }
    return memcmp(text.data()+text.size()-suffix.size(), suffix.data(), suffix.size()) == 0;
}

const uint8_t* AlignPageUp(const void* ptr) {
    return reinterpret_cast<uint8_t*>((uintptr_t(ptr) + gc_page_mask) & (~gc_page_mask));
}

uint8_t* AlignPageUp(void* ptr) {
    return const_cast<uint8_t*>(AlignPageUp(static_cast<const void*>(ptr)));
}

const uint8_t* AlignPageDown(const void* ptr) {
    return reinterpret_cast<uint8_t*>(uintptr_t(ptr) & (~gc_page_mask));
}

void Scattered(const int fd, const size_t n_bytes) {
    // See https://stackoverflow.com/questions/27271801/c-the-permitted-maximum-of-the-iovcnt-argument-in-writev
    constexpr size_t buffers_per_worker = 1024;
    constexpr size_t n_buffers = gc_n_workers * buffers_per_worker;
    const size_t even_page_bytes = (n_bytes + gc_page_mask) & (~gc_page_mask);
    size_t bytes_per_buffer = (n_bytes + n_buffers - 1) / n_buffers;
    bytes_per_buffer = (bytes_per_buffer + gc_page_mask) & (~gc_page_mask);
    const size_t used_buffers = (n_bytes + bytes_per_buffer - 1) / bytes_per_buffer;
    void* raw_storage = malloc(bytes_per_buffer * used_buffers + gc_page_mask);
    uint8_t* storage = AlignPageUp(raw_storage);
    iovec buffers[n_buffers];

#pragma omp parallel for
    for(size_t i=0; i<used_buffers; i++) {
        const size_t first_byte = bytes_per_buffer * i;
        const size_t limit_byte = std::min(even_page_bytes, bytes_per_buffer * (i+1));
        buffers[i].iov_len = limit_byte - first_byte;
        buffers[i].iov_base = storage + first_byte;
    }

#pragma omp parallel for num_threads(gc_n_workers)
    for(size_t i_worker = 0; i_worker < gc_n_workers; i_worker++) {
        const size_t first_buffer = i_worker * buffers_per_worker;
        const size_t limit_buffer = std::min((i_worker+1) * buffers_per_worker, used_buffers);
        if(first_buffer >= limit_buffer) {
            continue;
        }
        ssize_t n_read = preadv2(fd, buffers + first_buffer, limit_buffer - first_buffer, 0, RWF_HIPRI);
        if(n_read == -1) {
            perror("Failed to read file");
        }
    }
    free(raw_storage);
}

void MultiLargeRead(const int fd, const size_t n_bytes) {
    void* raw_storage = malloc(n_bytes + gc_page_size + gc_page_mask);
    uint8_t *storage = AlignPageUp(raw_storage);
    size_t bytes_per_worker = (n_bytes + gc_n_workers - 1) / gc_n_workers;
    bytes_per_worker = (bytes_per_worker + gc_page_mask) & (~gc_page_mask);
#pragma omp parallel for num_threads(gc_n_workers)
    for(size_t i=0; i<gc_n_workers; i++) {
        const size_t first_byte = i * bytes_per_worker;
        const size_t limit_byte = std::min(n_bytes, (i+1)*bytes_per_worker);
        if(first_byte >= limit_byte) {
            continue;
        }
        const ssize_t n_read = pread(fd, storage + first_byte,
            (limit_byte - first_byte + gc_page_mask) & (~gc_page_mask),
            first_byte);
        if(n_read == -1) {
            perror("Faild to read file");
        }
    }
    free(raw_storage);
}

int main() {
    for (const auto & entry : fs::directory_iterator(gc_src_dir)) {
        if(!EndsWith(entry.path(), ".safetensors") && !EndsWith(entry.path(), ".bin")) {
            continue;
        }
        std::cout << entry.path() << std::endl;

        const int fd = open(entry.path().c_str(), O_RDONLY | O_LARGEFILE | O_DIRECT);
        if(fd == -1) {
            perror("Failed to open");
            continue;
        }
        struct stat file_stat;
        if (fstat(fd, &file_stat) == -1) {
            perror("Faild to stat");
            continue;
        }
        //posix_fadvise(fd, 0, file_stat.st_size, POSIX_FADV_RANDOM);

        std::chrono::steady_clock::time_point tmLast = std::chrono::steady_clock::now();
        Scattered(fd, file_stat.st_size);
        std::chrono::steady_clock::time_point tmNow = std::chrono::steady_clock::now();
        const double nSec = std::chrono::duration_cast<std::chrono::nanoseconds>(tmNow - tmLast).count() / 1e9;
        const double GBperSec = (file_stat.st_size / nSec) / 1e9;
        std::cout << file_stat.st_size << " bytes in " << nSec << " seconds: "
            << GBperSec << " billion bytes per second." << std::endl;

        posix_fadvise(fd, 0, file_stat.st_size, POSIX_FADV_DONTNEED);
        if(close(fd) == -1) {
            perror("Failed to close");
        }
    }
    return 0;
}

/scratch/LLMs/models/bloom contains the Bloom large language model downloaded from https://huggingface.co/bigscience/bloom/tree/main .

I run the above program with the following commands:

g++ -fopenmp -O3 nvme-read-fileio.cpp -o nvme-read-fileio
echo 3 | sudo tee /proc/sys/vm/drop_caches
./nvme-read-fileio

The program starts with reading at about 4.5 GB/s, but then the performance drops down to 700-800 MB/s. The temperature of the SSD doesn't go above 74 degrees Celsius. I thought the SLC cache only plays a role when writing to the SSD. But apparently there is some caching (not in the OS, as the OS cache stays low after posix_fadvise()).

Can someone explains what's going on and whether these are the expected numbers for sustained read of an NVMe SSD?

The performance log for my SSD is:

./run-fileio.sh 
[sudo] password for serge: 
3
"/scratch/LLMs/models/bloom/model_00046-of-00072.safetensors"
4932875563 bytes in 1.06306 seconds: 4.64028 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00050-of-00072.bin"
4932877665 bytes in 1.05543 seconds: 4.67381 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00012-of-00072.safetensors"
4932875573 bytes in 1.09619 seconds: 4.50001 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00020-of-00072.safetensors"
4932875563 bytes in 1.08084 seconds: 4.56391 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00066-of-00072.safetensors"
4932875565 bytes in 1.11794 seconds: 4.41245 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00072-of-00072.safetensors"
57530 bytes in 0.0143936 seconds: 0.00399693 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00036-of-00072.bin"
4932877665 bytes in 1.07298 seconds: 4.59736 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00055-of-00072.safetensors"
4932875563 bytes in 1.0579 seconds: 4.66288 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00002-of-00072.safetensors"
4932875549 bytes in 1.08532 seconds: 4.54509 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00041-of-00072.bin"
4932877665 bytes in 1.07056 seconds: 4.60776 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00017-of-00072.safetensors"
4932875563 bytes in 1.11623 seconds: 4.41921 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00069-of-00072.safetensors"
4932875573 bytes in 1.24681 seconds: 3.95639 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00037-of-00072.safetensors"
4932875573 bytes in 1.5898 seconds: 3.10283 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00053-of-00072.bin"
4932877665 bytes in 1.08551 seconds: 4.54429 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00071-of-00072.bin"
4932877665 bytes in 2.83951 seconds: 1.73723 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00040-of-00072.bin"
4932877665 bytes in 1.09829 seconds: 4.4914 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00055-of-00072.bin"
4932877665 bytes in 1.13006 seconds: 4.36515 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00036-of-00072.safetensors"
4932875563 bytes in 1.11194 seconds: 4.43627 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00064-of-00072.bin"
4932877665 bytes in 2.82379 seconds: 1.7469 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00071-of-00072.safetensors"
4932875557 bytes in 1.09051 seconds: 4.52346 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00023-of-00072.bin"
4932877665 bytes in 2.83307 seconds: 1.74117 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00018-of-00072.safetensors"
4932875573 bytes in 1.10318 seconds: 4.47151 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00033-of-00072.safetensors"
4932875573 bytes in 2.85241 seconds: 1.72937 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00059-of-00072.safetensors"
4932875563 bytes in 1.11315 seconds: 4.43147 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00070-of-00072.bin"
4932877665 bytes in 1.05519 seconds: 4.67487 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00044-of-00072.bin"
4932877665 bytes in 3.30498 seconds: 1.49256 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00056-of-00072.safetensors"
4932875565 bytes in 1.49834 seconds: 3.29222 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00003-of-00072.bin"
4932877601 bytes in 1.09202 seconds: 4.51722 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00011-of-00072.safetensors"
4932875551 bytes in 3.22613 seconds: 1.52904 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00024-of-00072.safetensors"
4932875561 bytes in 2.73117 seconds: 1.80614 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00015-of-00072.bin"
4932877665 bytes in 2.78437 seconds: 1.77163 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00047-of-00072.bin"
4932877665 bytes in 1.08964 seconds: 4.52709 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00050-of-00072.safetensors"
4932875551 bytes in 1.05356 seconds: 4.68211 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00022-of-00072.bin"
4932877665 bytes in 1.12053 seconds: 4.40229 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00008-of-00072.bin"
4932877601 bytes in 3.23984 seconds: 1.52257 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00011-of-00072.bin"
4932877601 bytes in 2.7694 seconds: 1.78121 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00025-of-00072.bin"
4932877665 bytes in 2.35111 seconds: 2.0981 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00005-of-00072.bin"
4932877601 bytes in 2.94068 seconds: 1.67746 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00001-of-00072.safetensors"
7193289031 bytes in 1.91779 seconds: 3.75082 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00051-of-00072.safetensors"
4932875573 bytes in 3.97354 seconds: 1.24143 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00035-of-00072.bin"
4932877665 bytes in 2.97325 seconds: 1.65908 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00064-of-00072.safetensors"
4932875537 bytes in 2.3653 seconds: 2.08552 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00030-of-00072.safetensors"
4932875531 bytes in 1.07454 seconds: 4.5907 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00004-of-00072.safetensors"
4932875557 bytes in 4.56415 seconds: 1.08079 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00015-of-00072.safetensors"
4932875531 bytes in 3.24319 seconds: 1.52099 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00048-of-00072.safetensors"
4932875555 bytes in 2.37022 seconds: 2.08119 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00027-of-00072.safetensors"
4932875563 bytes in 1.072 seconds: 4.60155 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00060-of-00072.bin"
4932877665 bytes in 1.07123 seconds: 4.60486 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00045-of-00072.safetensors"
4932875569 bytes in 6.21179 seconds: 0.794116 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00043-of-00072.bin"
4932877665 bytes in 3.3426 seconds: 1.47576 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00044-of-00072.safetensors"
4932875555 bytes in 1.14831 seconds: 4.29577 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00032-of-00072.bin"
4932877665 bytes in 1.08985 seconds: 4.52621 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00008-of-00072.safetensors"
4932875519 bytes in 1.09357 seconds: 4.51081 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00052-of-00072.bin"
4932877665 bytes in 6.71973 seconds: 0.734089 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00068-of-00072.safetensors"
4932875563 bytes in 1.05955 seconds: 4.65564 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00013-of-00072.safetensors"
4932875573 bytes in 1.08236 seconds: 4.55753 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00029-of-00072.safetensors"
4932875563 bytes in 6.20449 seconds: 0.795049 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00039-of-00072.bin"
4932877665 bytes in 1.21287 seconds: 4.0671 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00016-of-00072.bin"
4932877665 bytes in 5.95278 seconds: 0.828668 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00056-of-00072.bin"
4932877665 bytes in 1.52038 seconds: 3.2445 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00033-of-00072.bin"
4932877665 bytes in 1.09718 seconds: 4.49596 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00041-of-00072.safetensors"
4932875573 bytes in 6.2845 seconds: 0.784927 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00063-of-00072.bin"
4932877665 bytes in 5.43954 seconds: 0.906855 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00045-of-00072.bin"
4932877665 bytes in 3.228 seconds: 1.52815 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00052-of-00072.safetensors"
4932875549 bytes in 1.97378 seconds: 2.4992 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00022-of-00072.safetensors"
4932875573 bytes in 1.10796 seconds: 4.45222 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00009-of-00072.safetensors"
4932875551 bytes in 5.92352 seconds: 0.832761 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00072-of-00072.bin"
58279 bytes in 0.0159825 seconds: 0.00364643 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00002-of-00072.bin"
4932877601 bytes in 1.0347 seconds: 4.76744 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00007-of-00072.bin"
4932877601 bytes in 6.40498 seconds: 0.770163 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00018-of-00072.bin"
4932877665 bytes in 6.25208 seconds: 0.788998 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00038-of-00072.bin"
4932877665 bytes in 1.50503 seconds: 3.27758 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00032-of-00072.safetensors"
4932875573 bytes in 1.03922 seconds: 4.74671 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00061-of-00072.safetensors"
4932875565 bytes in 6.46773 seconds: 0.76269 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00054-of-00072.bin"
4932877665 bytes in 6.37987 seconds: 0.773194 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00049-of-00072.bin"
4932877665 bytes in 1.06745 seconds: 4.62118 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00069-of-00072.bin"
4932877665 bytes in 1.04855 seconds: 4.70447 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00025-of-00072.safetensors"
4932875553 bytes in 5.97924 seconds: 0.825 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00003-of-00072.safetensors"
4932875551 bytes in 6.97898 seconds: 0.706819 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00031-of-00072.safetensors"
4932875555 bytes in 1.04063 seconds: 4.74029 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00014-of-00072.safetensors"
4932875551 bytes in 1.09654 seconds: 4.49859 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00028-of-00072.safetensors"
4932875573 bytes in 6.06694 seconds: 0.813074 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00068-of-00072.bin"
4932877665 bytes in 6.16835 seconds: 0.799708 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00047-of-00072.safetensors"
4932875573 bytes in 6.5344 seconds: 0.754909 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00031-of-00072.bin"
4932877665 bytes in 1.02919 seconds: 4.79295 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00029-of-00072.bin"
4932877665 bytes in 5.87793 seconds: 0.83922 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00035-of-00072.safetensors"
4932875573 bytes in 1.06242 seconds: 4.64306 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00065-of-00072.safetensors"
4932875571 bytes in 6.52125 seconds: 0.756431 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00065-of-00072.bin"
4932877665 bytes in 6.0777 seconds: 0.811636 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00026-of-00072.safetensors"
4932875563 bytes in 5.9842 seconds: 0.824316 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00042-of-00072.bin"
4932877665 bytes in 5.9264 seconds: 0.832357 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00019-of-00072.bin"
4932877665 bytes in 5.893 seconds: 0.837075 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00048-of-00072.bin"
4932877665 bytes in 4.13364 seconds: 1.19335 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00051-of-00072.bin"
4932877665 bytes in 3.35901 seconds: 1.46855 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00034-of-00072.bin"
4932877665 bytes in 6.14335 seconds: 0.802962 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00020-of-00072.bin"
4932877665 bytes in 1.04687 seconds: 4.71204 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00049-of-00072.safetensors"
4932875541 bytes in 6.37837 seconds: 0.773376 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00053-of-00072.safetensors"
4932875527 bytes in 6.32886 seconds: 0.779426 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00034-of-00072.safetensors"
4932875573 bytes in 6.32055 seconds: 0.78045 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00021-of-00072.safetensors"
4932875553 bytes in 6.25532 seconds: 0.788589 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00058-of-00072.safetensors"
4932875573 bytes in 5.91806 seconds: 0.83353 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00057-of-00072.safetensors"
4932875573 bytes in 5.93299 seconds: 0.831431 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00010-of-00072.safetensors"
4932875541 bytes in 6.10677 seconds: 0.807772 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00014-of-00072.bin"
4932877665 bytes in 6.45824 seconds: 0.763812 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00046-of-00072.bin"
4932877665 bytes in 6.69341 seconds: 0.736975 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00038-of-00072.safetensors"
4932875573 bytes in 1.47864 seconds: 3.3361 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00024-of-00072.bin"
4932877665 bytes in 1.05901 seconds: 4.65803 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00027-of-00072.bin"
4932877665 bytes in 6.40909 seconds: 0.769669 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00006-of-00072.bin"
4932877601 bytes in 5.52455 seconds: 0.892901 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00017-of-00072.bin"
4932877665 bytes in 5.88582 seconds: 0.838095 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00058-of-00072.bin"
4932877665 bytes in 6.09686 seconds: 0.809085 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00009-of-00072.bin"
4932877601 bytes in 6.5395 seconds: 0.75432 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00039-of-00072.safetensors"
4932875555 bytes in 8.07509 seconds: 0.610876 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00042-of-00072.safetensors"
4932875521 bytes in 6.39566 seconds: 0.771284 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00067-of-00072.bin"
4932877665 bytes in 1.92706 seconds: 2.55979 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00016-of-00072.safetensors"
4932875573 bytes in 1.08779 seconds: 4.53477 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00021-of-00072.bin"
4932877665 bytes in 6.08484 seconds: 0.810684 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00005-of-00072.safetensors"
4932875509 bytes in 6.18826 seconds: 0.797134 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00037-of-00072.bin"
4932877665 bytes in 1.07592 seconds: 4.58478 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00001-of-00072.bin"
7193290147 bytes in 8.89842 seconds: 0.808378 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00061-of-00072.bin"
4932877665 bytes in 6.05211 seconds: 0.815067 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00067-of-00072.safetensors"
4932875539 bytes in 1.92372 seconds: 2.56423 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00062-of-00072.safetensors"
4932875573 bytes in 6.68542 seconds: 0.737856 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00007-of-00072.safetensors"
4932875551 bytes in 1.09001 seconds: 4.52553 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00028-of-00072.bin"
4932877665 bytes in 5.91756 seconds: 0.8336 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00060-of-00072.safetensors"
4932875527 bytes in 6.2479 seconds: 0.789526 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00063-of-00072.safetensors"
4932875565 bytes in 6.11895 seconds: 0.806164 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00004-of-00072.bin"
4932877601 bytes in 1.91632 seconds: 2.57415 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00040-of-00072.safetensors"
4932875533 bytes in 1.1723 seconds: 4.20785 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00023-of-00072.safetensors"
4932875573 bytes in 6.29939 seconds: 0.783072 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00010-of-00072.bin"
4932877601 bytes in 6.46308 seconds: 0.763239 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00019-of-00072.safetensors"
4932875555 bytes in 6.28934 seconds: 0.784323 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00054-of-00072.safetensors"
4932875573 bytes in 6.13669 seconds: 0.803833 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00059-of-00072.bin"
4932877665 bytes in 6.26267 seconds: 0.787664 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00006-of-00072.safetensors"
4932875553 bytes in 6.55845 seconds: 0.752141 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00070-of-00072.safetensors"
4932875553 bytes in 1.0652 seconds: 4.63095 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00062-of-00072.bin"
4932877665 bytes in 6.25757 seconds: 0.788305 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00012-of-00072.bin"
4932877665 bytes in 5.91722 seconds: 0.833648 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00057-of-00072.bin"
4932877665 bytes in 6.42177 seconds: 0.76815 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00013-of-00072.bin"
4932877665 bytes in 6.10804 seconds: 0.807604 billion bytes per second.
"/scratch/LLMs/models/bloom/model_00043-of-00072.safetensors"
4932875573 bytes in 6.22971 seconds: 0.791831 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00066-of-00072.bin"
4932877665 bytes in 6.08366 seconds: 0.81084 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00030-of-00072.bin"
4932877665 bytes in 6.34713 seconds: 0.777183 billion bytes per second.
"/scratch/LLMs/models/bloom/pytorch_model_00026-of-00072.bin"
4932877665 bytes in 6.26312 seconds: 0.787607 billion bytes per second.

I'm no expert, but I've seen more than one youtube video where this is described. It seems to be a limitation of the design/system. Also, I would say, believe your evidence (-; . Hoping a real expert will weigh in. Good luck. — shellter, Mar 18 '23 at 16:00
Try reading larger chunks of data at a time - at least a few MiB at a time. — Jesper Juhl, Mar 18 '23 at 16:16
The OS caches the files in memory when you read them. I just wondered if you monitored the swap memory allocation during your tests? — Simon Goater, Mar 18 '23 at 17:42
@SimonGoater, yes, I explicitly clear the OS cache with `echo 3 | sudo tee /proc/sys/vm/drop_caches` before running the program, as well as `posix_fadvise(fd, 0, file_stat.st_size, POSIX_FADV_DONTNEED);` after reading a file in the program. The OS cache stays low. — Serge Rogatch, Mar 18 '23 at 21:44
You could maybe try reordering the files in your test to see if there's a change to the read performance of each file. This might shed some light on whether the slow-down is due to something on the system or maybe the physical location on the drive where the file's data is stored. — Simon Goater, Mar 19 '23 at 18:22
See what you can get from the hardware directly, then you at least know the best you can hope for in your code. `fio --name=seqread --filename=/dev/nvme0n1 --size=16Gb --rw=read --bs=1M --direct=1 --numjobs=1 --ioengine=libaio --iodepth=256 --group_reporting --runtime=30` Then play around a bit with fio, i.e. use a file instead of direct device access, try different iodepths, etc. — TrentP, Mar 19 '23 at 19:14
I think you'll get better performance if you have one thread, which uses aio to allow for multiple outstanding requests at once. Read sequentially. As an aio request is completed, add another. — TrentP, Mar 19 '23 at 19:22

score 2 · Answer 1 · answered Mar 19 '23 at 18:45

This is a partial answer showing a similar (smaller) effect and several hints to track the problem.

I was not able to fully reproduce this issue on my NVMe SSE Samsung 980 Pro (1 TB), but I discovered an interesting similar effect that might explain the behavior of your SSE. Here is the results on my machine for the first ~30 GiB files:

"/tmp/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.4701 seconds: 3.35546 billion bytes per second.
"/tmp/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.52471 seconds: 3.23528 billion bytes per second.
"/tmp/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.47729 seconds: 3.33913 billion bytes per second.
"/tmp/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.66287 seconds: 2.96649 billion bytes per second.
"/tmp/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 2.61768 seconds: 2.74796 billion bytes per second.
"/tmp/bigfiles/model_00006-of-00072.safetensors"
4932875553 bytes in 1.84153 seconds: 2.67868 billion bytes per second.

What we can see is that the speed seems to decrease over time. This effect is consistent over multiple runs. However, the speed of the first file is still ~3.3 GB/s even just after another run. This means my SSD is likely not responsible for the decrease in performance (otherwise, the speed is expected to decrease over subsequent runs). Note that data is not cached (check with iotop and lower-level profiling tools).

If did not download more data as I was running out of space (~4 GiB left) on this partition and this is actually an important point : the slowest files are the one I downloaded last! My first hypothesis was that the file system is fragmented when there is not a lot of remaining space so the last file as split in more small chunks than the first. If so, this causes the operating system (OS) to do more IO requests of small blocks than for the first files, resulting in a lower throughput.

To check that, I just removed the last file to make some space and copied the second file (model_00002-of-00072.safetensors) and here is the result:

"/tmp/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.46369 seconds: 3.37017 billion bytes per second.
"/tmp/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.8347 seconds: 2.68866 billion bytes per second.   <----------
"/tmp/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.47645 seconds: 3.34104 billion bytes per second.
"/tmp/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.69516 seconds: 2.90997 billion bytes per second.
"/tmp/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 2.64308 seconds: 2.72155 billion bytes per second.

[missing: "/tmp/bigfiles/model_00006-of-00072.safetensors"]

As we can see, the speed is consistent with the previous run for all the files except for the copied file. Note that the file system on the partition is Ext4. Other file systems can result in different behavior, especially FAT which tends to get quickly fragmented.

That being said, I tried to use the tool e4defrag to defragment the files and it did not significantly improve the situation. In fact, all files except the first (fastest) has been reported as files that do "not need defragmentation". A report can be shown using e4defrag -c /your/directory. Copying files is also a simple way to automatically defragment them.

I then tested again the same program on the same Linux system on a larger NTFS partition (800 GiB) and did not reproduced the effect. I downloaded more downloaded files and made many copy of the last file so not to wait for a while downloading it. The resulting directory toke 229 GiB. Results on an NTFS partition are actually more stable and surprisingly even better than the Ext4 partition. I run this 3 time to be sure this is reproducible (and it was). Here are the result of the last run:

"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00007-of-00072.safetensors"
4932875551 bytes in 1.31361 seconds: 3.75521 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00008-of-00072.safetensors"
4932875519 bytes in 1.29188 seconds: 3.81837 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00009-of-00072.safetensors"
4932875551 bytes in 1.28907 seconds: 3.82669 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00010-of-00072.safetensors"
4932875541 bytes in 1.29115 seconds: 3.82053 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00011-of-00072.safetensors"
4932875551 bytes in 1.29246 seconds: 3.81666 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00012-of-00072.safetensors"
4932875573 bytes in 1.29473 seconds: 3.80995 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00013-of-00072.safetensors"
4932875573 bytes in 1.28939 seconds: 3.82575 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00014-of-00072.safetensors"
4932875551 bytes in 1.28572 seconds: 3.83668 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00015-of-00072.safetensors"
4932875531 bytes in 1.28118 seconds: 3.85025 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy1.safetensors"
4932875573 bytes in 1.2539 seconds: 3.93404 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy10.safetensors"
4932875573 bytes in 1.26357 seconds: 3.90393 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy11.safetensors"
4932875573 bytes in 1.25435 seconds: 3.93262 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy30.safetensors"
4932875573 bytes in 1.24447 seconds: 3.96383 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy4.safetensors"
4932875573 bytes in 1.26411 seconds: 3.90225 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy5.safetensors"
4932875573 bytes in 1.24267 seconds: 3.96959 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy6.safetensors"
4932875573 bytes in 1.24377 seconds: 3.96607 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy7.safetensors"
4932875573 bytes in 1.24523 seconds: 3.96141 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy8.safetensors"
4932875573 bytes in 1.24923 seconds: 3.94875 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy9.safetensors"
4932875573 bytes in 1.24158 seconds: 3.97305 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072.safetensors"
4932875573 bytes in 1.24082 seconds: 3.9755 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy13.safetensors"
4932875573 bytes in 1.24269 seconds: 3.96953 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy14.safetensors"
4932875573 bytes in 1.23559 seconds: 3.99231 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy15.safetensors"
4932875573 bytes in 1.23835 seconds: 3.98343 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy16.safetensors"
4932875573 bytes in 1.25322 seconds: 3.93617 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy17.safetensors"
4932875573 bytes in 1.23546 seconds: 3.99275 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy18.safetensors"
4932875573 bytes in 1.24692 seconds: 3.95606 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy19.safetensors"
4932875573 bytes in 1.23862 seconds: 3.98255 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy2.safetensors"
4932875573 bytes in 1.23723 seconds: 3.98703 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy20.safetensors"
4932875573 bytes in 1.243 seconds: 3.96852 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy21.safetensors"
4932875573 bytes in 1.2451 seconds: 3.96182 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy22.safetensors"
4932875573 bytes in 1.25585 seconds: 3.92793 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy23.safetensors"
4932875573 bytes in 1.26094 seconds: 3.91205 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy24.safetensors"
4932875573 bytes in 1.23445 seconds: 3.99602 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy25.safetensors"
4932875573 bytes in 1.2411 seconds: 3.97459 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy26.safetensors"
4932875573 bytes in 1.24181 seconds: 3.97233 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy27.safetensors"
4932875573 bytes in 1.23395 seconds: 3.99762 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy28.safetensors"
4932875573 bytes in 1.23269 seconds: 4.00172 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy29.safetensors"
4932875573 bytes in 1.25099 seconds: 3.94318 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00006-of-00072.safetensors"
4932875553 bytes in 1.27636 seconds: 3.86481 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy12.safetensors"
4932875573 bytes in 1.23839 seconds: 3.9833 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00016-of-00072-copy3.safetensors"
4932875573 bytes in 1.23959 seconds: 3.97945 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00001-of-00072.safetensors"
7193289031 bytes in 1.86529 seconds: 3.8564 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00002-of-00072.safetensors"
4932875549 bytes in 1.27457 seconds: 3.87021 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00003-of-00072.safetensors"
4932875551 bytes in 1.29629 seconds: 3.80538 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00004-of-00072.safetensors"
4932875557 bytes in 1.2737 seconds: 3.87288 billion bytes per second.
"/media/richard/24523FDD3AC5C016/Documents and Settings/zephyr/Downloads/bigfiles/model_00005-of-00072.safetensors"
4932875509 bytes in 1.28479 seconds: 3.83945 billion bytes per second.

I advise you to try storing the files on a different partition with another file system so to check if the performance gap is coming from that. I expect the culprit to be the SSD driver, the firmware or the hardware itself.

So far, the SLC cache of my SSD did not impact the read timings, only writes, but it is not clear wether reads can be impacted by the SLC cache. The behaviour can be pretty different from one SSD to another. On my SSD, the firmware stores data in the SLC cache and do not move directly data to the TLC blocks. I guess this is for improving the life of the SSD by avoiding storing data systematically to TLC while they could be removed just later (TLC cells have a limited number of write which tends to be lower than the one of SLC cells, even on dynamic pseudo-SLC cache like on your SSD, which uses TLC cells to make an SLC cache). When the cache is closed to be saturated, it apparently start to move the most of the SLC cache content to TLC blocks. I guess this to be able to better sustain the next write bursts and delay the throughput switch due to the SLC->TLC block copy.

Note that a good firmware should keep relagularly modified file blocks in the SLC cache so to reduce the wear of the SSD. This means a way to check the speed of the SLC vs TLC blocks is to hammer write a file and then measure the time to read it compared to a file writen once. However, this strategy is dangerous since it can significantly reduce the life of the SSD if the firmware does not actually store the hammer file content in the SLC cache. Because of this, I did not tried this on my SSD.

Note that once a set of SLC/TLC pages belonging to a file is written, there is no reason for the read speed to change (except if the request are done less contiguously -- which is unexpected for a file left untouched and no other write done on the target SSD -- or if the firmware decide to surprisingly move the read pages -- which is possible when other writes are done meanwhile for example due to the wear levelling). Consequently, it would be interesting to read the files in a different order so to check whether the bad performance results are tied to specific files or just dependent of the time/heat. To check whether the heat is a problem, you can try to intensively use the SSD before running the program and check if this impact the performance of the benchmark. Note that 75°C is pretty high for this SSD (it is advised not to exceed 70°C). Besides this (and the impact of the FS), I am running out of ideas. I hope this helps.

score 1 · Answer 2 · answered Mar 22 '23 at 21:06

So this may not be a complete answer, but some additional things to consider is something called "read disturb". What this means is that repeatedly reading the same data location on the physical media can have a weak programming affect on the cells in that area. The SSD internals do account for this to keep your data safe, but this can trigger things like internal garbage collection and data refresh cycles, which may influence your applications read speed because the SSD is now frantically moving things around.

NVMe SSD read speed drops after multiple GBs read

2 Answers2