1

I have a problem with C++ and memory. Here's the pseudocode:

main.cpp
#include <iostream>
#include "seq.h"

int main(int argc, char *argv[]) {
    SnpSite snp_site("/mnt/c/Users/manht/Downloads/s_typhi_wong_holt.aln.gz");
    snp_site.test(); // run the first time
    snp_site.test(); // run the second time
}
seq.h
#include "file_handler.h"
#include <stdio.h>

class SnpSite {
private:
    string inputfile;
    FileHandler fh;
public:
    SnpSite(char* _inputfile);

    int is_unknown(char base);
    void test();
};
seq.cpp
#include "seq.h"

SnpSite::SnpSite(char* _inputfile) {
    fh = FileHandler();
    inputfile = _inputfile;
}

void SnpSite::test() {
    string sample_name, seq;
    this->fh.open(this->inputfile.c_str());
    this->fh.assign_next_sample_to(&sample_name, &seq);
    this->fh.close();
}
file_handler.h
#ifndef SEQ_H_
#include <zlib.h>
#include <utility>
#include <ctype.h>
#include "my_string.h"
#include <string>

using namespace std;

#define SEQ_H_

typedef bool (*match_func)(int c, int delimiter);

class FileHandler {
private:
    gzFile file;
    char buffer[2048]; // Static allocation for better performance.
    int buffer_start, buffer_end;
    bool eof;
    void get_until(int delimiter, string *s);
public:
    FileHandler();
    FileHandler(int _buffer_size);

    void open(const char* filename);
    void close();
    void assign_next_sample_to(string *name, string *seq);
    int next_char();
    bool is_eof();
};

#endif
file_handler.cpp
#include "file_handler.h"

FileHandler::FileHandler() {
    buffer_start = -1;
    buffer_end = -1;
    eof = false;
}

void FileHandler::open(const char* filename) {
    file = gzopen(filename, "r");
    eof = false;
}

void FileHandler::close() {
    gzclose(file);
}

int FileHandler::next_char() {
    /* Read current character and increase cursor (buffer_start) by 1.*/
    if (buffer_start >= buffer_end) {
        buffer_end = gzread(file, buffer, 2048);
        buffer_start = -1;
        if (buffer_end == 0) eof = true;
    }
    return buffer[++buffer_start];
}

bool FileHandler::is_eof() {
    return eof;
}

#define SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define SEP_TAB   1 // isspace() && !' '
#define SEP_LINE  2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define SEP_MAX   2

// list of function to compare c and delimiter, need exactly 2 arguments.
bool match_space(int c, int delimter) {
    return isspace(c);
}
bool match_tab(int c, int delimter) {
    return isspace(c) && c != ' ';
}
bool match_newline(int c, int delimter) {
    return c == '\n';
}
bool match_char(int c, int delimter) {
    return c == delimter;
}
bool no_match(int c, int delimiter) {
    return false;
}
// end list.

void FileHandler::get_until(int delimiter, string *s) {
    /*
        Read till delimiter and append bytes read to s.
        When done cursor will be at the end of the line.
    */
    match_func match; // function to check if a char match delimiter
    switch (delimiter) {
        case SEP_SPACE:
            match = match_space;
            break;
        case SEP_TAB:
            match = match_tab;
            break;
        case SEP_LINE:
            match = match_newline;
            break;
        default:
            if (delimiter > SEP_MAX) match = match_char;
            else match = no_match;
    }

    // begin process
    int i = buffer_start;
    while (!match(buffer[i], delimiter)) {
        if (buffer_start >= buffer_end) {
            buffer_end = gzread(file, buffer, 2048);
            buffer_start = 0;
            i = 0;
            if (buffer_end == 0) {
                eof = true;
                break;
            }
        }
        while (!match(buffer[i], delimiter) && i < buffer_end) i++;
        s->append((char*)(buffer + buffer_start), i - buffer_start);
        buffer_start = i;
    }
}

/* 
    Get next sample name and sequence, assign it to *name and *seq.
    (Note: this function do not read quality score for QUAL file).
*/
void FileHandler::assign_next_sample_to(string *name, string *seq) {
    /* Get next sample name and sequence, assign it to *name and *seq.*/
    name->erase();
    seq->erase();
    int c;
    while (!eof && (c = next_char()) != '>' && c != '@') {} // read until meet sample name
    get_until(SEP_SPACE, name); // get sample name
    while (!eof && (c = next_char()) != '>' && c != '@' && c != '+') {
        if (c == '\n') continue;
        get_until(SEP_LINE, seq); // read sequence
    }
    buffer_start--; // step back to the end of sequence
}

I don't use any dynamic allocation, and when I traced memory usage by PID in htop, I found something that I can't explain:

  • The first time I call test():
    • At the beginning of the function, my process uses 6168 KBytes.
    • At the end of the function, my process uses 13998 Kbytes.
  • The second time I call test():
    • At the beginning of the function, my process uses 6304 Kbytes.
    • At the end of the function, my process uses 21664 Kbytes.

The length of the seq variable is 4809037 and sample_name is 11 in both cases. I don't understand why memory usage is so different between them. Hope someone can find out and explain it to me, it helps me a lot. Thanks

Cosmin
  • 21,216
  • 5
  • 45
  • 60
manhtr76
  • 93
  • 1
  • 6
  • You really should not care for that. The amount of allocated memory is only an implementation detail and only matters if you are developping a compiler and its support library. For *real world* programs, what only matters is to not leak memory. Said differently if a tool like vagrant says that your code has memory leaks, you should fix it, even if you *memory measurements* do not detect it, and if vagrant says that everything is correct, memory measurements only make sense to know the requirements for your program. – Serge Ballesta Jan 02 '23 at 15:07
  • 1
    _"...I don't use any dynamic allocation,..."_ `string inputfile;` and `string sample_name,` the contents probably are on the heap (SSO), there might be other cases as well. Heap space is granted by the OS in pages and used by the C++ memory sub-allocator as needed. Heap pages are not usually returned to the OS until the program finishes. – Richard Critten Jan 02 '23 at 15:10
  • as a side remark [don't use ```using namespace std```](https://stackoverflow.com/questions/1452721/why-is-using-namespace-std-considered-bad-practice) – francesco Jan 02 '23 at 15:13
  • 1
    Not directly related, but FileHandler's destructor should call gzclose() if the file is still open, which means that FileHandler::close() should reset FileHandler::file to a known empty value, which means that the constructor should **initialize** FileHandler::file. So yes, there is a resource leak in your code, but you haven't stumbled on it yet. – Michaël Roy Jan 02 '23 at 15:37
  • I ran the program with valgrind and there is no memory leak. – manhtr76 Jan 02 '23 at 16:45
  • @RichardCritten I don't really understand, at the beginning of each function call, my program always uses an amount of 6MB memory. So I still think it freed memory. – manhtr76 Jan 02 '23 at 17:08

1 Answers1

0

This happens because of this line:

s->append((char*)(buffer + buffer_start), i - buffer_start);

Strings are dynamically allocated and every time the initial size is exceeded a new larger memory block is allocated. You can read more about this here: Chapter 4. Optimize String Use: A Case Study.

Cosmin
  • 21,216
  • 5
  • 45
  • 60
  • I know but I don't understand why the memory allocated is different between two function calls, despite the same conditions. – manhtr76 Jan 04 '23 at 03:04
  • @manhtr76 Because the block size is not always the same. – Cosmin Jan 04 '23 at 07:39
  • it has a big difference between the first two times, but the 3rd time I call test() function, the memory usage is very equivalent to the second time. – manhtr76 Jan 04 '23 at 15:05