I have a problem with C++ and memory. Here's the pseudocode:
main.cpp
#include <iostream>
#include "seq.h"
int main(int argc, char *argv[]) {
SnpSite snp_site("/mnt/c/Users/manht/Downloads/s_typhi_wong_holt.aln.gz");
snp_site.test(); // run the first time
snp_site.test(); // run the second time
}
seq.h
#include "file_handler.h"
#include <stdio.h>
class SnpSite {
private:
string inputfile;
FileHandler fh;
public:
SnpSite(char* _inputfile);
int is_unknown(char base);
void test();
};
seq.cpp
#include "seq.h"
SnpSite::SnpSite(char* _inputfile) {
fh = FileHandler();
inputfile = _inputfile;
}
void SnpSite::test() {
string sample_name, seq;
this->fh.open(this->inputfile.c_str());
this->fh.assign_next_sample_to(&sample_name, &seq);
this->fh.close();
}
file_handler.h
#ifndef SEQ_H_
#include <zlib.h>
#include <utility>
#include <ctype.h>
#include "my_string.h"
#include <string>
using namespace std;
#define SEQ_H_
typedef bool (*match_func)(int c, int delimiter);
class FileHandler {
private:
gzFile file;
char buffer[2048]; // Static allocation for better performance.
int buffer_start, buffer_end;
bool eof;
void get_until(int delimiter, string *s);
public:
FileHandler();
FileHandler(int _buffer_size);
void open(const char* filename);
void close();
void assign_next_sample_to(string *name, string *seq);
int next_char();
bool is_eof();
};
#endif
file_handler.cpp
#include "file_handler.h"
FileHandler::FileHandler() {
buffer_start = -1;
buffer_end = -1;
eof = false;
}
void FileHandler::open(const char* filename) {
file = gzopen(filename, "r");
eof = false;
}
void FileHandler::close() {
gzclose(file);
}
int FileHandler::next_char() {
/* Read current character and increase cursor (buffer_start) by 1.*/
if (buffer_start >= buffer_end) {
buffer_end = gzread(file, buffer, 2048);
buffer_start = -1;
if (buffer_end == 0) eof = true;
}
return buffer[++buffer_start];
}
bool FileHandler::is_eof() {
return eof;
}
#define SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
#define SEP_TAB 1 // isspace() && !' '
#define SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
#define SEP_MAX 2
// list of function to compare c and delimiter, need exactly 2 arguments.
bool match_space(int c, int delimter) {
return isspace(c);
}
bool match_tab(int c, int delimter) {
return isspace(c) && c != ' ';
}
bool match_newline(int c, int delimter) {
return c == '\n';
}
bool match_char(int c, int delimter) {
return c == delimter;
}
bool no_match(int c, int delimiter) {
return false;
}
// end list.
void FileHandler::get_until(int delimiter, string *s) {
/*
Read till delimiter and append bytes read to s.
When done cursor will be at the end of the line.
*/
match_func match; // function to check if a char match delimiter
switch (delimiter) {
case SEP_SPACE:
match = match_space;
break;
case SEP_TAB:
match = match_tab;
break;
case SEP_LINE:
match = match_newline;
break;
default:
if (delimiter > SEP_MAX) match = match_char;
else match = no_match;
}
// begin process
int i = buffer_start;
while (!match(buffer[i], delimiter)) {
if (buffer_start >= buffer_end) {
buffer_end = gzread(file, buffer, 2048);
buffer_start = 0;
i = 0;
if (buffer_end == 0) {
eof = true;
break;
}
}
while (!match(buffer[i], delimiter) && i < buffer_end) i++;
s->append((char*)(buffer + buffer_start), i - buffer_start);
buffer_start = i;
}
}
/*
Get next sample name and sequence, assign it to *name and *seq.
(Note: this function do not read quality score for QUAL file).
*/
void FileHandler::assign_next_sample_to(string *name, string *seq) {
/* Get next sample name and sequence, assign it to *name and *seq.*/
name->erase();
seq->erase();
int c;
while (!eof && (c = next_char()) != '>' && c != '@') {} // read until meet sample name
get_until(SEP_SPACE, name); // get sample name
while (!eof && (c = next_char()) != '>' && c != '@' && c != '+') {
if (c == '\n') continue;
get_until(SEP_LINE, seq); // read sequence
}
buffer_start--; // step back to the end of sequence
}
I don't use any dynamic allocation, and when I traced memory usage by PID in htop, I found something that I can't explain:
- The first time I call test():
- At the beginning of the function, my process uses 6168 KBytes.
- At the end of the function, my process uses 13998 Kbytes.
- The second time I call test():
- At the beginning of the function, my process uses 6304 Kbytes.
- At the end of the function, my process uses 21664 Kbytes.
The length of the seq
variable is 4809037 and sample_name
is 11 in both cases. I don't understand why memory usage is so different between them. Hope someone can find out and explain it to me, it helps me a lot. Thanks