While the OP claimed it take 12 seconds to process 100 file, I got a much faster measurement.
TL;DR;
Time for OPs method: 0.006472 seconds
Faster method: 0.005364 seconds
First, let's define the measurement metric: We ignore the time spent on file I/O since it cannot be avoided in the implementation level we focused.
We define the data processing function with this interface:
int testcase (uint8_t* des, size_t deslen, const uint8_t* src, size_t srclen);
Then, we measure time spent with N iterations, for simplicity we choose N=1:
T0 = chrono::system_clock::now();
test_base(des1, testdata_size, src, testdata_size);
T1 = chrono::system_clock::now();
Once we have more scientific measurement, we now have objective idea on what is faster.
However, I should point out that there are no generic algorithm that is fastest in all cases, that's why you have different algorithms.
For this particular test, we should consider the input sample space and design a processing such that it is optimised for such input sample statistically.
I assume the input sample is in plan English, such that most are ASCII character, and in lower cases, with few TAB, SPACE and line feeds; if this is not the case, my method will be slower.
Now, with the above assumption, I keep track of processed characters, and group together the output with a memcpy
, this yields a slightly faster measure in this particular case.
The full source: compile with clang -lstdc++ -O2 a.cpp -o a
#include <chrono>
#include <iostream>
#include <unistd.h>
#include <math.h>
using namespace std;
int test_base(uint8_t* des, size_t deslen, const uint8_t* src, size_t srclen) {
int i, j;
if ( deslen < srclen ) return -1;
for (i=0, j=0; i<srclen; ++i) {
switch (src[i]) {
case 'a':
case 'b':
case 'c':
case 'd':
case 'e':
case 'f':
case 'g':
case 'h':
case 'i':
case 'j':
case 'k':
case 'l':
case 'm':
case 'n':
case 'o':
case 'p':
case 'q':
case 'r':
case 's':
case 't':
case 'u':
case 'v':
case 'w':
case 'x':
case 'y':
case 'z':
des[j] = src[i] - 32;
++j;
break;
case ' ':
case '\t':
case '\r':
case '\n':
break;
default:
des[j] = src[i];
++j;
}
}
return 0;
}
int testcase_1(uint8_t* des, size_t deslen, const uint8_t* src, size_t srclen) {
const uint8_t* end = src + srclen;
const uint8_t* head = src;
size_t len;
if ( deslen < srclen ) return -1;
for (; src < end; src++) {
uint8_t value = *src;
if ( value >= 'a' && value <='z' ) {
size_t len = (size_t)(src - head);
if ( len > 0 ) memcpy ( des, head, len );
des[len] = value-32;
des += len+1;
head = src+1;
} else if ( value == ' ' || value == '\t' || value == '\r' || value == '\n' ) {
size_t len = (size_t)(src - head);
if ( len > 0 ) memcpy ( des, head, len );
des += len;
head = src+1;
} else {
// Do Nothing
}
}
return 0;
}
int main(int argc, char* argv[]) {
chrono::time_point<chrono::system_clock>T0, T1;
uint64_t duration;
// Create test data
uint8_t *src, *des1, *des2;
const size_t testdata_size = 1024*1024; // 1MB
src = new uint8_t[testdata_size];
des1 = new uint8_t[testdata_size];
des2 = new uint8_t[testdata_size];
// TODO: seed
for ( size_t i=0; i<testdata_size; i++ ) {
src[i] = (uint8_t)rand();
}
// Put things in cache and realize the memory
memset ( des1, 0, testdata_size);
memset ( des2, 0, testdata_size);
T0 = chrono::system_clock::now();
test_base(des1, testdata_size, src, testdata_size);
T1 = chrono::system_clock::now();
duration = chrono::duration_cast<std::chrono::nanoseconds>(T1-T0).count();
cout << "Duration: " << (duration/1.0E9) << endl;
T0 = chrono::system_clock::now();
testcase_1(des2, testdata_size, src, testdata_size);
T1 = chrono::system_clock::now();
duration = chrono::duration_cast<std::chrono::nanoseconds>(T1-T0).count();
cout << "Duration: " << (duration/1.0E9) << endl;
if ( memcmp ( des1, des2, testdata_size ) == 0 ) {
cout << "Meaningless compare to prevent optimize!";
}
delete des2;
delete des1;
delete src;
return 0;
}