You could just estimate what you need in blocks and grow the input buffer as needed...
This is untested, but gives the flavour of what should work.
This version attempts to load the entire file before investigating its content.
FILE *fp = fopen( "/sys/file", "rb" );
if( fp == NULL )
return -1;
#define BLK_SIZE 1024
char *buf = malloc( BLK_SIZE );
if( buf == NULL )
return -1;
char *readTo = buf;
size_t bufCnt = 0;
for( ;; ) {
size_t inCnt = fread( readTo, sizeof *readTo, BLK_SIZE, fp );
bufCnt += inCnt;
if( inCnt < BLK_SIZE )
break;
// possibly test for EOF here
char *tmp = realloc( buf, bufCnt + BLK_SIZE );
if( tmp == NULL )
return -1;
buf = tmp;
readTo = buf + bufCnt;
}
fclose( fp );
printf( "Got %ld valid bytes in buffer\n", bufCnt );
/* do stuff with *buf */
free( buf );
Hopefully the final EDIT of version 2:
I am grateful to @Andreas Wenzel for his cheerful and meticulous testing and comments that turned earlier (incorrect!) versions of my attempts into this prototype.
The objective is to find a string of bytes in a file.
In this prototype, single "buffer loads" are examined sequentially until the first instance of the target is found or EOF reached. This seems to cope with cases when the target bytes are split across two buffer loads. This uses a ridiculously small 'file' and small buffer that would, of course, be scaled up in the real world.
Making this more efficient is left as an exercise for the reader.
#include <stdio.h>
#include <stdlib.h>
#include <stdbool.h>
#include <string.h>
// Simulated file with text
char inBytes[] = "The cute brown fox jumps over the dogs and bababanana and stuff.";
char *pFrom = NULL;
size_t nLeft = sizeof inBytes - 1;
// Simulated 'fopen()'.
bool myOpen( void ) { nLeft = strlen( pFrom = inBytes ); return true; }
// Simulated 'fread()'. (only 1 "file pointer in use")
size_t myRead( char *buf, size_t cnt ) {
if( nLeft == 0 ) return 0; // EOF
size_t give = nLeft <= cnt ? nLeft : cnt;
memcpy( buf, pFrom, give );
pFrom += give;
nLeft -= give;
return give;
}
// Look for string using different buffer sizes to prove target split functions
bool foobar( char srchfor[], int bufSize ) {
bool found = false;
int matched = 0;
int lenWant = strlen( srchfor ); // # of chars to match
// RAM buffer includes room for "wrapping"
char *iblk = (char*)malloc( lenWant + bufSize );
if( iblk == NULL ) {
fprintf( stderr, "Malloc failed!!!\n" );
exit( 1 );
}
// simulate loading sequential blocks into a fixed size buffer.
myOpen();
size_t inBuf = 0;
char *pTo = iblk; // Read to location not always start of buffer
while( ( inBuf += myRead( pTo, bufSize ) ) != 0 ) {
printf( "'%.*s' ", (int)inBuf, iblk ); // Show what's in buffer
// The mill where matching is carried out
for( size_t i = 0; i < inBuf && matched < lenWant; )
if( srchfor[ matched ] == iblk[i] )
matched++, i++;
else if( matched )
i -= matched - 1, matched = 0; // rewind a bit and try again
else i++;
// Lucky?
if( matched == lenWant ) { printf( "Ahha!\n" ); found = true; break; }
if( matched == 0 ) {
pTo = iblk, inBuf = 0; // reset things
printf( "nothing\n" );
} else {
// preserve what did match, and read location is offset
printf( "got something\n" );
memmove( iblk, iblk + inBuf - matched, matched );
pTo += matched;
inBuf = matched;
matched = 0;
}
}
free( iblk );
return found;
}
int main() {
char *target = "babanana";
// Test with different buffer sizes (to split target across successive reads )
for( int sz = 20; sz < 27; sz += 2 )
printf( "bufSize = %d ... %s\n\n",
sz, foobar( target, sz ) ? "Found!": "Not Found." );
return 0;
}
Output:
'The cute brown fox j' nothing
'umps over the dogs a' nothing
'nd bababanana and st' Ahha!
bufSize = 20 ... Found!
'The cute brown fox jum' nothing
'ps over the dogs and b' got something
'bababanana and stuff.' Ahha!
bufSize = 22 ... Found!
'The cute brown fox jumps' nothing
' over the dogs and babab' got something
'babanana and stuff.' Ahha!
bufSize = 24 ... Found!
'The cute brown fox jumps o' nothing
'ver the dogs and bababanan' got something
'babanana and stuff.' Ahha!
bufSize = 26 ... Found!
EDIT3: That memmove()
and the buffer size has been an annoyance for some time now.
Here's a version that takes one character of input at a time (fgetc()
compatible), uses a heap buffer that is the same size as the target, uint8_t
allows a search for binary targets, implements a circular buffer and has a lot of fiddley index manipulation. It's not Knuth, but neither am I...
size_t srch( uint8_t srch[], size_t nS, uint8_t targ[], size_t nT ) {
uint8_t c, skip = 0, *q = (uint8_t*)malloc( nT );
if( q == NULL ) {
fprintf( stderr, "Malloc failed!!!\n" );
exit( 1 );
}
size_t head = 0, tail = 0, ti = 0, tiS = 0, i = 0;
while( ti < nT && i < nS ) {
c = skip ? c : srch[i++]; // getc()
skip = 0;
if( c == targ[ti] ) {
q[tail++] = c;
tail %= nT;
ti++;
} else if( ti ) {
skip = 1;
do{
while( --ti && q[ head = ++head%nT ] != targ[ 0 ] );
for( tiS = 0; q[ (head+tiS)%nT ] == targ[ tiS ]; tiS++ );
} while( tiS < ti );
}
}
free( q );
return ti == nT ? i - nT : nS; // found ? offset : impossible offset
}
int main() {
char *in =
"The cute brown fox jumps "
"over the dogs babababananana stuff";
size_t inSize = strlen( in );
char *targets[] = {
"The", "the", "ff",
"babanana", "banana",
"jumps", " cute",
"orange",
};
int nTargs = sizeof targets/sizeof targets[0];
for( int i = 0; i < nTargs; i++ ) {
size_t val = strlen( targets[i] );
val = srch( (uint8_t*)in, inSize, (uint8_t*)targets[i], val );
if( val == inSize )
printf( "%s ... not found\n", targets[i] );
else
printf( "%s ... %.15s\n", targets[i], in + val );
}
return 0;
}
Output
The ... The cute brown
the ... the dogs and ba
ff ... ff
babanana ... babananana and
banana ... bananana and st
jumps ... jumps over the
cute ... cute brown fox
orange ... not found