0

I want to read a file which has some characters in it and check the percentage of printable characters as well as the percentage of white spaces. This is my Python code which generates the input file:

import string
import random
array = list()
array = list(string.printable)
print(array)
external = ['\0','\a','\b','\v','\f','\e']
array = array + external
file = open("in.txt" , 'w')
for i in range (1000):
        outputline = array[random.randrange(0,len(array)-1)]
        file.write(outputline)
file.close()

I want my file to have both printable characters and whitespaces and other characters (which are not in these two groups). I do this in two ways:

  1. Read the file, chunk by chunk, with the read system call in C:

    #include <stdio.h>
    #include <stdio.h>
    #include <ctype.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <errno.h>
    int main(int arg, char *argv[])
    {
        char c ;
        char *data;
        int numOfWs = 0 ;
        int numOfPr = 0 ;
        int numberOfCharacters;
        int sizeOfBlock ;
        int nread ;
        int i=0;
        int k;
        float wsP = 0;
        float prP = 0;
        sizeOfBlock = atoi(argv[3]) ;
        data = malloc(sizeOfBlock*sizeof(char));
        int fd = open(argv[2], O_RDONLY);
        while((nread = read(fd, data, sizeOfBlock)) > 0)
        {
            numberOfCharacters += nread ;
            for (i = 0; i < nread; ++i)
            {
                c = data[i] ;
                if(isprint(c))
                    numOfPr ++ ;
                else if(isspace (c))
                    numOfWs ++ ;
            }
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        exit(0);
    }
    
  2. Copy the whole file into memory using mmap() and then start to read it from memory:

    #include <stdio.h>
    #include <stdlib.h>
    #include <fcntl.h>
    #include <unistd.h>
    #include <sys/types.h>
    #include <sys/mman.h>
    #include <sys/stat.h>
    #include <errno.h>
    #include <ctype.h>
    #include <string.h>
    int main(int arg, char *argv[])
    {
        char c ;
        int i, numOfWs = 0, numOfPr = 0, numberOfCharacters = 0;
        char *data;
        float wsP = 0;
        float prP = 0;
        struct stat s;
        int fp = open("x.txt", O_RDWR);
        int status = fstat (fp, &s);
        int size = s.st_size;
        data = mmap((caddr_t)0, size, PROT_READ, MAP_SHARED, fp,0);
        for (i=0; i<size; i++){
            char c;
            c = data[i];
            if(isspace(c))
                numOfWs ++;
            else if(isprint(c))
                numOfPr ++;
            numberOfCharacters ++ ;
        }
        wsP = (numOfWs / (float)numberOfCharacters)*100;
        prP = (numOfPr / (float)numberOfCharacters)*100 ;
        printf("%d printable characters out of %d bytes, %.2f%c\n", numOfPr,numberOfCharacters,prP,'%');
        printf("%d whitespace characters out of %d bytes, %.2f%c\n", numOfWs,numberOfCharacters,wsP, '%');
        close(fp);
        exit(0);
    }
    

when I check these two ways with the same file (created by the Python code) I get two different answers for the percentages but the number of characters which are read each way are the same (1000).

This is my file generated from the Python code (I don't know what happens when I copied it here it contains lots of control characters):

el*2mlz_XyjP@%?Sw}Qo~.."tJ^~6,eN8+kq l)*N-1oupE
)coFKoA0\=|W'{Oezx~^p(B5ZJe!AdYb{Gflv&wwCf8}>3"v*>9\pW8PIs;qpX7RSk<9}&8B$u kNaq(mJK$N-!38?E%8-T,I1zC~0O=}FH*
x9x6Q%GT_C0j>7:@EG{N ?Eh$?18;Ncy[3 $'ikKs%:A].?e;i4`x"k!VD]}*pw
?\wE~Vix7^H~[26lsN?_GO$vz3M464S`+h=A(5@]q<&<+ hjmehAb-_3*3F8&#iM3p)6T`S9Q\yZwm$U`OHG}02{A)WcVzBR1h}H?qhF:P^-j5AQ1<7FD60j#B#}9Z=}2QReaYy|{Wv<^!yOC/7P}n*ZEPV2@8cU),=*5]]d a:3J;Y(?D?31$pcrquc#&PB;A[9lV+gJ%WZ6K~A|%^E_\3dM/?"y)BtUtG"3hf}W4,3DrXxTyl\UbWwCbMufqCNWx |hiJ\>43S6tCCS)rEo0.cz5PjgK0_AKN|8'g]byLp9AlrZDuK1OX,Csa}nu&i_p,#
Wyc{Q
LA\4:!WSq"ln|Pv.B;+N'h%O;tu(CgIh~OYIXCl+6~nSxBuybP nH:j;t'\vk&p}
,;3Ny#`Ug!rVbqExY|  %BVCD^D~z:L(j8L!    @   X4a!KBNCQ4z&3^9[O<fkM-qrOq5F/M*]yyU+-VLdZRtUu
a"=a b%c~GI|tcC/
P'/`t|hZ/2iHd94l"%;4-{)VUw%%3"e%IQ{RAX]NeMcsh&@LziT0)_T"2XADH&NYqa<6,$wdSp@LIMGA&,Gx1mj|t't?7=YtT77r<qi8;|tzi kOAi'dq%+g2   5hY?XTj{F)18.Vd!!$Q{D)}$7XxO)Vi%29*,P"cXkC,M|&brd&-DGF>V4 %N)a"VM+TQ$FI;YiL-0 YSxXgC@i~,o6/a7U2c"eGr\N7^B:'dytlOOS(iy\lhC7vnW,f o;vKUNa
Hg#u}W4N wUM

this is the result from mmap():

961 printable characters out of 1000 bytes, 96.10%
39 whitespace characters out of 1000 bytes, 3.90%

and this is the result from chunk reading:

974 printable characters out of 1000 bytes, 97.40%
26 whitespace characters out of 1000 bytes, 2.60%

Why is the number of printable characters different, but the file is the same in the two methods?

I think isspace() doesn't accept as a white space in chunked mode and instead counts it as a printable character.

psmears
  • 26,070
  • 4
  • 40
  • 48
EmadSmart
  • 81
  • 1
  • 13
  • You don't need to pass the `'%'` character as a paramter just use `%%` in the format string. I don't understand your question, _I get two different answer ..._ What do you meant? – Iharob Al Asimi Jun 28 '15 at 15:38
  • I think that the chunk reading does not take space ` ` a white space whrn call the `isspace()` – EmadSmart Jun 28 '15 at 15:41
  • @iharob this is [my file](http://pastebin.com/NvsPNQwU) generated from python.the answers of each method is [here](http://pastebin.com/EYxMjBYm) you'll see the differences between answer for a same file – EmadSmart Jun 28 '15 at 15:46
  • عماد BTW, don't use `fp` as a file **d**escriptor identifier use `fd`, it doesn't matter but it's confusing, when I read `mmap((caddr_t)0, size, PROT_READ, MAP_SHARED, fp, 0);` I immediately thought "_Oh no, he passed a `FILE *` to `mmap()`_, also be consistent with your coding style. – Iharob Al Asimi Jun 28 '15 at 15:53
  • @iharob tnx , i edited – EmadSmart Jun 28 '15 at 15:56
  • A little style notice for the Python code: use [`with`](http://stackoverflow.com/q/1369526/562769) when you open files. – Martin Thoma Jun 28 '15 at 16:06

2 Answers2

1

You didn't initialize numberOfCharacters to 0 in the first program.

That means that the value of numberOfCharacters is undetermined before

numberOfCharacters += nread ;

is executed, this is a good reason to separate declaration from definition.

Iharob Al Asimi
  • 52,653
  • 6
  • 59
  • 97
1

the problem was that spaces are printable in the isprint()i change the order of isspace() and isprint() and use if()...else if() it becomes true.

EmadSmart
  • 81
  • 1
  • 13
  • You should accept your own answer, and of course spaces are printable, I just read that part from the first program, and missed it in the second program. – Iharob Al Asimi Jun 28 '15 at 19:29