1

I have an input file named as datafile.data, which looks something like below:

1,2,1,1,0
1,3,1,1,0
1,1,2,2,1
2,1,2,2,1
2,3,2,3,1
1,1,2,3,2
3,1,1,4,2
2,1,3,2,2
3,3,3,1,2
2,2,3,4,2

Here the 1st 4 columns stands for 4 attribute values say A1, A2, A3, A4. And the final column stands for the class value. For this particular sample file there are 4 attributes but for some other files, there can be 'n' number of attributes but for every file, the last column will give the class values.

Now I want to convert this file to another file named as : outputfile.exp

Where the output file's 1st row looks something like below:

<Number of rows in the .data file> <Number of attributes> <Max value of A1> <Max value of A2> <Max value of A3> <Max value of A4> <(Max value of last column)+1>

And the remaining rows of the output file will be same as the data file, with just one change, that is the last column's each value will be incremented by 1.

For an example the output file for the above example will look like:

10 4 3 3 3 4 3
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3

Where the 1st row's 10 is the number of rows, 4 is the number of attributes present, (3,3,3,4) these 4 are the maximum values of attributes A1,A2,A3 and A4 and last 3 stands for the highest class value +1. And the last column's every value has been incremented by 1 as well.

Below I am attaching my try:

#include <stdio.h>
#include <string.h>
#define MAX_FILE_NAME 100
  
int main()
{
    FILE *fp;
    int count = 0;  // Line counter (result)
    char filename[MAX_FILE_NAME], dataToBeRead[50];
    char c;  // To store a character read from file
  
    // Open the file
    fp = fopen("datafile.data", "r");
  
    // Check if file exists
    if (fp == NULL)
    {
        printf("Could not open file %s", filename);
        return 0;
    }
  
    // Extract characters from file and store in character c
    for (c = getc(fp); c != EOF; c = getc(fp))
        if (c == '\n') // Increment count if this character is newline
            count = count + 1;
  
    fclose(fp);
    
    printf("%d\n",count);
    
    fp = fopen("datafile.data", "r");
    
    if ( fp == NULL )
    {
        printf( "Failed to open." ) ;
    }
    else
    {
        while( fgets ( dataToBeRead, 50, fp ) != NULL )
        {
            printf( "%s" , dataToBeRead ) ;
        }
        fclose(fp) ;
    }
  
    return 0;
}

And I am getting the below output:

10
1,2,1,1,1
1,3,1,1,1
1,1,2,2,2
2,1,2,2,2
2,3,2,3,2
1,1,2,3,3
3,1,1,4,3
2,1,3,2,3
3,3,3,1,3
2,2,3,4,3

Now I am unable to proceed further, as I am very new to C, please help me out.

Edit 1 : The output format of the example will be:

10 4 3 3 3 4 3
1 2 1 1 1
1 3 1 1 1
1 1 2 2 2
2 1 2 2 2
2 3 2 3 2
1 1 2 3 3
3 1 1 4 3
2 1 3 2 3
3 3 3 1 3
2 2 3 4 3
Dev
  • 576
  • 3
  • 14
  • It would be a lot easier if you can place the meta data at the end of the file. If you want to write it at the start, you'll need to either read the data twice or store it all. – William Pursell Nov 01 '21 at 19:49
  • I need to write it at the start to match a particular pattern. So I can't place that at the end. – Dev Nov 01 '21 at 19:55
  • Do you just need the data, or are you looking for the learning exercise of doing this in C? You might want to seriously consider writing the metadata at the end of the file and then writing another program that moves that line to the beginning. Make things as simple as possible. – William Pursell Nov 01 '21 at 20:01
  • Honestly saying I need this as I am working with rule based classifiers, so I need to take this input to a common output format. Where the format looks exactly similar to what I have stated in the question. So I need those data at the top of the output file. I just want to give the .data file as command line argument which will generate the output file in that format. – Dev Nov 01 '21 at 20:07

3 Answers3

1

You really don't want to do this, since rewinding an input stream is an anti-pattern. But you can do something like:

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);

void
parse_line(const char *buf, int *max, int column_count)
{
    for(int i = 0; i < column_count; i++ ){
        char *end;
        int t = strtol(buf, &end, 10);
        if( t > max[i] ){
            max[i] = t;
        }
        if( !((i < column_count - 1 && *end == ',')
            || (i == column_count - 1 && *end == '\n'))
        ){
            fprintf(stderr, "invalid input '%c' in %s", *end, buf);
            exit(1);
        }
        buf = end + 1;
    }
}


int
main(int argc, char **argv)
{
    const char *path = argc > 1 ? argv[1] : "stdin";
    FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
    char buf[1024];
    int column_count = 1;
    int row_count = 1;
    int *max;

    /* Read first line to determine number of columns */
    if( fgets(buf, sizeof buf, in) == NULL ){
        fputs("Input error\n", stderr);
        return 1;
    }

    for( const char *p = buf; *p; p++ ){
        if( *p == ',' ){
            column_count += 1;
        }
    }
    max = xmalloc(column_count * sizeof *max);
    for( int i = 0; i < column_count; i++ ){
        max[i] = INT_MIN;
    }
    parse_line(buf, max, column_count);
    while( fgets(buf, sizeof buf, in) != NULL ){
        row_count += 1;
        parse_line(buf, max, column_count);
    }
    if( fseek(in, 0L, SEEK_SET) ){
        perror(path);
        return 1;
    }
    printf("%d %d ", row_count, column_count - 1);
    for( int i = 0; i < column_count - 1; i += 1 ){
        printf("%d ", max[i]);
    }
    printf("%d\n", max[column_count - 1] + 1);

    while( fgets(buf, sizeof buf, in) != NULL ){
        char *comma = strrchr(buf, ',');
        if( comma == NULL ){
            fprintf(stderr, "Invalid input\n");
            return 1;
        }
        *comma = '\0';
        int k = strtol(comma + 1, NULL, 10);
        printf("%s,%d\n", buf, k + 1);
    }
}

FILE *
xfopen(const char *path, const char *mode)
{
    FILE *fp = path[0] != '-' || path[1] != '\0' ? fopen(path, mode) :
        *mode == 'r' ? stdin : stdout;
    if( fp == NULL ){
        perror(path);
        exit(EXIT_FAILURE);
    }
    return fp;
}

void *
xmalloc(size_t s)
{
    void *rv = malloc(s);
    if( rv == NULL ){
        perror("malloc");
        exit(EXIT_FAILURE);
    }
    return rv;
}

You can execute this as ./a.out < datafile.data > outputfile.exp or ./a.out datafile.data > outputfile.exp, but this will not work if you try to read from a pipe (the seek will fail). The seek failure and the inability to run this as a filter make this a suboptimal approach, but storing the entire file in memory also has drawbacks.

William Pursell
  • 204,365
  • 48
  • 270
  • 300
  • Thank you so much. This is working absolutely fine. I just need a small modification, actually in that output file, the attribute values and class values will not be comma separated, it will be just space separated instead. I mean according to my above example, the second row of the output file will look something like : 1 2 1 1 1 – Dev Nov 02 '21 at 02:56
  • @Dev The sample output provided shows spaces as separators in the first line. Details like that should be included in the question. – William Pursell Nov 02 '21 at 12:49
  • No no 1st line is completely fine @William, and your code delivers exactly what I asked in the question. Actually it was a little mistake from my end. Should I update the question a little? And I have tried to modify the code of yours accordingly and succeeded to some extent. Should I edit the question and give the sample output once again as well as my try? I thing a small modification is needed. – Dev Nov 02 '21 at 15:35
  • There's probably no need to update the question. Making replacing the commas with a space in the output is a pretty trivial modification. – William Pursell Nov 02 '21 at 15:41
  • I have tried in several ways, one of which, I tried to modify the last printf statement of your code inside the main where I wrote that like : "printf("%s %d\n", buf, k + 1);" . Using this I am getting the output files 2nd row as : 1,2,1,1 1 ...But I am unable to find out the solution of how to remove all the commas and replece them with single space. I am updating the output file format in the question.Please help me. – Dev Nov 02 '21 at 20:32
  • You just need to go through `buf` and replace `,` with space. eg `for(char *p = buf; *p; p++) { if( *p == ',' ) *p = ' '; }` – William Pursell Nov 02 '21 at 20:51
  • Thank you so much. One more query just out of curiosity, if I want to give the stem filename only as the first command line argument, then is it possible like that? For example: I want to give it like : `./a.out datafile > outputfile.exp` . Where datafile is the name of the stem file which has say 2 files, one is datafile.data and another is datafile.names , inside the code it will read the datafile.data only. – Dev Nov 02 '21 at 21:10
  • You just need to build the desired string. eg, if `argv[1]` points to the string `datafile` (as in your call line), you could do `char path[256]; snprintf(path, sizeof path, "%s.data", argv[1]);` – William Pursell Nov 03 '21 at 09:53
  • Got it. Thank you so much for your help. – Dev Nov 03 '21 at 18:48
  • I just want to add one more thing to this code. I first want to read `datafile.names` and if the last line of that file contains a zero then I want to generate the `outputfile` else the code will terminate saying noisy data. I have tried to do like : https://stackoverflow.com/questions/69866277/problem-while-reading-from-the-stem-file-in-c can you please say where I am doing wrong? As while trying to execute that, it is showing `datafile.names.data: No such file or directory` . Please help me. – Dev Nov 07 '21 at 04:09
  • can you please help me out? I am trying all possible thing to make it work but till now nothing seemed to work correctly. As I am using C for the very first time, I am becoming a little puzzled to solve this issue. Please help me by just 1st reading the `.names` files in that code of yours and if there is a `zero` in the last line of the names file then the code will be executed to give the `correct result`, else it will just say `noisy data` and come out of it. Below I am attaching my own answer where I am giving the modified code of yours. Please help me. – Dev Nov 08 '21 at 03:09
  • @Dev I'll post some code at the other question. – William Pursell Nov 08 '21 at 14:13
  • Okay! Thanks a ton. – Dev Nov 08 '21 at 14:27
0

As William Pursell has provided superb answer in C, here is an awk alternative, although awk is not tagged.

awk -F, -v OFS="," '                            # assign input/output field separator to a comma
    NR==FNR {                                   # this block is invoked for the 1st read of the input file
        for (i = 1; i <= NF; i++) {             # loop over the filelds
            if (max[i] == "" || max[i] < $i) max[i] = $i
                                                # update the max values
        }
        nr = NR; nf = NF                        # store #records and #fields
        next                                    # skip following statements
    }
    FNR==1 {                                    # this block is invoked just before reading he 1st line for the 2nd read of the input file
        printf("%d %d ", nr, nf - 1)            # print #records and #fields - 1
        max[nf]++                               # increment the max value of the last field
        for (i = 1; i <= nf; i++) {             # print max values
            printf("%d%s", max[i], i==nf ? "\n" : " ");
        }
    }
    {                                           # this block is invoked for the 2nd read
        $nf++                                   # increment the value of the last field
        print                                   # print fields as csv
    }
' datafile.data datafile.data                   # read the input file twice
tshiono
  • 21,248
  • 2
  • 14
  • 22
0

Below is the modified code, where I want to read .names file first and then check whether the last line of that .names has a zero then I want to produce the output.

#include <limits.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

FILE * xfopen(const char *path, const char *mode);
void * xmalloc(size_t s);

void parse_line(const char *buf, int *max, int column_count)
{
    for(int i = 0; i < column_count; i++ ){
        char *end;
        int t = strtol(buf, &end, 10);
        
        if( t > max[i] ){
            max[i] = t;
        }
        
        if( !((i < column_count - 1 && *end == ',') || (i == column_count - 1 && *end == '\n')) ){
            fprintf(stderr, "invalid input '%c' in %s", *end, buf);
            exit(1);
        }
        buf = end + 1;
    }
}

int main(int argc, char **argv)
{

    char *path1;
    char *path = argc > 1 ? argv[1] : "stdin";
    
    sprintf(path, "%s.data", argv[1]);
    
    FILE *in = argc > 1 ? xfopen(path, "r") : stdin;
    
    char buf[1024];
    int column_count = 1;
    int row_count = 1;
    int *max;

    /* Read first line to determine number of columns */
    if( fgets(buf, sizeof buf, in) == NULL ){
        fputs("Input error\n", stderr);
        return 1;
    }

    for( const char *p = buf; *p; p++ ){
        if( *p == ',' ){
            column_count += 1;
        }
    }
    
    max = xmalloc(column_count * sizeof *max);
    
    for( int i = 0; i < column_count; i++ ){
        max[i] = INT_MIN;
    }
    
    parse_line(buf, max, column_count);
    while( fgets(buf, sizeof buf, in) != NULL ){
        row_count += 1;
        parse_line(buf, max, column_count);
    }
    
    if( fseek(in, 0L, SEEK_SET) ){
        perror(path);
        return 1;
    }
    
    printf("%d %d ", row_count, column_count - 1);
    
    for( int i = 0; i < column_count - 1; i += 1 ){
        printf("%d ", max[i]);
    }
    
    printf("%d\n", max[column_count - 1] + 1);

    while( fgets(buf, sizeof buf, in) != NULL ){
        char *comma = strrchr(buf, ',');
        if( comma == NULL ){
            fprintf(stderr, "Invalid input\n");
            return 1;
        }
        
        *comma = '\0';
        int k = strtol(comma + 1, NULL, 10);
        for(char *p = buf; *p;  p++){
            if( *p == ',' ) *p = ' '; 
        }
        printf("%s %d\n", buf, k + 1);
    }
}

FILE *
xfopen(const char *path, const char *mode)
{
    FILE *fp = path[0] != '-' || path[1] != '\0' ? fopen(path, mode) :
        *mode == 'r' ? stdin : stdout;
    if( fp == NULL ){
        perror(path);
        exit(EXIT_FAILURE);
    }
    
    return fp;
}

void *
xmalloc(size_t s)
{
    void *rv = malloc(s);
    if( rv == NULL ){
        perror("malloc");
        exit(EXIT_FAILURE);
    }
    return rv;
}
Dev
  • 576
  • 3
  • 14