7

i am trying to read UTF8 text from a text file, and then print some of it to another file. I am using Linux and gcc compiler. This is the code i am using:

#include <stdio.h>
#include <stdlib.h>

int main(){
    FILE *fin;
    FILE *fout;
    int character;
    fin=fopen("in.txt", "r");
    fout=fopen("out.txt","w");
    while((character=fgetc(fin))!=EOF){
        putchar(character); // It displays the right character (UTF8) in the terminal
        fprintf(fout,"%c ",character); // It displays weird characters in the file
    }
    fclose(fin);
    fclose(fout);
    printf("\nFile has been created...\n");
    return 0;
}

It works for English characters for now.

user2768374
  • 199
  • 1
  • 2
  • 8
  • 5
    You do realize that UTF-8 is a multibyte encoding, right? Inserting spaces between each byte (`fprintf` statement) will probably break that encoding in your output file. – ldav1s Feb 12 '14 at 19:49
  • I usually use wchar_t and wstring with fwprintf (wide characters) for persian characters. – Behnam Safari Feb 12 '14 at 20:01
  • if fprintf breaks the encoding, what should i use instead of fprintf do you suggest? – user2768374 Feb 12 '14 at 20:05
  • i tried to use fwprintf but the result was even worse. Can fgetc() read utf8? – user2768374 Feb 12 '14 at 20:21
  • 3
    "wide characters" are 99% broken in C (different compilers have different ideas about what a wide character is). Best is either load it as "raw uint8_t bytes" and do it yourself (if it's a simple thing), or use a decent internationalisation library (if it's more than a simple thing). – Brendan Feb 12 '14 at 20:58
  • Have you tried removing the extra space in your fprintf, as suggested by @ldav1s? – Markku K. Feb 12 '14 at 23:51
  • 2
    `fprintf` is not breaking the encoding. The _format string_ of `fprintf` in your code is breaking the encoding. If you are wishing to insert a space between each [Unicode code point](http://en.wikipedia.org/wiki/Code_point) read (which is what is "working" for English), your code _must_ become UTF-8 aware. A way to do this is to buffer up the bytes you are reading in until you have a code point then `fprintf(fout, "%s ", utf8cp);` where `utf8cp` is `char utf8cp[5];` It's 5 bytes long since UTF-8 characters are 1-4 bytes + terminating `'\0'`. – ldav1s Feb 13 '14 at 03:19
  • @user2768374, did either of the answers below help you? If one of them did and answered your question, please accept it. Otherwise, let us know what additional help you need or if you are still experiencing problems. Thanks. – Josh Durham Feb 21 '14 at 21:09
  • Thanks @jpdurham. i wanted to hold the value in a single variable so that i can pass it to different functions. – user2768374 Feb 22 '14 at 22:55
  • @Idav1s - How do we get all the bytes of a unicode multibyte character in 'utf8cp'? Do we use scanf? I'm not able to iterate through it. – KamyFC Jun 23 '23 at 11:30

4 Answers4

20

Instead of

fprintf(fout,"%c ",character);

use

fprintf(fout,"%c",character);

The second fprintf() does not contain a space after %c which is what was causing out.txt to display weird characters. The reason is that fgetc() is retrieving a single byte (the same thing as an ASCII character), not a UTF-8 character. Since UTF-8 is also ASCII compatible, it will write English characters to the file just fine.

putchar(character) output the bytes sequentially without the extra space between every byte so the original UTF-8 sequence remained intact. To see what I'm talking about, try

while((character=fgetc(fin))!=EOF){
    putchar(character);
    printf(" "); // This mimics what you are doing when you write to out.txt
    fprintf(fout,"%c ",character);
}

If you want to write UTF-8 characters with the space between them to out.txt, you would need to handle the variable length encoding of a UTF-8 character.

#include <stdio.h>
#include <stdlib.h>

/* The first byte of a UTF-8 character
 * indicates how many bytes are in
 * the character, so only check that
 */
int numberOfBytesInChar(unsigned char val) {
    if (val < 128) {
        return 1;
    } else if (val < 224) {
        return 2;
    } else if (val < 240) {
        return 3;
    } else {
        return 4;
    }
}

int main(){
    FILE *fin;
    FILE *fout;
    int character;
    fin = fopen("in.txt", "r");
    fout = fopen("out.txt","w");
    while( (character = fgetc(fin)) != EOF) {
        for (int i = 0; i < numberOfBytesInChar((unsigned char)character) - 1; i++) {
            putchar(character);
            fprintf(fout, "%c", character);
            character = fgetc(fin);
        }
        putchar(character);
        printf(" ");
        fprintf(fout, "%c ", character);
    }
    fclose(fin);
    fclose(fout);
    printf("\nFile has been created...\n");
    return 0;
}
Josh Durham
  • 1,632
  • 1
  • 17
  • 28
  • Thanks for the solution. I got an idea on how to check if a character contains more than one byte by using your 'numberOfBytesInChar' function. However this " fprintf(fout, "%c", character); " will output both the bytes. I want to output this multi byte character in hex format. What do you suggest? Like for example - 1) for this character à = E0 is the hex 2) ê = EA 3) ạ = 1EA1 - I tried using " fprintf(fout, "%X", character); " but it does write the correct hex matching. – KamyFC Jun 23 '23 at 10:32
6

This code worked for me:

/* fgetwc example */
#include <stdio.h>
#include <wchar.h>
#include <stdlib.h>
#include <locale.h>
int main ()
{
  setlocale(LC_ALL, "en_US.UTF-8");
  FILE * fin;
  FILE * fout;
  wint_t wc;
  fin=fopen ("in.txt","r");
  fout=fopen("out.txt","w");
  while((wc=fgetwc(fin))!=WEOF){
        // work with: "wc"
  }
  fclose(fin);
  fclose(fout);
  printf("File has been created...\n");
  return 0;
}
user2768374
  • 199
  • 1
  • 2
  • 8
1

If you do not wish to use the wide options, experiment with the following:

Read and write bytes, not characters. Also known as, use binary, not text.

fgetc effectively gets a byte from a file, but if the byte is greater than 127, try treating it as a int instead of a char. fputc, on the other hand, silently ignores putting a char > 127. It will work if you use an int rather than char as the input.

Also, in the open mode, try using binary, so try rb & wb rather than r & w

Kev Youren
  • 91
  • 1
  • 1
0

The C-style solution is very insightful, but if you'd consider using C++ the task becomes much more high level and it does not require you to have so much knowledge about utf-8 encoding. Consider the following:

#include<iostream>
#include<fstream>

int main(){
  wifstream input { "in.txt" }
  wofstream output { "out.txt" }

  // Look out - this part is not portable to windows                                             
  locale utf8 {"en_us.UTF-8"};   

  input.imbue(utf8);                                                             
  output.imbue(utf8);
  wcout.imbue(utf8);

  wchar_t c;

  while(input >> noskipws >> c) {
    wcout << c;
    output << c; 
  }

  return 0;  
}
Renra
  • 5,561
  • 3
  • 15
  • 17