2

What is the fastest way to count lines and words in a text file in pure ANSI C?

A word is terminated by a space or period. Line is terminated by '\n'.

This seems to be in C++.

Community
  • 1
  • 1
user366312
  • 16,949
  • 65
  • 235
  • 452
  • The solution in C++ that you link to should translate directly into C if you just change to an ANSI C-way to read from the file, shouldn't it? Hence, as far as I can see, this makes the question essentially "how to read from a file in ANSI C?" – gspr Oct 01 '10 at 09:11
  • 1
    Define "word" and "line" **VERY WELL**. Is `"Mc'Donalds"` a word? What about `"RS232C"` or `"transli-\nneated"`? Did a new line just begin in the middle of the previous example for word? – pmg Oct 01 '10 at 09:31
  • 1
    [This](http://git.savannah.gnu.org/cgit/coreutils.git/tree/src/wc.c). It's about 800 lines of code, but it's efficient and well-tested. – imgx64 Oct 01 '10 at 09:47
  • According to your definition of "word" and "line", in a file containing 3 spaces, a newline, 3 periods (for a total of 7 characters), how many words and lines are there? – pmg Oct 01 '10 at 20:06

3 Answers3

4
  • Read the file in
  • Iterate over characters increment character counter
  • Check if space/end of line increment word counter
  • Repeat second and third steps until EOF
Alan Haggai Alavi
  • 72,802
  • 19
  • 102
  • 127
  • That kind of approach is widely available on the net. – user366312 Oct 01 '10 at 09:19
  • 1
    JMSA: Yes, but it is also (almost, except for buffering) the approach that is used in the C++ example you link to, so it sounds like what you want. – gspr Oct 01 '10 at 09:22
3

Maybe take a look at the source code of the GNU wc utility as this utility does exactly what you want.

#include <stdlib.h>
#include <stdio.h>
#include <stdarg.h>

typedef unsigned long count_t;  /* Counter type */

/* Current file counters: chars, words, lines */
count_t ccount;
count_t wcount;
count_t lcount;

/* Totals counters: chars, words, lines */
count_t total_ccount = 0;
count_t total_wcount = 0;
count_t total_lcount = 0;

/* Print error message and exit with error status. If PERR is not 0,
   display current errno status. */
static void
error_print (int perr, char *fmt, va_list ap)
{
  vfprintf (stderr, fmt, ap);
  if (perr)
    perror (" ");
  else
    fprintf (stderr, "\n");
  exit (1);  
}

/* Print error message and exit with error status. */
static void
errf (char *fmt, ...)
{
  va_list ap;

  va_start (ap, fmt);
  error_print (0, fmt, ap);
  va_end (ap);
}

/* Print error message followed by errno status and exit
   with error code. */
static void
perrf (char *fmt, ...)
{
  va_list ap;

  va_start (ap, fmt);
  error_print (1, fmt, ap);
  va_end (ap);
}

/* Output counters for given file */
void
report (char *file, count_t ccount, count_t wcount, count_t lcount)
{
  printf ("%6lu %6lu %6lu %s\n", lcount, wcount, ccount, file);
}

/* Return true if C is a valid word constituent */
static int
isword (unsigned char c)
{
  return isalpha (c);
}

/* Increase character and, if necessary, line counters */
#define COUNT(c)       \
      ccount++;        \
      if ((c) == '\n') \
        lcount++;

/* Get next word from the input stream. Return 0 on end
   of file or error condition. Return 1 otherwise. */
int
getword (FILE *fp)
{
  int c;
  int word = 0;

  if (feof (fp))
    return 0;

  while ((c = getc (fp)) != EOF)
    {
      if (isword (c))
        {
          wcount++;
          break;
        }
      COUNT (c);
    }

  for (; c != EOF; c = getc (fp))
    {
      COUNT (c);
      if (!isword (c))
        break;
    }

  return c != EOF;
}

/* Process file FILE. */
void
counter (char *file)
{
  FILE *fp = fopen (file, "r");

  if (!fp)
    perrf ("cannot open file `%s'", file);

  ccount = wcount = lcount = 0;
  while (getword (fp))
    ;
  fclose (fp);

  report (file, ccount, wcount, lcount);
  total_ccount += ccount;
  total_wcount += wcount;
  total_lcount += lcount;
}

int
main (int argc, char **argv)
{
  int i;

  if (argc < 2)
    errf ("usage: wc FILE [FILE...]");

  for (i = 1; i < argc; i++)
    counter (argv[i]);

  if (argc > 2)
    report ("total", total_ccount, total_wcount, total_lcount);
  return 0;
}

Found at: http://www.gnu.org/software/cflow/manual/html_node/Source-of-wc-command.html

Gary Willoughby
  • 50,926
  • 41
  • 133
  • 199
  • that's a rather dated version that runs very, very slow (when compared to the binary shipping of wc); see http://git.savannah.gnu.org/cgit/coreutils.git – Sebastian Mach Jul 15 '11 at 11:24
  • This version suffers from the fact that a file with one word and no line separators will register 0 lines rather than 1. See: http://stackoverflow.com/questions/843154/fastest-way-to-find-the-number-of-lines-in-a-text-c/843484#843484 – Adrian McCarthy Jun 01 '13 at 13:55
2

Here is an explicit answer that counts the number of lines (extension to the number of words is trivial à la the C++ version linked to in OP). This version is buffered. Another answer suggests reading the entire file in first, which is simpler, but the below is more in line with what your C++ example does.

#include <stdio.h>
#include <string.h>

#define BUFSIZE 1024

int main(int argc, char** argv)
{
  int newlines = 0;
  char buf[BUFSIZE];
  FILE* file;

  if (argc != 2)
    return 1;

  file = fopen(argv[1], "r");
  while (fgets(buf, BUFSIZE, file))
  {
    if (!(strlen(buf) == BUFSIZE-1 && buf[BUFSIZE-2] != '\n'))
      newlines++;
  }

  printf("Number of lines in %s: %d\n", argv[1], newlines);

  return 0;
}

The BUFSIZE macro can be tweaked to maximize performance (since you say you want the fastest way). 1024 is simply a guess. Another possibility is probably to read the file memory mapped, but I didn't try since mmap is not ANSI C.

gspr
  • 11,144
  • 3
  • 41
  • 74
  • This doesn't handle the case where the last line is not terminated with a line separator. For that you need a state machine. It also doesn't readily extend to counting words, since that also requires a state machine. – Adrian McCarthy Jun 01 '13 at 14:00