1

I am confused which of the following should I use? (actually as of now all of them give me errors):

> beef = read.csv("beef.txt", header = TRUE)
Error in read.table(file = file, header = header, sep = sep, quote = quote,  : 
  more columns than column names
> beef = scan("beef.txt")
Error in scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  : 
  scan() expected 'a real', got '%'
> beef=read.table("beef.txt", header = FALSE, sep = " ")
Error in scan(file, what, nmax, sep, dec, quote, skip, nlines, na.strings,  : 
  line 1 did not have 8 elements
> beef=read.table("beef.txt", header = TRUE, sep = " ")
Error in read.table("beef.txt", header = TRUE, sep = " ") : 
  more columns than column names

Here's the top portion of beef.txt file and the rest is very similar.

% http://lib.stat.cmu.edu/DASL/Datafiles/agecondat.html
% 
% Datafile Name: Agricultural Economics Studies
% Datafile Subjects: Agriculture , Economics , Consumer
% Story Names: Agricultural Economics Studies
% Reference: F.B. Waugh, Graphic Analysis in Agricultural Economics,
%   Agricultural Handbook No. 128, U.S. Department of Agriculture, 1957.
% Authorization: free use
% Description: Price and consumption per capita of beef and pork
%   annually from 1925 to 1941 together with other variables relevant to
%   an economic analysis of price and/or consumption of beef and pork
%   over the period.
% Number of cases: 17
% Variable Names:
% 
%   PBE = Price of beef (cents/lb)
%   CBE = Consumption of beef per capita (lbs)
%   PPO = Price of pork (cents/lb)
%   CPO = Consumption of pork per capita (lbs)
%   PFO = Retail food price index (1947-1949 = 100)
%   DINC = Disposable income per capita index (1947-1949 = 100)
%   CFO = Food consumption per capita index (1947-1949 = 100)
%   RDINC = Index of real disposable income per capita (1947-1949 = 100)
%   RFP = Retail food price index adjusted by the CPI (1947-1949 = 100)
% 
% The Data:
YEAR    PBE CBE PPO CPO PFO DINC    CFO RDINC   RFP
1925    59.7    58.6    60.5    65.8    65.8    51.4    90.9    68.5    877
1926    59.7    59.4    63.3    63.3    68  52.6    92.1    69.6    899
1927    63  53.7    59.9    66.8    65.5    52.1    90.9    70.2    883
1928    71  48.1    56.3    69.9    64.8    52.7    90.9    71.9    884
1929    71  49  55  68.7    65.6    55.1    91.1    75.2    895

When I used fread the data is saved very strangely as in following, any idea how can it be formatted as expected?

> library(data.table)
> beef=fread("beef.txt", header = T, sep = " ")
> beef
    YEAR V2 V3 V4
 1: 1925 NA NA NA
 2: 1926 NA NA NA
 3: 1927 NA NA NA
 4: 1928 NA NA NA
 5: 1929 NA NA NA
 6: 1930 NA NA NA
 7: 1931 NA NA NA
 8: 1932 NA NA NA
 9: 1933 NA NA NA
10: 1934 NA NA NA
11: 1935 NA NA NA
12: 1936 NA NA NA
13: 1937 NA NA NA
14: 1938 NA NA NA
15: 1939 NA NA NA
16: 1940 NA NA NA
17: 1941 NA NA NA
         PBE\tCBE\tPPO\tCPO\tPFO\tDINC\tCFO\tRDINC\tRFP
 1: 59.7\t58.6\t60.5\t65.8\t65.8\t51.4\t90.9\t68.5\t877
 2:   59.7\t59.4\t63.3\t63.3\t68\t52.6\t92.1\t69.6\t899
 3:   63\t53.7\t59.9\t66.8\t65.5\t52.1\t90.9\t70.2\t883
 4:   71\t48.1\t56.3\t69.9\t64.8\t52.7\t90.9\t71.9\t884
 5:       71\t49\t55\t68.7\t65.6\t55.1\t91.1\t75.2\t895
 6: 74.2\t48.2\t59.6\t66.1\t62.4\t48.8\t90.7\t68.3\t874
 7:       72.1\t47.9\t57\t67.4\t51.4\t41.5\t90\t64\t791
 8:     79\t46\t49.5\t69.7\t42.8\t31.4\t87.8\t53.9\t733
 9:   73.1\t50.8\t47.3\t68.7\t41.6\t29.4\t88\t53.2\t752
10:   70.2\t55.2\t56.6\t62.2\t46.4\t33.2\t89.1\t58\t811
11:   82.2\t52.2\t73.9\t47.7\t49.7\t37\t87.3\t63.2\t847
12: 68.4\t57.3\t64.4\t54.4\t50.1\t41.8\t90.5\t70.5\t845
13:     73\t54.4\t62.2\t55\t52.1\t44.5\t90.4\t72.5\t849
14: 70.2\t53.6\t59.9\t57.4\t48.4\t40.8\t90.6\t67.8\t803
15:   67.8\t53.9\t51\t63.9\t47.1\t43.5\t93.8\t73.2\t793
16: 63.4\t54.2\t41.5\t72.4\t47.8\t46.5\t95.5\t77.6\t798
17:     56\t60\t43.9\t67.4\t52.2\t56.3\t97.5\t89.5\t830

And when I read.table as told in the comments I receive weird output (I don't read it as neatly as expected):

> beef=read.table("beef.txt", header = TRUE, sep = " ", comment.char="%")
> beef
   YEAR  X X.1 X.2
1  1925 NA  NA  NA
2  1926 NA  NA  NA
3  1927 NA  NA  NA
4  1928 NA  NA  NA
5  1929 NA  NA  NA
6  1930 NA  NA  NA
7  1931 NA  NA  NA
8  1932 NA  NA  NA
9  1933 NA  NA  NA
10 1934 NA  NA  NA
11 1935 NA  NA  NA
12 1936 NA  NA  NA
13 1937 NA  NA  NA
14 1938 NA  NA  NA
15 1939 NA  NA  NA
16 1940 NA  NA  NA
17 1941 NA  NA  NA
                PBE.CBE.PPO.CPO.PFO.DINC.CFO.RDINC.RFP
1  59.7\t58.6\t60.5\t65.8\t65.8\t51.4\t90.9\t68.5\t877
2    59.7\t59.4\t63.3\t63.3\t68\t52.6\t92.1\t69.6\t899
3    63\t53.7\t59.9\t66.8\t65.5\t52.1\t90.9\t70.2\t883
4    71\t48.1\t56.3\t69.9\t64.8\t52.7\t90.9\t71.9\t884
5        71\t49\t55\t68.7\t65.6\t55.1\t91.1\t75.2\t895
6  74.2\t48.2\t59.6\t66.1\t62.4\t48.8\t90.7\t68.3\t874
7        72.1\t47.9\t57\t67.4\t51.4\t41.5\t90\t64\t791
8      79\t46\t49.5\t69.7\t42.8\t31.4\t87.8\t53.9\t733
9    73.1\t50.8\t47.3\t68.7\t41.6\t29.4\t88\t53.2\t752
10   70.2\t55.2\t56.6\t62.2\t46.4\t33.2\t89.1\t58\t811
11   82.2\t52.2\t73.9\t47.7\t49.7\t37\t87.3\t63.2\t847
12 68.4\t57.3\t64.4\t54.4\t50.1\t41.8\t90.5\t70.5\t845
13     73\t54.4\t62.2\t55\t52.1\t44.5\t90.4\t72.5\t849
14 70.2\t53.6\t59.9\t57.4\t48.4\t40.8\t90.6\t67.8\t803
15   67.8\t53.9\t51\t63.9\t47.1\t43.5\t93.8\t73.2\t793
16 63.4\t54.2\t41.5\t72.4\t47.8\t46.5\t95.5\t77.6\t798
17     56\t60\t43.9\t67.4\t52.2\t56.3\t97.5\t89.5\t830

So thanks to comments turns out the separated wasn't a space but a tab. Here's what's the correct answer:

> beef=read.table("beef.txt", header = TRUE, sep = "\t", comment.char="%")
> beef
    YEAR....PBE  CBE  PPO  CPO  PFO DINC  CFO RDINC RFP
1  1925    59.7 58.6 60.5 65.8 65.8 51.4 90.9  68.5 877
2  1926    59.7 59.4 63.3 63.3 68.0 52.6 92.1  69.6 899
3    1927    63 53.7 59.9 66.8 65.5 52.1 90.9  70.2 883
4    1928    71 48.1 56.3 69.9 64.8 52.7 90.9  71.9 884
5    1929    71 49.0 55.0 68.7 65.6 55.1 91.1  75.2 895
6  1930    74.2 48.2 59.6 66.1 62.4 48.8 90.7  68.3 874
7  1931    72.1 47.9 57.0 67.4 51.4 41.5 90.0  64.0 791
8    1932    79 46.0 49.5 69.7 42.8 31.4 87.8  53.9 733
9  1933    73.1 50.8 47.3 68.7 41.6 29.4 88.0  53.2 752
10 1934    70.2 55.2 56.6 62.2 46.4 33.2 89.1  58.0 811
11 1935    82.2 52.2 73.9 47.7 49.7 37.0 87.3  63.2 847
12 1936    68.4 57.3 64.4 54.4 50.1 41.8 90.5  70.5 845
13   1937    73 54.4 62.2 55.0 52.1 44.5 90.4  72.5 849
14 1938    70.2 53.6 59.9 57.4 48.4 40.8 90.6  67.8 803
15 1939    67.8 53.9 51.0 63.9 47.1 43.5 93.8  73.2 793
16 1940    63.4 54.2 41.5 72.4 47.8 46.5 95.5  77.6 798
17   1941    56 60.0 43.9 67.4 52.2 56.3 97.5  89.5 830
Mona Jalal
  • 34,860
  • 64
  • 239
  • 408
  • 1
    Look for the `comment.char` parameter of `read.table`. (`read.csv` is just a wrapper around `read.table`; `scan` is similar but you will have to specify field types yourself.) – krlmlr Feb 15 '14 at 18:43
  • 1
    `read.table` is "just" a wrapper for `scan`. – Roland Feb 15 '14 at 18:48
  • 1
    If you have a well-formatted `.csv` dont' think: go with `read.csv()`. If you have a `fwf` file (fixed width format, very common in meteorology) use `read.fwf()`. Otherwise, try `read.table()` or `scan()` with the correct parameters. – Fernando Feb 15 '14 at 18:58

4 Answers4

3
beef=read.table("beef.txt", header = TRUE, sep = " ", comment.char="%")

beef=read.table("beef.txt", header = TRUE, sep = "\t", comment.char="%") #after update
Ananta
  • 3,671
  • 3
  • 22
  • 26
  • Please take a look at the updated question. I used your method but the read file is not neat! – Mona Jalal Feb 15 '14 at 18:56
  • so when we read a file using read.table is the final result a data.frame? If not how should I change it to data.frame? – Mona Jalal Feb 15 '14 at 19:07
  • @MonaJalal Read `help("read.table")`. The documentation is there to answer these questions. – Roland Feb 15 '14 at 19:10
  • 1
    it is data.frame, you can check by `class(beef)`, One honest suggestion though, seems recently you are exploring R a lot and for few, i have answered, SO is great way to get answers, but most of these questions are very basic, I think you should read some beginner R book like (http://cran.r-project.org/doc/contrib/Paradis-rdebuts_en.pdf) if you want explore R better. – Ananta Feb 15 '14 at 19:11
  • @Ananta I think there's a problem with reading the file as it is making PBE null : `> m3=lm(PBE ~ CBE + PBO + CPO + PFO +DINC + CFO+RDINC+RFP+YEAR, data = beef) Error in eval(expr, envir, enclos) : object 'PBE' not found > beef$PBE NULL` – Mona Jalal Feb 15 '14 at 20:07
  • @MonaJalal Do you have four dots connecting two column names in your data file, specifically `YEAR....PBE`? If so, perhaps remove those four dots by hand and then see whether this code works. Alternatively, perhaps there is no tab between `YEAR` and `PBE`? In that case try inserting a tab between those two column names. – Mark Miller Feb 15 '14 at 21:34
  • I got it figure out. Sep should be "" not tab! – Mona Jalal Feb 15 '14 at 21:51
1

There's a recent blog post about it showing that fread is the fastest, the rest is the same. The link: http://statcompute.wordpress.com/2014/02/11/efficiency-of-importing-large-csv-files-in-r/

In your case it doesn't really matter, use the one you find most comfortable.

An example using fread is following (assuming TAB separators):

library(data.table)
a = fread("data.csv", skip=26)
a
   YEAR  PBE  CBE  PPO  CPO  PFO DINC  CFO RDINC RFP
1: 1925 59.7 58.6 60.5 65.8 65.8 51.4 90.9  68.5 877
2: 1926 59.7 59.4 63.3 63.3 68.0 52.6 92.1  69.6 899
3: 1927 63.0 53.7 59.9 66.8 65.5 52.1 90.9  70.2 883
4: 1928 71.0 48.1 56.3 69.9 64.8 52.7 90.9  71.9 884
5: 1929 71.0 49.0 55.0 68.7 65.6 55.1 91.1  75.2 895
LauriK
  • 1,899
  • 15
  • 20
  • I used `fread` but the read data is saved strangely!! any hint? – Mona Jalal Feb 15 '14 at 18:54
  • 1
    I didn't down-vote, but I guess someone thinks this is too much of a link-only answer and the blog is a poor version of [our faq](http://stackoverflow.com/q/1727772/1412059). You should improve your answer and show how to handle such a file with `fread`. – Roland Feb 15 '14 at 18:55
  • 1
    I actually vote up because it's good to know there are other faster methods out there (even though in my case time to read would be calculated as zero) – Mona Jalal Feb 15 '14 at 18:57
  • @Roland: sorry, good point, I'll update my answer with an example – LauriK Feb 15 '14 at 18:59
  • @LauriK Thanks for the example. But in your case it's the same : > a$PBE NULL – Mona Jalal Feb 15 '14 at 20:08
1

Here is an alternative in base using readLines. This approach is much more complex, but returns numeric data ready for analysis. However, you must manually count the columns in the original data file and later reassign column names.

EDIT

At the bottom I added a generalized version that does not require manually counting the columns or manually adding column names.

Note that either version works regardless of whether the data are delimited with spaces or tabs.

Here is the code for the original version:

my.data <- readLines('c:/users/mmiller21/simple R programs/beef.txt')

ncols <- 10

header.info <- ifelse(substr(my.data, 1, 1) == '%', 1, 0)

my.data2 <- my.data[header.info==0]

my.data3 <- data.frame(matrix(unlist(strsplit(my.data2[-1], "[^0-9,.]+")), ncol=ncols, byrow=TRUE), stringsAsFactors = FALSE)

my.data4 <- as.data.frame(apply(my.data3, 2, function(x) as.numeric(x)))

colnames(my.data4) <- c('YEAR', 'PBE', 'CBE', 'PPO', 'CPO', 'PFO', 'DINC', 'CFO', 'RDINC', 'RFP')

> my.data4
     YEAR  PBE  CBE  PPO  CPO  PFO DINC  CFO RDINC RFP
[1,] 1925 59.7 58.6 60.5 65.8 65.8 51.4 90.9  68.5 877
[2,] 1926 59.7 59.4 63.3 63.3 68.0 52.6 92.1  69.6 899
[3,] 1927 63.0 53.7 59.9 66.8 65.5 52.1 90.9  70.2 883
[4,] 1928 71.0 48.1 56.3 69.9 64.8 52.7 90.9  71.9 884
[5,] 1929 71.0 49.0 55.0 68.7 65.6 55.1 91.1  75.2 895

Here are the contents of the original data file:

% http://lib.stat.cmu.edu/DASL/Datafiles/agecondat.html
% 
% Datafile Name: Agricultural Economics Studies
% Datafile Subjects: Agriculture , Economics , Consumer
% Story Names: Agricultural Economics Studies
% Reference: F.B. Waugh, Graphic Analysis in Agricultural Economics,
%   Agricultural Handbook No. 128, U.S. Department of Agriculture, 1957.
% Authorization: free use
% Description: Price and consumption per capita of beef and pork
%   annually from 1925 to 1941 together with other variables relevant to
%   an economic analysis of price and/or consumption of beef and pork
%   over the period.
% Number of cases: 17
% Variable Names:
% 
%   PBE = Price of beef (cents/lb)
%   CBE = Consumption of beef per capita (lbs)
%   PPO = Price of pork (cents/lb)
%   CPO = Consumption of pork per capita (lbs)
%   PFO = Retail food price index (1947-1949 = 100)
%   DINC = Disposable income per capita index (1947-1949 = 100)
%   CFO = Food consumption per capita index (1947-1949 = 100)
%   RDINC = Index of real disposable income per capita (1947-1949 = 100)
%   RFP = Retail food price index adjusted by the CPI (1947-1949 = 100)
% 
% The Data:
YEAR    PBE CBE PPO CPO PFO DINC    CFO RDINC   RFP
1925    59.7    58.6    60.5    65.8    65.8    51.4    90.9    68.5    877
1926    59.7    59.4    63.3    63.3    68  52.6    92.1    69.6    899
1927    63  53.7    59.9    66.8    65.5    52.1    90.9    70.2    883
1928    71  48.1    56.3    69.9    64.8    52.7    90.9    71.9    884
1929    71  49  55  68.7    65.6    55.1    91.1    75.2    895

Here is the code for the generalized version:

my.data <- readLines('c:/users/mmiller21/simple R programs/beef.txt')

header.info <- ifelse(substr(my.data, 1, 1) == '%', 1, 0)

my.data2 <- my.data[header.info==0]

ncols <- length(read.table(textConnection(my.data2[1])))

my.data3 <- data.frame(matrix(unlist(strsplit(my.data2[-1], "[^0-9,.]+")), ncol=ncols, byrow=TRUE), stringsAsFactors = FALSE)

my.data4 <- as.data.frame(apply(my.data3, 2, function(x) as.numeric(x)))

#colnames(my.data4) <- c('YEAR', 'PBE', 'CBE', 'PPO', 'CPO', 'PFO', 'DINC', 'CFO', 'RDINC', 'RFP')
#my.data4

colnames(my.data4) <- read.table(textConnection(my.data2[1]), colClasses = c('character'))
my.data4

colSums(my.data4)

sum(my.data4$PPO)
Mark Miller
  • 12,483
  • 23
  • 78
  • 132
1

The correct answer is as follows:

beef=read.table("beef.txt", header = TRUE, sep = "", comment.char="%")
Mona Jalal
  • 34,860
  • 64
  • 239
  • 408