Unstructured txt file with similar pattern for all rows in R

Question

I am currently working with a .txt file and have used the read_table2() function to read it, resulting in the following structure.

  X1          X2 X3         X4    X5    X6 X7    X8    X9    X10   X11   X12   X13   X14   X15   X16   X17   X18   X19   X20  
  <chr>    <dbl> <chr>   <dbl> <dbl> <dbl> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr> <chr>
1 FVP110~ 2.08e6 1101~ 1.10e 3     6     0 0     0     6     01101 6     0     0     0     6     01101 6     0     0     0    
2 FVP110~ 2.06e4 8     9.3 e 1     2    93 0     0     0     0     0     093   0     0     0     0     0     093   0     0    
3 FVP110~ 2.10e6 6     9.3 e 1     2    93 0     0     0     0     0     093   0     0     0     0     0     093   0     0    
4 FVP110~ 2.10e6 6     3.11e18  3111     8 0     0     0     8     03111 8     0     0     0     8     03111 8     0     0    
5 FVP110~ 2.08e6 94    2   e 0    94     0 0     0     0     0     094   0     0     0     0     0     094   0     0     0    
6 FVP110~ 2.06e4 6     9.2 e 1     2    92 0     0     0     0     0     092   0     0     0     0     0     092   0     0    
# ... with 31 more variables: X21 <chr>, X22 <chr>, X23 <chr>, X24 <chr>, X25 <chr>, X26 <chr>, X27 <chr>, X28 <chr>,
#   X29 <chr>, X30 <chr>, X31 <chr>, X32 <chr>, X33 <chr>, X34 <chr>, X35 <chr>, X36 <chr>, X37 <chr>, X38 <chr>, X39 <chr>,
#   X40 <chr>, X41 <chr>, X42 <chr>, X43 <chr>, X44 <chr>, X45 <chr>, X46 <chr>, X47 <chr>, X48 <chr>, X49 <chr>, X50 <chr>,
#   X51 <dbl>

I know that my first column, instead of being FVP1104Q1V110121011010110110527421101011165 is always a 4 chr 3 dbl 2chr 2chr 1dbl 2dbl etc. In total, there are 51 columns but if parsed correctly they will become a total of 129.

These are the first 10 rows and 10 columns of my data set.

structure(list(X1 = c("FVP1104Q1V110121011010110110527421101011165", 
"FVP1104Q1V110121011010110110527421101022262", "FVP1104Q1V110121011010110110527421101033231", 
"FVP1104Q1V110121011010110110527421101044134", "FVP1104Q1V110121011010110110527421102011165", 
"FVP1104Q1V110121011010110110527421102022260", "FVP1104Q1V110121011010110110527421102033138", 
"FVP1104Q1V110121011010110110527421102044232", "FVP1104Q1V11012101101011011052742110205616", 
"FVP1104Q1V110121011010110110527421102063142"), X2 = c(2080110, 
20601, 2100112, 2100112, 2080110, 20601, 2120115, 2100112, 10501, 
40701), X3 = c("11011116112", "8", "6", "6", "94", "6", "6", 
"6", "124", "8"), X4 = c(1101, 93, 93, 3111045932226084352, 2, 
92, 3185102331226052608, 93, 91, 6), X5 = c(6, 2, 2, 3111, 94, 
2, 3185, 2, 2, 11011216112), X6 = c(0, 93, 93, 8, 0, 92, 8, 93, 
91, 1101), X7 = c("0", "0", "0", "0", "0", "0", "0", "0", "0", 
"6"), X8 = c("0", "0", "0", "0", "0", "0", "0", "0", "0", "0"
), X9 = c("6", "0", "0", "0", "0", "0", "0", "0", "0", "0"), 
    X10 = c("01101", "0", "0", "8", "0", "0", "8", "0", "0", 
    "0"), X11 = c("6", "0", "0", "03111", "094", "0", "03185", 
    "0", "0", "6"), X12 = c("0", "093", "093", "8", "0", "092", 
    "8", "093", "091", "01101"), X13 = c("0", "0", "0", "0", 
    "0", "0", "0", "0", "0", "6"), X14 = c("0", "0", "0", "0", 
    "0", "0", "0", "0", "0", "0"), X15 = c("6", "0", "0", "0", 
    "0", "0", "0", "0", "0", "0")), row.names = c(NA, 10L), class = "data.frame")

And I want to get

structure(list(fileid = structure(c("FVP1", "FVP1", "FVP1", "FVP1", 
"FVP1", "FVP1", "FVP1", "FVP1", "FVP1", "FVP1"), label = "File Identification", format.stata = "%9s"), 
    schedule = structure(c(104, 104, 104, 104, 104, 104, 104, 
    104, 104, 104), label = "Schedule", format.stata = "%8.0g"), 
    quarter = structure(c("Q3", "Q3", "Q3", "Q3", "Q3", "Q3", 
    "Q3", "Q3", "Q3", "Q3"), label = "Quarter", format.stata = "%9s"), 
    visit = structure(c("V1", "V1", "V1", "V1", "V1", "V1", "V1", 
    "V1", "V1", "V1"), label = "Visit", format.stata = "%9s"), 
    sector = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sector", format.stata = "%8.0g"), 
    state = structure(c(36, 36, 36, 36, 36, 36, 36, 36, 36, 36
    ), label = "State/Ut Code", format.stata = "%8.0g"), district = structure(c(10, 
    10, 10, 10, 10, 10, 10, 10, 10, 10), label = "District Code", format.stata = "%8.0g"), 
    region = structure(c(362, 362, 362, 362, 362, 362, 362, 362, 
    362, 362), label = "NSS-Region", format.stata = "%8.0g"), 
    stratum = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Stratum", format.stata = "%8.0g"), 
    substratum = structure(c(8, 8, 8, 8, 8, 8, 8, 8, 8, 8), label = "Sub-Stratum", format.stata = "%8.0g"), 
    subsample = structure(c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2), label = "Sub-Sample", format.stata = "%8.0g"), 
    subregion = structure(c(3613, 3613, 3613, 3613, 3613, 3613, 
    3613, 3613, 3613, 3613), label = "Fod Sub-Region", format.stata = "%8.0g"), 
    fsu = structure(c(50030, 50030, 50030, 50030, 50030, 50030, 
    50030, 50030, 50030, 50030), label = "FSU", format.stata = "%10.0g"), 
    sbno = structure(c(1, 1, 1, 1, 1, 1, 1, 1, 1, 1), label = "Sample Sg/Sb No.", format.stata = "%8.0g"), 
    sss = structure(c(1, 1, 1, 1, 1, 1, 2, 2, 2, 2), label = "Second Stage Stratum No.", format.stata = "%8.0g")), row.names = c(NA, 
-10L), class = c("tbl_df", "tbl", "data.frame"))

I'm trying to replicate reshaping of a .txt data using a dictionary .dct in Stata, but I don't find a clear way to do that in R.

My data also includes NA's

You need to show how the text looks like. eg the first line of txt file — Onyambu, Feb 15 '23 at 20:31
So you have one column that's fixed width? But the rest are delimited? You could probably use `tidyr::separate` after you've read the data in to split the column I don't see How you're going from 51 columns to 129 though. It's easier to help you if you include a simple [reproducible example](https://stackoverflow.com/questions/5963269/how-to-make-a-great-r-reproducible-example) with sample input and desired output that can be used to test and verify possible solutions. I don't understand why you can't share data with `dput`. Perhaps you can share the first few lines of the raw text file. — MrFlick, Feb 15 '23 at 20:31

score 0 · Answer 1 · answered Feb 15 '23 at 22:05

As per MrFlick's suggestion, we can use tidyr::separate to break apart your first column into multiple columns by position:

library(tidyr)
data.frame(X1 = "FVP1104Q1V110121011010110110527421101011165") %>%
   separate(
     X1, 
     sep = c(4, 7, 9, 11, 12),
     into = paste0("X1_", 1:6)
  )
#   X1_1 X1_2 X1_3 X1_4 X1_5                            X1_6
# 1 FVP1  104   Q1   V1    1 0121011010110110527421101011165

Unstructured txt file with similar pattern for all rows in R

1 Answers1