1

Below is the code I am trying to implement. I want to extract this 10 consecutive values of rows and turn them into corresponding columns .

This is how data looks like: https://drive.google.com/file/d/0B7huoyuu0wrfeUs4d2p0eGpZSFU/view?usp=sharing

I have been trying but temp1 and temp2 comes out to be empty. Please help.

library(Hmisc)     #for increment function

myData <- read.csv("Clothing_&_Accessories.csv",header=FALSE,sep=",",fill=TRUE) # reading the csv file

extract<-myData$V2 # extracting the desired column

x<-1    
y<-1

temp1 <- NULL       #initialisation    
temp2 <- NULL       #initialisation    
data.sorted <- NULL #initialisation

limit<-nrow(myData)  # Calculating no of rows

while (x! = limit) {    
  count <- 1    
    for (count in 11) {    
      if (count > 10) {    
         inc(x) <- 1    
         break    # gets out of for loop
      }    
      else {    
         temp1[y]<-data_mat[x]  # extracting by every row element    
      }
      inc(x) <- 1  # increment x
     inc(y) <- 1  # increment y                    
   }
   temp2<-temp1
   data.sorted<-rbind(data.sorted,temp2)  # turn rows into columns 
}
Molx
  • 6,816
  • 2
  • 31
  • 47

2 Answers2

2

Your code is too complex. You can do this using only one for loop, without external packages, likes this:

myData <- as.data.frame(matrix(c(rep("a", 10), "", rep("b", 10)), ncol=1), stringsAsFactors = FALSE)

newData <- data.frame(row.names=1:10)
for (i in 1:((nrow(myData)+1)/11)) {
  start <- 11*i - 10
  newData[[paste0("col", i)]] <- myData$V1[start:(start+9)]
}

You don't actually need all this though. You can simply remove the empty lines, split the vector in chunks of size 10 (as explained here) and then turn the list into a data frame.

vec <- myData$V1[nchar(myData$V1)>0]

as.data.frame(split(vec, ceiling(seq_along(vec)/10)))

#    X1 X2
# 1   a  b
# 2   a  b
# 3   a  b
# 4   a  b
# 5   a  b
# 6   a  b
# 7   a  b
# 8   a  b
# 9   a  b
# 10  a  b
Community
  • 1
  • 1
Molx
  • 6,816
  • 2
  • 31
  • 47
2

We could create a numeric index based on the '' values in the 'V2' column, split the dataset, use Reduce/merge to get the columns in the wide format.

indx <- cumsum(myData$V2=='')+1
res <- Reduce(function(...) merge(..., by= 'V1'), split(myData, indx))
res1 <- res[order(factor(res$V1, levels=myData[1:10, 1])),]
colnames(res1)[-1] <- paste0('Col', 1:3)
head(res1,3)
#            V1       Col1       Col2       Col3
#2     ProductId B000179R3I B0000C3XXN B0000C3XX9
#4 product_title Amazon.com Amazon.com Amazon.com
#3 product_price    unknown    unknown    unknown

From the p1.png, the 'V1' column can also be the column names for the values in 'V2'. If that is the case, we can 'transpose' the 'res1' except the first column and change the column names of the output with the first column of 'res1' (setNames(...))

res2 <- setNames(as.data.frame(t(res1[-1]), stringsAsFactors=FALSE), 
                       res1[,1]) 
row.names(res2) <- NULL
res2[] <- lapply(res2, type.convert)
head(res2)
#   ProductId product_title product_price         userid
#1 B000179R3I    Amazon.com       unknown A3Q0VJTU04EZ56
#2 B0000C3XXN    Amazon.com       unknown A34JM8F992M9N1
#3 B0000C3XX9    Amazon.com       unknown A34JM8F993MN91
#                  profileName helpfulness reviewscore review_time
#1 Jeanmarie Kabala "JP Kabala"         7/7           4  1182816000
#2                   M. Shapiro         6/6           5  1205107200
#3                     J. Cruze         8/8           5   120571929
#              review_summary
#1 Periwinkle Dartmouth Blazer
#2        great classic jacket
#3                 Good jacket
#                                            review_text
#1 I own the Austin Reed dartmouth blazer in every color
#2          This is the second time I bought this jacket
#3           This is the third time I bought this jacket

I guess this is just a reshaping issue. In that case, we can use dcast from data.table to convert from long to wide format

library(data.table)
DT <- dcast(setDT(myData)[V1!=''][, N:= paste0('Col', 1:.N) ,V1], V1~N,
                              value.var='V2')

data

 myData <- structure(list(V1 = c("ProductId", "product_title",
 "product_price", 
 "userid", "profileName", "helpfulness", "reviewscore", "review_time", 
 "review_summary", "review_text", "", "ProductId", "product_title", 
 "product_price", "userid", "profileName", "helpfulness", 
 "reviewscore", 
 "review_time", "review_summary", "review_text", "", "ProductId", 
 "product_title", "product_price", "userid", "profileName",
 "helpfulness",  
 "reviewscore", "review_time", "review_summary", "review_text"
 ), V2 = c("B000179R3I", "Amazon.com", "unknown", "A3Q0VJTU04EZ56", 
 "Jeanmarie Kabala \"JP Kabala\"", "7/7", "4", "1182816000", 
 "Periwinkle Dartmouth Blazer", 
 "I own the Austin Reed dartmouth blazer in every color", "", 
 "B0000C3XXN", "Amazon.com", "unknown", "A34JM8F992M9N1",
 "M. Shapiro", 
 "6/6", "5", "1205107200", "great classic jacket",
 "This is the second time I bought this jacket", 
 "", "B0000C3XX9", "Amazon.com", "unknown", "A34JM8F993MN91", 
 "J. Cruze", "8/8", "5", "120571929", "Good jacket",
 "This is the third time I bought this jacket"
 )), .Names = c("V1", "V2"), row.names = c(NA, 32L),
 class = "data.frame")
akrun
  • 874,273
  • 37
  • 540
  • 662
  • what to do when data set is too large( like 1048576 entries) as it is taking too long time. – Varun Khambra Apr 12 '15 at 09:22
  • The reshaping thing solved it, thank you. i actually want to run sentimental analysis on the reviews extracted from this data set, so i wanted to rearrange and clean it first. – Varun Khambra Apr 12 '15 at 09:49
  • @VarunKhambra Yes, the `merge/Reduce` would be slower for the large dataset, The `data.table` `dcast` should be fast. – akrun Apr 12 '15 at 10:16