0

I have item score data in a long format, where one row represents one person's score on a specific item, as shown below:

# test_id  item_id  item_type item_grade
# 1 ebb9c7d4 fef31170  dictation  0.3994113
# 2 ebb9c7d4 c1a682fe  dictation  0.9938895
# 3 ebb9c7d4 a39c6769      ctest  0.6585366
# 4 ebb9c7d4 c23d8b15  dictation  0.0000001
# 5 ebb9c7d4 5e92d30a text_vocab  0.9146377

I want to reshape the data so that one row contains all of the item scores per person, i.e.,

test_id  fef31170  c1a682fe  a39c6769 ...
ebb9c7d4 0.399411  0.993890  0.658537 ...

I have tried the suggestions for several questions on this topic, but get either errors or data frames that are missing observations.

Below is my MWE:

library(reshape2)
library(tidyverse)
###############################################################################
##### read responses from csv file                                        #####
###############################################################################
resps.long <- read.csv('take-home-task.csv', header=TRUE)
attach(resps.long)
nlevels(test_id) #  2000 persons
nlevels(item_id) # 16082 items

resps[1:5,]
# test_id  item_id  item_type item_grade
# 1 ebb9c7d4 fef31170  dictation  0.3994113
# 2 ebb9c7d4 c1a682fe  dictation  0.9938895
# 3 ebb9c7d4 a39c6769      ctest  0.6585366
# 4 ebb9c7d4 c23d8b15  dictation  0.0000001
# 5 ebb9c7d4 5e92d30a text_vocab  0.9146377

index <- order(test_id, item_type, item_id)
resps.long.ordered <- resps.long[index,]

resps.long.ordered[1:5,]
# test_id  item_id   item_type item_grade
# 45542 000ed7fb 2623f3c5 audio_vocab  0.2062909
# 45544 000ed7fb ac0f9793 audio_vocab  0.0100000
# 45545 000ed7fb beba8cef audio_vocab  0.4446947
# 45543 000ed7fb f1f46e2f audio_vocab  0.4446947
# 45554 000ed7fb f7e0a5d8 audio_vocab  0.6149746


This is as far as I can get.

I have tried various suggested approaches, as shown below:

#
# Try different attempts to reshape data
# 

resps.wide <- reshape(resps.long.ordered, timevar="item_grade", idvar="test_id", direction="wide")
#
# There were 50 or more warnings (use warnings() to see the first 50)
# > warnings()
# Warning messages:
#   1: In reshapeWide(data, idvar = idvar, timevar = timevar,  ... :
#       multiple rows match for item_grade=0.206290927956814: first taken
#   2: In reshapeWide(data, idvar = idvar, timevar = timevar,  ... :
#       multiple rows match for item_grade=0.01: first taken
#   3: In reshapeWide(data, idvar = idvar, timevar = timevar,  ... :
#     multiple rows match for item_grade=0.444694658188075: first taken
#

resps.wide <- melt(data = resps.long.ordered, 
                   id.vars = test_id, 
                   variable.name = item_type,
                   measure.vars = item_grade)
#
# Error: id variables not found in data: ebb9c7d4, b8eacb47, ...
#


resps.wide <- resps.long.ordered %>%
  group_by(test_id) %>%
  mutate(item_id) %>%
  ungroup() %>%
  spread(test_id, item_grade)
resps.wide <- reshape(resps.wide, idvar="test_id", timevar="item_grade", direction="wide")
#
# Error in data[, timevar] : subscript out of bounds
#


resps.wide <- dcast(resps.long.ordered, item_grade~test_id, value.var="item_id")
#
# some data are lost:
#
# Aggregation function missing: defaulting to length
# > dim(resps.wide)
# [1] 11394  2001
#

  • Perhaps `resps.long.ordered %>% spread(test_id, item_grade, -test_id)`? – Jon Spring Nov 07 '19 at 19:22
  • If you convert it to a data.table with `library(data.table); setDT(df)` you can do it this way: `dcast(df, test_id ~ item_id, value.var = 'item_grade')`. This is a standard [convert to wide](https://stackoverflow.com/questions/5890584/how-to-reshape-data-from-long-to-wide-format) (this is SymbolixAU's answer there) – IceCreamToucan Nov 07 '19 at 19:27

0 Answers0