I have item score data in a long format, where one row represents one person's score on a specific item, as shown below:
# test_id item_id item_type item_grade
# 1 ebb9c7d4 fef31170 dictation 0.3994113
# 2 ebb9c7d4 c1a682fe dictation 0.9938895
# 3 ebb9c7d4 a39c6769 ctest 0.6585366
# 4 ebb9c7d4 c23d8b15 dictation 0.0000001
# 5 ebb9c7d4 5e92d30a text_vocab 0.9146377
I want to reshape the data so that one row contains all of the item scores per person, i.e.,
test_id fef31170 c1a682fe a39c6769 ...
ebb9c7d4 0.399411 0.993890 0.658537 ...
I have tried the suggestions for several questions on this topic, but get either errors or data frames that are missing observations.
Below is my MWE:
library(reshape2)
library(tidyverse)
###############################################################################
##### read responses from csv file #####
###############################################################################
resps.long <- read.csv('take-home-task.csv', header=TRUE)
attach(resps.long)
nlevels(test_id) # 2000 persons
nlevels(item_id) # 16082 items
resps[1:5,]
# test_id item_id item_type item_grade
# 1 ebb9c7d4 fef31170 dictation 0.3994113
# 2 ebb9c7d4 c1a682fe dictation 0.9938895
# 3 ebb9c7d4 a39c6769 ctest 0.6585366
# 4 ebb9c7d4 c23d8b15 dictation 0.0000001
# 5 ebb9c7d4 5e92d30a text_vocab 0.9146377
index <- order(test_id, item_type, item_id)
resps.long.ordered <- resps.long[index,]
resps.long.ordered[1:5,]
# test_id item_id item_type item_grade
# 45542 000ed7fb 2623f3c5 audio_vocab 0.2062909
# 45544 000ed7fb ac0f9793 audio_vocab 0.0100000
# 45545 000ed7fb beba8cef audio_vocab 0.4446947
# 45543 000ed7fb f1f46e2f audio_vocab 0.4446947
# 45554 000ed7fb f7e0a5d8 audio_vocab 0.6149746
This is as far as I can get.
I have tried various suggested approaches, as shown below:
#
# Try different attempts to reshape data
#
resps.wide <- reshape(resps.long.ordered, timevar="item_grade", idvar="test_id", direction="wide")
#
# There were 50 or more warnings (use warnings() to see the first 50)
# > warnings()
# Warning messages:
# 1: In reshapeWide(data, idvar = idvar, timevar = timevar, ... :
# multiple rows match for item_grade=0.206290927956814: first taken
# 2: In reshapeWide(data, idvar = idvar, timevar = timevar, ... :
# multiple rows match for item_grade=0.01: first taken
# 3: In reshapeWide(data, idvar = idvar, timevar = timevar, ... :
# multiple rows match for item_grade=0.444694658188075: first taken
#
resps.wide <- melt(data = resps.long.ordered,
id.vars = test_id,
variable.name = item_type,
measure.vars = item_grade)
#
# Error: id variables not found in data: ebb9c7d4, b8eacb47, ...
#
resps.wide <- resps.long.ordered %>%
group_by(test_id) %>%
mutate(item_id) %>%
ungroup() %>%
spread(test_id, item_grade)
resps.wide <- reshape(resps.wide, idvar="test_id", timevar="item_grade", direction="wide")
#
# Error in data[, timevar] : subscript out of bounds
#
resps.wide <- dcast(resps.long.ordered, item_grade~test_id, value.var="item_id")
#
# some data are lost:
#
# Aggregation function missing: defaulting to length
# > dim(resps.wide)
# [1] 11394 2001
#