Here is a stringr
alternative. It uses sapply
with str_extract_all
to extract the characters of df$digits
for each row and calculates the mean.
library(stringr)
df$mean_digits <- sapply(str_extract_all(df$digits, ".{1}"), function(x) mean(as.numeric(x)))
df
country gdp digits mean_digits
1 US 100 2657 5
2 AUS 50 123 2
3 NZ 40 11 1
Or, if you really wanted to, you could do it by using the matrix output from str_extract_all
and rowMeans
. Note: for str_extract_all
, simplify = FALSE
is the default.
extracted_mat <- str_extract_all(df$digits, ".{1}", simplify = TRUE)
class(extracted_mat) <- "numeric"
df$mean_digits <- rowMeans(extracted_mat, na.rm = T)
EDIT: running benchmarks on a larger scale (i.e., using @Gregor's sample suggestion).
# Packages
library(stringr)
library(gsubfn)
# Functions
mean_digits = function(x) {
sapply(strsplit(as.character(x), split = "", fixed = TRUE),
function(x) mean(as.integer(x)))
}
mnDigit <- function(x) {
n <- nchar(x)
sq <- as.numeric(paste0("1e", n:0))
mean((x %% sq[-length(sq)]) %/% sq[-1])
}
mnDigit2 <- function(a) {
dig <- ceiling(log10(a + 1))
vec1 <- 10^(dig:1)
vec2 <- vec1 / 10
mean((a %% vec1) %/% vec2)
}
# Creating x
set.seed(1)
x = sample(1:1e7, size = 5e5)
microbenchmark::microbenchmark(mnDigit2=sapply(x, mnDigit2),
mnDigit=sapply(x, mnDigit),
stringr=sapply(str_extract_all(x, ".{1}"), function(x) mean(as.numeric(x))),
stringr_matrix = {
extracted_mat <- str_extract_all(x, ".{1}", simplify = TRUE)
class(extracted_mat) <- "numeric"
rowMeans(extracted_mat, na.rm = T)
},
strsplit=mean_digits(x),
rowMeans=rowMeans(read.table(text = gsub("\\b", " ", x), fill = NA), na.rm = TRUE),
#strapply=sapply(strapply(x, ".", as.numeric, simplify=TRUE), mean),
times = 10)
Unit: milliseconds
expr min lq mean median uq max neval cld
mnDigit2 3154.4249 3226.633 3461.847 3445.867 3612.690 3840.691 10 c
mnDigit 6403.7460 6613.345 6876.223 6736.304 6965.453 7634.197 10 d
stringr 3277.0188 3628.581 3765.786 3711.022 3808.547 4347.229 10 c
stringr_matrix 944.5599 1029.527 1136.334 1090.186 1169.633 1540.976 10 a
strsplit 3087.6628 3259.925 3500.780 3416.607 3585.573 4249.027 10 c
rowMeans 1354.5196 1449.871 1604.305 1594.297 1745.088 1828.070 10 b
identical(sapply(x, mnDigit2), sapply(x, mnDigit))
[1] TRUE
identical(sapply(x, mnDigit2), sapply(str_extract_all(x, ".{1}"), function(x) mean(as.numeric(x))))
[1] TRUE
identical(sapply(x, mnDigit2), {
extracted_mat <- str_extract_all(x, ".{1}", simplify = TRUE)
class(extracted_mat) <- "numeric"
rowMeans(extracted_mat, na.rm = T)
})
[1] TRUE
identical(sapply(x, mnDigit2), mean_digits(x))
[1] TRUE
identical(sapply(x, mnDigit2), rowMeans(read.table(text = gsub("\\b", " ", x), fill = NA), na.rm = TRUE))
[1] TRUE