tl;dr
This answer contains a fast, non-regex approach and also some benchmarking. The non-regex approach is slightly faster but the regex approach uses less memory. parse_date_time()
is several orders of magnitude slower, taking over 3 minutes for the largest data frame, where the regex fasttime::fastDate()
approach took 33 seconds and non-regex version 21 seconds.
The facet labels in the plots below are the number of times the sample vector is repeated, so e.g. 10000 means a table of 30000 rows.

A non-regex approach
My suspicion is that a non-regex approach is always faster so I tried a non-regex fasttime::fastDate()
method:
DT <- as.data.table(dates)
DT[, third_char := substr(dates, 3, 3)]
DT[, `:=`(
actualDate = fcase(
third_char == "/", as.Date(dates, format = "%m/%d/%Y"),
third_char == "-", as.Date(dates, format = "%m-%d-%Y"),
third_char %in% 1:9, fastDate(dates)
),
third_char = NULL
)]
Benchmarking
I decided to benchmark the three approaches here:
- Wimpel's regular expression and
fasttime::fastDate()
method.
- My approach which also uses
fasttime::fastDate()
but avoids regex.
- The
lubridate
approach suggested by @user2974951 and modified by TarJae in comments. I have also converted the output into Date
format rather than POSIXct
so all output is identical.
Results
As well as the plot, I include the output below. Note there are very few iterations (sometimes only one) as the data gets larger, as each one takes a long time. Possibly the timing would change if more iterations were run. If anyone wants to spend some time replicating this, I include the code to do so below.
# A tibble: 12 x 14
# expression size min median `itr/sec` mem_alloc `gc/sec` n_itr n_gc total_time
# <bch:expr> <dbl> <bch:tm> <bch:tm> <dbl> <bch:byt> <dbl> <int> <dbl> <bch:tm>
# 1 regex 10 31ms 33.05ms 29.9 1.19MB 2.30 13 1 434.47ms
# 2 no_regex 10 12.78ms 13.8ms 69.1 6.58MB 9.53 29 4 419.91ms
# 3 parse_date_time 10 165.29ms 182.68ms 5.47 17.04MB 2.74 2 1 365.36ms
# 4 regex 100 364.25ms 366.44ms 2.73 10.74MB 0 2 0 732.89ms
# 5 no_regex 100 142.79ms 145.15ms 5.32 65.28MB 10.6 3 6 564.26ms
# 6 parse_date_time 100 1.95s 1.95s 0.514 195.77MB 2.06 1 4 1.95s
# 7 regex 1000 3.4s 3.4s 0.294 106.88MB 0 1 0 3.4s
# 8 no_regex 1000 1.62s 1.62s 0.619 652.36MB 0.619 1 1 1.62s
# 9 parse_date_time 1000 19.02s 19.02s 0.0526 1.93GB 0.473 1 9 19.02s
# 10 regex 10000 33.94s 33.94s 0.0295 1.04GB 0 1 0 33.94s
# 11 no_regex 10000 21.91s 21.91s 0.0456 6.37GB 0.137 1 3 21.91s
# 12 parse_date_time 10000 3.28m 3.28m 0.00508 15.67GB 0.0305 1 6 3.28m
Code to run the benchmark
results <- bench::press(
size = sizes,
{
dates <- rep(dates, size)
DT1 <- as.data.table(dates)
DT2 <- as.data.table(dates)
DT3 <- as.data.table(dates)
bench::mark(
min_iterations = 1,
max_iterations = 100,
check = FALSE,
regex = {
# replace all "/" charachters to "-"
DT1[, dates2 := gsub("/", "-", dates)][]
# reverse strings in the dd-mm-yyyy format to yyyy-mm-dd
DT1[
grepl("^[0-9]{1,2}.[0-9]{1,2}.[0-9]{4}$", dates2),
dates2 := gsub("^([0-9]{1,2}).([0-9]{1,2}).([0-9]{4})$", "\\3-\\1-\\2", dates2)
][]
# usse fastposix to convert
DT1[, `:=`(
dates2 = NULL,
actualDate = fasttime::fastDate(dates2)
)][]
},
no_regex = {
DT2[, third_char := substr(dates, 3, 3)]
DT2[, `:=`(
actualDate = fcase(
third_char == "/", as.Date(dates, format = "%m/%d/%Y"),
third_char == "-", as.Date(dates, format = "%m-%d-%Y"),
third_char %in% 1:9, fastDate(dates)
),
third_char = NULL
)]
},
parse_date_time = {
DT3[, actualDate :=
fastDate(parse_date_time(
dates,
orders = c("ymd", "mdy")
))]
}
)
}
)
Code to generate the plot
library(ggplot2)
library(dplyr)
results |>
rowwise() |>
mutate(max = max(unlist(time))) |>
ungroup() |>
transmute(
expression = attr(expression, "description"),
size,
median,
min,
max,
gc = sapply(gc, \(x){
gc_level <- max(max(which(colSums(x) > 0)), 0) + 1
c("none", names(x))[gc_level]
})
) |>
ggplot(aes(group = expression)) +
geom_pointrange(
aes(
x = expression,
y = median,
ymin = min,
ymax = max,
color = gc
)
) +
facet_wrap(
vars(size),
scales = "free_y"
) +
theme_bw(base_size = 16) +
expand_limits(y = 0)