UPDATE 6/16:
Added the train-test split function that calls the validating function. Added a parameter to set alpha separately for your target variable(s) in case you want a higher standard. Added library calls.
Here's a Github about it: https://github.com/KalebCoberly/train_test_split_R
END UPDATE
I'd like to do something similar to this post about Pandas dataframes but in R, and ideally regardless of data types (e.g. a data frame with both factors and numeric columns).
I want to get a random sample of an R data frame in which each variable is relatively representative of the population.
I have seen ways to create a stratified sample based on a single variable. But, I want to ensure representation on multiple columns and not just factors.
I wrote a simple algorithm to approach this across numeric variables, using the Wilcoxon test on each variable. So, if all numeric columns in the sample (test set) seem to come from the same set as the numeric columns in the remaining set (train set), you've got a fairly representative sample. You take a random sample and validate it with the following function, and resample and validate until you get a sample that meets your minimum representativity (as measured by alpha) across all variables.
In this case, because alpha represents the risk of incorrectly rejecting the null hypothesis (H0 = The samples did not come from significantly different populations, i.e. they are representative of the same population.), and because we want to fail to reject the null hypothesis, we want a p-value greater than alpha rather than less than alpha, and we want as high an alpha as we can muster.
library(tidyverse)
train_test_split = function(df, y_cols, id_cols, feats_lst, test_size = .3,
alpha = .5, target_alpha = .9, validate = TRUE) {
# Splits df into train/test sets and input/target (X/y) sets.
# (Must have id_col, but can be "dummy" since it's discarded for index.)
# Parameters:
# df: (data.frame) Full data set, including target variable(s).
# y_cols: (c(character)) Target column(s).
# id_cols: (c(character)) Id column(s) to drop, because df maintains index.
# test_size: (numeric) Proportion of rows to use for test set.
# (Does not validate.)
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and of test do not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and of test represent different supersets.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# validate: (bool) Should set split be validated?
# Return:
# split_lst: (list(data.frame)) (train_X, train_y, test_X, test_y)
# train_X (data.frame) Input features in training subset.
# train_y (data.frame) Target variable in training subset.
# test_X (data.frame) Input features in testing subset.
# test_y (data.frame) Target variable in testing subset.
split_lst = list(
'train_X' = data.frame(),
'train_y' = data.frame(),
'test_X' = data.frame(),
'test_y' = data.frame()
)
full_set_len = nrow(df)
test_set_len = as.integer(test_size * full_set_len)
###
### TO DO: Add a parameter and logic to choose whether to track this. ###
###
# To track average p-values of features:
feats_p_av_lst = vector(mode = 'list', length = length(feats_lst))
names(feats_p_av_lst) = feats_lst
# Split and validate until valid.
valid_split = FALSE
while (!valid_split) {
# Split randomly.
test_idx = sample(x = full_set_len, size = test_set_len)
split_lst$train_X = select(df[-test_idx, ], -all_of(y_cols))
split_lst$train_y = select(df[-test_idx, ], all_of(y_cols))
split_lst$train_y[id_cols] = split_lst$train_X[id_cols]
split_lst$test_X = select(df[test_idx, ], -all_of(y_cols))
split_lst$test_y = select(df[test_idx, ], all_of(y_cols))
split_lst$test_y[id_cols] = split_lst$test_X[id_cols]
# Validate the split.
if (validate) {
# Randomize test order to "cost-average" compute.
feats_lst = sample(feats_lst)
# Test X and y separately to avoid the join compute and data copies.
X_validation_results = validate_split(
train = split_lst$train_X,
test = split_lst$test_X,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_av_lst,
alpha = alpha,
target_alpha = target_alpha
)
feats_p_av_lst = X_validation_results$p_vals
if (X_validation_results$valid){
y_validation_results = validate_split(
train = split_lst$train_y,
test = split_lst$test_y,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_av_lst,
alpha = alpha,
target_alpha = target_alpha
)
feats_p_av_lst = y_validation_results$p_vals
if (y_validation_results$valid) {
valid_split = TRUE
} # else { print("Invalid y split. Resampling.") }
} # else { print("Invalid X split. Resampling.") }
} else {valid_split = TRUE}
}
if (validate) {
for(feat in names(feats_p_av_lst)) {
feats_p_av_lst[[feat]] = mean(feats_p_av_lst[[feat]])
}
print('Average p-values:')
print(feats_p_av_lst)
}
return(split_lst)
}
validate_split = function(train, test, feats_lst, y_cols, feats_p_val_lst,
alpha = .5, target_alpha = .9) {
# Conducts Wilcoxon ranks sum test column by column to test if train and test
# represent a similar superset. (i.e., is the split stratified on every
# feature?) Both train and test should have the same features. There should
# be at least one numeric (i.e. continuous) feature, as the test will only
# be performed on these columns -- this does limit the test.
# Parameters:
# train: (data.frame) A subset of original set to compare to the other
# subset, test.
# test: (data.frame) A subset of original set to compare to the other
# subset, train.
# feats_lst: (list(character)) List of features to test.
# y_cols: (c(character)) Vector of target features.
# feats_p_val_lst: (list(character:list(double)) Dictionary of p-values to
# to track which features are hardest to stratify.
# alpha: (numeric) Probability of incorrectly rejecting the null hypothesis.
# H0 = feature n of train and test does not represent different sets.
# (i.e. representative split)
# H1 = feature n of train and test represents a different superset.
# target_alpha: (numeric) Alpha to use if feature is target feature (i.e.
# if feature is in y_cols).
# Return:
# list(valid: (bool), p_vals: (list(character:list(double)))
# valid: (bool) Are the sets representative of the same superset?
# p_vals: (list(character:list(double)) feats_p_val_lst updated
valid = TRUE
for (feat in feats_lst) {
if (valid & feat %in% colnames(train) & feat %in% colnames(test)) {
this_alpha = alpha
if (feat %in% y_cols) {
this_alpha = target_alpha
}
results = wilcox.test(
x = as.double(train[[feat]]),
y = as.double(test[[feat]])
)
if (!(results$p.value > this_alpha)) {
# print("Reject null hypothesis that split is not unrepresentative:")
valid = FALSE
}
# print(feat)
# print(results$p.value)
feats_p_val_lst[[feat]] = c(feats_p_val_lst[[feat]], results$p.value)
}
}
return(list('valid' = valid, 'p_vals' = feats_p_val_lst))
}
Test it on dummy data:
sample_df = data.frame(
list(
'Id' = c(1:1000),
'y' = as.double(sample(1:1000, size = 1000)),
'a' = as.double(sample(1:2000, size = 1000)),
'b' = as.double(sample(1:3000, size = 1000))
)
)
y_cols = c('y'),
id_cols = c('Id'),
feats_lst = colnames(select(sample_df, where(is.double)))
split_lst = train_test_split(
df = sample_df,
y_cols = y_cols,
id_cols = id_cols,
feats_lst = feats_lst
)
# > names(split_lst)
# [1] "train_X" "train_y" "test_X" "test_y"
# You can call validate_split again on your found split to
# get your final p-values for each feature.
feats_p_val_lst = vector(mode = 'list', length = length(feats_lst))
names(feats_p_val_lst) = feats_lst
validate_split_lst = validate_split =(
train = split_lst$train_X,
test = split_lst$test_X,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = feats_p_val_lst
)
validate_split_lst = validate_split =(
train = split_lst$train_y,
test = split_lst$test_y,
feats_lst = feats_lst,
y_cols = y_cols,
feats_p_val_lst = validate_split_lst$p_vals
)
> validate_split_lst$p_vals
# A list of all your feature names with their p-values.
> validate_split_lst$valid
TRUE
Again, this leaves out factors and integers entirely, unless you cast them as doubles, but that would violate the Wilcoxon assumption that the data is continuous.
Given that my current dataset contains about 80 variables, almost half of which are doubles, this suffices because the factors are probably pretty representative if all the doubles are.
But, it takes forever to run and get even p > .5 (i.e. fail to reject the null hypothesis that these data sets are not from different populations (i.e. are not unrepresentative)). And, what about a data set with all or most of its variables as factors or integers?
Is there a better way, both/either from a mathematical/statistical perspective and/or an R/programming perspective? Also, is this somehow problematic for machine learning? I'd like to think that it would improve the generalizability of the trained/tuned model, reduce the chance of overfit, but does it somehow create leakage or something else problematic?