I tried to use it according to the docs and couldn't get stratify
to work.
Setup
from sklearn.cross_validation import train_test_split
import pandas as pd
import numpy as np
np.random.seed([3,1415])
p = np.arange(1, 5.) / np.arange(1, 5.).sum()
df = pd.DataFrame({'category': np.random.choice(('cat1', 'cat2', 'cat3', 'cat4'), (1000,), p=p),
'x': np.random.rand(1000), 'y': np.random.choice(range(2), (1000,))})
def get_freq(s):
return s.value_counts() / len(s)
print get_freq(df.category)
cat4 0.400
cat3 0.284
cat2 0.208
cat1 0.108
Name: category, dtype: float64
If I try to:
train, test = train_test_split(df, stratify=df.category, test_size=.5)
train, test = train_test_split(df, stratify=df.category.values, test_size=.5)
train, test = train_test_split(df, stratify=df.category.values.tolist(), test_size=.5)
All returned a:
TypeError: Invalid parameters passed:
The docs say:
stratify : array-like or None (default is None)
I can't think why this wouldn't work.
I decided to build a work around:
def stratify_train_test(df, stratifyby, *args, **kwargs):
train, test = pd.DataFrame(), pd.DataFrame()
gb = df.groupby(stratifyby)
for k in gb.groups:
traink, testk = train_test_split(gb.get_group(k), *args, **kwargs)
train = pd.concat([train, traink])
test = pd.concat([test, testk])
return train, test
train, test = stratify_train_test(df, 'category', test_size=.5)
# this also works
# train, test = stratify_train_test(df, df.category, test_size=.5)
print get_freq(train.category)
print len(train)
Name: category, dtype: float64
cat4 0.400
cat3 0.284
cat2 0.208
cat1 0.108
Name: category, dtype: float64
500
print get_freq(test.category)
print len(test)
cat4 0.400
cat3 0.284
cat2 0.208
cat1 0.108
Name: category, dtype: float64
500