I am trying to save a large trained sklearn pipeline object using joblib. I would be happy to use any method for saving the model that works, but my research so far hasn't turned up any valid options. The code runs end to end and joblib saves the model perfectly on small datasets, but when I scale it up to use the full dataset, it fails with the following error:
File "/usr/local/lib/python3.7/pickle.py", line 732, in save_bytes
self._write_large_bytes(BINBYTES + pack("<I", n), obj)
struct.error: 'I' format requires 0 <= number <= 4294967295
This has been difficult to test because the model takes 2 days to train so I created a synthetic object using the following code, all joblib dumps work fine. This code creates a 25gb object in memory and using vanilla joblib dumps a 6gb file.
import sys
sys.stdout.flush()
import joblib
import pickle
import time
test = ['A'*1024 for _ in range(0, 1024*1024*1024*3)]
print(sys.getsizeof(test))
try:
start = time.time()
joblib.dump(test,'joblib_test1.joblib')
print("joblib: " + str(time.time()-start))
except:
pass
try:
start = time.time()
joblib.dump(test,'joblib_test2.joblib',protocol=4,compress=3)
print("joblib protocol 4 compress 3: " + str(time.time()-start))
except:
pass
try:
start = time.time()
joblib.dump(test,'joblib_test3.joblib',protocol=4,compress=9)
print("joblib protocol 3 compress 9: " + str(time.time()-start))
except:
pass
try:
start = time.time()
joblib.dump(test, 'joblib_test4.joblib.xz') # xz
print("joblib xz: " + str(time.time()-start))
except:
pass
Here is a snippet of the model training code for reference:
pipeline = Pipeline([
# Use FeatureUnion to combine the features from subject and body
('union', FeatureUnion( #Feature union merges text, numeric, and categorical data for model ingestion
transformer_list=[
('categorical',Pipeline([ # creates one hot encoded categorical variables
('selector', ItemSelector(key=['region'])),
('onehotencoder',encoder)
])),
('numerical',Pipeline([ # selects numeric features
('selector', ItemSelector(key=['m_num','a_num','c_num']))
])),
('bow', Pipeline([ # creates bag of words
('selector', ItemSelector(key='message')),
('clean',CleanText()),
('tfidf', tfidf_vectorizer)
])),
# Pipeline for pulling ad hoc features from text
('text_stats', Pipeline([ # calculates stats from text
('selector', ItemSelector(key='message')),
('stats', TextStats()), # returns a list of dicts
('vect', DictVectorizer()) # list of dicts -> feature matrix
]))
]
)),
('oversampler',smt), # Smote: oversample under-represented classes
('svd',svd), # Truncated SVD. Used for dimensionality reduction
('model', classifier) # Modeling step
],verbose=1)
pipeline.fit(X_train,np.ravel(y_train))
joblib.dump(pipeline,filename,compress=9)
Full stack trace for reference:
Traceback (most recent call last):
File "training/3_Train_Multiclass.py", line 427, in <module>
joblib.dump(pipeline,filename,compress=9)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 502, in dump
NumpyPickler(f, protocol=protocol).dump(value)
File "/usr/local/lib/python3.7/pickle.py", line 437, in dump
self.save(obj)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 549, in save
self.save_reduce(obj=obj, *rv)
File "/usr/local/lib/python3.7/pickle.py", line 662, in save_reduce
save(state)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 859, in save_dict
self._batch_setitems(obj.items())
File "/usr/local/lib/python3.7/pickle.py", line 885, in _batch_setitems
save(v)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 819, in save_list
self._batch_appends(obj)
File "/usr/local/lib/python3.7/pickle.py", line 843, in _batch_appends
save(x)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 774, in save_tuple
save(element)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 549, in save
self.save_reduce(obj=obj, *rv)
File "/usr/local/lib/python3.7/pickle.py", line 662, in save_reduce
save(state)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 859, in save_dict
self._batch_setitems(obj.items())
File "/usr/local/lib/python3.7/pickle.py", line 885, in _batch_setitems
save(v)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 549, in save
self.save_reduce(obj=obj, *rv)
File "/usr/local/lib/python3.7/pickle.py", line 662, in save_reduce
save(state)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 859, in save_dict
self._batch_setitems(obj.items())
File "/usr/local/lib/python3.7/pickle.py", line 885, in _batch_setitems
save(v)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 549, in save
self.save_reduce(obj=obj, *rv)
File "/usr/local/lib/python3.7/pickle.py", line 638, in save_reduce
save(args)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 774, in save_tuple
save(element)
File "/usr/local/lib/python3.7/site-packages/joblib/numpy_pickle.py", line 295, in save
return Pickler.save(self, obj)
File "/usr/local/lib/python3.7/pickle.py", line 504, in save
f(self, obj) # Call unbound method with explicit self
File "/usr/local/lib/python3.7/pickle.py", line 732, in save_bytes
self._write_large_bytes(BINBYTES + pack("<I", n), obj)
struct.error: 'I' format requires 0 <= number <= 4294967295