I'm submitting a training job via the REST API. The process is able to train, but when it gets to the saving part it errors throwing a The replica master 0 exited with a non-zero status of 1.
error. I've checked my IAM Permissions for the service account, and it has the following permissions:
- Logs Writer
- ML Engine Admin
- Storage Admin
- Storage Object Admin
Here's a more in-depth traceback of the actual error.
Traceback (most recent call last):
File "/usr/lib/python3.5/runpy.py", line 184, in _run_module_as_main "__main__", mod_spec)
File "/usr/lib/python3.5/runpy.py", line 85, in _run_code exec(code, run_globals)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 223, in <module> dispatch(**parse_args.__dict__)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 133, in dispatch callbacks=callbacks)
File "/root/.local/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper return func(*args, **kwargs)
File "/root/.local/lib/python3.5/site-packages/keras/models.py", line 1110, in fit_generator initial_epoch=initial_epoch)
File "/root/.local/lib/python3.5/site-packages/keras/legacy/interfaces.py", line 88, in wrapper return func(*args, **kwargs)
File "/root/.local/lib/python3.5/site-packages/keras/engine/training.py", line 1849, in fit_generator callbacks.on_epoch_begin(epoch)
File "/root/.local/lib/python3.5/site-packages/keras/callbacks.py", line 63, in on_epoch_begin callback.on_epoch_begin(epoch, logs)
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 74, in on_epoch_begin copy_file_to_gcs(self.job_dir, checkpoints[-1])
File "/root/.local/lib/python3.5/site-packages/trainer/task.py", line 150, in copy_file_to_gcs output_f.write(input_f.read())
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/lib/io/file_io.py", line 126, in read pywrap_tensorflow.ReadFromStream(self._read_buf, length, status)) File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/lib/io/file_io.py", line 94, in _prepare_value return compat.as_str_any(val)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 106, in as_str_any return as_str(value)
File "/usr/local/lib/python3.5/dist-packages/tensorflow/python/util/compat.py", line 84, in as_text return bytes_or_text.decode(encoding) UnicodeDecodeError: 'utf-8' codec can't decode byte 0x89 in position 0: invalid start byte
I'm not entirely sure as of why this is happening. The code is taken from the example projects on Googles git page. Nothing has been changed. Here's my REST call:
{
"jobId": "training_20",
"trainingInput": {
"scaleTier": "BASIC",
"packageUris": ["gs://MY_BUCKET/census.tar.gz"],
"pythonModule": "trainer.task",
"args": [
"--train-files",
"gs://MY_BUCKET/adult.data.csv",
"--eval-files",
"gs://MY_BUCKET/adult.test.csv",
"--job-dir",
"gs://MY_BUCKET/models",
"--train-steps",
"100",
"--eval-steps",
"10"],
"region": "europe-west1",
"jobDir": "gs://MY_BUCKET/models",
"runtimeVersion": "1.4",
"pythonVersion": "3.5"
}
}
This is the saving code part:
# Unhappy hack to work around h5py not being able to write to GCS.
# Force snapshots and saves to local filesystem, then copy them over to GCS.
if job_dir.startswith("gs://"):
census_model.save(CENSUS_MODEL)
copy_file_to_gcs(job_dir, CENSUS_MODEL)
else:
census_model.save(os.path.join(job_dir, CENSUS_MODEL))
# Convert the Keras model to TensorFlow SavedModel
model.to_savedmodel(census_model, os.path.join(job_dir, 'export'))
# h5py workaround: copy local models over to GCS if the job_dir is GCS.
def copy_file_to_gcs(job_dir, file_path):
with file_io.FileIO(file_path, mode='r') as input_f:
with file_io.FileIO(os.path.join(job_dir, file_path), mode='w+') as output_f:
output_f.write(input_f.read())