To reproduce
I am running the MAML (with higher) meta-learning algorithm with a resnet. I see this gives issues in my script (error message pasted bellow). Is Adafactor not suppose to work with Resnets or other models?
Steps to reproduce the behavior:
- run this code: https://github.com/brando90/higher/blob/master/examples/maml-omniglot.py (it already has adafactor)
- if that works uncomment the resnet12 line and ping me please
Expected behavior
I expect training to go smoothly but isntead get:
--------------------- META-TRAIN ------------------------
Starting training!
Traceback (most recent call last):
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 441, in <module>
main_resume_from_checkpoint(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 403, in main_resume_from_checkpoint
run_training(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 413, in run_training
meta_train_fixed_iterations(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/meta_learning/training/meta_training.py", line 233, in meta_train_fixed_iterations
args.outer_opt.step()
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 577, in step
update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 508, in _approx_sq_grad
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
RuntimeError: mat1 must be a matrix, got 4-D tensor
full error output:
('PID', '25721')
('always_use_deterministic_algorithms', False)
('args_hardcoded_in_script', False)
('base_model_mode', 'resnet12_rsf')
('best_val_loss', inf)
('condor_jobid', -1)
('copy_initial_weights', False)
('current_logs_path', '/home/miranda9/data/logs/logs_Nov05_15-44-03_jobid_668')
('current_time', 'Nov30_08-42-53')
('data_path', 'miniimagenet')
('debug', False)
('debug_test', False)
('device', device(type='cuda'))
('epoch_num', -1)
('eval_iters', 2)
('experiment_name', 'debug')
('fo', False)
('force_log', True)
('githash', '9af491c')
('githash_long', '9af491ccd13fa88f4d07287f54305488ba4967fc')
('githash_short', '9af491c')
('gpu_name', 'NVIDIA GeForce GTX TITAN X')
('grad_clip_mode', None)
('grad_clip_rate', None)
('hostname', 'vision-02.cs.illinois.edu')
('inner_debug_eval', False)
('inner_debug_train', False)
('inner_lr', 0.1)
('it', 0)
('jobid', 10340)
('k_eval', 15)
('k_shots', 5)
('log_root', PosixPath('/home/miranda9/data/logs/logs_Nov30_08-42-53_jobid_10340'))
('log_to_wandb', True)
('log_train_freq', 200)
('log_val_freq', 200)
('logger', <uutils.logger.Logger object at 0x2b832f5eff70>)
('logging', True)
('mail_user', 'brando.science@gmail.com')
('master_port', '37126')
('meta_batch_size_eval', 2)
('meta_batch_size_train', 2)
('meta_learner', 'maml_fixed_inner_lr')
('metrics_as_dist', False)
('my_stdout_filepath', '/home/miranda9/data/logs/logs_Nov05_15-44-03_jobid_668/my_stdout.log')
('n_classes', 5)
('nb_inner_train_steps', 4)
('nccl', 2708)
('num_epochs', -1)
('num_its', 3)
('num_workers', 4)
('outer_debug', False)
('outer_lr', 0.001)
('path_to_checkpoint', PosixPath('/home/miranda9/data_folder_fall2020_spring2021/logs/nov_all_mini_imagenet_expts/logs_Nov05_15-44-03_jobid_668'))
('pin_memory', False)
('pw_path', '/home/miranda9/pw_app.config.json')
('rank', -1)
('run_name', 'debug (Adafactor) : args.jobid=10340')
('save_ckpt', True)
('seed', None)
('serial', False)
('show_layerwise_sims', False)
('sim_compute_parallel', False)
('slurm_array_task_id', -1)
('slurm_jobid', 10340)
('split', 'train')
('tb', True)
('track_higher_grads', True)
('train_iters', 500000)
('trainin_with_epochs', False)
('training_mode', 'iterations')
('wandb_entity', 'brando')
('wandb_group', 'experiment_debug')
('wandb_project', 'sl_vs_ml_iclr_workshop_paper')
------- Main Resume from Checkpoint --------
args.base_model=ResNet(
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(64, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(avgpool): AdaptiveAvgPool2d(output_size=1)
(dropout): Dropout(p=0.0, inplace=False)
(classifier): Linear(in_features=640, out_features=5, bias=True)
)
args.outer_opt=Adafactor (
Parameter Group 0
beta1: None
clip_threshold: 1.0
decay_rate: -0.8
eps: (1e-30, 0.001)
lr: None
relative_step: True
scale_parameter: True
warmup_init: True
weight_decay: 0.0
)
args.meta_learner=MAMLMetaLearner(
(base_model): ResNet(
(layer1): Sequential(
(0): BasicBlock(
(conv1): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(3, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer2): Sequential(
(0): BasicBlock(
(conv1): Conv2d(64, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(160, 160, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(64, 160, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(160, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer3): Sequential(
(0): BasicBlock(
(conv1): Conv2d(160, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(160, 320, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(320, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(layer4): Sequential(
(0): BasicBlock(
(conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(relu): LeakyReLU(negative_slope=0.1)
(conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn2): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(conv3): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
(bn3): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
(maxpool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
(downsample): Sequential(
(0): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1), bias=False)
(1): BatchNorm2d(640, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
)
(DropBlock): DropBlock()
)
)
(avgpool): AdaptiveAvgPool2d(output_size=1)
(dropout): Dropout(p=0.0, inplace=False)
(classifier): Linear(in_features=640, out_features=5, bias=True)
)
)
args.scheduler=None
--------------------- META-TRAIN ------------------------
Starting training!
Traceback (most recent call last):
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 441, in <module>
main_resume_from_checkpoint(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 403, in main_resume_from_checkpoint
run_training(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/experiments/meta_learning/main_metalearning.py", line 413, in run_training
meta_train_fixed_iterations(args)
File "/home/miranda9/automl-meta-learning/automl-proj-src/meta_learning/training/meta_training.py", line 233, in meta_train_fixed_iterations
args.outer_opt.step()
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 577, in step
update = self._approx_sq_grad(exp_avg_sq_row, exp_avg_sq_col)
File "/home/miranda9/miniconda3/envs/metalearning_gpu/lib/python3.9/site-packages/transformers/optimization.py", line 508, in _approx_sq_grad
return torch.mm(r_factor.unsqueeze(-1), c_factor.unsqueeze(0))
RuntimeError: mat1 must be a matrix, got 4-D tensor
Doesn't work with ViT:
Traceback (most recent call last):
File "/lfs/ampere3/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_maml_torchmeta.py", line 509, in <module>
main()
File "/lfs/ampere3/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_maml_torchmeta.py", line 443, in main
train(rank=-1, args=args)
File "/lfs/ampere3/0/brando9/diversity-for-predictive-success-of-meta-learning/div_src/diversity_src/experiment_mains/main_maml_torchmeta.py", line 485, in train
meta_train_fixed_iterations(args, args.agent, args.dataloaders, args.opt, args.scheduler)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/training/meta_training.py", line 104, in meta_train_fixed_iterations
log_zeroth_step(args, meta_learner)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/logging_uu/wandb_logging/supervised_learning.py", line 170, in log_zeroth_step
train_loss, train_acc = model(batch, training=training)
File "/lfs/ampere3/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
return forward_call(*input, **kwargs)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_meta_learner.py", line 66, in forward
meta_loss, meta_loss_ci, meta_acc, meta_acc_ci = meta_learner_forward_adapt_batch_of_tasks(self, spt_x, spt_y,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_differentiable_optimizer.py", line 473, in meta_learner_forward_adapt_batch_of_tasks
meta_losses, meta_accs = get_lists_losses_accs_meta_learner_forward_adapt_batch_of_tasks(meta_learner,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_differentiable_optimizer.py", line 511, in get_lists_losses_accs_meta_learner_forward_adapt_batch_of_tasks
fmodel: FuncModel = get_maml_adapted_model_with_higher_one_task(meta_learner.base_model,
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_differentiable_optimizer.py", line 195, in get_maml_adapted_model_with_higher_one_task
diffopt.step(inner_loss, grad_callback=lambda grads: [g.detach() for g in grads])
File "/lfs/ampere3/0/brando9/miniconda/envs/mds_env_gpu/lib/python3.9/site-packages/higher/optim.py", line 237, in step
all_grads = grad_callback(all_grads)
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_differentiable_optimizer.py", line 195, in <lambda>
diffopt.step(inner_loss, grad_callback=lambda grads: [g.detach() for g in grads])
File "/afs/cs.stanford.edu/u/brando9/ultimate-utils/ultimate-utils-proj-src/uutils/torch_uu/meta_learners/maml_differentiable_optimizer.py", line 195, in <listcomp>
diffopt.step(inner_loss, grad_callback=lambda grads: [g.detach() for g in grads])
AttributeError: 'NoneType' object has no attribute 'detach'
related:
- https://github.com/huggingface/transformers/issues/14574
- https://github.com/facebookresearch/higher/issues/124
- Adafactor from transformers hugging face only works with Transfromers - does it not work with Resnets and MAML with higher?
- https://www.reddit.com/r/pytorch/comments/r5p2pk/adafactor_from_transformers_hugging_face_only/