How to run with "-m some-module" directly in python code?
Recently I learned that running a command like below can run some torch.distributed python modules:
"python -m torch.distributed.launch --nproc_per_node=2 SOME_TRAINING_SCRIPT.py"
========
If I want to achieve running SOME_TRAINING_SCRIPT.py by torch.distributed.launch by just executing script python SOME_TRAINING_SCRIPT.py (instead of python -m torch.distributed.launch --nproc_per_node=2 SOME_TRAINING_SCRIPT.py), what things should be added to SOME_TRAINING_SCRIPT.py to achieve this?
The python file is like below:
import torch, os import torch.nn as nn from torch.utils.data import Dataset, DataLoader from torch.utils.data.distributed import DistributedSampler os.environ["CUDA_VISIBLE_DEVICES"] = "1,2" os.environ['MASTER_ADDR'] = '127.0.0.1' os.environ['MASTER_PORT'] = '29500' torch.distributed.init_process_group(backend="gloo") input_size = 5 output_size = 2 batch_size = 30 data_size = 90 local_rank = torch.distributed.get_rank() torch.cuda.set_device(local_rank) device = torch.device("cuda", local_rank) print("local_rank = ", local_rank) class RandomDataset(Dataset): def __init__(self, size, length): self.len = length self.data = torch.randn(length, size).to('cuda') def __getitem__(self, index): return self.data[index] def __len__(self): return self.len dataset = RandomDataset(input_size, data_size) rand_loader = DataLoader(dataset=dataset, batch_size=batch_size, sampler=DistributedSampler(dataset)) class Model(nn.Module): def __init__(self, input_size, output_size): super(Model, self).__init__() self.fc = nn.Linear(input_size, output_size) def forward(self, input): output = self.fc(input) print(" In Model: input size", input.size(), "output size", output.size()) return output model = Model(input_size, output_size) model.to(device) if torch.cuda.device_count() > 1: print("Let's use", torch.cuda.device_count(), "GPUs!") model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], output_device=local_rank) for data in rand_loader: if torch.cuda.is_available(): input_var = data else: input_var = data output = model(input_var) print("Outside: input size", input_var.size(), "output_size", output.size())