0

I have 4 Tesla K80 GPUs in my system. I would like to automatically allocate free GPUs based on an integer input in the code. I am aware of tf.config.experimental.set_visible_devices() to assign specific GPUs but currently do not know how to identify which of the GPUs are in-use (expect manually using nvidia-smi). I am currently changing the code below for every run.

import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus[2:], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)

The above code lets me set the GPUs I want to allocate (GPU 2,3 in above example) for the run. Is there anyway to obtain a list of free (unused) devices to automate the allocation process instead manually having to identify which of the devices should be set? I am currently using TensorFlow version 1.15

DVK
  • 475
  • 2
  • 6
  • 27
  • Looks like https://stackoverflow.com/questions/38559755/how-to-get-current-available-gpus-in-tensorflow – instinct71 Jan 13 '20 at 22:41
  • @instinct71 Unfortunately all the answers in the post are to obtain the total number of GPUs available. That is not the question I asked though. I wanted to know how to obtain the GPUs that are currently not being used. – DVK Jan 13 '20 at 23:07
  • 1
    Sorry about that. I remember seeing this question before. I think https://stackoverflow.com/questions/41634674/tensorflow-on-shared-gpus-how-to-automatically-select-the-one-that-is-unused should be it. – instinct71 Jan 14 '20 at 00:14
  • 1
    https://stackoverflow.com/questions/40069883/how-to-set-specific-gpu-in-tensorflow/47998168#47998168 – y.selivonchyk Jan 14 '20 at 05:00

1 Answers1

0
import subprocess, re
import os
import utils


# Nvidia-smi GPU memory parsing.
# Tested on nvidia-smi 370.23
# TF1.15


def run_command(cmd):
    """Run command, return output as string."""
    output = subprocess.Popen(cmd, stdout=subprocess.PIPE, shell=True).communicate()[0]
    return output.decode("ascii")


def list_available_gpus():
    """Returns list of available GPU ids."""
    output = run_command("nvidia-smi -L")
    # lines of the form GPU 0: TITAN X
    gpu_regex = re.compile(r"GPU (?P<gpu_id>\d+):")
    result = []
    for line in output.strip().split("\n"):
        m = gpu_regex.match(line)
        assert m, "Couldnt parse "+line
        result.append(int(m.group("gpu_id")))
    return result


def gpu_memory_map():
    """Returns map of GPU id to memory allocated on that GPU."""

    output = run_command("nvidia-smi")
    gpu_output = output[output.find("GPU Memory"):]
    # lines of the form
    # |    0      8734    C   python                                       11705MiB |
    memory_regex = re.compile(r"[|]\s+?(?P<gpu_id>\d+)\D+?(?P<pid>\d+).+[ ](?P<gpu_memory>\d+)MiB")
    rows = gpu_output.split("\n")
    result = {gpu_id: 0 for gpu_id in list_available_gpus()}
    for row in gpu_output.split("\n"):
        m = memory_regex.search(row)
        if not m:
            continue
        gpu_id = int(m.group("gpu_id"))
        gpu_memory = int(m.group("gpu_memory"))
        result[gpu_id] += gpu_memory
    return result


def pick_gpu_lowest_memory():
    """Returns GPU with the least allocated memory"""

    memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()]
    best_memory, best_gpu = sorted(memory_gpu_map)[0]
    return best_gpu


def pick_free_gpus(num_gpus=1):
    """Returns free GPUs with the least allocated memory"""

    memory_gpu_map = [(memory, gpu_id) for (gpu_id, memory) in gpu_memory_map().items()]
    sorted_list = sorted(memory_gpu_map)
    gpu_list = []
    for i in range(num_gpus):
        if sorted_list[i][0] == 0:
            gpu_list.append(sorted_list[i][1])
        else:
            print(f'Currently fewer than {num_gpus} GPUs are free right now, choose {i} or fewer GPUs')
            exit()
    return ','.join(map(str, gpu_list))

num_gpus = 2
os.environ["CUDA_VISIBLE_DEVICES"] = pick_free_gpus(num_gpus)
import tensorflow as tf
tf.config.optimizer.set_jit(True)  # Enable XLA.
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # Restrict TensorFlow to only use the first GPU
    try:
        tf.config.experimental.set_visible_devices(gpus, 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
    except RuntimeError as e:
        # Visible devices must be set before GPUs have been initialized
        print(e)
DVK
  • 475
  • 2
  • 6
  • 27