Jupyter notebook: memory usage for each notebook

Question

The memory on my lab's server (Ubuntu) is constantly filling up due to users never shutting down old notebooks. I would like to get a better idea of how much memory each notebook is taking up. I can summarize (rough) memory usage for all jupyter notebooks run by each user, but I would like to get the total memory usage of each individual notebook so that I can shut down those particular memory hogs (or tell another user to shut his/her's down). I quickly put together the following code to get approx. mem. usage per jupyter kernel, but I don't know how to associate the kernel IDs to a particular notebook.

import os
import pwd
import pandas as pd

UID   = 1
EUID  = 2

pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]

df = []
for pid in pids:
    try:
        ret = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
    except IOError: # proc has already terminated
        continue
    
    # jupyter notebook processes
    if len(ret) > 0 and 'share/jupyter/runtime' in ret:
        process = psutil.Process(int(pid))
        mem = process.memory_info()[0] 
           
        # user name for pid
        for ln in open('/proc/%d/status' % int(pid)):
            if ln.startswith('Uid:'):
                uid = int(ln.split()[UID])
                uname = pwd.getpwuid(uid).pw_name

        # user, pid, memory, proc_desc
        df.append([uname, pid, mem, ret])
        
df = pd.DataFrame(df)
df.columns = ['user', 'pid', 'memory', 'proc_desc']
df

i can't run this.. due to no module found with error `PackageNotFoundError: Packages missing in current channels: - pwd`. anyother solution that doesn't require `pwd`. I am on windows server with anaconda python 2.7. — ihightower, Oct 12 '17 at 03:27

aiguofer · Answer 1 · 2019-05-17T13:08:16.587

I made some improvements to sharchaea's script for portability and speed.

Mainly, only check ports that notebooks are running on, check different hostname options, improve the kernel process check and check for ipython or jupyter.

import argparse
import re
import subprocess

import pandas as pd
import psutil
import requests
import tabulate

kernel_regex = re.compile(r".+kernel-(.+)\.json")
notebook_regex = re.compile(r"(https?://([^:/]+):?(\d+)?)/?(\?token=([a-z0-9]+))?")


def get_proc_info():
    pids = psutil.pids()

    # memory info from psutil.Process
    df_mem = []

    for pid in pids:
        try:
            proc = psutil.Process(pid)
            cmd = " ".join(proc.cmdline())
        except psutil.NoSuchProcess:
            continue

        if len(cmd) > 0 and ("jupyter" in cmd or "ipython" in cmd) and "kernel" in cmd:
            # kernel
            kernel_ID = re.sub(kernel_regex, r"\1", cmd)

            # memory
            mem = proc.memory_info()[0] / float(1e9)

            uname = proc.username()

            # user, pid, memory, kernel_ID
            df_mem.append([uname, pid, mem, kernel_ID])

    df_mem = pd.DataFrame(df_mem)
    df_mem.columns = ["user", "pid", "memory_GB", "kernel_ID"]
    return df_mem


def get_running_notebooks():
    notebooks = []

    for n in subprocess.Popen(
        ["jupyter", "notebook", "list"], stdout=subprocess.PIPE
    ).stdout.readlines()[1:]:
        match = re.match(notebook_regex, n.decode())
        if match:
            base_url, host, port, _, token = match.groups()
            notebooks.append({"base_url": base_url, "token": token})
        else:
            print("Unknown format: {}".format(n.decode()))

    return notebooks


def get_session_info(password=None):
    df_nb = []
    kernels = []

    for notebook in get_running_notebooks():
        s = requests.Session()
        if notebook["token"] is not None:
            s.get(notebook["base_url"] + "/?token=" + notebook["token"])
        else:
            # do a get to the base url to get the session cookies
            s.get(notebook["base_url"])
        if password is not None:
            # Seems jupyter auth process has changed, need to first get a cookie,
            # then add that cookie to the data being sent over with the password
            data = {"password": password}
            data.update(s.cookies)
            s.post(notebook["base_url"] + "/login", data=data)

        res = s.get(notebook["base_url"] + "/api/sessions")

        if res.status_code != 200:
            raise Exception(res.json())

        for sess in res.json():
            kernel_ID = sess["kernel"]["id"]
            if kernel_ID not in kernels:
                kernel = {
                    "kernel_ID": kernel_ID,
                    "kernel_name": sess["kernel"]["name"],
                    "kernel_state": sess["kernel"]["execution_state"],
                    "kernel_connections": sess["kernel"]["connections"],
                    # "notebook_url": notebook["base_url"] + "/notebook/" + sess["id"],
                    "notebook_path": sess["path"],
                }
                kernel.update(notebook)
                df_nb.append(kernel)
                kernels.append(kernel_ID)

    df_nb = pd.DataFrame(df_nb)
    del df_nb["token"]
    return df_nb


def parse_args():
    parser = argparse.ArgumentParser(description="Find memory usage.")
    parser.add_argument("--password", help="password (only needed if pass-protected)")

    return parser.parse_args()


def main(password=None, print_ascii=False):
    df_mem = get_proc_info()
    df_nb = get_session_info(password)

    # joining tables
    df = pd.merge(df_nb, df_mem, on=["kernel_ID"], how="inner")
    df = df.sort_values("memory_GB", ascending=False).reset_index(drop=True)
    if print_ascii:
        print(tabulate.tabulate(df, headers=(df.columns.tolist())))
    return df


if __name__ == "__main__":
    args = vars(parse_args())
    main(args["password"], print_ascii=True)

I'll probably continue to make updates to this at this gist

edit: Code has been updated to work with newer versions of Jupyter using token authentication, to leverage only psutil making it Windows compatible, and to work on Python 3.

i can't run this.. due to no module found with error `PackageNotFoundError: Packages missing in current channels: - pwd`. anyother solution that doesn't require `pwd`. I am on windows server with anaconda python 2.7. — ihightower, Oct 12 '17 at 03:28
@ihightower hmmm, I doubt this will work on Windows. It's getting most of its info about the processes from proc, which is a Unix way of storing data about running processes, services, etc. https://stackoverflow.com/questions/5802143/is-there-anything-like-proc-for-windows might give you a starting point. — aiguofer, Oct 12 '17 at 04:28
@ihightower I've updated this to not use `pwd` or `/proc`. Instead it only uses `psutil` which should be portable and work on Windows. — aiguofer, May 04 '19 at 22:43
@aiguofer you should add password=args(["password"]) in your main arguments when you call it in *ifmain*. Otherwise it could be interpreted as the hostname right ? — tricky, May 17 '19 at 11:26
@tricky thanks for pointing that out :) I changed the script quite a bit and forgot to change that signature for main. The hostname is now found using `jupyter notebook list` so you don't have to pass it in. I might add the option to run against remote servers using Fabric in the future. — aiguofer, May 17 '19 at 13:12

score 5 · Answer 2 · answered Jan 09 '16 at 20:45

I seemed to have figured out a working solution for my own problem:

import os
import pwd
import psutil
import re
import string
import json
import urllib2
import pandas as pd

UID   = 1
EUID  = 2
regex = re.compile(r'.+kernel-(.+)\.json')

pids = [pid for pid in os.listdir('/proc') if pid.isdigit()]

# memory info from psutil.Process
df_mem = []
for pid in pids:
    try:
        ret = open(os.path.join('/proc', pid, 'cmdline'), 'rb').read()
    except IOError: # proc has already terminated
        continue

    # jupyter notebook processes
    if len(ret) > 0 and 'share/jupyter/runtime' in ret:
        # kernel
        kernel_ID = re.sub(regex, r'\1', ret)
        kernel_ID = filter(lambda x: x in string.printable, kernel_ID)

        # memory
        process = psutil.Process(int(pid))
        mem = process.memory_info()[0] / float(1e9)


        # user name for pid
        for ln in open('/proc/{}/status'.format(int(pid))):
            if ln.startswith('Uid:'):
                uid = int(ln.split()[UID])
                uname = pwd.getpwuid(uid).pw_name

        # user, pid, memory, kernel_ID
        df_mem.append([uname, pid, mem, kernel_ID])

df_mem = pd.DataFrame(df_mem)
df_mem.columns = ['user', 'pid', 'memory_GB', 'kernel_ID']


# notebook info from assessing ports
df_nb = []
for port in xrange(5000,30000):
    sessions = None
    try:
        url = 'http://127.0.0.1:{}/api/sessions'.format(port)
        sessions = json.load(urllib2.urlopen(url))
    except urllib2.URLError:
        sessions = None

    if sessions:
        for sess in sessions:
            kernel_ID = str(sess['kernel']['id'])
            notebook_path = sess['notebook']['path']
            df_nb.append([port, kernel_ID, notebook_path])

df_nb = pd.DataFrame(df_nb)
df_nb.columns = ['port', 'kernel_ID', 'notebook_path']


# joining tables
df = pd.merge(df_nb, df_mem, on=['kernel_ID'], how='inner')
df.sort(['memory_GB'], ascending=False)

Jupyter notebook: memory usage for each notebook

2 Answers2

Linked

Related