0

I'm trying to figure out how I can load thousands of numpy files using an iterator. To do this without overflowing my memory, I need to be able to open and close the files.

I have a list called train_data that holds a list of numpy filenames. I can load one numpy array using

np.load(train_data[0])

but if I try to load it using an iterator and the context manager (which according to this answer would help with memory, as it should close the file by itself after it is done with it), it throws an error.

class AbstractIterator(object):
    def __iter__(self):
        for abstract_filename in train_data:
            with np.load(abstract_filename) as a:
                print(abstract_filename)

iterator =  AbstractIterator()
for i in iterator:
    print(i)

Error message:

Traceback (most recent call last):
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-53-12ff2d39b102>", line 8, in <module>
    for i in AbstractIterator():
  File "<ipython-input-53-12ff2d39b102>", line 5, in __iter__
    with np.load(abstract_filename) as a:
AttributeError: __enter__

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2044, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'AttributeError' object has no attribute '_render_traceback_'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 1151, in get_records
    return _fixed_getinnerframes(etb, number_of_lines_of_context, tb_offset)
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 319, in wrapped
    return f(*args, **kwargs)
  File "/Users/briennakh/anaconda3/lib/python3.6/site-packages/IPython/core/ultratb.py", line 353, in _fixed_getinnerframes
    records = fix_frame_records_filenames(inspect.getinnerframes(etb, context))
  File "/Users/briennakh/anaconda3/lib/python3.6/inspect.py", line 1483, in getinnerframes
    frameinfo = (tb.tb_frame,) + getframeinfo(tb, context)
  File "/Users/briennakh/anaconda3/lib/python3.6/inspect.py", line 1441, in getframeinfo
    filename = getsourcefile(frame) or getfile(frame)
  File "/Users/briennakh/anaconda3/lib/python3.6/inspect.py", line 696, in getsourcefile
    if getattr(getmodule(object, filename), '__loader__', None) is not None:
  File "/Users/briennakh/anaconda3/lib/python3.6/inspect.py", line 725, in getmodule
    file = getabsfile(object, _filename)
  File "/Users/briennakh/anaconda3/lib/python3.6/inspect.py", line 709, in getabsfile
    return os.path.normcase(os.path.abspath(_filename))
  File "/Users/briennakh/anaconda3/lib/python3.6/posixpath.py", line 376, in abspath
    cwd = os.getcwd()
FileNotFoundError: [Errno 2] No such file or directory

How can I load and properly close the file after I'm done working with it, in an iterator?

EDIT: I need an iterator. Not something that is a workaround. I'm using this with gensim, which takes an iterator.

I have tried to incorporate the answer into my code.

class DocumentIterator(object):
    def load_documents(self):
        for filename in self.filenames: 
            loaded_file = np.load(filename)
            tag = os.path.splitext(os.path.basename(filename))[0]
            abstract = TaggedDocument(words=loaded_file, tags=[tag])
            try:
                yield abstract
            except Exception as e:
                print('Error!' + str(e))
            finally:
                del loaded_file
                print('Closed ' + tag)

    def __init__(self, filenames):
        self.filenames = filenames
        self.generator = self.load_documents()

    def __iter__(self):
        self.generator = self.load_documents() # Reset the iterator
        return self

    def __next__(self):
        abstract = next(self.generator)
        return abstract

This works.

brienna
  • 1,415
  • 1
  • 18
  • 45
  • I’m glad you found the answer. By the way, ``os.path`` is an older usage. If your Python >= 3.4, I suggest you use ``pathlib.Path``, it can do almost everything ``os.path`` can do, and it is very comfortable to use! [ref1](https://stackabuse.com/introduction-to-the-python-pathlib-module/) [ref2: document](https://docs.python.org/3/library/pathlib.html) – Carson Jun 09 '20 at 05:58

1 Answers1

1

I'm not sure that does you must implement iter to help you close.

You can use contextlib.contextmanager to help you to do something after leaving the with. In this way, you don't have to implement iter.

import numpy as np
from pathlib import Path
from contextlib import contextmanager


def generate_test_files(numbers_of_files=3):
    test_data = np.arange(10)
    rtn_list = []
    for i in range(numbers_of_files):
        file_name = f'test_{i}.npy'
        if not Path(file_name).exists():
            np.save(file_name, test_data)
        rtn_list.append(file_name)
    return rtn_list


@contextmanager
def block_numpy_load(file_name):
    obj = np.load(file_name)
    try:
        yield obj
    except:
        ...
    finally:
        del obj
        print('release')


train_file_list = generate_test_files()
for f in train_file_list:
    with block_numpy_load(f) as data:
        print(data)

[0 1 2 3 4 5 6 7 8 9]
release
[0 1 2 3 4 5 6 7 8 9]
release
[0 1 2 3 4 5 6 7 8 9]
release

OOP

If you want to OO may reference the following.

import numpy as np
from pathlib import Path
from contextlib import contextmanager
from typing import Iterator
import abc


def generate_test_files(numbers_of_files=3):
    test_data = np.arange(10)
    rtn_list = []
    for i in range(numbers_of_files):
        file_name = f'test_{i}.npy'
        if not Path(file_name).exists():
            np.save(file_name, test_data)
        rtn_list.append(Path(file_name))
    return rtn_list


@contextmanager
def block_numpy_load(file_name):
    obj = np.load(file_name)
    try:
        yield obj
    except:
        ...
    finally:
        # del obj
        print('release')


class NpLoadMixin(abc.ABC):
    def __iter__(self) -> Iterator[Path]:
        for file in self.train_data:
            yield file

    @abc.abstractmethod
    def start_iter(self):
        for file in self:
            with block_numpy_load(file) as a:
                ...


class MyAI(NpLoadMixin):
    def __init__(self, train_data):
        self.train_data = train_data

    def start_iter(self):
        for file in self:
            with block_numpy_load(file) as np_array:
                print(np_array)


train_file_list = generate_test_files()
obj = MyAI(train_file_list)
obj.start_iter()
Carson
  • 6,105
  • 2
  • 37
  • 45