Getting MemoryError: Unable to allocate 11.9 GiB for an array with shape (40000, 40000) and data type float64

Question

I am getting kind of weird error when I tried to import my .mat file which is available at this link https://drive.google.com/drive/folders/19GmXlWGh4-u_GxntNvmlC1YD6XiIaXqn?usp=sharing. My matf ile has 40000 columns and 764 rows and mat file contains random values between 0 and 255. But each time when I run my code using this data set it gives memory error. can anyone guide me how I can deal with this error. I have 16 GB of ram and 2.1 Ghz processor and 500 ssd in my computer. Below is my code which I took from AZU that has all feature selection methods https://github.com/jundongl/scikit-feature/blob/master/skfeature/example/test_trace_ratio.py

    import scipy.io
import scipy.io as sio
import os
import pandas as pd
from sklearn.model_selection import cross_validate
from sklearn import svm
from sklearn.metrics import accuracy_score
import trace_ratio
from sklearn.model_selection import KFold

from sklearn import model_selection

import scipy.io
import scipy.io
import os
import scipy.io as sio

# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.KFold.html#sklearn.model_selection.KFold
from sklearn.model_selection import KFold

from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.model_selection import StratifiedKFold


# pip install skfeature-chappers
# from skfeature.function.similarity_based import fisher_score
# from skfeature.function.similarity_based import trace_ratio

def main():
    # load data'
    source_dir = './'
    mat = sio.loadmat(os.path.join(source_dir, '200by200.mat'))
    # mat = scipy.io.loadmat('../data/COIL20.mat')
    X = mat['X']  # data
    X = X.astype(float)
    y = mat['Y']  # label
    y = y[:, 0]
    n_samples, n_features = X.shape  # number of samples and number of features

    # split data into 10 folds
    ss = KFold(n_splits=5)
    #ss = StratifiedKFold(n_splits=5, shuffle=True, random_state=2)

    # ss= RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=0)
    # perform evaluation on classification task
    num_fea = 50  # number of selected features
    # clf = svm.LinearSVC()    # linear SVM
    clf = svm.SVC(kernel='rbf')

    correct = 0
    for train, test in ss.split(X,y):
        # obtain the index of selected features
        idx, feature_score, subset_score = trace_ratio.trace_ratio(X[train], y[train], num_fea, style='fisher')
        # obtain the dataset on the selected features
        selected_features = X[:, idx[0:num_fea]]

        # train a classification model with the selected features on the training dataset
        clf.fit(selected_features[train], y[train])
        # print(selected_features[train])
        # predict the class labels of test data
        y_predict = clf.predict(selected_features[test])

        # obtain the classification accuracy on the test data
        acc = accuracy_score(y[test], y_predict)
        correct = correct + acc

    # output the average classification accuracy over all 10 folds
    print('Accuracy:', float(correct) / 5)


if __name__ == '__main__':
    main()

The error message is correct - an array with those dimensions will require 11.9 GiB. Are you running a 64-bit build of Python? — Mark Ransom, Jan 20 '21 at 22:32
Perhaps some of the answers for a [similar question](https://stackoverflow.com/questions/23872942/sklearn-and-large-datasets) will help. — rickhg12hs, Jan 22 '21 at 15:41

Getting MemoryError: Unable to allocate 11.9 GiB for an array with shape (40000, 40000) and data type float64

0 Answers0