I'm trying to multiplicate 2 big matrices with memory limit using hdf5 (pytables) but function numpy.dot seems to give me error:
Valueerror: array is too big
I need to do matrix multiplication by myself maybe blockwise or there is some another python function similar to numpy.dot?
import numpy as np
import time
import tables
import cProfile
import numexpr as ne
n_row=10000
n_col=100
n_batch=10
rows = n_row
cols = n_col
batches = n_batch
atom = tables.UInt8Atom() #?
filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
fileName_a = 'C:\carray_a.h5'
shape_a = (rows*batches, cols) # predefined size
h5f_a = tables.open_file(fileName_a, 'w')
ca_a = h5f_a.create_carray(h5f_a.root, 'carray', atom, shape_a, filters=filters)
for i in range(batches):
data = np.random.rand(rows,cols)
ca_a[i*rows:(i+1)*rows]= data[:]
#h5f_0.close()
rows = n_col
cols = n_row
batches = n_batch
fileName_b = 'C:\carray_b.h5'
shape_b = (rows, cols*batches) # predefined size
h5f_b = tables.open_file(fileName_b, 'w')
ca_b = h5f_b.create_carray(h5f_b.root, 'carray', atom, shape_b, filters=filters)
#need to batch by cols
sz= rows/batches
for i in range(batches):
data = np.random.rand(sz, cols*batches)
ca_b[i*sz:(i+1)*sz]= data[:]
#h5f_1.close()
rows = n_batch*n_row
cols = n_batch*n_row
fileName_c = 'C:\carray_c.h5'
shape_c = (rows, cols) # predefined size
h5f_c = tables.open_file(fileName_c, 'w')
ca_c = h5f_c.create_carray(h5f_c.root, 'carray', atom, shape_c, filters=filters)
a= h5f_a.root.carray#[:]
b= h5f_b.root.carray#[:]
c= h5f_c.root.carray
t0= time.time()
c= np.dot(a,b) #error if aray is big
print (time.time()-t0)
Update: so here is the code.It's interesting but using hdf5 it works even faster.
import numpy as np
import tables
import time
sz= 100 #chunk size
n_row=10000 #m
n_col=1000 #n
#for arbitrary size
A=np.random.rand(n_row,n_col)
B=np.random.rand(n_col,n_row)
# A=np.random.randint(5, size=(n_row,n_col))
# B=np.random.randint(5, size=(n_col,n_row))
#using numpy array
#C= np.zeros((n_row,n_row))
#using hdf5
fileName_C = 'CArray_C.h5'
atom = tables.Float32Atom()
shape = (A.shape[0], B.shape[1])
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
h5f_C = tables.open_file(fileName_C, 'w')
C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape)
sz= block_size
t0= time.time()
for i in range(0, A.shape[0], sz):
for j in range(0, B.shape[1], sz):
for k in range(0, A.shape[1], sz):
C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
print (time.time()-t0)
t0= time.time()
res= np.dot(A,B)
print (time.time()-t0)
print (C== res)
h5f_C.close()