I would like to speed up the following code related to spherical modes. It is a simplification of my actual code (I didn't want to oversimplify it because it can lead to solutions that are not valid for my actual problem):
import numpy as np
import time
import math
def function_call(npp,nmax):
matrix_a = np.random.rand(npp)
matrix_b = np.random.rand(npp)
a=np.random.rand()
F = np.zeros((2*npp, 2*nmax*(nmax+2)),dtype=np.complex_)
npa=np.arange(npp)
for n in range(1,nmax+1,1):
a_n = np.sqrt(1 / (2 * np.pi * n * (n + 1)))
for m in range(-n,n+1,1):
b_m = (-1)**((np.abs(m) + m) / 2)
p_mn = int(1 / 2 * (np.abs(m) + n + 1 / 2 * (1 - (-1)**(np.abs(m) + n))))
alpha_mn = np.sqrt(((2 * n + 1) / 2) * math.factorial(n - np.abs(m)) / math.factorial(n + np.abs(m)))
A_mn = np.zeros(npp)
B_mn = np.zeros(npp)
for p in range(p_mn,n+1,1):
Cai_pmn = math.factorial(n) * ((-1)**(n + p)) / (math.factorial(p) * math.factorial(n - p)) * math.factorial(2 * p)/math.factorial(2 * p - np.abs(m) - n)
A_mn = A_mn + Cai_pmn * (np.cos(matrix_a))**(2 * p - np.abs(m) - n)
B_mn = B_mn + (2 * p - np.abs(m) - n) * Cai_pmn * (np.cos(matrix_a))**(np.abs(2 * p - np.abs(m) - n - 1))
A_mn = A_mn / (2**n * math.factorial(n))
B_mn = B_mn / (2**n * math.factorial(n))
S_mn = alpha_mn * m * A_mn * np.sin(matrix_a)**np.abs(np.abs(m) - 1)
D_mn = alpha_mn * (np.abs(m) * A_mn * np.cos(matrix_a) * (np.sin(matrix_a))**(np.abs(np.abs(m) - 1)) - B_mn * (np.sin(matrix_a))**(np.abs(m) + 1))
h1 = 1j**(n+1)*np.exp(-1j*a)/(a)
h2 = 1j**(n)*np.exp(-1j*a)/(a)
F_s1_theta = 1j * a_n * b_m * h1 * (S_mn * np.exp(1j * m * matrix_b))
F_s1_phi = -a_n * b_m * h1 * (D_mn * np.exp(1j * m * matrix_b))
F_s2_theta = a_n * b_m * h2 * (D_mn * np.exp(1j * m * matrix_b))
F_s2_phi = 1j * a_n * b_m * h2 * (S_mn * np.exp(1j * m * matrix_b))
j = 2 * (n * (n + 1) + m - 1)
F[2 * npa, j] = F_s1_theta[npa]
F[2 * npa+1 , j] = F_s1_phi[npa]
j = 2 * (n * (n + 1) + m - 1) + 1
F[2 * npa, j] = F_s2_theta[npa]
F[2 * npa+1, j] = F_s2_phi[npa]
prev_time_ep =time.time()
npp=500
nmax=80
function_call(npp,nmax)
print(" --- %s seconds ---" % (time.time() - prev_time_ep))
The first option that I have tried is to vectorize it (it took me some time because it is not obvious). However, memory consumption grows rapidly and it is not efficient.
I have also tried with Numba and I actually succeed to reduce by 4 the previous time, but I was looking for larger improvements if it is possible.
I have read also that maybe multiprocessing or Cython could be good options. Maybe there is a way of vectorizing it without growing rapidly in memory usage?