I have a code that looks like the following, where I want to minimize a function my_cost
with respect to parameters w
.
However, when running the code, it appears to me that it is very slow (like 30 times slower) compared to same one implemented without tensorflow (by explicitly defining a function that gives the gradient of the cost).
Am I doing something wrong in the following example code? (maybe I am unnecessarily re-computing the gradients graph each time?)
I am using Python 3, and TensorFlow 2.0.0. Relevant Git
In the following code, I am using a simple dummy cost function just as an example to show the big difference in the runtime.
Code with Tensorflow:
import numpy as np
import tensorflow as tf
import time
class ExampleTF:
def __init__(self, n=100, m=10):
Z = np.random.randn(n, m)
self.Z = tf.convert_to_tensor(Z, dtype=tf.float32)
self.w = tf.Variable(np.ones((m, 1)), dtype=tf.float32)
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return tf.reduce_sum((self.Z @ self.w) - P)
# =====================================
def optimize_w(self, cost_func, parameters, lr=0.01, iterations=2000):
optimizer = tf.optimizers.Adam(lr)
for _ in range(iterations):
optimizer.minimize(cost_func, var_list=parameters)
# =====================================
def update(self, P):
P = tf.convert_to_tensor(P, dtype=tf.float32)
self.optimize_w(
cost_func = lambda: self.cost(P),
parameters = [self.w]
)
#print("===> cost:", self.cost(P).numpy())
#print("w:", self.w.numpy().reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_tf = ExampleTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_tf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)
Code without Tensorflow (just numpy) :
import numpy as np
import tensorflow as tf
import time
class ExampleNonTF:
def __init__(self, n=100, m=10):
self.Z = np.random.randn(n, m)
self.w = np.ones((m, 1))
# =====================================
def cost(self, P):
# This is a simple dummy cost function just as an example
return np.sum(self.Z @ self.w - P)
# =====================================
def gradient_cost(self, P):
# This is the gradient of the dummy cost function with respect to self.w
return np.sum(self.Z, axis=0).reshape(self.w.shape)
# =====================================
def optimize_w(self, P, lr=0.01, iterations=2000): # This is the ADAM optimizer
avg_grad1 = 0; avg_grad2 = 0
beta1 = 0.9; beta2 = 0.999; eps = 1e-07
for itr in range(iterations):
grad = self.gradient_cost(P)
avg_grad1 = beta1 * avg_grad1 + (1 - beta1) * grad
avg_grad2 = (beta2 * avg_grad2 + (1 - beta2) * (grad ** 2))
avg_grad1_corr = avg_grad1 / (1 - beta1 ** (itr + 1))
avg_grad2_corr = avg_grad2 / (1 - beta2 ** (itr + 1))
self.w = self.w - lr * (avg_grad1_corr / (np.sqrt(avg_grad2_corr) + eps))
# =====================================
def update(self, P):
self.optimize_w(P)
#print("===> cost:", self.cost(P))
#print("w:", self.w.reshape(-1)[:10])
# =====================================
n, m = 10000, 100
ex_nontf = ExampleNonTF(n, m)
for _ in range(50):
P = np.random.uniform(size=n).reshape((-1, 1))
start = time.time()
ex_nontf.update(P)
elapsed = time.time() - start
print("elapsed time:", elapsed)