I have created two QAT models with the AllValuesQuantizer, one with per-tensor and one with per-channel quantization. When inspecting their respective QuantizeWrapper layers I note that both have scalar values for the variables kernel_min and kernel_max.
Here is an example of a per-tensor quantized model
Here is an example of a per-channel quantized model
As I have understood from this paper, the min/max values of the kernel are what defines the scale and zero-point quantization parameters. For per-tensor quantization it is reasonable that the model only has a single min and max value, as the whole tensor has the same scale and zero-point. HOWEVER, for per-channel quantization (where each channel has its own scale and zero-point) I believe that kernel_min and kernel_max should be vectors? Why aren't they?
In this github issue someone mentions that QAT automatically uses per-tensor quantization (as of march 2020), but that this is subject to change. To me it looks like QAT still only uses per-tensor quantization? If that's the case, why is there a parameter that I can set to enable per-tensor quantization (See AllValuesQuantizer's per-axis boolean)?
To further showcase my point, I also noted in the source code for the AllValuesQuantizer that self.per_axis is never passed to the next function, so what is that even variable used for? Note that the other quantizers, LastValue and MovingAverage, do pass this variable.
So; does TF's QAT even perform per-channel quantization? Doesn't seem like it to me. How can I use per-channel quantization with the AllValuesQuantizer?
GitHub issue: https://github.com/tensorflow/tensorflow/issues/47858
Code to replicate my two models:
import tensorflow as tf
from tensorflow import keras
import tensorflow_model_optimization as tfmot
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
# Possible quantization aware quantizers:
QAT_ALL_VALUES = tfmot.quantization.keras.quantizers.AllValuesQuantizer
QAT_LAST_VALUE = tfmot.quantization.keras.quantizers.LastValueQuantizer
QAT_MA = tfmot.quantization.keras.quantizers.MovingAverageQuantizer
def quantization_aware_training(model, save, w_bits, a_bits, symmetric, per_axis, narrow_range, quantizer, batch_size=64, epochs=2):
# Create quantized model's name string
name = model.name + '_'
name = name + str(w_bits) + 'wbits_' + str(a_bits) + 'abits_'
if symmetric:
name = name + 'sym_'
else:
name = name + 'asym_'
if narrow_range:
name = name + 'narr_'
else:
name = name + 'full_'
if per_axis:
name = name + 'perch_'
else:
name = name + 'perten_'
if quantizer == QAT_ALL_VALUES:
name = name + 'AV'
elif quantizer == QAT_LAST_VALUE:
name = name + 'LV'
elif quantizer == QAT_MA:
name = name + 'MA'
# Quantization
# *****
quantize_apply = tfmot.quantization.keras.quantize_apply
quantize_model = tfmot.quantization.keras.quantize_model
quantize_annotate_layer = tfmot.quantization.keras.quantize_annotate_layer
clone_model = tf.keras.models.clone_model
quantize_scope = tfmot.quantization.keras.quantize_scope
supported_layers = [
tf.keras.layers.Conv2D,
]
class Quantizer(tfmot.quantization.keras.QuantizeConfig):
# Configure how to quantize weights.
def get_weights_and_quantizers(self, layer):
return [(layer.kernel, tfmot.quantization.keras.quantizers.LastValueQuantizer(num_bits=8, symmetric=True, narrow_range=False, per_axis=False))]
# Configure how to quantize activations.
def get_activations_and_quantizers(self, layer):
return [(layer.activation, tfmot.quantization.keras.quantizers.MovingAverageQuantizer(num_bits=8, symmetric=False, narrow_range=False, per_axis=False))]
def set_quantize_weights(self, layer, quantize_weights):
# Add this line for each item returned in `get_weights_and_quantizers`
# , in the same order
layer.kernel = quantize_weights[0]
def set_quantize_activations(self, layer, quantize_activations):
# Add this line for each item returned in `get_activations_and_quantizers`
# , in the same order.
layer.activation = quantize_activations[0]
# Configure how to quantize outputs (may be equivalent to activations).
def get_output_quantizers(self, layer):
return []
def get_config(self):
return {}
class ConvQuantizer(Quantizer):
# Configure weights to quantize with 4-bit instead of 8-bits.
def get_weights_and_quantizers(self, layer):
return [(layer.kernel, quantizer(num_bits=w_bits, symmetric=symmetric, narrow_range=narrow_range, per_axis=per_axis))]
# Configure how to quantize activations.
def get_activations_and_quantizers(self, layer):
return [(layer.activation, tfmot.quantization.keras.quantizers.MovingAverageQuantizer(num_bits=a_bits, symmetric=False, narrow_range=False, per_axis=False))]
class DepthwiseQuantizer(Quantizer):
# Configure weights to quantize with 4-bit instead of 8-bits.
def get_weights_and_quantizers(self, layer):
return [(layer.depthwise_kernel, quantizer(num_bits=w_bits, symmetric=symmetric, narrow_range=narrow_range, per_axis=per_axis))]
# Configure how to quantize activations.
def get_activations_and_quantizers(self, layer):
return [(layer.activation, tfmot.quantization.keras.quantizers.MovingAverageQuantizer(num_bits=a_bits, symmetric=False, narrow_range=False, per_axis=False))]
# Instead of simply using quantize_annotate_model or quantize_model we must use
# quantize_annotate_layer since it's the only one with a quantize_config argument
def quantize_all_layers(layer):
if isinstance(layer, tf.keras.layers.DepthwiseConv2D):
return quantize_annotate_layer(layer, quantize_config=DepthwiseQuantizer())
elif isinstance(layer, tf.keras.layers.Conv2D):
return quantize_annotate_layer(layer, quantize_config=ConvQuantizer())
return layer
annotated_model = clone_model(
model,
clone_function=quantize_all_layers
)
with quantize_scope(
{'Quantizer': Quantizer},
{'ConvQuantizer': ConvQuantizer},
{'DepthwiseQuantizer': DepthwiseQuantizer}):
q_aware_model = quantize_apply(annotated_model)
# *****
# Compile and train model
optimizer = keras.optimizers.Adam(
learning_rate=0.001)
q_aware_model.compile(loss=tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True),
optimizer=optimizer, metrics=['sparse_categorical_accuracy'])
(train_images, train_labels),_ = keras.datasets.cifar10.load_data()
q_aware_model.fit(train_images, train_labels, batch_size=batch_size, epochs=epochs, verbose=1,
validation_split=0.1)
if save:
save_path = 'models/temp/' + name
q_aware_model.save(save_path + '.h5')
return q_aware_model
def temp_net():
dropout = 0.1
model = keras.Sequential()
model.add(keras.layers.Conv2D(32, (3, 3), padding='same', input_shape=(32, 32, 3)))
model.add(keras.layers.BatchNormalization())
model.add(keras.layers.Activation('relu'))
model.add(keras.layers.Flatten())
model.add(keras.layers.Dense(10, activation='softmax'))
model._name = "temp_net"
return model
if __name__ == "__main__":
q_model = quantization_aware_training(model=temp_net(), save=True,
w_bits=8, a_bits=8, symmetric=False, narrow_range=False, per_axis=False, quantizer=QAT_ALL_VALUES, batch_size=64, epochs=1)