## TensorFlow cookbook This section includes implementation of a set of common operations in TensorFlow. 1. Get shape 2. Batch gather 3. Beam search 4. Merge 5. Entropy 6. KL-Divergence 7. Make parallel 8. Leaky Relu 9. Batch normalization ### Get shape ```python def get_shape(tensor): """Returns static shape if available and dynamic shape otherwise.""" static_shape = tensor.shape.as_list() dynamic_shape = tf.unstack(tf.shape(tensor)) dims = [s[1] if s[0] is None else s[0] for s in zip(static_shape, dynamic_shape)] return dims ``` ### Batch Gather ```python def batch_gather(tensor, indices): """Gather in batch from a tensor of arbitrary size. In pseudocode this module will produce the following: output[i] = tf.gather(tensor[i], indices[i]) Args: tensor: Tensor of arbitrary size. indices: Vector of indices. Returns: output: A tensor of gathered values. """ shape = get_shape(tensor) flat_first = tf.reshape(tensor, [shape[0] * shape[1]] + shape[2:]) indices = tf.convert_to_tensor(indices) offset_shape = [shape[0]] + [1] * (indices.shape.ndims - 1) offset = tf.reshape(tf.range(shape[0]) * shape[1], offset_shape) output = tf.gather(flat_first, indices + offset) return output ``` ### Beam Search ```python import tensorflow as tf def rnn_beam_search(update_fn, initial_state, sequence_length, beam_width, begin_token_id, end_token_id, name="rnn"): """Beam-search decoder for recurrent models. Args: update_fn: Function to compute the next state and logits given the current state and ids. initial_state: Recurrent model states. sequence_length: Length of the generated sequence. beam_width: Beam width. begin_token_id: Begin token id. end_token_id: End token id. name: Scope of the variables. Returns: ids: Output indices. logprobs: Output log probabilities probabilities. """ batch_size = initial_state.shape.as_list()[0] state = tf.tile(tf.expand_dims(initial_state, axis=1), [1, beam_width, 1]) sel_sum_logprobs = tf.log([[1.] + [0.] * (beam_width - 1)]) ids = tf.tile([[begin_token_id]], [batch_size, beam_width]) sel_ids = tf.zeros([batch_size, beam_width, 0], dtype=ids.dtype) mask = tf.ones([batch_size, beam_width], dtype=tf.float32) for i in range(sequence_length): with tf.variable_scope(name, reuse=True if i > 0 else None): state, logits = update_fn(state, ids) logits = tf.nn.log_softmax(logits) sum_logprobs = ( tf.expand_dims(sel_sum_logprobs, axis=2) + (logits * tf.expand_dims(mask, axis=2))) num_classes = logits.shape.as_list()[-1] sel_sum_logprobs, indices = tf.nn.top_k( tf.reshape(sum_logprobs, [batch_size, num_classes * beam_width]), k=beam_width) ids = indices % num_classes beam_ids = indices // num_classes state = batch_gather(state, beam_ids) sel_ids = tf.concat([batch_gather(sel_ids, beam_ids), tf.expand_dims(ids, axis=2)], axis=2) mask = (batch_gather(mask, beam_ids) * tf.to_float(tf.not_equal(ids, end_token_id))) return sel_ids, sel_sum_logprobs ``` ## Merge ```python import tensorflow as tf def merge(tensors, units, activation=tf.nn.relu, name=None, **kwargs): """Merge features with broadcasting support. This operation concatenates multiple features of varying length and applies non-linear transformation to the outcome. Example: a = tf.zeros([m, 1, d1]) b = tf.zeros([1, n, d2]) c = merge([a, b], d3) # shape of c would be [m, n, d3]. Args: tensors: A list of tensor with the same rank. units: Number of units in the projection function. """ with tf.variable_scope(name, default_name="merge"): # Apply linear projection to input tensors. projs = [] for i, tensor in enumerate(tensors): proj = tf.layers.dense( tensor, units, name="proj_%d" % i, **kwargs) projs.append(proj) # Compute sum of tensors. result = projs.pop() for proj in projs: result = result + proj # Apply nonlinearity. if activation: result = activation(result) return result ``` ## Entropy ```python import tensorflow as tf def softmax_entropy(logits, dim=-1): """Compute entropy over specified dimensions.""" plogp = tf.nn.softmax(logits, dim) * tf.nn.log_softmax(logits, dim) return -tf.reduce_sum(plogp, dim) ``` ## KL-Divergence ```python def gaussian_kl(q, p=(0., 0.)): """Computes KL divergence between two isotropic Gaussian distributions. To ensure numerical stability, this op uses mu, log(sigma^2) to represent the distribution. If q is not provided, it's assumed to be unit Gaussian. Args: q: A tuple (mu, log(sigma^2)) representing a multi-variatie Gaussian. p: A tuple (mu, log(sigma^2)) representing a multi-variatie Gaussian. Returns: A tensor representing KL(q, p). """ mu1, log_sigma1_sq = q mu2, log_sigma2_sq = p return tf.reduce_sum( 0.5 * (log_sigma2_sq - log_sigma1_sq + tf.exp(log_sigma1_sq - log_sigma2_sq) + tf.square(mu1 - mu2) / tf.exp(log_sigma2_sq) - 1), axis=-1) ``` ## Make parallel ```python def make_parallel(fn, num_gpus, **kwargs): """Parallelize given model on multiple gpu devices. Args: fn: Arbitrary function that takes a set of input tensors and outputs a single tensor. First dimension of inputs and output tensor are assumed to be batch dimension. num_gpus: Number of GPU devices. **kwargs: Keyword arguments to be passed to the model. Returns: A tensor corresponding to the model output. """ in_splits = {} for k, v in kwargs.items(): in_splits[k] = tf.split(v, num_gpus) out_split = [] for i in range(num_gpus): with tf.device(tf.DeviceSpec(device_type="GPU", device_index=i)): with tf.variable_scope(tf.get_variable_scope(), reuse=tf.AUTO_REUSE): out_split.append(fn(**{k : v[i] for k, v in in_splits.items()})) return tf.concat(out_split, axis=0) ``` ## Leaky relu ```python def leaky_relu(tensor, alpha=0.1): """Computes the leaky rectified linear activation.""" return tf.maximum(tensor, alpha * tensor) ``` ## Batch normalization ```python def batch_normalization(tensor, training=False, epsilon=0.001, momentum=0.9, fused_batch_norm=False, name=None): """Performs batch normalization on given 4-D tensor. The features are assumed to be in NHWC format. Noe that you need to run UPDATE_OPS in order for this function to perform correctly, e.g.: with tf.control_dependencies(tf.get_collection(tf.GraphKeys.UPDATE_OPS)): train_op = optimizer.minimize(loss) Based on: https://arxiv.org/abs/1502.03167 """ with tf.variable_scope(name, default_name="batch_normalization"): channels = tensor.shape.as_list()[-1] axes = list(range(tensor.shape.ndims - 1)) beta = tf.get_variable( 'beta', channels, initializer=tf.zeros_initializer()) gamma = tf.get_variable( 'gamma', channels, initializer=tf.ones_initializer()) avg_mean = tf.get_variable( "avg_mean", channels, initializer=tf.zeros_initializer(), trainable=False) avg_variance = tf.get_variable( "avg_variance", channels, initializer=tf.ones_initializer(), trainable=False) if training: if fused_batch_norm: mean, variance = None, None else: mean, variance = tf.nn.moments(tensor, axes=axes) else: mean, variance = avg_mean, avg_variance if fused_batch_norm: tensor, mean, variance = tf.nn.fused_batch_norm( tensor, scale=gamma, offset=beta, mean=mean, variance=variance, epsilon=epsilon, is_training=training) else: tensor = tf.nn.batch_normalization( tensor, mean, variance, beta, gamma, epsilon) if training: update_mean = tf.assign( avg_mean, avg_mean * momentum + mean * (1.0 - momentum)) update_variance = tf.assign( avg_variance, avg_variance * momentum + variance * (1.0 - momentum)) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mean) tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_variance) return tensor ```