def load_language(app, tokenizer_service, tag, model_dir): config = Config.load(['./default.conf', './default.' + tag + '.conf', os.path.join(model_dir, 'model.conf')]) model = create_model(config) graph = tf.Graph() session = tf.Session(graph=graph) with graph.as_default(): # Force everything to run on CPU, we run on single inputs so there is not much point # on going through the GPU with tf.device('/cpu:0'): model.build() loader = tf.train.Saver() with session.as_default(): loader.restore(session, os.path.join(model_dir, 'best')) tokenizer = Tokenizer(tokenizer_service, tag) app.add_language(tag, LanguageContext(tag, tokenizer, session, config, model)) print('Loaded language ' + tag)
def __init__(self, config={}, device="/gpu:0"): config = hc.Config(config) dtype = config.dtype or "float32" initializer = config.initializer or 'orthogonal' orthogonal_gain = config.orthogonal_gain or 1.0 random_stddev = config.random_stddev or 0.02 self.dtype = self.parse_dtype(dtype) self.scope_count = 0 self.description = '' self.weights = [] self.biases = [] self.device = config.device self.initialized = False self._reuse = False if initializer == 'orthogonal': self.initializer = self.orthogonal_initializer(orthogonal_gain) else: self.initializer = self.random_initializer(random_stddev)
def variable_on_worker_level(name, shape, initializer): r''' Next we concern ourselves with graph creation. However, before we do so we must introduce a utility function ``variable_on_worker_level()`` used to create a variable in CPU memory. ''' # Use the /cpu:0 device on worker_device for scoped operations if len(FLAGS.ps_hosts) == 0: device = worker_device else: device = tf.train.replica_device_setter(worker_device=worker_device, cluster=cluster) with tf.device(device): # Create or get apropos variable var = tf.get_variable(name=name, shape=shape, initializer=initializer) return var
def create_optimizer(): optimizer = tf.train.AdamOptimizer(learning_rate=FLAGS.learning_rate, beta1=FLAGS.beta1, beta2=FLAGS.beta2, epsilon=FLAGS.epsilon) return optimizer # Towers # ====== # In order to properly make use of multiple GPU's, one must introduce new abstractions, # not present when using a single GPU, that facilitate the multi-GPU use case. # In particular, one must introduce a means to isolate the inference and gradient # calculations on the various GPU's. # The abstraction we intoduce for this purpose is called a 'tower'. # A tower is specified by two properties: # * **Scope** - A scope, as provided by `tf.name_scope()`, # is a means to isolate the operations within a tower. # For example, all operations within 'tower 0' could have their name prefixed with `tower_0/`. # * **Device** - A hardware device, as provided by `tf.device()`, # on which all operations within the tower execute. # For example, all operations of 'tower 0' could execute on the first GPU `tf.device('/gpu:0')`.
def trainable_initial_state(self, batch_size): """ Create a trainable initial state for the BasicLSTMCell :param batch_size: number of samples per batch :return: LSTMStateTuple """ def _create_initial_state(batch_size, state_size, trainable=True, initializer=tf.random_normal_initializer()): with tf.device('/cpu:0'): s = tf.get_variable('initial_state', shape=[1, state_size], dtype=tf.float32, trainable=trainable, initializer=initializer) state = tf.tile(s, tf.stack([batch_size] + [1])) return state with tf.variable_scope('initial_c'): initial_c = _create_initial_state(batch_size, self._num_units) with tf.variable_scope('initial_h'): initial_h = _create_initial_state(batch_size, self._num_units) return tf.contrib.rnn.LSTMStateTuple(initial_c, initial_h)
def assign_sub(self, delta, name=None): """Mimic the updates to the variable. Args: delta: is pushed into a staging buffer and will be pumped later. name: currently ignored; names of ops and the StagingArea are computed without using this pass name. Returns: The actual updates. The colocation constraint will be reapplied. """ # This parameter is ignored: the StagingArea only supports setting # the shared name, not the names of individual ops it uses. del name # colocate_with(None, True) clears the colocation constraints. # Push the delta into a staging buffer. with ops.colocate_with(None, True), tf.device(self.var_stage_get.device): delta_staging_area = data_flow_ops.StagingArea( [self.var_stage_get.dtype], shapes=[self.var_stage_get.shape]) delta_put_op = delta_staging_area.put([delta]) self.variable_mgr.staging_delta_ops.append(delta_put_op) delta_get_op = delta_staging_area.get()[0] # Return the actual updates. The colocation constraint will be reapplied. return self.real_var.assign_sub(delta_get_op)
def build_all_reduce_device_prefixes(job_name, num_tasks): """Build list of device prefix names for all_reduce. Args: job_name: 'worker', 'ps' or 'localhost'. num_tasks: number of jobs across which device names should be generated. Returns: A list of device name prefix strings. Each element spells out the full host name without adding the device. e.g. '/job:worker/task:0' """ if job_name != 'localhost': return ['/job:%s/task:%d' % (job_name, d) for d in range(0, num_tasks)] else: assert num_tasks == 1 return ['/job:%s' % job_name]
def unpack_grad_tuple(gv, gpt): """Unpack a previously packed collection of gradient tensors. Args: gv: A (grad, var) pair to be unpacked. gpt: A GradPackTuple describing the packing operation that produced gv. Returns: A list of (grad, var) pairs corresponding to the values that were originally packed into gv, maybe following subsequent operations like reduction. """ elt_widths = [x.num_elements() for x in gpt.shapes] with tf.device(gv[0][0].device): with tf.name_scope('unpack'): splits = tf.split(gv[0], elt_widths) unpacked_gv = [] for idx, s in enumerate(splits): unpacked_gv.append((tf.reshape(s, gpt.shapes[idx]), gpt.vars[idx])) return unpacked_gv
def preprocess_device_grads(self, device_grads): """Preprocess the device gradients prior to applying them. Args: device_grads: List of lists of (gradient, variable) tuples. device_grads[t][g] = (gradient, variable), where t is the index of the tower and g is the index of the gradient-variable pair. Returns: a tuple of (apply_gradients_devices, gradient_state). gradient_state is an opaque structure that should be passed to get_gradients_to_apply() and append_apply_gradients_ops() (in that order). apply_gradients_devices is a list of devices where the gradients will be applied with get_gradients_to_apply() and append_apply_gradients_ops(). """ del device_grads # unused by this implementation assert False, 'Must be implemented in subclass'
def trainable_variables_on_device(self, rel_device_num, abs_device_num, writable=False): """Return the set of trainable variables on device. Args: rel_device_num: local worker device index. abs_device_num: global graph device index. writable: whether to get a reference to the underlying variable. Returns: The set of trainable variables on the specified device. """ del rel_device_num, writable if self.each_tower_has_variables(): params = [ v for v in tf.trainable_variables() if v.name.startswith('v%s/' % abs_device_num) ] else: params = tf.trainable_variables() return params
def append_apply_gradients_ops(self, gradient_state, opt, grads, training_ops, loss_scale_params): device_grads = gradient_state # From 2nd result of preprocess_device_grads. def get_apply_gradients_ops_func(): """Returns a list of ops for updating gradients.""" apply_gradients_ops = [] # For each variable, apply the combined gradients for this server on # the parameter server, and then wait for all other servers to do this. for i, (g, v) in enumerate(grads): apply_gradient_op = opt.apply_gradients([(g, v)]) barrier = self.benchmark_cnn.add_sync_queues_and_barrier( 'replicate_variable_%s' % i, [apply_gradient_op]) with tf.control_dependencies([barrier]): with tf.device(self.benchmark_cnn.cpu_device): updated_value = v.read_value() for my_d in range(len(self.benchmark_cnn.devices)): apply_gradients_ops.append( device_grads[my_d][i][1].assign(updated_value)) return apply_gradients_ops variable_mgr_util.append_gradients_with_loss_scale( training_ops, get_apply_gradients_ops_func, loss_scale_params, self.grad_has_inf_nan)
def fix_variables(self, sess, pretrained_model): print('Fix VGG16 layers..') with tf.variable_scope('Fix_VGG16') as scope: with tf.device("/cpu:0"): # fix the vgg16 issue from conv weights to fc weights # fix RGB to BGR fc6_conv = tf.get_variable("fc6_conv", [7, 7, 512, 4096], trainable=False) fc7_conv = tf.get_variable("fc7_conv", [1, 1, 4096, 4096], trainable=False) conv1_rgb = tf.get_variable("conv1_rgb", [3, 3, 3, 64], trainable=False) restorer_fc = tf.train.Saver({self._scope + "/fc6/weights": fc6_conv, self._scope + "/fc7/weights": fc7_conv, self._scope + "/conv1/conv1_1/weights": conv1_rgb}) restorer_fc.restore(sess, pretrained_model) sess.run(tf.assign(self._variables_to_fix[self._scope + '/fc6/weights:0'], tf.reshape(fc6_conv, self._variables_to_fix[self._scope + '/fc6/weights:0'].get_shape()))) sess.run(tf.assign(self._variables_to_fix[self._scope + '/fc7/weights:0'], tf.reshape(fc7_conv, self._variables_to_fix[self._scope + '/fc7/weights:0'].get_shape()))) sess.run(tf.assign(self._variables_to_fix[self._scope + '/conv1/conv1_1/weights:0'], tf.reverse(conv1_rgb, [2])))
def run_bilateral_slice_apply(self, dev, grid_data, guide_data, input_data, has_offset=False): with tf.device(dev): grid_tensor = tf.convert_to_tensor( grid_data, name='grid', dtype=tf.float32) guide_tensor = tf.convert_to_tensor( guide_data, name='guide', dtype=tf.float32) input_tensor = tf.convert_to_tensor( input_data, name='input', dtype=tf.float32) output_tensor = ops.bilateral_slice_apply(grid_tensor, guide_tensor, input_tensor, has_offset=has_offset) with self.test_session() as sess: output_data = sess.run(output_tensor) return output_data
def __init__(self, num_parameter_servers=0, ps_device='/job:ps', placement='CPU:0'): """Initialize VariableDeviceChooser. Args: num_parameter_servers: number of parameter servers. ps_device: string representing the parameter server device. placement: string representing the placement of the variable either CPU:0 or GPU:0. When using parameter servers forced to CPU:0. """ self._num_ps = num_parameter_servers self._ps_device = ps_device self._placement = placement if num_parameter_servers == 0 else 'CPU:0' self._next_task_id = 0
def global_step(device=''): """Returns the global step variable. Args: device: Optional device to place the variable. It can be an string or a function that is called to get the device for the variable. Returns: the tensor representing the global step variable. """ global_step_ref = tf.get_collection(tf.GraphKeys.GLOBAL_STEP) if global_step_ref: return global_step_ref[0] else: collections = [ VARIABLES_TO_RESTORE, tf.GraphKeys.VARIABLES, tf.GraphKeys.GLOBAL_STEP, ] # Get the device for the variable. with tf.device(variable_device(device, 'global_step')): return tf.get_variable('global_step', shape=[], dtype=tf.int64, initializer=tf.zeros_initializer, trainable=False, collections=collections)
def build_all(self, param_avg=False): """Build all nodes.""" if self._has_built_all: raise Exception('Only call build_all or build_eval once.') self._has_built_all = True with tf.device(self.get_device_fn()): with tf.variable_scope(self.name): inp_var = self.build_input() output_var = self.build(inp_var) loss_var = self.build_loss(inp_var, output_var) train_step = self.build_optim(loss_var) if param_avg: ema_op, avg_var = self.get_average_var() self._avg_var = avg_var with tf.control_dependencies([train_step, ema_op]): train_step = tf.no_op(name='train_step') self.register_var('train_step', train_step) return self
def load_new_model(sess, restore_path, nlayers, device='/cpu:0'): from resnet_imagenet_model import ResNetImageNetModel with tf.device(device): logger = tfplus.utils.logger.get() with logger.verbose_level(2): resnet = ResNetImageNetModel().set_all_options({ 'inp_depth': 3, 'layers': get_layers(nlayers), 'strides': [1, 2, 2, 2], 'channels': [64, 256, 512, 1024, 2048], 'bottleneck': True, 'shortcut': 'projection', 'compatible': True, 'weight_decay': 1e-4, 'subtract_mean': True, 'trainable': False }) inp_var = resnet.build_input() out_var = resnet.build(inp_var) out_var2 = resnet.build(inp_var) saver = tf.train.Saver(resnet.get_save_var_dict()) saver.restore(sess, restore_path) return resnet, inp_var, out_var, out_var2
def load_wrapper_model(sess, restore_path, nlayers, device='/cpu:0'): from resnet_imagenet_model_wrapper import ResNetImageNetModelWrapper with tf.device(device): logger = tfplus.utils.logger.get() with logger.verbose_level(2): resnet = ResNetImageNetModelWrapper().set_all_options({ 'inp_depth': 3, 'layers': get_layers(nlayers), 'strides': [1, 2, 2, 2], 'channels': [64, 256, 512, 1024, 2048], 'bottleneck': True, 'shortcut': 'projection', 'compatible': True, 'wd': 1e-4, 'subtract_mean': True, 'trainable': False }) inp_var = resnet.build_input() out_var = resnet.build(inp_var) saver = tf.train.Saver(resnet.res_net.get_save_var_dict()) saver.restore(sess, restore_path) return resnet.res_net, inp_var, out_var['y_out']
def build_input(self): results = {} phase_train = self.add_input_var('phase_train', None, 'bool') results['phase_train'] = phase_train inp_depth = self.get_option('inp_depth') orig_x = [] for ii in xrange(self.num_replica): with tf.name_scope('%s_%d' % ('replica', ii)) as scope: device = '/gpu:{}'.format(ii) with tf.device(device): x_ = self.add_input_var('x_{}'.format( ii), [None, None, None, inp_depth], 'float') results['x_{}'.format(ii)] = x_ y_gt_ = self.add_input_var('y_gt_{}'.format(ii), [ None, NUM_CLS], 'float') results['y_gt_{}'.format(ii)] = y_gt_ orig_x.append( (x_ + self.sub_models[0].res_net._img_mean) / 255.0) # self.log.error(x_.device) # self.log.fatal('') self.register_var('orig_x', tf.concat(0, orig_x)) return results
def build(self, inp): # Divide input equally. self.lazy_init_var() inp_list = [] output = [] for ii in xrange(self.num_replica): with tf.name_scope('%s_%d' % ('replica', ii)) as scope: device = '/gpu:{}'.format(ii) with tf.device(device): tf.get_variable_scope().reuse_variables() inp_ = { 'x': inp['x_{}'.format(ii)], 'y_gt': inp['y_gt_{}'.format(ii)], 'phase_train': inp['phase_train'] } output.append(self.sub_models[ii].build(inp_)) inp_list.append(inp_) self.output_list = output self.input_list = inp_list output = tf.concat(0, [oo['y_out'] for oo in output]) self.register_var('y_out', output) output2 = tf.concat(0, [mm.get_var('score_out') for mm in self.sub_models]) self.register_var('score_out', output2) return {'y_out': output}
def build_optim(self, loss): global_step = self.global_step learn_rate = self.learn_rate # We must calculate the mean of each gradient. Note that this is the # synchronization point across all towers. grads = self.average_gradients(self.tower_grads) # Apply the gradients to adjust the shared variables. apply_gradient_op = self.opt.apply_gradients( grads, global_step=global_step) # Track the moving averages of all trainable variables. variable_averages = tf.train.ExponentialMovingAverage( 0.999, global_step) variables_averages_op = variable_averages.apply( tf.trainable_variables()) # Group all updates to into a single train op. train_op = tf.group(apply_gradient_op, variables_averages_op) # for m in self.sub_models: # self.log.info(m.device) # self.log.fatal('haha') return train_op
def __init__(self, hps, gpu_mode=True, reuse=False): """Initializer for the SketchRNN model. Args: hps: a HParams object containing model hyperparameters gpu_mode: a boolean that when True, uses GPU mode. reuse: a boolean that when true, attemps to reuse variables. """ self.hps = hps with tf.variable_scope('vector_rnn', reuse=reuse): if not gpu_mode: with tf.device('/cpu:0'): tf.logging.info('Model using cpu.') self.build_model(hps) else: tf.logging.info('Model using gpu.') self.build_model(hps)
def get_device_setter(num_parameter_servers, num_workers): """ Get a device setter given number of servers in the cluster. Given the numbers of parameter servers and workers, construct a device setter object using ClusterSpec. Args: num_parameter_servers: Number of parameter servers num_workers: Number of workers Returns: Device setter object. """ ps_hosts = re.findall(r'[\w\.:]+', FLAGS.ps_hosts) # split address worker_hosts = re.findall(r'[\w\.:]+', FLAGS.worker_hosts) # split address assert num_parameter_servers == len(ps_hosts) assert num_workers == len(worker_hosts) cluster_spec = tf.train.ClusterSpec({"ps":ps_hosts,"worker":worker_hosts}) # Get device setter from the cluster spec # return tf.train.replica_device_setter(cluster=cluster_spec)
def main(): # Graph with tf.device('/cpu:0'): a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) c=a+b target = tf.constant(100.,shape=[2],dtype=tf.float32) loss = tf.reduce_mean(tf.square(c-target)) opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) # Session #sv = tf.train.Supervisor(logdir='/tmp/mydir') sv = tf.train.Supervisor(logdir='/tmp/mydir') gpu_options = tf.GPUOptions(allow_growth=True,allocator_type="BFC",visible_device_list="%d"%FLAGS.gpu_id) config = tf.ConfigProto(gpu_options=gpu_options,allow_soft_placement=False,device_count={'GPU':1},log_device_placement=True) sess = sv.prepare_or_wait_for_session(config=config) for i in range(1000): sess.run(opt) if i % 10 == 0: r = sess.run(c) print(r) time.sleep(.1)
def main(): # Graph with tf.device('/cpu:0'): a = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) b = tf.Variable(tf.truncated_normal(shape=[2]),dtype=tf.float32) c=a+b target = tf.constant(100.,shape=[2],dtype=tf.float32) loss = tf.reduce_mean(tf.square(c-target)) opt = tf.train.GradientDescentOptimizer(.0001).minimize(loss) # Session sv = tf.train.Supervisor() sess = sv.prepare_or_wait_for_session() for i in range(1000): sess.run(opt) if i % 10 == 0: r = sess.run(c) print(r) time.sleep(.1)
def fused_birnn(fused_rnn, inputs, sequence_length, initial_state=(None, None), dtype=None, scope=None, time_major=False, backward_device=None): with tf.variable_scope(scope or "BiRNN"): sequence_length = tf.cast(sequence_length, tf.int32) if not time_major: inputs = tf.transpose(inputs, [1, 0, 2]) outputs_fw, state_fw = fused_rnn(inputs, sequence_length=sequence_length, initial_state=initial_state[0], dtype=dtype, scope="FW") if backward_device is not None: with tf.device(backward_device): outputs_bw, state_bw = fused_rnn_backward(fused_rnn, inputs, sequence_length, initial_state[1], dtype, scope="BW") else: outputs_bw, state_bw = fused_rnn_backward(fused_rnn, inputs, sequence_length, initial_state[1], dtype, scope="BW") if not time_major: outputs_fw = tf.transpose(outputs_fw, [1, 0, 2]) outputs_bw = tf.transpose(outputs_bw, [1, 0, 2]) return (outputs_fw, outputs_bw), (state_fw, state_bw)
def testPS(self): deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertDeviceEqual(deploy_config.clone_device(0), '/job:worker') self.assertEqual(deploy_config.clone_scope(0), '') self.assertDeviceEqual(deploy_config.optimizer_device(), '/job:worker/device:CPU:0') self.assertDeviceEqual(deploy_config.inputs_device(), '/job:worker/device:CPU:0') with tf.device(deploy_config.variables_device()): a = tf.Variable(0) b = tf.Variable(0) c = tf.no_op() d = slim.variable('a', [], caching_device=deploy_config.caching_device()) self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(a.device, a.value().device) self.assertDeviceEqual(b.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(b.device, b.value().device) self.assertDeviceEqual(c.device, '') self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(d.value().device, '')
def testVariablesPS(self): deploy_config = model_deploy.DeploymentConfig(num_ps_tasks=2) with tf.device(deploy_config.variables_device()): a = tf.Variable(0) b = tf.Variable(0) c = tf.no_op() d = slim.variable('a', [], caching_device=deploy_config.caching_device()) self.assertDeviceEqual(a.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(a.device, a.value().device) self.assertDeviceEqual(b.device, '/job:ps/task:1/device:CPU:0') self.assertDeviceEqual(b.device, b.value().device) self.assertDeviceEqual(c.device, '') self.assertDeviceEqual(d.device, '/job:ps/task:0/device:CPU:0') self.assertDeviceEqual(d.value().device, '')
def testCreateSingleclone(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) clone = clones[0] self.assertEqual(len(slim.get_variables()), 5) for v in slim.get_variables(): self.assertDeviceEqual(v.device, 'CPU:0') self.assertDeviceEqual(v.value().device, 'CPU:0') self.assertEqual(clone.outputs.op.name, 'BatchNormClassifier/fully_connected/Sigmoid') self.assertEqual(clone.scope, '') self.assertDeviceEqual(clone.device, '') self.assertEqual(len(slim.losses.get_losses()), 1) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(len(update_ops), 2)
def testCreateOnecloneWithPS(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = BatchNormClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1, num_ps_tasks=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(clones), 1) clone = clones[0] self.assertEqual(clone.outputs.op.name, 'BatchNormClassifier/fully_connected/Sigmoid') self.assertDeviceEqual(clone.device, '/job:worker') self.assertEqual(clone.scope, '') self.assertEqual(len(slim.get_variables()), 5) for v in slim.get_variables(): self.assertDeviceEqual(v.device, '/job:ps/task:0/CPU:0') self.assertDeviceEqual(v.device, v.value().device)
def testCreateLogisticClassifier(self): g = tf.Graph() with g.as_default(): tf.set_random_seed(0) tf_inputs = tf.constant(self._inputs, dtype=tf.float32) tf_labels = tf.constant(self._labels, dtype=tf.float32) model_fn = LogisticClassifier clone_args = (tf_inputs, tf_labels) deploy_config = model_deploy.DeploymentConfig(num_clones=1) self.assertEqual(slim.get_variables(), []) clones = model_deploy.create_clones(deploy_config, model_fn, clone_args) self.assertEqual(len(slim.get_variables()), 2) update_ops = tf.get_collection(tf.GraphKeys.UPDATE_OPS) self.assertEqual(update_ops, []) optimizer = tf.train.GradientDescentOptimizer(learning_rate=1.0) total_loss, grads_and_vars = model_deploy.optimize_clones(clones, optimizer) self.assertEqual(len(grads_and_vars), len(tf.trainable_variables())) self.assertEqual(total_loss.op.name, 'total_loss') for g, v in grads_and_vars: self.assertDeviceEqual(g.device, '') self.assertDeviceEqual(v.device, 'CPU:0')
def testNoSummariesOnGPU(self): with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=2) # clone function creates a fully_connected layer with a regularizer loss. def ModelFn(): inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32) reg = tf.contrib.layers.l2_regularizer(0.001) tf.contrib.layers.fully_connected(inputs, 30, weights_regularizer=reg) model = model_deploy.deploy( deploy_config, ModelFn, optimizer=tf.train.GradientDescentOptimizer(1.0)) # The model summary op should have a few summary inputs and all of them # should be on the CPU. self.assertTrue(model.summary_op.op.inputs) for inp in model.summary_op.op.inputs: self.assertEqual('/device:CPU:0', inp.device)
def testNoSummariesOnGPUForEvals(self): with tf.Graph().as_default(): deploy_config = model_deploy.DeploymentConfig(num_clones=2) # clone function creates a fully_connected layer with a regularizer loss. def ModelFn(): inputs = tf.constant(1.0, shape=(10, 20), dtype=tf.float32) reg = tf.contrib.layers.l2_regularizer(0.001) tf.contrib.layers.fully_connected(inputs, 30, weights_regularizer=reg) # No optimizer here, it's an eval. model = model_deploy.deploy(deploy_config, ModelFn) # The model summary op should have a few summary inputs and all of them # should be on the CPU. self.assertTrue(model.summary_op.op.inputs) for inp in model.summary_op.op.inputs: self.assertEqual('/device:CPU:0', inp.device)
def _optimize_clone(optimizer, clone, num_clones, regularization_losses, **kwargs): """Compute losses and gradients for a single clone. Args: optimizer: A tf.Optimizer object. clone: A Clone namedtuple. num_clones: The number of clones being deployed. regularization_losses: Possibly empty list of regularization_losses to add to the clone losses. **kwargs: Dict of kwarg to pass to compute_gradients(). Returns: A tuple (clone_loss, clone_grads_and_vars). - clone_loss: A tensor for the total loss for the clone. Can be None. - clone_grads_and_vars: List of (gradient, variable) for the clone. Can be empty. """ sum_loss = _gather_clone_loss(clone, num_clones, regularization_losses) clone_grad = None if sum_loss is not None: with tf.device(clone.device): clone_grad = optimizer.compute_gradients(sum_loss, **kwargs) return sum_loss, clone_grad
def clone_device(self, clone_index): """Device used to create the clone and all the ops inside the clone. Args: clone_index: Int, representing the clone_index. Returns: A value suitable for `tf.device()`. Raises: ValueError: if `clone_index` is greater or equal to the number of clones". """ if clone_index >= self._num_clones: raise ValueError('clone_index must be less than num_clones') device = '' if self._num_ps_tasks > 0: device += self._worker_device if self._clone_on_cpu: device += '/device:CPU:0' else: if self._num_clones > 1: device += '/device:GPU:%d' % clone_index return device
def extract_batch(dataset, config): with tf.device("/cpu:0"): bboxer = PriorBoxGrid(config) data_provider = slim.dataset_data_provider.DatasetDataProvider( dataset, num_readers=2, common_queue_capacity=512, common_queue_min=32) if args.segment: im, bbox, gt, seg = data_provider.get(['image', 'object/bbox', 'object/label', 'image/segmentation']) else: im, bbox, gt = data_provider.get(['image', 'object/bbox', 'object/label']) seg = tf.expand_dims(tf.zeros(tf.shape(im)[:2]), 2) im = tf.to_float(im)/255 bbox = yxyx_to_xywh(tf.clip_by_value(bbox, 0.0, 1.0)) im, bbox, gt, seg = data_augmentation(im, bbox, gt, seg, config) inds, cats, refine = bboxer.encode_gt_tf(bbox, gt) return tf.train.shuffle_batch([im, inds, refine, cats, seg], args.batch_size, 2048, 64, num_threads=4)
def getStatsEigen(self, stats=None): if len(self.stats_eigen) == 0: stats_eigen = {} if stats is None: stats = self.stats tmpEigenCache = {} with tf.device('/cpu:0'): for var in stats: for key in ['fprop_concat_stats', 'bprop_concat_stats']: for stats_var in stats[var][key]: if stats_var not in tmpEigenCache: stats_dim = stats_var.get_shape()[1].value e = tf.Variable(tf.ones( [stats_dim]), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/e', trainable=False) Q = tf.Variable(tf.diag(tf.ones( [stats_dim])), name='KFAC_FAC/' + stats_var.name.split(':')[0] + '/Q', trainable=False) stats_eigen[stats_var] = {'e': e, 'Q': Q} tmpEigenCache[ stats_var] = stats_eigen[stats_var] else: stats_eigen[stats_var] = tmpEigenCache[ stats_var] self.stats_eigen = stats_eigen return self.stats_eigen
def run(): if len(sys.argv) < 3: print("** Usage: python3 " + sys.argv[0] + " <<Model Directory>> <<Test Set>>") sys.exit(1) np.random.seed(42) model_dir = sys.argv[1] config = Config.load(['./default.conf', os.path.join(model_dir, 'model.conf')]) model = create_model(config) test_data = load_data(sys.argv[2], config.dictionary, config.grammar, config.max_length) print("unknown", unknown_tokens) with tf.Graph().as_default(): tf.set_random_seed(1234) with tf.device('/cpu:0'): model.build() test_eval = Seq2SeqEvaluator(model, config.grammar, test_data, 'test', config.reverse_dictionary, beam_size=config.beam_size, batch_size=config.batch_size) loader = tf.train.Saver() with tf.Session() as sess: loader.restore(sess, os.path.join(model_dir, 'best')) #sess = tf_debug.LocalCLIDebugWrapperSession(sess) #sess.add_tensor_filter("has_inf_or_nan", tf_debug.has_inf_or_nan) test_eval.eval(sess, save_to_file=True)
def collect_results(results_tuple, returns): r''' This routine will help collecting partial results for the WER reports. The ``results_tuple`` is composed of an array of the original labels, an array of the corresponding decodings, an array of the corrsponding distances and an array of the corresponding losses. ``returns`` is built up in a similar way, containing just the unprocessed results of one ``session.run`` call (effectively of one batch). Labels and decodings are converted to text before splicing them into their corresponding results_tuple lists. In the case of decodings, for now we just pick the first available path. ''' # Each of the arrays within results_tuple will get extended by a batch of each available device for i in range(len(available_devices)): # Collect the labels results_tuple[0].extend(sparse_tensor_value_to_texts(returns[0][i])) # Collect the decodings - at the moment we default to the first one results_tuple[1].extend(sparse_tensor_value_to_texts(returns[1][i][0])) # Collect the distances results_tuple[2].extend(returns[2][i]) # Collect the losses results_tuple[3].extend(returns[3][i]) # For reporting we also need a standard way to do time measurements.
def __init__(self, mc, gpu_id): with tf.device('/gpu:{}'.format(gpu_id)): ModelSkeleton.__init__(self, mc) self._add_forward_graph() self._add_interpretation_graph() self._add_loss_graph() self._add_train_graph() self._add_viz_graph()
def get_variable(name, shape, initializer=None, dtype=tf.float32, device=None): """ Helper to create a Variable stored on CPU memory. Args: name: name of the variable shape: list of ints initializer: initializer for Variable dtype: data type, defaults to tf.float32 device: device to which the variable will be pinned Returns: Variable Tensor """ if device is None: device = '/cpu:0' if initializer is None: with tf.device(device): var = tf.get_variable(name, shape, dtype=dtype) else: with tf.device(device): var = tf.get_variable(name, shape, initializer=initializer, dtype=dtype) return var