我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.python.framework.dtypes.string()。
def next_key(self): """The key names of the next (in iteration) truncated unrolled examples. The format of the key is: ```python "%05d_of_%05d:%s" % (sequence + 1, sequence_count, original_key)
if `sequence + 1 < sequence_count`, otherwise: ```python "STOP:%s" % original_key ``` where `original_key` is the unique key read in by the prefetcher. Returns: A string vector of length `batch_size`, the keys. """ return self._state_saver._received_next_key
```
def _store_index_maps(self, sequences, context, states): """Prepares the internal dictionaries _name_to_index and _index_to_name. These dictionaries are used to keep track of indices into the barrier. Args: sequences: `OrderedDict` of string, `Tensor` pairs. context: `OrderedDict` of string, `Tensor` pairs. states: `OrderedDict` of string, `Tensor` pairs. """ assert isinstance(sequences, dict) assert isinstance(context, dict) assert isinstance(states, dict) self._name_to_index = dict((name, ix) for (ix, name) in enumerate( ["__length", "__total_length", "__next_key", "__sequence", "__sequence_count"] + ["__sequence__%s" % k for k in sequences.keys()] + ["__context__%s" % k for k in context.keys()] + ["__state__%s" % k for k in states.keys()])) self._index_to_name = [ name for (name, _) in sorted( self._name_to_index.items(), key=lambda n_ix: n_ix[1])]
def _make_test_csv_sparse(): f = tempfile.NamedTemporaryFile( dir=tf.test.get_temp_dir(), delete=False, mode="w") w = csv.writer(f) w.writerow(["int", "float", "bool", "string"]) for _ in range(100): # leave columns empty; these will be read as default value (e.g. 0 or NaN) intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else "" floatvalue = np.random.rand() if np.random.rand() > 0.5 else "" boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else "" stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else "") row = [intvalue, floatvalue, boolvalue, stringvalue] w.writerow(row) f.close() return f.name
def __new__(cls, column_name, hash_bucket_size, combiner="sum", dtype=dtypes.string): if dtype != dtypes.string and not dtype.is_integer: raise ValueError("dtype must be string or integer. " "dtype: {}, column_name: {}".format(dtype, column_name)) return super(_SparseColumnHashed, cls).__new__( cls, column_name, bucket_size=hash_bucket_size, combiner=combiner, dtype=dtype)
def CamVidInputs(image_filenames, label_filenames, batch_size): images = ops.convert_to_tensor(image_filenames, dtype=dtypes.string) labels = ops.convert_to_tensor(label_filenames, dtype=dtypes.string) filename_queue = tf.train.slice_input_producer([images, labels], shuffle=True) image, label = CamVid_reader(filename_queue) reshaped_image = tf.cast(image, tf.float32) min_fraction_of_examples_in_queue = 0.4 min_queue_examples = int(NUM_EXAMPLES_PER_EPOCH_FOR_TRAIN * min_fraction_of_examples_in_queue) print ('Filling queue with %d CamVid images before starting to train. ' 'This will take a few minutes.' % min_queue_examples) # Generate a batch of images and labels by building up a queue of examples. return _generate_image_and_label_batch(reshaped_image, label, min_queue_examples, batch_size, shuffle=True)
def dataset_reader(filename_queue): #prev name: CamVid_reader image_filename = filename_queue[0] #tensor of type string label_filename = filename_queue[1] #tensor of type string #get png encoded image imageValue = tf.read_file(image_filename) labelValue = tf.read_file(label_filename) #decodes a png image into a uint8 or uint16 tensor #returns a tensor of type dtype with shape [height, width, depth] image_bytes = tf.image.decode_png(imageValue) label_bytes = tf.image.decode_png(labelValue) #Labels are png, not jpeg image = tf.reshape(image_bytes, (FLAGS.image_h, FLAGS.image_w, FLAGS.image_c)) label = tf.reshape(label_bytes, (FLAGS.image_h, FLAGS.image_w, 1)) return image, label
def dataset_inputs(image_filenames, label_filenames, batch_size, running_train_set=True): images = ops.convert_to_tensor(image_filenames, dtype=dtypes.string) labels = ops.convert_to_tensor(label_filenames, dtype=dtypes.string) filename_queue = tf.train.slice_input_producer([images, labels], shuffle=True) image, label = dataset_reader(filename_queue) reshaped_image = tf.cast(image, tf.float32) min_fraction_of_examples_in_queue = FLAGS.fraction_of_examples_in_queue min_queue_examples = int(FLAGS.num_examples_epoch_train * min_fraction_of_examples_in_queue) print ('Filling queue with %d input images before starting to train. ' 'This may take some time.' % min_queue_examples) # Generate a batch of images and labels by building up a queue of examples. return _generate_image_and_label_batch(reshaped_image, label, min_queue_examples, batch_size, shuffle=True)
def decode_example(self, serialized_example, item_handler, image_format): """Decodes the given serialized example with the specified item handler. Args: serialized_example: a serialized TF example string. item_handler: the item handler used to decode the image. image_format: the image format being decoded. Returns: the decoded image found in the serialized Example. """ serialized_example = array_ops.reshape(serialized_example, shape=[]) decoder = TFExampleDecoder( keys_to_features={ 'image/encoded': tf.FixedLenFeature((), dtypes.string, default_value=''), 'image/format': tf.FixedLenFeature((), dtypes.string, default_value=image_format), }, items_to_handlers={'image': item_handler}) [tf_image] = decoder.decode(serialized_example, ['image']) return tf_image
def test_decode_example_with_string_tensor(self): tensor_shape = (2, 3, 1) np_array = np.array([[['ab'], ['cd'], ['ef']], [['ghi'], ['jkl'], ['mnop']]]) example = example_pb2.Example(features=feature_pb2.Features(feature={ 'labels': self._bytes_feature(np_array), })) serialized_example = example.SerializeToString() with self.test_session(): serialized_example = array_ops.reshape(serialized_example, shape=[]) keys_to_features = { 'labels': parsing_ops.FixedLenFeature( tensor_shape, dtypes.string, default_value=constant_op.constant('', shape=tensor_shape, dtype=dtypes.string)) } items_to_handlers = {'labels': tfexample_decoder.Tensor('labels')} decoder = TFExampleDecoder(keys_to_features, items_to_handlers) [tf_labels] = decoder.decode(serialized_example, ['labels']) labels = tf_labels.eval() labels = labels.astype(np_array.dtype) self.assertTrue(np.array_equal(np_array, labels))
def _create_tfrecord_dataset(tmpdir): if not gfile.Exists(tmpdir): gfile.MakeDirs(tmpdir) data_sources = test_utils.create_tfrecord_files(tmpdir, num_files=1) keys_to_features = { 'image/encoded': tf.FixedLenFeature(shape=(), dtype=dtypes.string, default_value=''), 'image/format': tf.FixedLenFeature(shape=(), dtype=dtypes.string, default_value='jpeg'), 'image/class/label': tf.FixedLenFeature( shape=[1], dtype=dtypes.int64, default_value=array_ops.zeros([1], dtype=dtypes.int64)) } items_to_handlers = { 'image': tfslim.tfexample_decoder.Image(), 'label': tfslim.tfexample_decoder.Tensor('image/class/label'), } decoder = TFExampleDecoder(keys_to_features, items_to_handlers) return Dataset( data_sources=data_sources, reader=tf.TFRecordReader, decoder=decoder, num_samples=100)
def testMutableHashTableDuplicateInsert(self): with self.test_session(): default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery", "brain"]) values = constant_op.constant([0, 1, 2, 3], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) self.assertAllEqual(0, table.size().eval()) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant(["brain", "salad", "tank"]) output = table.lookup(input_string) result = output.eval() self.assertAllEqual([3, 1, -1], result)
def testMutableHashTableFindHighRank(self): with self.test_session(): default_val = -1 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant( [["brain", "salad"], ["tank", "tarkus"]]) output = table.lookup(input_string) self.assertAllEqual([2, 2], output.get_shape()) result = output.eval() self.assertAllEqual([[0, 1], [-1, -1]], result)
def testMutableHashTableOfTensorsFindHighRank(self): with self.test_session(): default_val = constant_op.constant([-1, -1, -1], dtypes.int64) keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([[0, 1, 2], [2, 3, 4], [4, 5, 6]], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant( [["brain", "salad"], ["tank", "tarkus"]]) output = table.lookup(input_string) self.assertAllEqual([2, 2, 3], output.get_shape()) result = output.eval() self.assertAllEqual( [[[0, 1, 2], [2, 3, 4]], [[-1, -1, -1], [-1, -1, -1]]], result)
def testMutableHashTableWithTensorDefault(self): with self.test_session(): default_val = constant_op.constant(-1, dtypes.int64) keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1, 2], dtypes.int64) table = lookup.MutableHashTable(dtypes.string, dtypes.int64, default_val) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant(["brain", "salad", "tank"]) output = table.lookup(input_string) result = output.eval() self.assertAllEqual([0, 1, -1], result)
def testMutableHashTableStringFloat(self): with self.test_session(): default_val = -1.5 keys = constant_op.constant(["brain", "salad", "surgery"]) values = constant_op.constant([0, 1.1, 2.2], dtypes.float32) table = lookup.MutableHashTable(dtypes.string, dtypes.float32, default_val) self.assertAllEqual(0, table.size().eval()) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant(["brain", "salad", "tank"]) output = table.lookup(input_string) result = output.eval() self.assertAllClose([0, 1.1, -1.5], result)
def testMapStringToFloat(self): with self.test_session(): keys = constant_op.constant(["a", "b", "c"], dtypes.string) values = constant_op.constant([0.0, 1.1, 2.2], dtypes.float32) default_value = constant_op.constant(-1.5, dtypes.float32) table = lookup.MutableDenseHashTable( dtypes.string, dtypes.float32, default_value=default_value, empty_key="") self.assertAllEqual(0, table.size().eval()) table.insert(keys, values).run() self.assertAllEqual(3, table.size().eval()) input_string = constant_op.constant(["a", "b", "d"], dtypes.string) output = table.lookup(input_string) self.assertAllEqual([3], output.get_shape()) result = output.eval() self.assertAllClose([0, 1.1, -1.5], result)
def testInitializeTable(self): vocabulary_file = self._createVocabFile("one_column_1.txt") with self.test_session(): default_value = -1 table = lookup.HashTable( lookup.TextFileInitializer(vocabulary_file, dtypes.string, lookup.TextFileIndex.WHOLE_LINE, dtypes.int64, lookup.TextFileIndex.LINE_NUMBER), default_value) table.init.run() input_string = constant_op.constant(["brain", "salad", "tank"]) output = table.lookup(input_string) result = output.eval() self.assertAllEqual([0, 1, -1], result)
def testInitializeIndexTable(self): vocabulary_file = self._createVocabFile("one_column_2.txt") with self.test_session(): default_value = "UNK" key_index = lookup.TextFileIndex.LINE_NUMBER value_index = lookup.TextFileIndex.WHOLE_LINE table = lookup.HashTable( lookup.TextFileInitializer(vocabulary_file, dtypes.int64, key_index, dtypes.string, value_index), default_value) table.init.run() input_values = constant_op.constant([0, 1, 2, 3], dtypes.int64) output = table.lookup(input_values) result = output.eval() self.assertAllEqual([b"brain", b"salad", b"surgery", b"UNK"], result)
def testMultiColumn(self): vocabulary_file = os.path.join(self.get_temp_dir(), "three_columns.txt") with open(vocabulary_file, "w") as f: f.write("\n".join(["0\tbrain\t1", "1\tsalad\t5", "2\tsurgery\t6"]) + "\n") with self.test_session(): default_value = -1 key_index = 1 value_index = 2 table = lookup.HashTable( lookup.TextFileInitializer(vocabulary_file, dtypes.string, key_index, dtypes.int64, value_index), default_value) table.init.run() input_string = constant_op.constant(["brain", "salad", "surgery"]) output = table.lookup(input_string) result = output.eval() self.assertAllEqual([1, 5, 6], result)
def _make_test_csv_sparse(): f = tempfile.NamedTemporaryFile( dir=test.get_temp_dir(), delete=False, mode="w") w = csv.writer(f) w.writerow(["int", "float", "bool", "string"]) for _ in range(100): # leave columns empty; these will be read as default value (e.g. 0 or NaN) intvalue = np.random.randint(-10, 10) if np.random.rand() > 0.5 else "" floatvalue = np.random.rand() if np.random.rand() > 0.5 else "" boolvalue = int(np.random.rand() > 0.3) if np.random.rand() > 0.5 else "" stringvalue = (("S: %.4f" % np.random.rand()) if np.random.rand() > 0.5 else "") row = [intvalue, floatvalue, boolvalue, stringvalue] w.writerow(row) f.close() return f.name
def testParse(self): parser = csv_parser.CSVParser( column_names=["col0", "col1", "col2"], default_values=["", "", 1.4]) csv_lines = ["one,two,2.5", "four,five,6.0"] csv_input = constant_op.constant( csv_lines, dtype=dtypes.string, shape=[len(csv_lines)]) csv_column = mocks.MockSeries("csv", csv_input) expected_output = [ np.array([b"one", b"four"]), np.array([b"two", b"five"]), np.array([2.5, 6.0]) ] output_columns = parser(csv_column) self.assertEqual(3, len(output_columns)) cache = {} output_tensors = [o.build(cache) for o in output_columns] self.assertEqual(3, len(output_tensors)) with self.test_session() as sess: output = sess.run(output_tensors) for expected, actual in zip(expected_output, output): np.testing.assert_array_equal(actual, expected)
def make_parsing_export_strategy(feature_columns, exports_to_keep=5): """Create an ExportStrategy for use with Experiment, using `FeatureColumn`s. Creates a SavedModel export that expects to be fed with a single string Tensor containing serialized tf.Examples. At serving time, incoming tf.Examples will be parsed according to the provided `FeatureColumn`s. Args: feature_columns: An iterable of `FeatureColumn`s representing the features that must be provided at serving time (excluding labels!). exports_to_keep: Number of exports to keep. Older exports will be garbage-collected. Defaults to 5. Set to None to disable garbage collection. Returns: An ExportStrategy that can be passed to the Experiment constructor. """ feature_spec = feature_column.create_feature_spec_for_parsing(feature_columns) serving_input_fn = input_fn_utils.build_parsing_serving_input_fn(feature_spec) return make_export_strategy(serving_input_fn, exports_to_keep=exports_to_keep)
def testExportMonitorInputFeatureKeyNoFeatures(self): random.seed(42) input_feature_key = 'my_example_key' def _serving_input_fn(): return { input_feature_key: array_ops.placeholder( dtype=dtypes.string, shape=(1,)) }, None monitor = learn.monitors.ExportMonitor( every_n_steps=1, export_dir=tempfile.mkdtemp() + 'export/', input_fn=_serving_input_fn, input_feature_key=input_feature_key, exports_to_keep=2, signature_fn=export.generic_signature_fn) regressor = learn.LinearRegressor(feature_columns=[_X_COLUMN]) with self.assertRaisesRegexp(KeyError, _X_KEY): regressor.fit(input_fn=_training_input_fn, steps=10, monitors=[monitor])
def testExportMonitorInputFeature(self): random.seed(42) input_feature_key = 'my_example_key' def _serving_input_fn(): return { input_feature_key: array_ops.placeholder( dtype=dtypes.string, shape=(1,)), _X_KEY: random_ops.random_uniform( shape=(1,), minval=0.0, maxval=1000.0) }, None export_dir = tempfile.mkdtemp() + 'export/' monitor = learn.monitors.ExportMonitor( every_n_steps=1, export_dir=export_dir, input_fn=_serving_input_fn, input_feature_key=input_feature_key, exports_to_keep=2, signature_fn=export.generic_signature_fn) regressor = learn.LinearRegressor(feature_columns=[_X_COLUMN]) regressor.fit(input_fn=_training_input_fn, steps=10, monitors=[monitor]) self._assert_export(monitor, export_dir, 'generic_signature')
def test_dense(self): """Tests only dense inputs. """ op = sparse_feature_cross_op.sparse_feature_cross([ constant_op.constant([['batch1-FC1-F1', 'batch1-FC1-F2'], ['batch2-FC1-F1', 'batch2-FC1-F2']], dtypes.string), constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'], ['batch2-FC2-F1', 'batch2-FC2-F2']], dtypes.string), ]) expected_out = self._sparse_tensor([[ 'batch1-FC1-F1_X_batch1-FC2-F1', 'batch1-FC1-F1_X_batch1-FC2-F2', 'batch1-FC1-F2_X_batch1-FC2-F1', 'batch1-FC1-F2_X_batch1-FC2-F2' ], [ 'batch2-FC1-F1_X_batch2-FC2-F1', 'batch2-FC1-F1_X_batch2-FC2-F2', 'batch2-FC1-F2_X_batch2-FC2-F1', 'batch2-FC1-F2_X_batch2-FC2-F2' ]]) with self.test_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def test_integer_mixed_string_dense(self): """Tests mixed dense inputs. """ op = sparse_feature_cross_op.sparse_feature_cross([ constant_op.constant([[11, 333], [55555, 999999]], dtypes.int64), constant_op.constant([['batch1-FC2-F1', 'batch1-FC2-F2'], ['batch2-FC2-F1', 'batch2-FC2-F2']], dtypes.string), ]) expected_out = self._sparse_tensor([[ '11_X_batch1-FC2-F1', '11_X_batch1-FC2-F2', '333_X_batch1-FC2-F1', '333_X_batch1-FC2-F2' ], [ '55555_X_batch2-FC2-F1', '55555_X_batch2-FC2-F2', '999999_X_batch2-FC2-F1', '999999_X_batch2-FC2-F2' ]]) with self.test_session() as sess: self._assert_sparse_tensor_equals(expected_out, sess.run(op))
def __new__(cls, column_name, vocabulary_file, num_oov_buckets=0, vocab_size=None, default_value=-1, combiner="sum", dtype=dtypes.string): if dtype != dtypes.string and not dtype.is_integer: raise ValueError("dtype must be string or integer. " "dtype: {}, column_name: {}".format(dtype, column_name)) return super(_SparseColumnVocabulary, cls).__new__( cls, column_name, combiner=combiner, lookup_config=_SparseIdLookupConfig( vocabulary_file=vocabulary_file, num_oov_buckets=num_oov_buckets, vocab_size=vocab_size, default_value=default_value), dtype=dtype)
def testWeightedSparseColumnDtypes(self): ids = fc.sparse_column_with_keys("ids", ["marlo", "omar", "stringer"]) weighted_ids = fc.weighted_sparse_column(ids, "weights") self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.float32) }, weighted_ids.config) weighted_ids = fc.weighted_sparse_column(ids, "weights", dtype=dtypes.int32) self.assertDictEqual({ "ids": parsing_ops.VarLenFeature(dtypes.string), "weights": parsing_ops.VarLenFeature(dtypes.int32) }, weighted_ids.config) with self.assertRaisesRegexp(ValueError, "dtype is not convertible to float"): weighted_ids = fc.weighted_sparse_column( ids, "weights", dtype=dtypes.string)
def tf_ops(self, capacity=32): images = ops.convert_to_tensor(self._image_fn_list, dtype=dtypes.string) labels = ops.convert_to_tensor(self._label_list, dtype=dtypes.int32) # Makes an input queue im_fn_q, labl_q = tf.train.slice_input_producer( [images, labels], capacity=capacity, shuffle=True) file_contents_q = tf.read_file(im_fn_q) im_q = self._decoder(file_contents_q, channels=3) return im_q, labl_q
def _shard_indices(self, keys): if self._key_dtype == dtypes.string: indices = string_ops.string_to_hash_bucket_fast(keys, self._num_shards) else: indices = math_ops.mod(keys, self._num_shards) return math_ops.cast(indices, dtypes.int32)
def __init__(self, length, key, sequences, context): length = ops.convert_to_tensor(length, name="length") key = ops.convert_to_tensor(key, name="key") if not isinstance(sequences, dict): raise TypeError("sequences must be a dict") if not isinstance(context, dict): raise TypeError("context must be a dict") if not sequences: raise ValueError("must have at least one sequence tensor") for k in sequences.keys(): if not isinstance(k, six.string_types): raise TypeError("sequence key must be string: %s" % k) if ":" in k: raise ValueError("sequence key may not have a colon: '%s'" % k) for k in context.keys(): if not isinstance(k, six.string_types): raise TypeError("context key must be string: %s" % k) if ":" in k: raise ValueError("context key may not have a colon: '%s'" % k) sequences = dict( (k, ops.convert_to_tensor(v, name="sequence_%s" % k)) for k, v in sequences.items()) context = dict( (k, ops.convert_to_tensor(v, name="context_%s" % k)) for k, v in context.items()) self._length = length self._key = key self._sequences = sequences self._context = context
def key(self): """The key names of the given truncated unrolled examples. The format of the key is: ```python "%05d_of_%05d:%s" % (sequence, sequence_count, original_key)
where `original_key` is the unique key read in by the prefetcher. Returns: A string vector of length `batch_size`, the keys. """ return self._state_saver._received_keys
def _create_barrier(self): """Create the barrier. This method initializes the Barrier object with the right types and shapes. """ # Create the barrier sequence_dtypes = [v.dtype for k, v in self._sorted_sequences.items()] context_dtypes = [v.dtype for k, v in self._sorted_context.items()] state_dtypes = [v.dtype for k, v in self._sorted_states.items()] types = ([dtypes.int32, # length dtypes.int32, # total_length dtypes.string, # next_keys dtypes.int32, # sequence dtypes.int32] # expanded_sequence_count + sequence_dtypes + context_dtypes + state_dtypes) sequence_shapes = [ [self._num_unroll] + self._sorted_sequences[k].get_shape().as_list()[1:] for k in self._sorted_sequences.keys()] context_shapes = [ self._sorted_context[k].get_shape().as_list() for k in self._sorted_context.keys()] state_shapes = [ self._sorted_states[k].get_shape().as_list() for k in self._sorted_states.keys()] shapes = ([(), # length (), # total_length (), # next_keys (), # sequence ()] # expanded_sequence_count + sequence_shapes + context_shapes + state_shapes) self._barrier = data_flow_ops.Barrier(types=types, shapes=shapes)
def initialize(self, table): """Initializes the table from a text file. Args: table: The table to be initialized. Returns: The operation that initializes the table. Raises: TypeError: when the keys and values data types do not match the table key and value data types. """ # pylint: disable=protected-access table._check_table_dtypes(self.key_dtype, self.value_dtype) with ops.name_scope(self._name, "text_file_init", [table]) as scope: filename = ops.convert_to_tensor(self._filename, dtypes.string, name="asset_filepath") init_op = gen_data_flow_ops._initialize_table_from_text_file( table.table_ref, filename, self._key_index, self._value_index, -1 if self._vocab_size is None else self._vocab_size, self._delimiter, name=scope) # pylint: enable=protected-access ops.add_to_collection(ops.GraphKeys.TABLE_INITIALIZERS, init_op) ops.add_to_collection(ops.GraphKeys.ASSET_FILEPATHS, filename) return init_op
def _file_path_value(self, path_tensor): """Returns the filepath value stored in constant `path_tensor`.""" if not isinstance(path_tensor, ops.Tensor): raise TypeError("tensor is not a Tensor") if path_tensor.op.type != "Const": raise TypeError("Only constants tensor are supported") if path_tensor.dtype != dtypes.string: raise TypeError("File paths should be string") str_value = path_tensor.op.get_attr("value").string_val if len(str_value) != 1: raise TypeError("Only scalar tensors are supported") return str_value[0]
def _make_test_csv(): f = tempfile.NamedTemporaryFile( dir=tf.test.get_temp_dir(), delete=False, mode="w") w = csv.writer(f) w.writerow(["int", "float", "bool", "string"]) for _ in range(100): intvalue = np.random.randint(-10, 10) floatvalue = np.random.rand() boolvalue = int(np.random.rand() > 0.3) stringvalue = "S: %.4f" % np.random.rand() row = [intvalue, floatvalue, boolvalue, stringvalue] w.writerow(row) f.close() return f.name
def testFromCSVWithFeatureSpec(self): if not HAS_PANDAS: return num_batches = 100 batch_size = 8 data_path = _make_test_csv_sparse() feature_spec = { "int": tf.FixedLenFeature(None, dtypes.int16, np.nan), "float": tf.VarLenFeature(dtypes.float16), "bool": tf.VarLenFeature(dtypes.bool), "string": tf.FixedLenFeature(None, dtypes.string, "") } pandas_df = pd.read_csv(data_path, dtype={"string": object}) # Pandas insanely uses NaN for empty cells in a string column. # And, we can't use Pandas replace() to fix them because nan != nan s = pandas_df["string"] for i in range(0, len(s)): if isinstance(s[i], float) and math.isnan(s[i]): pandas_df.set_value(i, "string", "") tensorflow_df = df.TensorFlowDataFrame.from_csv_with_feature_spec( [data_path], batch_size=batch_size, shuffle=False, feature_spec=feature_spec) # These columns were sparse; re-densify them for comparison tensorflow_df["float"] = densify.Densify(np.nan)(tensorflow_df["float"]) tensorflow_df["bool"] = densify.Densify(np.nan)(tensorflow_df["bool"]) self._assert_pandas_equals_tensorflow(pandas_df, tensorflow_df, num_batches=num_batches, batch_size=batch_size)
def _dtype_to_nan(dtype): if dtype is dtypes.string: return b"" elif dtype.is_integer: return np.nan elif dtype.is_floating: return np.nan elif dtype is dtypes.bool: return np.nan else: raise ValueError("Can't parse type without NaN into sparse tensor: %s" % dtype)
def split(self, index_series, proportion, batch_size=None): """Deterministically split a `DataFrame` into two `DataFrame`s. Note this split is only as deterministic as the underlying hash function; see `tf.string_to_hash_bucket_fast`. The hash function is deterministic for a given binary, but may change occasionally. The only way to achieve an absolute guarantee that the split `DataFrame`s do not change across runs is to materialize them. Note too that the allocation of a row to one partition or the other is evaluated independently for each row, so the exact number of rows in each partition is binomially distributed. Args: index_series: a `Series` of unique strings, whose hash will determine the partitioning; or the name in this `DataFrame` of such a `Series`. (This `Series` must contain strings because TensorFlow provides hash ops only for strings, and there are no number-to-string converter ops.) proportion: The proportion of the rows to select for the 'left' partition; the remaining (1 - proportion) rows form the 'right' partition. batch_size: the batch size to use when rebatching the left and right `DataFrame`s. If None (default), the `DataFrame`s are not rebatched; thus their batches will have variable sizes, according to which rows are selected from each batch of the original `DataFrame`. Returns: Two `DataFrame`s containing the partitioned rows. """ if isinstance(index_series, str): index_series = self[index_series] left_mask, = split_mask.SplitMask(proportion)(index_series) right_mask = ~left_mask left_rows = self.select_rows(left_mask) right_rows = self.select_rows(right_mask) if batch_size: left_rows = left_rows.batch(batch_size=batch_size, shuffle=False) right_rows = right_rows.batch(batch_size=batch_size, shuffle=False) return left_rows, right_rows
def key(self): """Returns a string which will be used as a key when we do sorting.""" pass
def insert_transformed_feature(self, columns_to_tensors): """Apply transformation and inserts it into columns_to_tensors. Args: columns_to_tensors: A mapping from feature columns to tensors. 'string' key means a base feature (not-transformed). It can have _FeatureColumn as a key too. That means that _FeatureColumn is already transformed. """ raise NotImplementedError("Transform is not implemented for {}.".format( self))
def key(self): """Returns a string which will be used as a key when we do sorting.""" return "{}".format(self)
def sparse_column_with_integerized_feature(column_name, bucket_size, combiner=None, dtype=dtypes.int64): """Creates an integerized _SparseColumn. Use this when your features are already pre-integerized into int64 IDs. output_id = input_feature Args: column_name: A string defining sparse column name. bucket_size: An int that is > 1. The number of buckets. It should be bigger than maximum feature. In other words features in this column should be an int64 in range [0, bucket_size) combiner: A string specifying how to reduce if the sparse column is multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" the default: * "sum": do not normalize features in the column * "mean": do l1 normalization on features in the column * "sqrtn": do l2 normalization on features in the column For more information: `tf.embedding_lookup_sparse`. dtype: Type of features. It should be an integer type. Default value is dtypes.int64. Returns: An integerized _SparseColumn definition. Raises: ValueError: bucket_size is not greater than 1. ValueError: dtype is not integer. """ if combiner is None: logging.warn("The default value of combiner will change from \"sum\" " "to \"sqrtn\" after 2016/11/01.") combiner = "sum" return _SparseColumnIntegerized( column_name, bucket_size, combiner=combiner, dtype=dtype)
def sparse_column_with_hash_bucket(column_name, hash_bucket_size, combiner=None, dtype=dtypes.string): """Creates a _SparseColumn with hashed bucket configuration. Use this when your sparse features are in string or integer format, but you don't have a vocab file that maps each value to an integer ID. output_id = Hash(input_feature_string) % bucket_size Args: column_name: A string defining sparse column name. hash_bucket_size: An int that is > 1. The number of buckets. combiner: A string specifying how to reduce if the sparse column is multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" the default: * "sum": do not normalize features in the column * "mean": do l1 normalization on features in the column * "sqrtn": do l2 normalization on features in the column For more information: `tf.embedding_lookup_sparse`. dtype: The type of features. Only string and integer types are supported. Returns: A _SparseColumn with hashed bucket configuration Raises: ValueError: hash_bucket_size is not greater than 2. ValueError: dtype is neither string nor integer. """ if combiner is None: logging.warn("The default value of combiner will change from \"sum\" " "to \"sqrtn\" after 2016/11/01.") combiner = "sum" return _SparseColumnHashed(column_name, hash_bucket_size, combiner, dtype)
def __new__(cls, column_name, keys, default_value=-1, combiner="sum"): return super(_SparseColumnKeys, cls).__new__( cls, column_name, combiner=combiner, lookup_config=_SparseIdLookupConfig( keys=keys, vocab_size=len(keys), default_value=default_value), dtype=dtypes.string)
def sparse_column_with_keys(column_name, keys, default_value=-1, combiner=None): """Creates a _SparseColumn with keys. Look up logic is as follows: lookup_id = index_of_feature_in_keys if feature in keys else default_value Args: column_name: A string defining sparse column name. keys: a string list defining vocabulary. default_value: The value to use for out-of-vocabulary feature values. Default is -1. combiner: A string specifying how to reduce if the sparse column is multivalent. Currently "mean", "sqrtn" and "sum" are supported, with "sum" the default: * "sum": do not normalize features in the column * "mean": do l1 normalization on features in the column * "sqrtn": do l2 normalization on features in the column For more information: `tf.embedding_lookup_sparse`. Returns: A _SparseColumnKeys with keys configuration. """ if combiner is None: logging.warn("The default value of combiner will change from \"sum\" " "to \"sqrtn\" after 2016/11/01.") combiner = "sum" return _SparseColumnKeys( column_name, tuple(keys), default_value=default_value, combiner=combiner)
def weighted_sparse_column(sparse_id_column, weight_column_name, dtype=dtypes.float32): """Creates a _SparseColumn by combining sparse_id_column with a weight column. Args: sparse_id_column: A `_SparseColumn` which is created by `sparse_column_with_*` functions. weight_column_name: A string defining a sparse column name which represents weight or value of the corresponding sparse id feature. dtype: Type of weights, such as `tf.float32` Returns: A _WeightedSparseColumn composed of two sparse features: one represents id, the other represents weight (value) of the id feature in that example. Raises: ValueError: if dtype is not convertible to float. An example usage: ```python words = sparse_column_with_hash_bucket("words", 1000) tfidf_weighted_words = weighted_sparse_column(words, "tfidf_score")
This configuration assumes that input dictionary of model contains the following two items: * (key="words", value=word_tensor) where word_tensor is a SparseTensor. * (key="tfidf_score", value=tfidf_score_tensor) where tfidf_score_tensor is a SparseTensor. Following are assumed to be true: * word_tensor.indices = tfidf_score_tensor.indices * word_tensor.shape = tfidf_score_tensor.shape
""" if not (dtype.is_integer or dtype.is_floating): raise ValueError("dtype is not convertible to float. Given {}".format( dtype))
return _WeightedSparseColumn(sparse_id_column, weight_column_name, dtype) ```