def testNGramsWithSpaceSeparator(self): string_tensor = tf.constant(['One was Johnny', 'Two was a rat']) tokenized_tensor = tf.string_split(string_tensor, delimiter=' ') output_tensor = mappers.ngrams( tokens=tokenized_tensor, ngram_range=(1, 2), separator=' ') with tf.Session(): output = output_tensor.eval() self.assertAllEqual( output.indices, [[0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [1, 6]]) self.assertAllEqual(output.values, [ 'One', 'One was', 'was', 'was Johnny', 'Johnny', 'Two', 'Two was', 'was', 'was a', 'a', 'a rat', 'rat']) self.assertAllEqual(output.dense_shape, [2, 7])
def full_onehot_process_line_as_2d_input(the_str, num_samples=-1): with tf.name_scope("process_data_2d"): #with tf.device("/cpu:0"): # A tensor referenced when getting indices of characters for the the_values array mapping_strings = tf.constant( ["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"]) number_of_mapping_strings = 16 # len(mapping_strings) the_values = tf.constant( [[1 if i == j else 0 for i in range(number_of_mapping_strings)] for j in range(number_of_mapping_strings)], dtype=tf.float32) # Create the table for getting indices (for the_values) from the information about the board the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table") data = tf.reshape( # Get the values at the given indices tf.gather( the_values, # Get an array of indices corresponding to the array of characters the_table.lookup( # Split the string into an array of characters tf.string_split( [the_str], delimiter="").values)), [num_samples, 64, number_of_mapping_strings]) #THIS SHOULD REALLY BE [3x8x8,num_mapping_strings] return data
def decode(self, data, items): decoded_items = {} # Split tokens tokens = tf.string_split([data], delimiter=self.delimiter).values # Optionally prepend a special token if self.prepend_token is not None: tokens = tf.concat([[self.prepend_token], tokens], 0) # Optionally append a special token if self.append_token is not None: tokens = tf.concat([tokens, [self.append_token]], 0) decoded_items[self.length_feature_name] = tf.size(tokens) decoded_items[self.tokens_feature_name] = tokens return [decoded_items[_] for _ in items]
def image_reading(path: str, resized_size: Tuple[int, int]=None, data_augmentation: bool=False, padding: bool=False) -> Tuple[tf.Tensor, tf.Tensor]: # Read image image_content = tf.read_file(path, name='image_reader') image = tf.cond(tf.equal(tf.string_split([path], '.').values[1], tf.constant('jpg', dtype=tf.string)), true_fn=lambda: tf.image.decode_jpeg(image_content, channels=1, try_recover_truncated=True), # TODO channels = 3 ? false_fn=lambda: tf.image.decode_png(image_content, channels=1), name='image_decoding') # Data augmentation if data_augmentation: image = augment_data(image) # Padding if padding: with tf.name_scope('padding'): image, img_width = padding_inputs_width(image, resized_size, increment=CONST.DIMENSION_REDUCTION_W_POOLING) # Resize else: image = tf.image.resize_images(image, size=resized_size) img_width = tf.shape(image)[1] with tf.control_dependencies([tf.assert_equal(image.shape[:2], resized_size)]): return image, img_width
def testTFIDFNoData(self): def preprocessing_fn(inputs): inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': ''}] input_schema = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) expected_transformed_data = [{'tf_idf': [], 'index': []}] expected_transformed_schema = dataset_metadata.DatasetMetadata({ 'tf_idf': sch.ColumnSchema(tf.float32, [None], sch.ListColumnRepresentation()), 'index': sch.ColumnSchema(tf.int64, [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_schema, preprocessing_fn, expected_transformed_data, expected_transformed_schema)
def testUniquesAnalyzerWithTokenization(self): def preprocessing_fn(inputs): return { 'index': tft.string_to_int(tf.string_split(inputs['a'])) } input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye world'}] input_metadata = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) expected_data = [{'index': [0, 0, 1]}, {'index': [0, 2, 1]}] expected_metadata = dataset_metadata.DatasetMetadata({ 'index': sch.ColumnSchema( sch.IntDomain(tf.int64, -1, 2, True, 'vocab_string_to_int_uniques'), [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def make_preprocessing_fn(frequency_threshold): """Creates a preprocessing function for reddit. Args: frequency_threshold: The frequency_threshold used when generating vocabularies for categorical and text features. Returns: A preprocessing function. """ def preprocessing_fn(inputs): """User defined preprocessing function for reddit columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) for name in ('author', 'comment_body', 'comment_parent_body'): words = tf.string_split(inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result return preprocessing_fn
def __init__(self, config, batch_size, one_hot=False): self.lookup = None reader = tf.TextLineReader() filename_queue = tf.train.string_input_producer(["chargan.txt"]) key, x = reader.read(filename_queue) vocabulary = self.get_vocabulary() table = tf.contrib.lookup.string_to_index_table_from_tensor( mapping = vocabulary, default_value = 0) x = tf.string_join([x, tf.constant(" " * 64)]) x = tf.substr(x, [0], [64]) x = tf.string_split(x,delimiter='') x = tf.sparse_tensor_to_dense(x, default_value=' ') x = tf.reshape(x, [64]) x = table.lookup(x) self.one_hot = one_hot if one_hot: x = tf.one_hot(x, len(vocabulary)) x = tf.cast(x, dtype=tf.float32) x = tf.reshape(x, [1, int(x.get_shape()[0]), int(x.get_shape()[1]), 1]) else: x = tf.cast(x, dtype=tf.float32) x -= len(vocabulary)/2.0 x /= len(vocabulary)/2.0 x = tf.reshape(x, [1,1, 64, 1]) num_preprocess_threads = 8 x = tf.train.shuffle_batch( [x], batch_size=batch_size, num_threads=num_preprocess_threads, capacity= 5000, min_after_dequeue=500, enqueue_many=True) self.x = x self.table = table
def testStringToTFIDFEmptyDoc(self): def preprocessing_fn(inputs): inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': 'hello hello world'}, {'a': ''}, {'a': 'hello goodbye hello world'}, {'a': 'I like pie pie pie'}] input_schema = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) log_5_over_2 = 1.91629073187 log_5_over_3 = 1.51082562376 expected_transformed_data = [{ 'tf_idf': [(2/3)*log_5_over_3, (1/3)*log_5_over_3], 'index': [0, 2] }, { 'tf_idf': [], 'index': [] }, { 'tf_idf': [(2/4)*log_5_over_3, (1/4)*log_5_over_3, (1/4)*log_5_over_2], 'index': [0, 2, 4] }, { 'tf_idf': [(3/5)*log_5_over_2, (1/5)*log_5_over_2, (1/5)*log_5_over_2], 'index': [1, 3, 5] }] expected_transformed_schema = dataset_metadata.DatasetMetadata({ 'tf_idf': sch.ColumnSchema(tf.float32, [None], sch.ListColumnRepresentation()), 'index': sch.ColumnSchema(tf.int64, [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_schema, preprocessing_fn, expected_transformed_data, expected_transformed_schema)
def testUniquesAnalyzerWithHighFrequencyThresholdAndOOVBuckets(self): def preprocessing_fn(inputs): return { 'index1': tft.string_to_int( tf.string_split(inputs['a']), default_value=-99, top_k=1, num_oov_buckets=3) } input_data = [ {'a': 'hello hello world world'}, {'a': 'hello tarkus toccata'}, {'a': 'hello goodbye foo'} ] input_metadata = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) # Generated vocab (ordered by frequency, then value) should be: # ["hello", "world", "goodbye", "foo", "tarkus", "toccata"]. After applying # top_k =1 this becomes ["hello"] plus three OOV buckets. # The specific output values here depend on the hash of the words, and the # test will break if the hash changes. expected_data = [ {'index1': [0, 0, 2, 2]}, {'index1': [0, 3, 1]}, {'index1': [0, 2, 1]}, ] expected_metadata = dataset_metadata.DatasetMetadata({ 'index1': sch.ColumnSchema( sch.IntDomain(tf.int64, 0, 3, True, 'vocab_string_to_int_uniques'), [None], sch.ListColumnRepresentation()), }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def testNGramsEmpty(self): output_tensor = mappers.ngrams(tf.string_split(tf.constant([''])), (1, 5), '') with tf.Session(): output = output_tensor.eval() self.assertEqual((0, 2), output.indices.shape) self.assertAllEqual([1, 0], output.dense_shape) self.assertEqual(0, len(output.values))
def testNGrams(self): string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', '']) tokenized_tensor = tf.string_split(string_tensor, delimiter='') output_tensor = mappers.ngrams( tokens=tokenized_tensor, ngram_range=(1, 5), separator='') self.assertSparseOutput( expected_indices=[ [0, 0], [0, 1], [0, 2], [0, 3], [0, 4], [0, 5], [1, 0], [1, 1], [1, 2], [1, 3], [1, 4], [1, 5], [2, 0], [2, 1], [2, 2], [2, 3], [2, 4], [2, 5], [2, 6], [2, 7], [2, 8], [2, 9], [2, 10], [2, 11], [2, 12], [2, 13], [2, 14], [2, 15], [2, 16], [2, 17], [2, 18], [2, 19], [2, 20], [2, 21], [2, 22], [2, 23], [2, 24], [2, 25], [2, 26], [2, 27], [2, 28], [2, 29], [3, 0]], expected_values=[ 'a', 'ab', 'abc', 'b', 'bc', 'c', 'd', 'de', 'def', 'e', 'ef', 'f', 'f', 'fg', 'fgh', 'fghi', 'fghij', 'g', 'gh', 'ghi', 'ghij', 'ghijk', 'h', 'hi', 'hij', 'hijk', 'hijkl', 'i', 'ij', 'ijk', 'ijkl', 'ijklm', 'j', 'jk', 'jkl', 'jklm', 'k', 'kl', 'klm', 'l', 'lm', 'm', 'z'], expected_shape=[5, 30], actual_sparse_tensor=output_tensor, close_values=False)
def testNGramsBadSizes(self): string_tensor = tf.constant(['abc', 'def', 'fghijklm', 'z', '']) tokenized_tensor = tf.string_split(string_tensor, delimiter='') with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'): mappers.ngrams(tokenized_tensor, (0, 5), separator='') with self.assertRaisesRegexp(ValueError, 'Invalid ngram_range'): mappers.ngrams(tokenized_tensor, (6, 5), separator='')
def get_inference_input(inputs, params): dataset = tf.data.Dataset.from_tensor_slices( tf.constant(inputs) ) # Split string dataset = dataset.map(lambda x: tf.string_split([x]).values, num_parallel_calls=params.num_threads) # Append <eos> dataset = dataset.map( lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0), num_parallel_calls=params.num_threads ) # Convert tuple to dictionary dataset = dataset.map( lambda x: {"source": x, "source_length": tf.shape(x)[0]}, num_parallel_calls=params.num_threads ) dataset = dataset.padded_batch( params.decode_batch_size, {"source": [tf.Dimension(None)], "source_length": []}, {"source": params.pad, "source_length": 0} ) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() src_table = tf.contrib.lookup.index_table_from_tensor( tf.constant(params.vocabulary["source"]), default_value=params.mapping["source"][params.unk] ) features["source"] = src_table.lookup(features["source"]) return features
def get_infer_iterator( src_dataset, src_vocab_table, batch_size, source_reverse, eos, src_max_len=None): src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values) if src_max_len: src_dataset = src_dataset.map(lambda src: src[:src_max_len]) # Convert the word strings to ids src_dataset = src_dataset.map( lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) if source_reverse: src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0])) # Add in the word counts. src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( batch_size, # The entry is the source line rows; # this has unknown-length vectors. The last entry is # the source row size; this is a scalar. padded_shapes=(tf.TensorShape([None]), # src tf.TensorShape([])), # src_len # Pad the source sequences with eos tokens. # (Though notice we don't generally need to do this since # later on we will be masking out calculations past the true sequence. padding_values=(src_eos_id, # src 0)) # src_len -- unused batched_dataset = batching_func(src_dataset) batched_iter = batched_dataset.make_initializable_iterator() (src_ids, src_seq_len) = batched_iter.get_next() return BatchedInput( initializer=batched_iter.initializer, source=src_ids, target_input=None, target_output=None, source_sequence_length=src_seq_len, target_sequence_length=None)
def get_input_fn(batch_size, num_epochs, context_filename, answer_filename, max_sequence_len): def input_fn(): source_dataset = tf.contrib.data.TextLineDataset(context_filename) target_dataset = tf.contrib.data.TextLineDataset(answer_filename) def map_dataset(dataset): dataset = dataset.map(lambda string: tf.string_split([string]).values) dataset = dataset.map(lambda token: tf.string_to_number(token, tf.int64)) dataset = dataset.map(lambda tokens: (tokens, tf.size(tokens))) dataset = dataset.map(lambda tokens, size: (tokens[:max_sequence_len], tf.minimum(size, max_sequence_len))) return dataset source_dataset = map_dataset(source_dataset) target_dataset = map_dataset(target_dataset) dataset = tf.contrib.data.Dataset.zip((source_dataset, target_dataset)) dataset = dataset.repeat(num_epochs) dataset = dataset.padded_batch(batch_size, padded_shapes=((tf.TensorShape([max_sequence_len]), tf.TensorShape([])), (tf.TensorShape([max_sequence_len]), tf.TensorShape([])) )) iterator = dataset.make_one_shot_iterator() next_element = iterator.get_next() return next_element, None return input_fn
def get_test_iterator(src_dataset, src_vocab_table, batch_size, config): src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(config.eos)), tf.int32) src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values) src_dataset = src_dataset.map(lambda src: src[:config.src_max_len]) src_dataset = src_dataset.map( lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) if config.reverse_src: src_dataset = src_dataset.map(lambda src: tf.reverse(src, axis=[0])) src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( config.batch_size, padded_shapes=(tf.TensorShape([None]), tf.TensorShape([])), padding_values=(src_eos_id, 0)) batched_dataset = batching_func(src_dataset) batched_iter = batched_dataset.make_initializable_iterator() src_ids, src_seq_len = batched_iter.get_next() return BatchedInput( initializer=batched_iter.initializer, source=src_ids, target_input=None, target_output=None, source_sequence_length=src_seq_len, target_sequence_length=None)
def _read_id_file(path) -> Dataset: def _parse_line(line): splits = tf.string_split(tf.reshape(line, (-1,))).values return tf.string_to_number(splits, out_type=tf.int32) return TextLineDataset(path) \ .filter(lambda line: tf.size(line) > 0) \ .map(_parse_line)
def testStringToTFIDF(self): def preprocessing_fn(inputs): inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a'])) out_index, out_values = tft.tfidf(inputs_as_ints, 6) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye hello world'}, {'a': 'I like pie pie pie'}] input_schema = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) # IDFs # hello = log(4/3) = 0.28768 # world = log(4/3) # goodbye = log(4/2) = 0.69314 # I = log(4/2) # like = log(4/2) # pie = log(4/2) log_4_over_2 = 1.69314718056 log_4_over_3 = 1.28768207245 expected_transformed_data = [{ 'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3], 'index': [0, 2] }, { 'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_2], 'index': [0, 2, 4] }, { 'tf_idf': [(3/5)*log_4_over_2, (1/5)*log_4_over_2, (1/5)*log_4_over_2], 'index': [1, 3, 5] }] expected_transformed_schema = dataset_metadata.DatasetMetadata({ 'tf_idf': sch.ColumnSchema(tf.float32, [None], sch.ListColumnRepresentation()), 'index': sch.ColumnSchema(tf.int64, [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_schema, preprocessing_fn, expected_transformed_data, expected_transformed_schema)
def testTFIDFWithOOV(self): test_vocab_size = 3 def preprocessing_fn(inputs): inputs_as_ints = tft.string_to_int(tf.string_split(inputs['a']), top_k=test_vocab_size) out_index, out_values = tft.tfidf(inputs_as_ints, test_vocab_size+1) return { 'tf_idf': out_values, 'index': out_index } input_data = [{'a': 'hello hello world'}, {'a': 'hello goodbye hello world'}, {'a': 'I like pie pie pie'}] input_schema = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) # IDFs # hello = log(3/3) = 0 # pie = log(3/2) = 0.4054651081 # world = log(3/3) = 0 # OOV - goodbye, I, like = log(3/3) log_4_over_2 = 1.69314718056 log_4_over_3 = 1.28768207245 expected_transformed_data = [{ 'tf_idf': [(2/3)*log_4_over_3, (1/3)*log_4_over_3], 'index': [0, 2] }, { 'tf_idf': [(2/4)*log_4_over_3, (1/4)*log_4_over_3, (1/4)*log_4_over_3], 'index': [0, 2, 3] }, { 'tf_idf': [(3/5)*log_4_over_2, (2/5)*log_4_over_3], 'index': [1, 3] }] expected_transformed_schema = dataset_metadata.DatasetMetadata({ 'tf_idf': sch.ColumnSchema(tf.float32, [None], sch.ListColumnRepresentation()), 'index': sch.ColumnSchema(tf.int64, [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_schema, preprocessing_fn, expected_transformed_data, expected_transformed_schema)
def testUniquesAnalyzerWithFrequencyThreshold(self): def preprocessing_fn(inputs): return { 'index1': tft.string_to_int(tf.string_split(inputs['a']), default_value=-99, frequency_threshold=2), # As above but using a string for frequency_threshold (and changing # the default_value to showcase things). 'index2': tft.string_to_int(tf.string_split(inputs['a']), default_value=-9, frequency_threshold='2') } input_data = [ {'a': 'hello hello world'}, {'a': 'hello goodbye world'}, {'a': 'hello goodbye foo'} ] input_metadata = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) # Generated vocab (ordered by frequency, then value) should be: # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2 # this becomes # ["hello", "world", "goodbye"]. expected_data = [ {'index1': [0, 0, 1], 'index2': [0, 0, 1]}, {'index1': [0, 2, 1], 'index2': [0, 2, 1]}, {'index1': [0, 2, -99], 'index2': [0, 2, -9]} ] expected_metadata = dataset_metadata.DatasetMetadata({ 'index1': sch.ColumnSchema( sch.IntDomain(tf.int64, -99, 2, True, 'vocab_string_to_int_uniques'), [None], sch.ListColumnRepresentation()), 'index2': sch.ColumnSchema( sch.IntDomain(tf.int64, -9, 2, True, 'vocab_string_to_int_1_uniques'), [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def testUniquesAnalyzerWithFrequencyThresholdTooHigh(self): # Expected to return an empty dict due to too high threshold. def preprocessing_fn(inputs): return { 'index1': tft.string_to_int( tf.string_split(inputs['a']), default_value=-99, frequency_threshold=77), # As above but using a string for frequency_threshold (and changing # the default_value to showcase things). 'index2': tft.string_to_int( tf.string_split(inputs['a']), default_value=-9, frequency_threshold='77') } input_data = [ {'a': 'hello hello world'}, {'a': 'hello goodbye world'}, {'a': 'hello goodbye foo'} ] input_metadata = dataset_metadata.DatasetMetadata({ 'a': sch.ColumnSchema(tf.string, [], sch.FixedColumnRepresentation()) }) # Generated vocab (ordered by frequency, then value) should be: # ["hello", "world", "goodbye", "foo"]. After applying frequency_threshold=2 # this becomes empty. expected_data = [ {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]}, {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]}, {'index1': [-99, -99, -99], 'index2': [-9, -9, -9]} ] # Note the vocabs are empty but the tables have size 1 so max_value is 1. expected_metadata = dataset_metadata.DatasetMetadata({ 'index1': sch.ColumnSchema( sch.IntDomain(tf.int64, -99, 0, True, 'vocab_string_to_int_uniques'), [None], sch.ListColumnRepresentation()), 'index2': sch.ColumnSchema( sch.IntDomain(tf.int64, -9, 0, True, 'vocab_string_to_int_1_uniques'), [None], sch.ListColumnRepresentation()) }) self.assertAnalyzeAndTransformResults( input_data, input_metadata, preprocessing_fn, expected_data, expected_metadata)
def get_evaluation_input(inputs, params): with tf.device("/cpu:0"): # Create datasets datasets = [] for data in inputs: dataset = tf.data.Dataset.from_tensor_slices(data) # Split string dataset = dataset.map(lambda x: tf.string_split([x]).values, num_parallel_calls=params.num_threads) # Append <eos> dataset = dataset.map( lambda x: tf.concat([x, [tf.constant(params.eos)]], axis=0), num_parallel_calls=params.num_threads ) datasets.append(dataset) dataset = tf.data.Dataset.zip(tuple(datasets)) # Convert tuple to dictionary dataset = dataset.map( lambda *x: { "source": x[0], "source_length": tf.shape(x[0])[0], "references": x[1:] }, num_parallel_calls=params.num_threads ) dataset = dataset.padded_batch( params.eval_batch_size, { "source": [tf.Dimension(None)], "source_length": [], "references": (tf.Dimension(None),) * (len(inputs) - 1) }, { "source": params.pad, "source_length": 0, "references": (params.pad,) * (len(inputs) - 1) } ) iterator = dataset.make_one_shot_iterator() features = iterator.get_next() src_table = tf.contrib.lookup.index_table_from_tensor( tf.constant(params.vocabulary["source"]), default_value=params.mapping["source"][params.unk] ) tgt_table = tf.contrib.lookup.index_table_from_tensor( tf.constant(params.vocabulary["target"]), default_value=params.mapping["target"][params.unk] ) features["source"] = src_table.lookup(features["source"]) features["references"] = tuple( tgt_table.lookup(item) for item in features["references"] ) return features
def read_images(data_dir): pattern = os.path.join(data_dir, '*.png') filenames = tf.train.match_filenames_once(pattern, name='list_files') queue = tf.train.string_input_producer( filenames, num_epochs=NUM_EPOCHS, shuffle=True, name='queue') reader = tf.WholeFileReader() filename, content = reader.read(queue, name='read_image') filename = tf.Print( filename, data=[filename], message='loading: ') filename_split = tf.string_split([filename], delimiter='/') label_id = tf.string_to_number(tf.substr(filename_split.values[1], 0, 1), out_type=tf.int32) label = tf.one_hot( label_id-1, 5, on_value=1.0, off_value=0.0, dtype=tf.float32) img_tensor = tf.image.decode_png( content, dtype=tf.uint8, channels=3, name='img_decode') # Preprocess the image, Performs random transformations # Random flip img_tensor_flip = tf.image.random_flip_left_right(img_tensor) # Random brightness img_tensor_bri = tf.image.random_brightness(img_tensor_flip, max_delta=0.2) # Per-image scaling img_tensor_std = tf.image.per_image_standardization(img_tensor_bri) min_after_dequeue = 1000 capacity = min_after_dequeue + 3 * BATCH_SIZE example_batch, label_batch = tf.train.shuffle_batch( [img_tensor_std, label], batch_size=BATCH_SIZE, shapes=[(IMAGE_HEIGHT, IMAGE_WIDTH, NUM_CHANNELS), (NUM_CLASS)], capacity=capacity, min_after_dequeue=min_after_dequeue, name='train_shuffle') return example_batch, label_batch # `images` is a 4-D tensor with the shape: # [n_batch, img_height, img_width, n_channel]
def get_infer_iterator(src_dataset, src_vocab_table, batch_size, eos, src_max_len=None): src_eos_id = tf.cast(src_vocab_table.lookup(tf.constant(eos)), tf.int32) src_dataset = src_dataset.map(lambda src: tf.string_split([src]).values) if src_max_len: src_dataset = src_dataset.map(lambda src: src[:src_max_len]) # Convert the word strings to ids src_dataset = src_dataset.map( lambda src: tf.cast(src_vocab_table.lookup(src), tf.int32)) # Add in the word counts. src_dataset = src_dataset.map(lambda src: (src, tf.size(src))) def batching_func(x): return x.padded_batch( batch_size, # The entry is the source line rows; # this has unknown-length vectors. The last entry is # the source row size; this is a scalar. padded_shapes=( tf.TensorShape([None]), # src tf.TensorShape([])), # src_len # Pad the source sequences with eos tokens. # (Though notice we don't generally need to do this since # later on we will be masking out calculations past the true sequence. padding_values=( src_eos_id, # src 0)) # src_len -- unused batched_dataset = batching_func(src_dataset) batched_iter = batched_dataset.make_initializable_iterator() (src_ids, src_seq_len) = batched_iter.get_next() return BatchedInput( initializer=batched_iter.initializer, source=src_ids, target_input=None, target_output=None, source_sequence_length=src_seq_len, target_sequence_length=None)
def process_line_as_2d_input_with_ep(the_str): """ NOTES: 1) I likely won't be using this, opting to instead use the onehot implementation """ with tf.name_scope("process_data_2d"): # with tf.device("/cpu:0"): # A tensor referenced when getting indices of characters for the the_values array mapping_strings = tf.constant(["0", "1", "K", "Q", "R", "B", "N", "P", "C", "k", "q", "r", "b", "n", "p", "c"]) the_values = tf.constant( [[0, 0, 0, 0, 0, 0, 0, 0], # 0 [0, 0, 0, 0, 0, 0, 1, 0], # 1 [1, 0, 0, 0, 0, 0, 0, 0], # K [0, 1, 0, 0, 0, 0, 0, 0], # Q [0, 0, 1, 0, 0, 0, 0, 0], # R [0, 0, 0, 1, 0, 0, 0, 0], # B [0, 0, 0, 0, 1, 0, 0, 0], # N [0, 0, 0, 0, 0, 1, 0, 0], # P [0, 0, 0, 0, 0, 0, 0, 1], # C [-1, 0, 0, 0, 0, 0, 0, 0], # k [0, -1, 0, 0, 0, 0, 0, 0], # q [0, 0, -1, 0, 0, 0, 0, 0], # r [0, 0, 0, -1, 0, 0, 0, 0], # b [0, 0, 0, 0, -1, 0, 0, 0], # n [0, 0, 0, 0, 0, -1, 0, 0], # p [0, 0, 0, 0, 0, 0, 0, -1], # c ], dtype=tf.float32) # Create the table for getting indices (for the_values) from the information about the board the_table = tf.contrib.lookup.index_table_from_tensor(mapping=mapping_strings, name="index_lookup_table") data = tf.reshape( # Get the values at the given indices tf.gather( the_values, # Get an array of indices corresponding to the array of characters the_table.lookup( # Split the string into an array of characters tf.string_split( [the_str], delimiter="").values)), [3, 64, 8]) return data
def __init__(self, data_path, filenames_file, params, dataset, mode): self.data_path = data_path self.params = params self.dataset = dataset self.mode = mode self.left_image_batch = None self.right_image_batch = None input_queue = tf.train.string_input_producer([filenames_file], shuffle=False) line_reader = tf.TextLineReader() _, line = line_reader.read(input_queue) split_line = tf.string_split([line]).values # we load only one image for test, except if we trained a stereo model if mode == 'test' and not self.params.do_stereo: left_image_path = tf.string_join([self.data_path, split_line[0]]) left_image_o = self.read_image(left_image_path) else: left_image_path = tf.string_join([self.data_path, split_line[0]]) right_image_path = tf.string_join([self.data_path, split_line[1]]) left_image_o = self.read_image(left_image_path) right_image_o = self.read_image(right_image_path) if mode == 'train': # randomly flip images do_flip = tf.random_uniform([], 0, 1) left_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(right_image_o), lambda: left_image_o) right_image = tf.cond(do_flip > 0.5, lambda: tf.image.flip_left_right(left_image_o), lambda: right_image_o) # randomly augment images do_augment = tf.random_uniform([], 0, 1) left_image, right_image = tf.cond(do_augment > 0.5, lambda: self.augment_image_pair(left_image, right_image), lambda: (left_image, right_image)) left_image.set_shape( [None, None, 3]) right_image.set_shape([None, None, 3]) # capacity = min_after_dequeue + (num_threads + a small safety margin) * batch_size min_after_dequeue = 2048 capacity = min_after_dequeue + 4 * params.batch_size self.left_image_batch, self.right_image_batch = tf.train.shuffle_batch([left_image, right_image], params.batch_size, capacity, min_after_dequeue, params.num_threads) elif mode == 'test': self.left_image_batch = tf.stack([left_image_o, tf.image.flip_left_right(left_image_o)], 0) self.left_image_batch.set_shape( [2, None, None, 3]) if self.params.do_stereo: self.right_image_batch = tf.stack([right_image_o, tf.image.flip_left_right(right_image_o)], 0) self.right_image_batch.set_shape( [2, None, None, 3])
def make_preprocessing_fn(frequency_threshold): """Creates a preprocessing function for criteo. Args: frequency_threshold: The frequency_threshold used when generating vocabularies for categorical and text features. Returns: A preprocessing function. """ def preprocessing_fn(inputs): """User defined preprocessing function for criteo columns. Args: inputs: dictionary of input `tensorflow_transform.Column`. Returns: A dictionary of `tensorflow_transform.Column` representing the transformed columns. """ # TODO(b/35001605) Make this "passthrough" more DRY. result = {'score': inputs['score'], 'toplevel': inputs['toplevel']} result['subreddit_id'] = tft.string_to_int( inputs['subreddit'], frequency_threshold=frequency_threshold) # TODO(b/35318962): Obviate the need for this workaround on Dense features. # FeatureColumns expect shape (batch_size, 1), not just (batch_size) # All features added to results up to this point are dense and require this # workaround. All following features will be sparse. result = { k: tft.map(lambda x: tf.expand_dims(x, -1), v) for k, v in result.items() } for name in ('author', 'comment_body', 'comment_parent_body'): words = tft.map(tf.string_split, inputs[name]) # TODO(b/33467613) Translate these to bag-of-words style sparse features. result[name + '_bow'] = tft.string_to_int( words, frequency_threshold=frequency_threshold) return result return preprocessing_fn