我们从Python开源项目中,提取了以下46个代码示例,用于说明如何使用data_utils.PAD_ID。
def sample2vec(self, sample_arr): # ???????????0?ids 1?label ids ut_arr = [] batch_size = len(sample_arr) labels = np.zeros((batch_size, self.label_size)) vec_cache = [] for i in range(batch_size): pad = [data_utils.PAD_ID]*(self.max_ut_size-len(sample_arr[i][0])) #0??? vec_cache.append(list(reversed(sample_arr[i][0]+pad))) #???? for j in range(len(sample_arr[i][1])): index = int (sample_arr[i][1][j]) if index < self.label_size: labels[i][index] = 1.0 for i in range(self.max_ut_size): temp = np.array([ vec_cache[index][i] for index in range(batch_size)]) ut_arr.append(temp) #?????id???lable??? return ut_arr, labels #######################################################
def get_batch(self,data_set,batch_size,random=True): '''get a batch of data from a data_set and do all needed preprocess to make them usable for the model defined above''' if random: seqs = np.random.choice(data_set,size= batch_size) else: seqs = data_set[0:batch_size] encoder_inputs = np.zeros((batch_size,self.max_seq_length),dtype = int) decoder_inputs = np.zeros((batch_size,self.max_seq_length+2),dtype = int) encoder_lengths = np.zeros(batch_size) decoder_weights = np.zeros((batch_size,self.max_seq_length+2),dtype=float) for i,seq in enumerate(seqs): encoder_inputs[i] = np.array(list(reversed(seq))+[data_utils.PAD_ID]*(self.max_seq_length-len(seq))) decoder_inputs[i] = np.array([data_utils.GO_ID]+seq+[data_utils.EOS_ID]+[data_utils.PAD_ID]*(self.max_seq_length-len(seq))) encoder_lengths[i]= len(seq) decoder_weights[i,0:(len(seq)+1)]=1.0 return np.transpose(encoder_inputs), np.transpose(decoder_inputs), encoder_lengths, np.transpose(decoder_weights)
def demo2vec(self,sentece): ut_arr = [] batch_size = len(sample_arr) vec_cahce = [] for i in range(batch_size): pad = [data_utils.PAD_ID]*(self.max_ut_size-len(sample_arr[i][0])) #0??? vec_cache.append(list(reversed(sample_arr[i][0]+pad))) #???? for i in range(self.max_ut_size): temp = np.array([ vec_cache[index][i] for index in range(batch_size)]) ut_arr.append(temp) return ut_arr #######################################################
def get_batch(self, features, sentences, lengths): batch_size = len(sentences) encoder_inputs, encoder_lengths, decoder_inputs = [], [], [] feature_pad = np.array([0.0] * self.feature_size) for (vid, sen) in sentences: feature = features[vid] encoder_lengths.append(lengths[vid]) if len(feature) > self.encoder_max_sequence_length: feature = random.sample(feature, self.encoder_max_sequence_length) pad_size = self.encoder_max_sequence_length - len(feature) encoder_inputs.append(feature + [feature_pad] * pad_size) pad_size = self.decoder_max_sentence_length - len(sen) - 2 decoder_inputs.append([data_utils.GO_ID] + sen + [data_utils.EOS_ID] + [data_utils.PAD_ID] * pad_size) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] for length_idx in xrange(self.encoder_max_sequence_length): batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(batch_size)], dtype=np.float32)) batch_encoder_lengths = np.array(encoder_lengths) for length_idx in xrange(self.decoder_max_sentence_length): batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(batch_size, dtype=np.float32) for batch_idx in xrange(batch_size): if length_idx < self.decoder_max_sentence_length - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == self.decoder_max_sentence_length - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_encoder_lengths, batch_decoder_inputs, batch_weights
def get_batch(self, features, sentences): batch_size = len(sentences) encoder_inputs, decoder_inputs = [], [] feature_pad = np.array([0.0] * self.feature_size) for (vid, sen) in sentences: feature = features[vid] if len(feature) > self.encoder_max_sequence_length: feature = random.sample(feature, self.encoder_max_sequence_length) pad_size = self.encoder_max_sequence_length - len(feature) encoder_inputs.append(feature + [feature_pad] * pad_size) pad_size = self.decoder_max_sentence_length - len(sen) - 2 decoder_inputs.append([data_utils.GO_ID] + sen + [data_utils.EOS_ID] + [data_utils.PAD_ID] * pad_size) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] for length_idx in xrange(self.encoder_max_sequence_length): batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(batch_size)], dtype=np.float32)) for length_idx in xrange(self.decoder_max_sentence_length): batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(batch_size, dtype=np.float32) for batch_idx in xrange(batch_size): if length_idx < self.decoder_max_sentence_length - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == self.decoder_max_sentence_length - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, data, bucket_id): """Get a random batch of data from the specified bucket, prepare for step. To feed data in step(..) it must be a list of batch-major vectors, while data here contains single length-major cases. So the main logic of this function is to re-index data cases to be in the proper format for feeding. Args: data: a tuple of size len(self.buckets) in which each element contains lists of pairs of input and output data that we use to create a batch. bucket_id: integer, which bucket to get the batch for. Returns: The triple (encoder_inputs, decoder_inputs, target_weights) for the constructed batch that has the proper format to call step(...) later. """ encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. for _ in xrange(self.batch_size): encoder_input, decoder_input = random.choice(data[bucket_id]) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(self.batch_size, dtype=np.float32) for batch_idx in xrange(self.batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, bucket_dbs, bucket_id, data): encoder_size, decoder_size = self.buckets[bucket_id] # bucket_db = bucket_dbs[bucket_id] encoder_inputs, decoder_inputs = [], [] for encoder_input, decoder_input in data: # encoder_input, decoder_input = random.choice(data[bucket_id]) # encoder_input, decoder_input = bucket_db.random() encoder_input = data_utils.sentence_indice(encoder_input) decoder_input = data_utils.sentence_indice(decoder_input) # Encoder encoder_pad = [data_utils.PAD_ID] * ( encoder_size - len(encoder_input) ) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder decoder_pad_size = decoder_size - len(decoder_input) - 2 decoder_inputs.append( [data_utils.GO_ID] + decoder_input + [data_utils.EOS_ID] + [data_utils.PAD_ID] * decoder_pad_size ) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # batch encoder for i in range(encoder_size): batch_encoder_inputs.append(np.array( [encoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) # batch decoder for i in range(decoder_size): batch_decoder_inputs.append(np.array( [decoder_inputs[j][i] for j in range(self.batch_size)], dtype=np.int32 )) batch_weight = np.ones(self.batch_size, dtype=np.float32) for j in range(self.batch_size): if i < decoder_size - 1: target = decoder_inputs[j][i + 1] if i == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[j] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def create_batches(data): print("generating batches...") batches = [[] for _ in _buckets] for bucket_id in xrange(len(_buckets)): data_bucket = data[bucket_id] encoder_size, decoder_size = _buckets[bucket_id] # shuffle the data data_permute = np.random.permutation(len(data_bucket)) num_batches = math.ceil(len(data_bucket)/FLAGS.batch_size) for b_idx in xrange(num_batches): encoder_inputs, decoder_inputs = [], [] for i in xrange(FLAGS.batch_size): data_idx = data_permute[(b_idx*FLAGS.batch_size+i) % len(data_bucket)] encoder_input, decoder_input = data_bucket[data_idx] # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(FLAGS.batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(FLAGS.batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(FLAGS.batch_size, dtype=np.float32) for batch_idx in xrange(FLAGS.batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) batches[bucket_id].append((batch_encoder_inputs, batch_decoder_inputs, batch_weights)) return batches #----------------------------------------------------- # main training function #-----------------------------------------------------
def get_decode_batch(self, data, bucket_id): """Get sequential batch """ encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] this_batch_size = len(data[bucket_id]) ## SHUBHAM - seq_len initialized seq_len = [] # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. for sample in data[bucket_id]: encoder_input, decoder_input = sample ## SHUBHAM - Append Entries seq_len.append(len(encoder_input)) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) ## SHUBHAM - reversing just the input encoder_inputs.append(list(reversed(encoder_input)) + encoder_pad) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(this_batch_size, dtype=np.float32) for batch_idx in xrange(this_batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) ## SHUBHAM - seq_len as nparray and then passing it as well seq_len = np.asarray(seq_len, dtype=np.int64) return batch_encoder_inputs, batch_decoder_inputs, batch_weights, seq_len
def get_batch(self, data, bucket_id): """Get batches """ this_batch_size = len(data[bucket_id]) encoder_size, decoder_size = self.buckets[bucket_id] text_encoder_inputs, speech_encoder_inputs, decoder_inputs = [], [], [] seq_len = [] for sample in data[bucket_id]: text_encoder_input, decoder_input, speech_encoder_input = sample seq_len.append(len(text_encoder_input)) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(text_encoder_input)) text_encoder_inputs.append(list(reversed(text_encoder_input)) + encoder_pad) # do the same for speech encoder inputs: reverse sequence speech_encoder_inputs.append(np.fliplr(speech_encoder_input).T) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_text_encoder_inputs, batch_speech_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_text_encoder_inputs.append( np.array([text_encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) for length_idx in xrange(encoder_size * spscale): batch_speech_encoder_inputs.append([speech_encoder_inputs[batch_idx][length_idx, :] for batch_idx in xrange(this_batch_size)]) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(this_batch_size, dtype=np.float32) for batch_idx in xrange(this_batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) seq_len = np.asarray(seq_len, dtype=np.int64) return batch_text_encoder_inputs, batch_speech_encoder_inputs, batch_decoder_inputs, batch_weights, seq_len
def get_decode_batch(self, data, bucket_id): """Get sequential batch """ encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] this_batch_size = len(data[bucket_id]) # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. for sample in data[bucket_id]: encoder_input, decoder_input = sample # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append(np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append(np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(this_batch_size, dtype=np.float32) for batch_idx in xrange(this_batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_mix_batch(self, bucketed_data, bucket_id, this_batch_size): """Get a random batch of data from the specified bucket, prepare for step. To feed data in step(..) it must be a list of batch-major vectors, while data here contains single length-major cases. So the main logic of this function is to re-index data cases to be in the proper format for feeding. Args: data: a tuple of size len(self.buckets) in which each element contains lists of pairs of input and output data that we use to create a batch. bucket_id: integer, which bucket to get the batch for. Returns: The triple (encoder_inputs, decoder_inputs, target_weights) for the constructed batch that has the proper format to call step(...) later. """ encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. for _ in xrange(this_batch_size): encoder_input, decoder_input = random.choice(bucketed_data) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(this_batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(this_batch_size, dtype=np.float32) for batch_idx in xrange(this_batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def train2vec(self, dialogs, iters): batch_size = len(dialogs) #????batch_size max_border = self.get_max(iters) #?????????? history_inputs =[] true_inputs =[] false_inputs = [] for i in range( batch_size ): border = min(len(dialogs[i]),max_border*2) dialogs[i] = dialogs[i][:border] #for j in len(dialogs[i]): if (dialogs ==None) or len(dialogs)==0 : #?????? return None,None,None for i in range(batch_size): #batch one_session = dialogs[i] #?????? cache = [] for j in range(self.max_dialogue_size): #???????????????? if j < len(one_session): encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0])) #0?????? #print('encoder_pad',encoder_pad) cache.append(list(reversed(one_session[j][0]+encoder_pad))) #???? else: cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) history_inputs.append(cache) true_cache =[] false_cache = [] for j in range(self.max_dialogue_size): #candidate part if j %2==0: #?0,2,4,..?????? continue if j<len(one_session): true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][0])) true_cache.append(list(reversed(one_session[j][0] + true_pad)))# true candiate false_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(one_session[j][1])) false_cache.append(list(reversed(one_session[j][1] + false_pad)))#false candidate else: true_cache.append(list([data_utils.EOS_ID]*self.max_sentence_size)) false_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) true_inputs.append(true_cache) false_inputs.append(false_cache) ###################################################### batch_history,batch_true,batch_false = [], [], [] for sent_index in range(self.max_dialogue_size): history_cache = [] for length_index in range(self.max_sentence_size): history_cache.append(np.array([history_inputs[batch_index][sent_index][length_index] for batch_index in range(len(history_inputs))])) batch_history.append(history_cache) if sent_index % 2!=0: true_cache, false_cache = [], [] for length_index in range(self.max_sentence_size): true_cache.append(np.array([true_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))])) false_cache.append(np.array([false_inputs[batch_index][int(sent_index/2)][length_index] for batch_index in range(len(history_inputs))])) batch_true.append(true_cache) batch_false.append(false_cache) return batch_history, batch_true, batch_false
def test2vec(self,history): #????????????? #??????????? history_inputs =[] candidate_inputs =[] if (history ==None) or len(history)==0 : #?????? return None,None #print(history) candidate_size = len(history[1]) #print('candidate_size',candidate_size) cache = [] for j in range(self.max_dialogue_size): #???????????????? if j< len(history): encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0])) #0?????? cache.append(list(reversed(history[j][0]+encoder_pad))) #???? else: cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) history_inputs = cache #print(history_inputs) true_cache =[] for i in range(self.max_dialogue_size): #candidate part if i %2==0: #?0,2,4,..?????? continue if i<len(history): #???????? for j in range(candidate_size): true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j])) true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate else: for j in range(candidate_size): true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) candidate_inputs.append(true_cache) true_cache =[] ###################################################### batch_history, batch_candidate = [], [] for sent_index in range(self.max_dialogue_size): history_cache = [] for length_index in range(self.max_sentence_size): history_cache.append(np.array( [history_inputs[sent_index][length_index]])) batch_history.append(history_cache) if sent_index % 2 != 0: candidate_cache = [] for length_index in range(self.max_sentence_size): candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)])) batch_candidate.append(candidate_cache) return batch_history, batch_candidate ############################################################################
def test2vec(self,history): #????????????? #??????????? history_inputs =[] candidate_inputs =[] if (history ==None) or len(history)==0 : #?????? return None,None candidate_size = len(history[1]) cache = [] for j in range(self.max_dialogue_size): #???????????????? if j< len(history): encoder_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[j][0])) #0?????? cache.append(list(reversed(history[j][0]+encoder_pad))) #???? else: cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) history_inputs = cache true_cache =[] for i in range(self.max_dialogue_size): #candidate part if i %2==0: #?0,2,4,..?????? continue if i<len(history): #???????? for j in range(candidate_size): true_pad = [data_utils.PAD_ID]*(self.max_sentence_size-len(history[i][j])) true_cache.append(list(reversed(history[i][j] + true_pad)))# true candidate else: for j in range(candidate_size): true_cache.append(list([data_utils.PAD_ID]*self.max_sentence_size)) candidate_inputs.append(true_cache) true_cache =[] ###################################################### batch_history, batch_candidate = [], [] for sent_index in range(self.max_dialogue_size): history_cache = [] for length_index in range(self.max_sentence_size): history_cache.append(np.array( [history_inputs[sent_index][length_index]])) batch_history.append(history_cache) if sent_index % 2 != 0: candidate_cache = [] for length_index in range(self.max_sentence_size): candidate_cache.append(np.array([candidate_inputs[int(sent_index/2)][batch_index][length_index] for batch_index in range(candidate_size)])) batch_candidate.append(candidate_cache) return batch_history, batch_candidate ############################################################################
def read_mrs_data(buckets, source_paths, target_paths, max_size=None, any_length=False, offset_target=-1): # Read in all files seperately. source_inputs = [data_utils.read_ids_file(path, max_size) for path in source_paths] target_inputs = [data_utils.read_ids_file(path, max_size) for path in target_paths] data_set = [[] for _ in buckets] data_list = [] # Assume everything is well-aligned. for i in xrange(len(source_inputs[0])): # over examples # List of sequences of each type. source_ids = [source_input[i] for source_input in source_inputs] # Assume first target type predicts EOS. # Not checking pointer ranges: do that inside tf graph. target_ids = [target_inputs[0][i] + [data_utils.EOS_ID]] for j, target_input in enumerate(target_inputs[1:]): if offset_target > 0 and j + 1 == offset_target: target_ids.append([data_utils.PAD_ID] + target_input[i] + [data_utils.PAD_ID]) else: target_ids.append(target_input[i] + [data_utils.PAD_ID]) found_bucket = False for bucket_id, (source_size, target_size) in enumerate(buckets): if len(source_ids[0]) < source_size and len(target_ids[0]) < target_size: data_set[bucket_id].append([source_ids, target_ids]) data_list.append([source_ids, target_ids, bucket_id]) found_bucket = True break if any_length and not found_bucket: # Crop examples that are larger than the largest bucket. source_size, target_size = buckets[-1][0], buckets[-1][1] if len(source_ids[0]) >= source_size: source_ids = [source_id[:source_size] for source_id in source_ids] if len(target_ids[0]) >= target_size: target_ids = [target_id[:target_size] for target_id in target_ids] bucket_id = len(buckets) - 1 data_set[bucket_id].append([source_ids, target_ids]) data_list.append([source_ids, target_ids, bucket_id]) return data_set, data_list
def get_batch(self, data, bucket_id): """Get a random batch of data from the specified bucket, prepare for step. To feed data in step(..) it must be a list of batch-major vectors, while data here contains single length-major cases. So the main logic of this function is to re-index data cases to be in the proper format for feeding. Args: data: a tuple of size len(self.buckets) in which each element contains lists of pairs of input and output data that we use to create a batch. bucket_id: integer, which bucket to get the batch for. Returns: The triple (encoder_inputs, decoder_inputs, target_weights) for the constructed batch that has the proper format to call step(...) later. """ encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. for _ in xrange(self.batch_size): encoder_input, decoder_input = random.choice(data[bucket_id]) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. #encoder_inputs?shape?(batch_size,encoder_size) #batch_encoder_inputs?shape?(encoder_size,batch_size) for length_idx in xrange(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(self.batch_size, dtype=np.float32) for batch_idx in xrange(self.batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] #????decoder????????target?pad,???????????????????? if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) #shape?(encoder_size,batch_size) return batch_encoder_inputs, batch_decoder_inputs, batch_weights
def get_batch(self, train_data, bucket_id): encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] batch_source_encoder, batch_source_decoder = [], [] #print("bucket_id: ", bucket_id) for batch_i in xrange(self.batch_size): encoder_input, decoder_input = random.choice(train_data[bucket_id]) batch_source_encoder.append(encoder_input) batch_source_decoder.append(decoder_input) #print("encoder_input: ", encoder_input) encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) #print("encoder_input pad: ", list(reversed(encoder_input + encoder_pad))) #print("decoder_input: ", decoder_input) decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) #print("decoder_pad: ",[data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] for length_idx in xrange(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) batch_weight = np.ones(self.batch_size, dtype=np.float32) for batch_idx in xrange(self.batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder
def get_batch(self, train_data, bucket_id, type=0): encoder_size, decoder_size = self.buckets[bucket_id] encoder_inputs, decoder_inputs = [], [] # print("Batch_Size: %s" %self.batch_size) # Get a random batch of encoder and decoder inputs from data, # pad them if needed, reverse encoder inputs and add GO to decoder. batch_source_encoder, batch_source_decoder = [], [] # print("bucket_id: %s" %bucket_id) for batch_i in xrange(self.batch_size): if type == 1: # feed_data = {bucket_id: zip(tokens_a, tokens_b)} encoder_input, decoder_input = train_data[bucket_id][batch_i] elif type == 2: # feed_data = {bucket_id: [(resp_tokens, [])]} encoder_input_a, decoder_input = train_data[bucket_id][0] encoder_input = encoder_input_a[batch_i] elif type == 0: encoder_input, decoder_input = random.choice(train_data[bucket_id]) print("train en: %s, de: %s" % (encoder_input, decoder_input)) batch_source_encoder.append(encoder_input) batch_source_decoder.append(decoder_input) # Encoder inputs are padded and then reversed. encoder_pad = [data_utils.PAD_ID] * (encoder_size - len(encoder_input)) encoder_inputs.append(list(reversed(encoder_input + encoder_pad))) # Decoder inputs get an extra "GO" symbol, and are padded then. decoder_pad_size = decoder_size - len(decoder_input) - 1 decoder_inputs.append([data_utils.GO_ID] + decoder_input + [data_utils.PAD_ID] * decoder_pad_size) # Now we create batch-major vectors from the data selected above. batch_encoder_inputs, batch_decoder_inputs, batch_weights = [], [], [] # Batch encoder inputs are just re-indexed encoder_inputs. for length_idx in xrange(encoder_size): batch_encoder_inputs.append( np.array([encoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Batch decoder inputs are re-indexed decoder_inputs, we create weights. for length_idx in xrange(decoder_size): batch_decoder_inputs.append( np.array([decoder_inputs[batch_idx][length_idx] for batch_idx in xrange(self.batch_size)], dtype=np.int32)) # Create target_weights to be 0 for targets that are padding. batch_weight = np.ones(self.batch_size, dtype=np.float32) for batch_idx in xrange(self.batch_size): # We set weight to 0 if the corresponding target is a PAD symbol. # The corresponding target is decoder_input shifted by 1 forward. if length_idx < decoder_size - 1: target = decoder_inputs[batch_idx][length_idx + 1] if length_idx == decoder_size - 1 or target == data_utils.PAD_ID: batch_weight[batch_idx] = 0.0 batch_weights.append(batch_weight) return batch_encoder_inputs, batch_decoder_inputs, batch_weights, batch_source_encoder, batch_source_decoder