我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.python.platform.gfile.GFile()。
def initialize_vocabulary(vocabulary_path): """ Initialize vocabulary from file. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None): """ Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: for line in data_file: token_ids = sentence_to_token_ids(line, vocab, tokenizer) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def read_data(source_path, target_path, max_seq_len=100): data_set = [] with tf.gfile.GFile(source_path, mode="r") as source_file: with tf.gfile.GFile(target_path, mode="r") as target_file: source, target = source_file.readline(), target_file.readline() counter = 0 while source and target: counter += 1 if counter % 10000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [GO_ID] + [int(x) for x in target.split()] y = [int(x) for x in target.split()] + [EOS_ID] if len(source_ids) < max_seq_len and len(target_ids) <max_seq_len: data_set.append([source_ids, target_ids, y]) source, target = source_file.readline(), target_file.readline() return data_set
def create_vocabulary(vocabulary_path, data_paths, tokenizer=None): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))) vocab = {} for path in data_paths: with open(path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print("processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: if w in vocab: vocab[w] += 1 else: vocab[w] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) print("Vocabulary size: %d" % len(vocab_list)) with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 1000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def combine_shuffle(data_dir, filename): if not (gfile.Exists(os.path.join(data_dir, filename + '_train.txt')) and gfile.Exists(os.path.join(data_dir, filename + '_test.txt'))): data_train = [] data_test = [] print ('Shuffle file in %s' % data_dir) for subdir, dirs, files in os.walk(data_dir): for afile in files: with gfile.GFile(os.path.join(subdir, afile), mode="r") as f: if afile.endswith("train.txt"): data_train.append(f.read()) else: data_test.append(f.read()) with gfile.GFile(os.path.join(data_dir, filename + '_train.txt'), mode="w") as train_file: train_file.write(''.join(data_train)) train_file.close() with gfile.GFile(os.path.join(data_dir, filename + '_test.txt'), mode="w") as test_file: test_file.write(''.join(data_test)) test_file.close()
def create_label_vocab(vocabulary_path, data_path): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="r") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) # tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) label = line.strip() vocab[label] = 1 label_list = START_VOCAB_dict['no_padding'] + sorted(vocab) # label_list = sorted(vocab) with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for k in label_list: vocab_file.write(k + "\n")
def data_to_token_ids(data_path, target_path, vocabulary_path): """Tokenize preprocess file and turn into token-ids using given vocabulary file. This function loads preprocess line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the preprocess file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. if None, basic_tokenizer will be used. """ if not gfile.Exists(target_path): print("Tokenizing preprocess in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: for line in tqdm(data_file): token_ids = sentence_to_token_ids(line, vocab) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): with tempfile.NamedTemporaryFile() as tmpfile: temp_file_name = tmpfile.name urllib.request.urlretrieve(source_url, temp_file_name) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): temp_file_name, _ = urlretrieve_with_retry(source_url) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=False): print(target_path) if True:#gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: line = line.strip() sentence_array = line.split(SPLITER) token_array= [] for sentence in sentence_array: token_ids = sentence_to_token_ids(sentence, vocab, tokenizer,normalize_digits) token_array.append(" ".join([str(tok) for tok in token_ids])) counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) tokens_file.write("\t".join(token_array) + "\n")
def data_to_token_ids(self, tokenizer=None, normalize_digits=False): data_path =self.ut_path target_path = self.ids_path vacab_path = self.vocab_path print("Tokenizing data in %s" % data_path) vocab, _ = self.initialize_vocabulary() with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: line = line.strip() sentence_array = line.split(SPLITER) token_array= [] for sentence in sentence_array: token_ids = self.sentence_to_token_ids(sentence, vocab, tokenizer,normalize_digits) token_array.append(" ".join([str(tok) for tok in token_ids])) counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) tokens_file.write("\t".join(token_array) + "\n")
def create_token_ids(self, ids_path="", vocabulary_path="", progress_interval=100000): if ids_path: self.ids_path = ids_path if vocabulary_path: self.vocabulary_path = vocabulary_path if gfile.Exists(self.ids_path): print("ids file already exists.") return 0 else: print("Tokenizing data at %s" % self.data_path) vocab, _ = self.load_vocabulary(self.vocabulary_path) with gfile.GFile(self.data_path, mode="rb") as data_file: with gfile.GFile(self.ids_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % progress_interval == 0: print(" tokenizing line %d" % counter) token_ids = self.sentence_to_token_ids(line, vocab) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def split_tweets_replies(tweets_path, enc_path, dec_path): """Read data from tweets_paths and split it to tweets and replies. Args: tweets_path: original tweets data enc_path: path to write tweets dec_path: path to write replies Returns: None """ i = 1 with gfile.GFile(tweets_path, mode="rb") as f, gfile.GFile(enc_path, mode="w+") as ef, gfile.GFile(dec_path, mode="w+") as df: for line in f: if not isinstance(line, str): line = line.decode('utf-8') line = sanitize_line(line) # Odd lines are tweets if i % 2 == 1: ef.write(line) # Even lines are replies else: df.write(line) i = i + 1
def create_train_validation(source_path, train_path, validation_path, train_ratio=0.75): """Split source file into train and validation data Args: source_path: source file path train_path: Path to write train data validation_path: Path to write validatio data train_ratio: Train data ratio Returns: None """ nb_lines = num_lines(source_path) nb_train = int(nb_lines * train_ratio) counter = 0 with gfile.GFile(source_path, "r") as f, gfile.GFile(train_path, "w") as tf, gfile.GFile(validation_path, "w") as vf: for line in f: if counter < nb_train: tf.write(line) else: vf.write(line) counter = counter + 1 # Originally from https://github.com/1228337123/tensorflow-seq2seq-chatbot
def initialize_vocabulary(vocabulary_path): if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] # Dictionary of (word, idx) vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path) # From https://github.com/1228337123/tensorflow-seq2seq-chatbot
def create_vocabulary(source_path, vocabulary_path, max_vocabulary_size, tokenizer=japanese_tokenizer): """Create vocabulary file. Please see comments in head for file format Args: source_path: source file path vocabulary_path: Path to write vocabulary max_vocabulary_size: Max vocabulary size tokenizer: tokenizer used for tokenize each lines Returns: None """ if gfile.Exists(vocabulary_path): print("Found vocabulary file") return with gfile.GFile(source_path, mode="r") as f: counter = 0 vocab = {} # (word, word_freq) for line in f: counter += 1 words = tokenizer(line) if counter % 5000 == 0: sys.stdout.write(".") sys.stdout.flush() for word in words: # Normalize numbers. Not sure if it's necessary. word = re.sub(DIGIT_RE, "0", word) if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for w in vocab_list: vocab_file.write(w + "\n") print("\n")
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} #with gfile.GFile(data_path, mode="r") as f: with open(data_path,'rb') as f: counter = 0 for line in f.readlines(): counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line.decode('utf-8')) if tokenizer else basic_tokenizer(line.decode('utf-8')) for w in tokens: word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for w in vocab_list: vocab_file.write(w + "\n")
def read_data(tokenized_dialog_path, max_size=None): """Read data from source file and put into buckets. Args: source_path: path to the files with token-ids. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in BUCKETS] with gfile.GFile(tokenized_dialog_path, mode="r") as fh: source, target = fh.readline(), fh.readline() counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(EOS_ID) for bucket_id, (source_size, target_size) in enumerate(BUCKETS): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, source_ids, target_ids]) break source, target = fh.readline(), fh.readline() print('num_samples in each bucket:',[len(ele) for ele in data_set]) return data_set
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None): """Create vocabulary file from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: for line in f: tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for word in tokens: if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_word_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_word_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_word_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def print_out(s, newline=True): """Print a message out and log it to file.""" if log_filename: try: with gfile.GFile(log_filename, mode="a") as f: f.write(s + ("\n" if newline else "")) # pylint: disable=bare-except except: sys.stdout.write("Error appending to %s\n" % log_filename) sys.stdout.write(s + ("\n" if newline else "")) sys.stdout.flush()
def __init__(self, file_path, word_alphabet, pos_alphabet, type_alphabet): self.__source_file = gfile.GFile(file_path, mode='r') self.__word_alphabet = word_alphabet self.__pos_alphabet = pos_alphabet self.__type_alphabet = type_alphabet
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) line = tf.compat.as_bytes(line) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def initialize_vocabulary(vocabulary_path): # map vocab to word embeddings if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip('\n') for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 5000 == 0: print("tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 1000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) print('>> Full Vocabulary Size :',len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + "\n")
def initialize_vocabulary(vocabulary_path): if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [tf.compat.as_bytes(line.strip()) for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: # try: # line = line.decode('utf8', 'ignore') # except Exception as e: # print(e, line) # continue counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(tf.compat.as_bytes(line), vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def read_data(tokenized_dialog_path, buckets, max_size=None, reversed=False): """Read data from source file and put into buckets. Args: source_path: path to the files with token-ids. max_size: maximum number of lines to read, all other will be ignored; if 0 or None, data files will be read completely (no limit). Returns: data_set: a list of length len(_buckets); data_set[n] contains a list of (source, target) pairs read from the provided data files that fit into the n-th bucket, i.e., such that len(source) < _buckets[n][0] and len(target) < _buckets[n][1]; source and target are lists of token-ids. """ data_set = [[] for _ in buckets] with gfile.GFile(tokenized_dialog_path, mode="r") as fh: source, target = fh.readline(), fh.readline() if reversed: source, target = target, source # reverse Q-A pair, for bi-direction model counter = 0 while source and target and (not max_size or counter < max_size): counter += 1 if counter % 100000 == 0: print(" reading data line %d" % counter) sys.stdout.flush() source_ids = [int(x) for x in source.split()] target_ids = [int(x) for x in target.split()] target_ids.append(EOS_ID) for bucket_id, (source_size, target_size) in enumerate(buckets): if len(source_ids) < source_size and len(target_ids) < target_size: data_set[bucket_id].append([source_ids, target_ids]) break source, target = fh.readline(), fh.readline() return data_set
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) print('>> Full Vocabulary Size :',len(vocab_list)) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="r") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = START_VOCAB_dict['lm'] + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for w in vocab_list: vocab_file.write(w + "\n")