我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用tensorflow.python.platform.gfile.Exists()。
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=japanese_tokenizer, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="wb") as tokens_file: # edit w to wb counter = 0 for line in data_file: # line = tf.compat.as_bytes(line) # added by Ken counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) # line is binary here line = line.decode('utf-8') token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n") # Originally from https://github.com/1228337123/tensorflow-seq2seq-chatbot
def initialize_vocabulary(vocabulary_path): """ Initialize vocabulary from file. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None): """ Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: for line in data_file: token_ids = sentence_to_token_ids(line, vocab, tokenizer) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, bottleneck_tensor): """Create a single bottleneck file.""" print('Creating bottleneck at ' + bottleneck_path) image_path = get_image_path(image_lists, label_name, index, image_dir, category) if not gfile.Exists(image_path): tf.logging.fatal('File does not exist %s', image_path) image_data = gfile.FastGFile(image_path, 'rb').read() try: bottleneck_values = run_bottleneck_on_image( sess, image_data, jpeg_data_tensor, bottleneck_tensor) except: raise RuntimeError('Error during processing file %s' % image_path) bottleneck_string = ','.join(str(x) for x in bottleneck_values) with open(bottleneck_path, 'w') as bottleneck_file: bottleneck_file.write(bottleneck_string)
def create_vocabulary(vocabulary_path, data_paths, tokenizer=None): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths))) vocab = {} for path in data_paths: with open(path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print("processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: if w in vocab: vocab[w] += 1 else: vocab[w] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) print("Vocabulary size: %d" % len(vocab_list)) with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 1000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def create_data_list(image_dir): if not gfile.Exists(image_dir): print("Image director '" + image_dir + "' not found.") return None extensions = ['jpg', 'JPG', 'jpeg', 'JPEG', 'png', 'PNG'] print("Looking for images in '" + image_dir + "'") file_list = [] for extension in extensions: file_glob = os.path.join(image_dir, '*.' + extension) file_list.extend(gfile.Glob(file_glob)) if not file_list: print("No files found in '" + image_dir + "'") return None images = [] labels = [] for file_name in file_list: image = Image.open(file_name) image_gray = image.convert('L') image_resize = image_gray.resize(size=(IMAGE_WIDTH,IMAGE_HEIGHT)) input_img = np.array(image_resize, dtype='int16') image.close() label_name = os.path.basename(file_name).split('_')[0] images.append(input_img) labels.append(label_name) return zip(images, labels)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def combine_shuffle(data_dir, filename): if not (gfile.Exists(os.path.join(data_dir, filename + '_train.txt')) and gfile.Exists(os.path.join(data_dir, filename + '_test.txt'))): data_train = [] data_test = [] print ('Shuffle file in %s' % data_dir) for subdir, dirs, files in os.walk(data_dir): for afile in files: with gfile.GFile(os.path.join(subdir, afile), mode="r") as f: if afile.endswith("train.txt"): data_train.append(f.read()) else: data_test.append(f.read()) with gfile.GFile(os.path.join(data_dir, filename + '_train.txt'), mode="w") as train_file: train_file.write(''.join(data_train)) train_file.close() with gfile.GFile(os.path.join(data_dir, filename + '_test.txt'), mode="w") as test_file: test_file.write(''.join(data_test)) test_file.close()
def create_label_vocab(vocabulary_path, data_path): if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="r") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) # tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) label = line.strip() vocab[label] = 1 label_list = START_VOCAB_dict['no_padding'] + sorted(vocab) # label_list = sorted(vocab) with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for k in label_list: vocab_file.write(k + "\n")
def createBottleneckFile(bottleneckPath, imageLists, labelName, index, imageDir, category, sess, jpegDataTensor, bottleneckTensor): print('Create bottleneck at ' + bottleneckPath) imagePath = getImagePath(imageLists, labelName, index, imageDir, category) if not gfile.Exists(imagePath): tf.logging.fatal('File not exist %s', imagePath) imageData = gfile.FastGFile(imagePath, 'rb').read() try: bottleneckValues = runBottleneckOnImage(sess, imageData, jpegDataTensor, bottleneckTensor) except: pass bottleneckString = ','.join(str(x) for x in bottleneckValues) with open(bottleneckPath, 'w') as f: f.write(bottleneckString)
def getRandomDistortedBottlenecks(sess, imageLists, num, category, imageDir, inputJpegTensor, distortedImage, resizedInputTensor, bottleneckTensor): classCount = len(imageLists.keys()) bottlenecks = [] groundTruths = [] for _ in range(num): labelIndex = random.randrange(classCount) labelName = list(imageLists.keys())[labelIndex] imageIndex = random.randrange(MAX_NUM_IMAGES_PER_CLASS + 1) imagePath = getImagePath(imageLists, labelName, imageIndex, imageDir, category) if not gfile.Exists(imagePath): tf.logging.fatal('File not exist %s', imagePath) jpegData = gfile.FastGFile(imagePath, 'rb').read() distortedImageData = sess.run(distortedImage, {inputJpegTensor: jpegData}) bottleneck = runBottleneckOnImage(sess, distortedImageData, resizedInputTensor, bottleneckTensor) groundTruth = np.zeros(classCount, dtype = np.float32) groundTruth[labelIndex] = 1.0 bottlenecks.append(bottleneck) groundTruths.append(groundTruth) return bottlenecks, groundTruths
def data_to_token_ids(data_path, target_path, vocabulary_path): """Tokenize preprocess file and turn into token-ids using given vocabulary file. This function loads preprocess line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the preprocess file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. if None, basic_tokenizer will be used. """ if not gfile.Exists(target_path): print("Tokenizing preprocess in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: for line in tqdm(data_file): token_ids = sentence_to_token_ids(line, vocab) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def testPathsWithParse(self): base_dir = os.path.join(tf.test.get_temp_dir(), "paths_parse") self.assertFalse(gfile.Exists(base_dir)) for p in xrange(3): gfile.MakeDirs(os.path.join(base_dir, "%d" % p)) # add a base_directory to ignore gfile.MakeDirs(os.path.join(base_dir, "ignore")) # create a simple parser that pulls the export_version from the directory. def parser(path): match = re.match("^" + base_dir + "/(\\d+)$", path.path) if not match: return None return path._replace(export_version=int(match.group(1))) self.assertEquals( gc.get_paths(base_dir, parser=parser), [gc.Path(os.path.join(base_dir, "0"), 0), gc.Path(os.path.join(base_dir, "1"), 1), gc.Path(os.path.join(base_dir, "2"), 2)])
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): with tempfile.NamedTemporaryFile() as tmpfile: temp_file_name = tmpfile.name urllib.request.urlretrieve(source_url, temp_file_name) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
def maybe_download(filename, work_directory, source_url): """Download the data from source url, unless it's already here. Args: filename: string, name of the file in the directory. work_directory: string, path to working directory. source_url: url to download from if file doesn't exist. Returns: Path to resulting file. """ if not gfile.Exists(work_directory): gfile.MakeDirs(work_directory) filepath = os.path.join(work_directory, filename) if not gfile.Exists(filepath): temp_file_name, _ = urlretrieve_with_retry(source_url) gfile.Copy(temp_file_name, filepath) with gfile.GFile(filepath) as f: size = f.size() print('Successfully downloaded', filename, size, 'bytes.') return filepath
def create_bottleneck_file(bottleneck_path, image_lists, label_name, index, image_dir, category, sess, jpeg_data_tensor, decoded_image_tensor, resized_input_tensor, bottleneck_tensor): """Create a single bottleneck file.""" tf.logging.info('Creating bottleneck at ' + bottleneck_path) image_path = get_image_path(image_lists, label_name, index, image_dir, category) if not gfile.Exists(image_path): tf.logging.fatal('File does not exist %s', image_path) image_data = gfile.FastGFile(image_path, 'rb').read() try: bottleneck_values = run_bottleneck_on_image( sess, image_data, jpeg_data_tensor, decoded_image_tensor, resized_input_tensor, bottleneck_tensor) except Exception as e: raise RuntimeError('Error during processing file %s (%s)' % (image_path, str(e))) bottleneck_string = ','.join(str(x) for x in bottleneck_values) with open(bottleneck_path, 'w') as bottleneck_file: bottleneck_file.write(bottleneck_string)
def convert_to(x, y, z, filename): """Converts data to tfrecords. Args: :param x, y: list - [img1, img2, ...]. img: ndarray. :param name: str. """ if not gfile.Exists(filename): print('Writing', filename) writer = tf.python_io.TFRecordWriter(filename) for index in range(NUM_PER_IMAGE): image_x = x[index].tostring() image_y = y[index].tostring() example = tf.train.Example(features=tf.train.Features(feature={ 'label': _float_feature(z[index]), 'image_x': _bytes_feature(image_x), 'image_y': _bytes_feature(image_y) })) writer.write(example.SerializeToString()) writer.close()
def load_meanstddev(path): # load precomputed mean/stddev if not gfile.Exists(path): raise ValueError('Mean/stddev file not found.') assert gfile.Exists(path) mean_stddev_string = open(path, 'r').read().split('\n') mean_str = mean_stddev_string[0][1:-1].split(',') stddev_str = mean_stddev_string[1][1:-1].split(',') eigval_str = mean_stddev_string[2][1:-1].split(',') eigvecs_str = mean_stddev_string[3][1:-1].split(' ') mean = [float(mean_str[0]), float(mean_str[1]), float(mean_str[2])] stddev = [float(stddev_str[0]), float(stddev_str[1]), float(stddev_str[2])] eigvals = [float(eigval_str[0]), float(eigval_str[1]), float(eigval_str[2])] eigvecs = [] for eigvec_str in eigvecs_str: eigvec = eigvec_str[1:-1].split(',') eigvecs.append([float(eigvec[0]), float(eigvec[1]), float(eigvec[2])]) return mean, stddev, eigvals, eigvecs
def output_predict(depths, images, output_dir): print("output predict into %s" % output_dir) if not gfile.Exists(output_dir): gfile.MakeDirs(output_dir) for i, (image, depth) in enumerate(zip(images, depths)): pilimg = Image.fromarray(np.uint8(image)) image_name = "%s/%05d_org.png" % (output_dir, i) pilimg.save(image_name) depth = depth.transpose(2, 0, 1) if np.max(depth) != 0: ra_depth = (depth/np.max(depth))*255.0 else: ra_depth = depth*255.0 depth_pil = Image.fromarray(np.uint8(ra_depth[0]), mode="L") depth_name = "%s/%05d_dep.png" % (output_dir, i) depth_pil.save(depth_name)
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="r") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def initialize_vocabulary(vocabulary_path): if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="r") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] # Dictionary of (word, idx) vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path) # From https://github.com/1228337123/tensorflow-seq2seq-chatbot
def create_model(session, forward_only,batch_size=None): """Create translation model and initialize or load parameters in session.""" model = seq2seq_model.Seq2SeqModel( vocab_size=FLAGS.vocab_size, embedding_dim=FLAGS.embedding_dim, buckets=BUCKETS, size=FLAGS.size, num_layers=FLAGS.num_layers, max_gradient_norm=FLAGS.max_gradient_norm, batch_size=FLAGS.batch_size if not batch_size else batch_size, learning_rate=FLAGS.learning_rate, learning_rate_decay_factor=FLAGS.learning_rate_decay_factor, use_lstm=True, forward_only=forward_only) ckpt = tf.train.get_checkpoint_state(FLAGS.model_dir) #print('path:',ckpt.model_checkpoint_path) #print('gfile:',gfile.Exists(ckpt.model_checkpoint_path)) #if ckpt and gfile.Exists(ckpt.model_checkpoint_path): if ckpt: print("Reading model parameters from %s" % ckpt.model_checkpoint_path) model.saver.restore(session, ckpt.model_checkpoint_path) else: print("Created model with fresh parameters.") session.run(tf.global_variables_initializer()) return model
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} #with gfile.GFile(data_path, mode="r") as f: with open(data_path,'rb') as f: counter = 0 for line in f.readlines(): counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line.decode('utf-8')) if tokenizer else basic_tokenizer(line.decode('utf-8')) for w in tokens: word = re.sub(_DIGIT_RE, "0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="w") as vocab_file: for w in vocab_list: vocab_file.write(w + "\n")
def create_image_lists(image_dir, testing_percentage=0.0, validation_percentage=0.0): """ Code modified from tensorflow/tensorflow/examples/image_retraining """ if not gfile.Exists(image_dir): print("Image directory '" + image_dir + "' not found.") return None training_images = [] extensions = ['jpg', 'jpeg', 'JPG', 'JPEG'] sub_dirs = [x[0] for x in os.walk(image_dir)] file_list = [] for extension in extensions: file_glob = os.path.join(image_dir, '*.' + extension) file_list.extend(glob.glob(file_glob)) if not file_list: print('No files found') else: # print "No. of files found: %d" % len(file_list) training_images.extend([f for f in file_list]) random.shuffle(training_images) no_of_images = len(training_images) validation_offset = int(validation_percentage * no_of_images) validation_images = training_images[:validation_offset] test_offset = int(testing_percentage * no_of_images) testing_images = training_images[validation_offset:validation_offset + test_offset] training_images = training_images[validation_offset + test_offset:] result = { 'train': training_images, 'test': testing_images, 'validation': validation_images, } return result
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None): """Create vocabulary file from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: for line in f: tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for word in tokens: if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_word_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) tokens = tokenizer(line) if tokenizer else basic_word_tokenizer(line) for w in tokens: word = re.sub(_DIGIT_RE, b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")
def initialize_vocabulary(vocabulary_path): """Initialize vocabulary from file. We assume the vocabulary is stored one-item-per-line, so a file: dog cat will result in a vocabulary {"dog": 0, "cat": 1}, and this function will also return the reversed-vocabulary ["dog", "cat"]. Args: vocabulary_path: path to the file containing the vocabulary. Returns: a pair: the vocabulary (a dictionary mapping string to integers), and the reversed vocabulary (a list, which reverses the vocabulary mapping). Raises: ValueError: if the provided vocabulary_path does not exist. """ if gfile.Exists(vocabulary_path): rev_vocab = [] with gfile.GFile(vocabulary_path, mode="rb") as f: rev_vocab.extend(f.readlines()) rev_vocab = [line.strip() for line in rev_vocab] vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)]) return vocab, rev_vocab else: raise ValueError("Vocabulary file %s not found.", vocabulary_path)
def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=True): """Tokenize data file and turn into token-ids using given vocabulary file. This function loads data line-by-line from data_path, calls the above sentence_to_token_ids, and saves the result to target_path. See comment for sentence_to_token_ids on the details of token-ids format. Args: data_path: path to the data file in one-sentence-per-line format. target_path: path where the file with token-ids will be created. vocabulary_path: path to the vocabulary file. tokenizer: a function to use to tokenize each sentence; if None, basic_word_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(target_path): print("Tokenizing data in %s" % data_path) vocab, _ = initialize_vocabulary(vocabulary_path) with gfile.GFile(data_path, mode="rb") as data_file: with gfile.GFile(target_path, mode="w") as tokens_file: counter = 0 for line in data_file: counter += 1 if counter % 100000 == 0: print(" tokenizing line %d" % counter) token_ids = sentence_to_token_ids(line, vocab, tokenizer, normalize_digits) tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")
def get_wmt_enfr_train_set(directory): """Download the WMT en-fr training corpus to directory unless it's there.""" train_path = os.path.join(directory, "giga-fren.release2.fixed") if not (gfile.Exists(train_path +".fr") and gfile.Exists(train_path +".en")): corpus_file = maybe_download(directory, "training-giga-fren.tar", _WMT_ENFR_TRAIN_URL) print("Extracting tar file %s" % corpus_file) with tarfile.open(corpus_file, "r") as corpus_tar: corpus_tar.extractall(directory) gunzip_file(train_path + ".fr.gz", train_path + ".fr") gunzip_file(train_path + ".en.gz", train_path + ".en") return train_path
def get_wmt_enfr_dev_set(directory): """Download the WMT en-fr training corpus to directory unless it's there.""" dev_name = "newstest2013" dev_path = os.path.join(directory, dev_name) if not (gfile.Exists(dev_path + ".fr") and gfile.Exists(dev_path + ".en")): dev_file = maybe_download(directory, "dev-v2.tgz", _WMT_ENFR_DEV_URL) print("Extracting tgz file %s" % dev_file) with tarfile.open(dev_file, "r:gz") as dev_tar: fr_dev_file = dev_tar.getmember("dev/" + dev_name + ".fr") en_dev_file = dev_tar.getmember("dev/" + dev_name + ".en") fr_dev_file.name = dev_name + ".fr" # Extract without "dev/" prefix. en_dev_file.name = dev_name + ".en" dev_tar.extract(fr_dev_file, directory) dev_tar.extract(en_dev_file, directory) return dev_path
def create_vocabulary(vocabulary_path, data_path, max_vocabulary_size, tokenizer=None, normalize_digits=True): """Create vocabulary file (if it does not exist yet) from data file. Data file is assumed to contain one sentence per line. Each sentence is tokenized and digits are normalized (if normalize_digits is set). Vocabulary contains the most-frequent tokens up to max_vocabulary_size. We write it to vocabulary_path in a one-token-per-line format, so that later token in the first line gets id=0, second line gets id=1, and so on. Args: vocabulary_path: path where the vocabulary will be created. data_path: data file that will be used to create vocabulary. max_vocabulary_size: limit on the size of the created vocabulary. tokenizer: a function to use to tokenize each data sentence; if None, basic_tokenizer will be used. normalize_digits: Boolean; if true, all digits are replaced by 0s. """ if not gfile.Exists(vocabulary_path): print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path)) vocab = {} with gfile.GFile(data_path, mode="rb") as f: counter = 0 for line in f: counter += 1 if counter % 100000 == 0: print(" processing line %d" % counter) line = tf.compat.as_bytes(line) tokens = tokenizer(line) if tokenizer else basic_tokenizer(line) for w in tokens: word = _DIGIT_RE.sub(b"0", w) if normalize_digits else w if word in vocab: vocab[word] += 1 else: vocab[word] = 1 vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True) if len(vocab_list) > max_vocabulary_size: vocab_list = vocab_list[:max_vocabulary_size] with gfile.GFile(vocabulary_path, mode="wb") as vocab_file: for w in vocab_list: vocab_file.write(w + b"\n")