Python tensorflow.python.platform.gfile 模块，GFile() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用tensorflow.python.platform.gfile.GFile()。

项目：basic-encoder-decoder 作者：pemywei | 项目源码 | 文件源码

def initialize_vocabulary(vocabulary_path):
    """
    Initialize vocabulary from file.
    Args:
        vocabulary_path: path to the file containing the vocabulary.
    Returns:
        a pair: the vocabulary (a dictionary mapping string to integers), and
        the reversed vocabulary (a list, which reverses the vocabulary mapping).
    Raises:
        ValueError: if the provided vocabulary_path does not exist.
    """
    if gfile.Exists(vocabulary_path):
        rev_vocab = []
        with gfile.GFile(vocabulary_path, mode="rb") as f:
            rev_vocab.extend(f.readlines())
        rev_vocab = [line.strip() for line in rev_vocab]
        vocab = dict([(x, y) for (y, x) in enumerate(rev_vocab)])
        return vocab, rev_vocab
    else:
        raise ValueError("Vocabulary file %s not found.", vocabulary_path)

项目：basic-encoder-decoder 作者：pemywei | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None):
    """
    Tokenize data file and turn into token-ids using given vocabulary file.

    This function loads data line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
        data_path: path to the data file in one-sentence-per-line format.
        target_path: path where the file with token-ids will be created.
        vocabulary_path: path to the vocabulary file.
        tokenizer: a function to use to tokenize each sentence;
        if None, basic_tokenizer will be used.
    """
    if not gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                for line in data_file:
                    token_ids = sentence_to_token_ids(line, vocab, tokenizer)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：basic-encoder-decoder 作者：pemywei | 项目源码 | 文件源码

def read_data(source_path, target_path, max_seq_len=100):
    data_set = []

    with tf.gfile.GFile(source_path, mode="r") as source_file:
        with tf.gfile.GFile(target_path, mode="r") as target_file:
            source, target = source_file.readline(), target_file.readline()
            counter = 0
            while source and target:
                counter += 1
                if counter % 10000 == 0:
                    print("  reading data line %d" % counter)
                    sys.stdout.flush()

                source_ids = [int(x) for x in source.split()]
                target_ids = [GO_ID] + [int(x) for x in target.split()]
                y = [int(x) for x in target.split()] + [EOS_ID]
                if len(source_ids) < max_seq_len and len(target_ids) <max_seq_len:
                    data_set.append([source_ids, target_ids, y])
                source, target = source_file.readline(), target_file.readline()
    return data_set

项目：Question-Answering 作者：MurtyShikhar | 项目源码 | 文件源码

def create_vocabulary(vocabulary_path, data_paths, tokenizer=None):
    if not gfile.Exists(vocabulary_path):
        print("Creating vocabulary %s from data %s" % (vocabulary_path, str(data_paths)))
        vocab = {}
        for path in data_paths:
            with open(path, mode="rb") as f:
                counter = 0
                for line in f:
                    counter += 1
                    if counter % 100000 == 0:
                        print("processing line %d" % counter)
                    tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
                    for w in tokens:
                        if w in vocab:
                            vocab[w] += 1
                        else:
                            vocab[w] = 1
        vocab_list = _START_VOCAB + sorted(vocab, key=vocab.get, reverse=True)
        print("Vocabulary size: %d" % len(vocab_list))
        with gfile.GFile(vocabulary_path, mode="wb") as vocab_file:
            for w in vocab_list:
                vocab_file.write(w + b"\n")

项目：deep-news-summarization 作者：hengluchang | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="r") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 1000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：seq2seq-webchatbot 作者：zhaoyingjun | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：DMN-tensorflow 作者：sufengniu | 项目源码 | 文件源码

def combine_shuffle(data_dir, filename):
  if not (gfile.Exists(os.path.join(data_dir, filename + '_train.txt')) and gfile.Exists(os.path.join(data_dir, filename + '_test.txt'))):
    data_train = []
    data_test = []
    print ('Shuffle file in %s' % data_dir)
    for subdir, dirs, files in os.walk(data_dir):
      for afile in files:
        with gfile.GFile(os.path.join(subdir, afile), mode="r") as f:
          if afile.endswith("train.txt"):
            data_train.append(f.read())
          else:
            data_test.append(f.read())

    with gfile.GFile(os.path.join(data_dir, filename + '_train.txt'), mode="w") as train_file:
      train_file.write(''.join(data_train))
      train_file.close()
    with gfile.GFile(os.path.join(data_dir, filename + '_test.txt'), mode="w") as test_file:
      test_file.write(''.join(data_test))
      test_file.close()

项目：joint-slu-lm 作者：HadoopIt | 项目源码 | 文件源码

def create_label_vocab(vocabulary_path, data_path):
  if not gfile.Exists(vocabulary_path):
    print("Creating vocabulary %s from data %s" % (vocabulary_path, data_path))
    vocab = {}
    with gfile.GFile(data_path, mode="r") as f:
      counter = 0
      for line in f:
        counter += 1
        if counter % 100000 == 0:
          print("  processing line %d" % counter)
#        tokens = tokenizer(line) if tokenizer else basic_tokenizer(line)
        label = line.strip()
        vocab[label] = 1
      label_list = START_VOCAB_dict['no_padding'] + sorted(vocab)
#      label_list = sorted(vocab)
      with gfile.GFile(vocabulary_path, mode="w") as vocab_file:
        for k in label_list:
          vocab_file.write(k + "\n")

项目：Technical-Analysis-And-Practice-in-TensorFlow 作者：greatgeekgrace | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：neural-chat 作者：henriblancke | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path):
    """Tokenize preprocess file and turn into token-ids using given vocabulary file.

    This function loads preprocess line-by-line from data_path, calls the above
    sentence_to_token_ids, and saves the result to target_path. See comment
    for sentence_to_token_ids on the details of token-ids format.

    Args:
      data_path: path to the preprocess file in one-sentence-per-line format.
      target_path: path where the file with token-ids will be created.
      vocabulary_path: path to the vocabulary file.
        if None, basic_tokenizer will be used.
    """
    if not gfile.Exists(target_path):
        print("Tokenizing preprocess in %s" % data_path)

        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="r") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                for line in tqdm(data_file):
                    token_ids = sentence_to_token_ids(line, vocab)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：lsdc 作者：febert | 项目源码 | 文件源码

def maybe_download(filename, work_directory, source_url):
  """Download the data from source url, unless it's already here.

  Args:
      filename: string, name of the file in the directory.
      work_directory: string, path to working directory.
      source_url: url to download from if file doesn't exist.

  Returns:
      Path to resulting file.
  """
  if not gfile.Exists(work_directory):
    gfile.MakeDirs(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not gfile.Exists(filepath):
    with tempfile.NamedTemporaryFile() as tmpfile:
      temp_file_name = tmpfile.name
      urllib.request.urlretrieve(source_url, temp_file_name)
      gfile.Copy(temp_file_name, filepath)
      with gfile.GFile(filepath) as f:
        size = f.size()
      print('Successfully downloaded', filename, size, 'bytes.')
  return filepath

项目：lsdc 作者：febert | 项目源码 | 文件源码

def maybe_download(filename, work_directory, source_url):
  """Download the data from source url, unless it's already here.

  Args:
      filename: string, name of the file in the directory.
      work_directory: string, path to working directory.
      source_url: url to download from if file doesn't exist.

  Returns:
      Path to resulting file.
  """
  if not gfile.Exists(work_directory):
    gfile.MakeDirs(work_directory)
  filepath = os.path.join(work_directory, filename)
  if not gfile.Exists(filepath):
    temp_file_name, _ = urlretrieve_with_retry(source_url)
    gfile.Copy(temp_file_name, filepath)
    with gfile.GFile(filepath) as f:
      size = f.size()
    print('Successfully downloaded', filename, size, 'bytes.')
  return filepath

项目：tensorflow_chatbot 作者：llSourcell | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：chatbot 作者：bikash | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：easybot 作者：undersail | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：deeplearning4chatbot 作者：liangjz92 | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path, tokenizer=None, normalize_digits=False):
    print(target_path)
    if True:#gfile.Exists(target_path):
        print("Tokenizing data in %s" % data_path)
        vocab, _ = initialize_vocabulary(vocabulary_path)
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    line = line.strip()
                    sentence_array = line.split(SPLITER)
                    token_array= []
                    for sentence in sentence_array:
                        token_ids = sentence_to_token_ids(sentence, vocab, tokenizer,normalize_digits)
                        token_array.append(" ".join([str(tok) for tok in token_ids]))
                    counter += 1
                    if counter % 100000 == 0:
                        print(" tokenizing line %d" % counter)
                    tokens_file.write("\t".join(token_array) + "\n")

项目：deeplearning4chatbot 作者：liangjz92 | 项目源码 | 文件源码

def data_to_token_ids(self,  tokenizer=None, normalize_digits=False):
        data_path  =self.ut_path
        target_path = self.ids_path
        vacab_path = self.vocab_path
        print("Tokenizing data in %s" % data_path)
        vocab, _ = self.initialize_vocabulary()
        with gfile.GFile(data_path, mode="rb") as data_file:
            with gfile.GFile(target_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    line = line.strip()
                    sentence_array = line.split(SPLITER)
                    token_array= []
                    for sentence in sentence_array:
                        token_ids = self.sentence_to_token_ids(sentence, vocab, tokenizer,normalize_digits)
                        token_array.append(" ".join([str(tok) for tok in token_ids]))
                    counter += 1
                    if counter % 100000 == 0:
                        print(" tokenizing line %d" % counter)
                    tokens_file.write("\t".join(token_array) + "\n")

项目：ChatBot 作者：bsat007 | 项目源码 | 文件源码

def data_to_token_ids(data_path, target_path, vocabulary_path,
                      tokenizer=None, normalize_digits=True):

  if not gfile.Exists(target_path):
    print("Tokenizing data in %s" % data_path)
    vocab, _ = initialize_vocabulary(vocabulary_path)
    with gfile.GFile(data_path, mode="rb") as data_file:
      with gfile.GFile(target_path, mode="w") as tokens_file:
        counter = 0
        for line in data_file:
          counter += 1
          if counter % 100000 == 0:
            print("  tokenizing line %d" % counter)
          token_ids = sentence_to_token_ids(line, vocab, tokenizer,
                                            normalize_digits)
          tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")

项目：DialogueBreakdownDetection2016 作者：icoxfog417 | 项目源码 | 文件源码

def create_token_ids(self, ids_path="", vocabulary_path="", progress_interval=100000):
        if ids_path:
            self.ids_path = ids_path
        if vocabulary_path:
            self.vocabulary_path = vocabulary_path
        if gfile.Exists(self.ids_path):
            print("ids file already exists.")
            return 0
        else:
            print("Tokenizing data at %s" % self.data_path)

        vocab, _ = self.load_vocabulary(self.vocabulary_path)
        with gfile.GFile(self.data_path, mode="rb") as data_file:
            with gfile.GFile(self.ids_path, mode="w") as tokens_file:
                counter = 0
                for line in data_file:
                    counter += 1
                    if counter % progress_interval == 0:
                        print("  tokenizing line %d" % counter)
                    token_ids = self.sentence_to_token_ids(line, vocab)
                    tokens_file.write(" ".join([str(tok) for tok in token_ids]) + "\n")