我们从Python开源项目中,提取了以下4个代码示例,用于说明如何使用spacy.en()。
def main(params): input_train_json = json.load(open(params['input_train_json'], 'r')) print("Load spaCy with GloVe vectors") nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') words_to_keep = build_vocab( nlp.tokenizer, [img['question'] for img in input_train_json], int(params['word_count_threshold'])) vectors = sense2vec.vectors.VectorMap(nlp.vocab.vectors_length) for string in words_to_keep: word = nlp.vocab[string] vectors.borrow(word.orth_, 1, numpy.ascontiguousarray(word.vector)) replaced = 0 paraphrases = [] for i, word in enumerate(nlp.vocab): if word.orth_ in words_to_keep: word.norm_ = word.orth_ elif word.lower_ in words_to_keep: word.norm_ = word.lower_ elif word.is_alpha and word.has_vector: vector = numpy.ascontiguousarray(word.vector, dtype='float32') synonyms, scores = vectors.most_similar(vector, 1) word.norm_ = synonyms[0] paraphrases.append((word.orth_, word.norm_)) else: word.norm_ = word.shape_ if i and i % 10000 == 0: print(i, 'words processed. Example: %s --> %s' % random.choice(paraphrases)) print('%d vector-based paraphrases' % len(paraphrases)) if not os.path.exists(params['spacy_data']): os.mkdir(params['spacy_data']) if not os.path.exists(os.path.join(params['spacy_data'], 'vocab')): os.mkdir(os.path.join(params['spacy_data'], 'vocab')) if not os.path.exists(os.path.join(params['spacy_data'], 'tokenizer')): os.mkdir(os.path.join(params['spacy_data'], 'tokenizer')) nlp.vocab.dump(os.path.join(params['spacy_data'], 'vocab', 'lexemes.bin')) with io.open(os.path.join(params['spacy_data'], 'vocab', 'strings.json'), 'w', encoding='utf8') as file_: nlp.vocab.strings.dump(file_)
def is_stop(w): return w in spacy.en.STOP_WORDS
def get_question_features(question): ''' For a given question, a unicode string, returns the timeseris vector with each word (token) transformed into a 300 dimension representation calculated using Glove Vector ''' word_embeddings = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') # word_embeddings = spacy.load('en')#, vectors='en_glove_cc_300_1m_vectors') # nlp = English() # n_dimensions = nlp.vocab.load_vectors('glove.840B.300d.txt.bz2') # print n_dimensions # tokens = n_dimensions # embeddings_index = {} # f = open('glove.6B.300d.txt') # for line in f: # values = line.split() # word = values[0] # coefs = np.asarray(values[1:], dtype='float32') # embeddings_index[word] = coefs # f.close() # # print('Found %s word vectors.' % len(embeddings_index)) # # word_embeddings = spacy.load('en', vectors='glove.6B.30d.txt') tokens = word_embeddings(question) question_tensor = np.zeros((1, 30, 300)) for j in xrange(len(tokens)): question_tensor[0,j,:] = tokens[j].vector return question_tensor