我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用spacy.load()。
def __init__(self, batchsize=64, max_length=15, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/vdict.json','r') as f: self.vdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split): """ Parses the question and answer json files for the given data split. Returns the question dictionary and the answer dictionary. """ qdic, adic = {}, {} with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: qdata = json.load(f)['questions'] for q in qdata: qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ {'qstr': q['question'], 'iid': q['image_id']} if 'test' not in data_split: with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: adata = json.load(f)['annotations'] for a in adata: adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ a['answers'] print 'parsed', len(qdic), 'questions for', data_split return qdic, adic
def load_genome_json(): """ Parses the genome json file. Returns the question dictionary and the answer dictionary. """ qdic, adic = {}, {} with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: qdata = json.load(f) for q in qdata: key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) qdic[key] = {'qstr': q['question'], 'iid': q['image']} adic[key] = [{'answer': q['answer']}] print 'parsed', len(qdic), 'questions for genome' return qdic, adic
def __init__(self, vdict_path, adict_path, \ batchsize=128, max_length=15, n_ans_vocabulary=1000, mode='train', data_shape=(2048)): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.n_ans_vocabulary = n_ans_vocabulary self.mode = mode self.data_shape = data_shape assert self.mode == 'test' # load vocabulary with open(vdict_path,'r') as f: vdict = json.load(f) with open(adict_path,'r') as f: adict = json.load(f) self.n_vocabulary, self.vdict = len(vdict), vdict self.n_ans_vocabulary, self.adict = len(adict), adict self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') self.glove_dict = {} # word -> glove vector
def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType: """ In order to avoid loading spacy models a whole bunch of times, we'll save references to them, keyed by the options we used to create the spacy model, so any particular configuration only gets loaded once. """ options = (spacy_model_name, pos_tags, parse, ner) if options not in LOADED_SPACY_MODELS: disable = ['vectors', 'textcat'] if not pos_tags: disable.append('tagger') if not parse: disable.append('parser') if not ner: disable.append('ner') spacy_model = spacy.load(spacy_model_name, disable=disable) LOADED_SPACY_MODELS[options] = spacy_model return LOADED_SPACY_MODELS[options]
def load(self, filename): """Load pre-existing dictionary in 'token[<TAB>count]' format. Initialize counts from other dictionary, or 0 if they aren't included. """ print('Dictionary: loading dictionary from {}'.format( filename)) with open(filename) as read: for line in read: split = line.strip().split('\t') token = unescape(split[0]) cnt = int(split[1]) if len(split) > 1 else 0 self.freq[token] = cnt if token not in self.tok2ind: index = len(self.tok2ind) self.tok2ind[token] = index self.ind2tok[index] = token print('[ num words = %d ]' % len(self))
def load_spacy_model(disable=False): """ Returns loaded spacy pipeline Args: disable: a list of pipeline components to disable from loaded spacy model. Can signifcantly increase speed. Returns: spacy pipeline """ # if diable is not false, load spacy model with modified pipeline # otherwise, load the default pipeline if disable: try: nlp = spacy.load('en_core_web_sm', disable=disable) except: print('''[ERROR] You likely pased an invalid disable argument to get_spacy_doc!''') else: nlp = spacy.load('en_core_web_sm') return nlp
def train(filePath): try: if not filePath.lower().endswith('json'): return {'success':False,'message':'Training file should be in json format'} with open(filePath) as file: ent_data = json.load(file) dataset = [jsonToCrf(q, nlp) for q in ent_data['entity_examples']] X_train = [sent2features(s) for s in dataset] y_train = [sent2labels(s) for s in dataset] crf = sklearn_crfsuite.CRF( algorithm='lbfgs', c1=0.1, c2=0.1, max_iterations=100, all_possible_transitions=True ) crf.fit(X_train, y_train) if(not os.path.exists("crfModel")): os.mkdir("crfModel") if(os.path.isfile("crfModel/classifier.pkl")): os.remove("crfModel/classifier.pkl") joblib.dump(crf,"crfModel/classifier.pkl") return {'success':True,'message':'Model Trained Successfully'} except Exception as ex: return {'success':False,'message':'Error while Training the model - '+str(ex)}
def predict(utterance): try: tagged = [] finallist = [] parsed = nlp(utterance) for i in range(len(parsed)): tagged.append((str(parsed[i]),parsed[i].tag_)) finallist.append(tagged) test = [sent2features(s) for s in finallist] if(os.path.isfile("crfModel/classifier.pkl")): crf = joblib.load("crfModel/classifier.pkl") else: return {'success':False,'message':'Please Train the model first'} predicted = crf.predict(test) entityList = extractEntities(predicted[0],tagged) return {'success':True,'entitiesPredicted':entityList} except Exception as ex: return {'success':False,'message':'Error while pediction - '+str(ex)}
def __init__(self,language='en'): """ Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en) :param language: Language to parse (en/de/es/pt/fr/it/nl) :type language: str """ # We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily) import spacy acceptedLanguages = ['en','de','es','pt','fr','it','nl'] assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages)) self.language = language if not language in Parser.languageModels: Parser.languageModels[language] = spacy.load(language, disable=['ner']) self.nlp = Parser.languageModels[language]
def build_word_frequency_distribution(): path = os.path.join(data_dir, 'word_freq.pickle') try: with open(path, 'rb') as freq_dist_f: freq_dist_f = pickle.load(freq_dist_f) print('frequency distribution loaded') return freq_dist_f except IOError: pass print('building frequency distribution') freq = defaultdict(int) for i, review in enumerate(read_reviews()): doc = en.tokenizer(review['text']) for token in doc: freq[token.orth_] += 1 if i % 10000 == 0: with open(path, 'wb') as freq_dist_f: pickle.dump(freq, freq_dist_f) print('dump at {}'.format(i)) return freq
def build_vocabulary(lower=3, n=50000): try: with open(vocab_fn, 'rb') as vocab_file: vocab = pickle.load(vocab_file) print('vocabulary loaded') return vocab except IOError: print('building vocabulary') freq = build_word_frequency_distribution() top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n-lower+1] vocab = {} i = lower for w, freq in top_words: vocab[w] = i i += 1 with open(vocab_fn, 'wb') as vocab_file: pickle.dump(vocab, vocab_file) return vocab
def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./%s/vdict.json'%folder,'r') as f: self.vdict = json.load(f) with open('./%s/adict.json'%folder,'r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') self.glove_dict = {} # word -> glove vector
def _set_tokenizer(self, tokenizer): """ Set tokenizer :param tokenizer: tokenization method :return: None """ if tokenizer == "nltk": self.tokenizer = nltk.word_tokenize elif tokenizer == "spacy": spacy_en = spacy.load("en") def spacy_tokenizer(seq): return [w.text for w in spacy_en(seq)] self.tokenizer = spacy_tokenizer else: raise ValueError("Invalid tokenizing method %s" % tokenizer)
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode #self.max_length, self.qdic, self.adic = VQADataProvider.load_data(mode) self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/vdict.json','r') as f: self.vdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) # self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') # self.glove_dict = {} # word -> glove vector
def load_vqa_json(data_split): """ Parses the question and answer json files for the given data split. Returns the question dictionary and the answer dictionary. """ qdic, adic = {}, {} with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f: qdata = json.load(f)['questions'] for q in qdata: qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \ {'qstr': q['question'], 'iid': q['image_id']} if 'test' not in data_split: with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f: adata = json.load(f)['annotations'] for a in adata: adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \ a['answers'] write_log('parsed ' + str(len(qdic)) + ' questions for ' + data_split, 'log.txt') return qdic, adic
def load_genome_json(): """ Parses the genome json file. Returns the question dictionary and the answer dictionary. """ qdic, adic = {}, {} with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f: qdata = json.load(f) for q in qdata: key = 'genome' + QID_KEY_SEPARATOR + str(q['id']) qdic[key] = {'qstr': q['question'], 'iid': q['image']} adic[key] = [{'answer': q['answer']}] write_log('parsed ' + str(len(qdic)) + ' questions for genome', 'log.txt') return qdic, adic
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, max_w_length=config.LENGTH_OF_LONGEST_WORD, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.max_w_length = max_w_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/cdict.json','r') as f: self.cdict = json.load(f) with open('./result/vdict.json','r') as f: self.vdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) #self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') #self.glove_dict = {} # word -> glove vector
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode # self.max_length, self.qdic, self.adic = VQADataProvider.load_data(mode) self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/vdict.json','r') as f: self.vdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) # self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') # self.glove_dict = {} # word -> glove vector
def __init__(self, batchsize=64, max_length=config.MAX_WORDS_IN_QUESTION, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/vdict.json','r') as f: self.vdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) # self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') # self.glove_dict = {} # word -> glove vector
def __init__(self, batchsize=64, max_length=config.MAX_CHARS_IN_QUESTION, mode='train'): self.batchsize = batchsize self.d_vocabulary = None self.batch_index = None self.batch_len = None self.rev_adict = None self.max_length = max_length self.mode = mode self.qdic, self.adic = VQADataProvider.load_data(mode) with open('./result/cdict.json','r') as f: self.cdict = json.load(f) with open('./result/adict.json','r') as f: self.adict = json.load(f) self.n_ans_vocabulary = len(self.adict) #self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors') #self.glove_dict = {} # word -> glove vector