Python spacy 模块，load() 实例源码

我们从Python开源项目中，提取了以下50个代码示例，用于说明如何使用spacy.load()。

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def __init__(self, batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./result/vdict.json','r') as f:
            self.vdict = json.load(f)
        with open('./result/adict.json','r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def load_vqa_json(data_split):
        """
        Parses the question and answer json files for the given data split. 
        Returns the question dictionary and the answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
            qdata = json.load(f)['questions']
            for q in qdata:
                qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
                    {'qstr': q['question'], 'iid': q['image_id']}

        if 'test' not in data_split:
            with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
                adata = json.load(f)['annotations']
                for a in adata:
                    adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
                        a['answers']

        print 'parsed', len(qdic), 'questions for', data_split
        return qdic, adic

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def __init__(self, batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./result/vdict.json','r') as f:
            self.vdict = json.load(f)
        with open('./result/adict.json','r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：vqa-mcb 作者：akirafukui | 项目源码 | 文件源码

def __init__(self, vdict_path, adict_path, \
        batchsize=128, max_length=15, n_ans_vocabulary=1000, mode='train', data_shape=(2048)):

        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.n_ans_vocabulary = n_ans_vocabulary
        self.mode = mode
        self.data_shape = data_shape

        assert self.mode == 'test'

        # load vocabulary
        with open(vdict_path,'r') as f:
            vdict = json.load(f)
        with open(adict_path,'r') as f:
            adict = json.load(f)
        self.n_vocabulary, self.vdict = len(vdict), vdict
        self.n_ans_vocabulary, self.adict = len(adict), adict

        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：allennlp 作者：allenai | 项目源码 | 文件源码

def get_spacy_model(spacy_model_name: str, pos_tags: bool, parse: bool, ner: bool) -> SpacyModelType:
    """
    In order to avoid loading spacy models a whole bunch of times, we'll save references to them,
    keyed by the options we used to create the spacy model, so any particular configuration only
    gets loaded once.
    """
    options = (spacy_model_name, pos_tags, parse, ner)
    if options not in LOADED_SPACY_MODELS:
        disable = ['vectors', 'textcat']
        if not pos_tags:
            disable.append('tagger')
        if not parse:
            disable.append('parser')
        if not ner:
            disable.append('ner')
        spacy_model = spacy.load(spacy_model_name, disable=disable)
        LOADED_SPACY_MODELS[options] = spacy_model
    return LOADED_SPACY_MODELS[options]

项目：ParlAI 作者：facebookresearch | 项目源码 | 文件源码

def load(self, filename):
        """Load pre-existing dictionary in 'token[<TAB>count]' format.
        Initialize counts from other dictionary, or 0 if they aren't included.
        """
        print('Dictionary: loading dictionary from {}'.format(
              filename))
        with open(filename) as read:
            for line in read:
                split = line.strip().split('\t')
                token = unescape(split[0])
                cnt = int(split[1]) if len(split) > 1 else 0
                self.freq[token] = cnt
                if token not in self.tok2ind:
                    index = len(self.tok2ind)
                    self.tok2ind[token] = index
                    self.ind2tok[index] = token
        print('[ num words =  %d ]' % len(self))

项目：Alfred 作者：JohnGiorgi | 项目源码 | 文件源码

def load_spacy_model(disable=False):
    """
    Returns loaded spacy pipeline

    Args:
        disable: a list of pipeline components to disable from loaded spacy
        model. Can signifcantly increase speed.

    Returns:
        spacy pipeline
    """
    # if diable is not false, load spacy model with modified pipeline
    # otherwise, load the default pipeline
    if disable:
        try:
            nlp = spacy.load('en_core_web_sm', disable=disable)
        except:
            print('''[ERROR] You likely pased an invalid disable argument to
                     get_spacy_doc!''')
    else:
        nlp = spacy.load('en_core_web_sm')

    return nlp

项目：scikitcrf_NER 作者：ManikandanThangavelu | 项目源码 | 文件源码

def train(filePath):
    try:
        if not filePath.lower().endswith('json'):
            return {'success':False,'message':'Training file should be in json format'}
        with open(filePath) as file:
            ent_data = json.load(file)
        dataset = [jsonToCrf(q, nlp) for q in ent_data['entity_examples']]
        X_train = [sent2features(s) for s in dataset]
        y_train = [sent2labels(s) for s in dataset]
        crf = sklearn_crfsuite.CRF(
        algorithm='lbfgs', 
        c1=0.1, 
        c2=0.1, 
        max_iterations=100, 
        all_possible_transitions=True
        )
        crf.fit(X_train, y_train)
        if(not os.path.exists("crfModel")):
            os.mkdir("crfModel")
        if(os.path.isfile("crfModel/classifier.pkl")):
            os.remove("crfModel/classifier.pkl")
        joblib.dump(crf,"crfModel/classifier.pkl")
        return {'success':True,'message':'Model Trained Successfully'}
    except Exception as ex:
        return {'success':False,'message':'Error while Training the model - '+str(ex)}

项目：scikitcrf_NER 作者：ManikandanThangavelu | 项目源码 | 文件源码

def predict(utterance):
    try:
        tagged = []
        finallist = []
        parsed = nlp(utterance)
        for i in range(len(parsed)):
            tagged.append((str(parsed[i]),parsed[i].tag_))
        finallist.append(tagged)
        test = [sent2features(s) for s in finallist]
        if(os.path.isfile("crfModel/classifier.pkl")):
            crf = joblib.load("crfModel/classifier.pkl")
        else:
            return {'success':False,'message':'Please Train the model first'}
        predicted = crf.predict(test)
        entityList = extractEntities(predicted[0],tagged)
        return {'success':True,'entitiesPredicted':entityList}
    except Exception as ex:
        return {'success':False,'message':'Error while pediction - '+str(ex)}

项目：kindred 作者：jakelever | 项目源码 | 文件源码

def __init__(self,language='en'):
        """
        Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en)

        :param language: Language to parse (en/de/es/pt/fr/it/nl)
        :type language: str
        """

        # We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily)
        import spacy

        acceptedLanguages = ['en','de','es','pt','fr','it','nl']
        assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages))

        self.language = language

        if not language in Parser.languageModels:
            Parser.languageModels[language] = spacy.load(language, disable=['ner'])

        self.nlp = Parser.languageModels[language]

项目：hierarchical-attention-networks 作者：ematvey | 项目源码 | 文件源码

def build_word_frequency_distribution():
  path = os.path.join(data_dir, 'word_freq.pickle')

  try:
    with open(path, 'rb') as freq_dist_f:
      freq_dist_f = pickle.load(freq_dist_f)
      print('frequency distribution loaded')
      return freq_dist_f
  except IOError:
    pass

  print('building frequency distribution')
  freq = defaultdict(int)
  for i, review in enumerate(read_reviews()):
    doc = en.tokenizer(review['text'])
    for token in doc:
      freq[token.orth_] += 1
    if i % 10000 == 0:
      with open(path, 'wb') as freq_dist_f:
        pickle.dump(freq, freq_dist_f)
      print('dump at {}'.format(i))
  return freq

项目：hierarchical-attention-networks 作者：ematvey | 项目源码 | 文件源码

def build_vocabulary(lower=3, n=50000):
  try:
    with open(vocab_fn, 'rb') as vocab_file:
      vocab = pickle.load(vocab_file)
      print('vocabulary loaded')
      return vocab
  except IOError:
    print('building vocabulary')
  freq = build_word_frequency_distribution()
  top_words = list(sorted(freq.items(), key=lambda x: -x[1]))[:n-lower+1]
  vocab = {}
  i = lower
  for w, freq in top_words:
    vocab[w] = i
    i += 1
  with open(vocab_fn, 'wb') as vocab_file:
    pickle.dump(vocab, vocab_file)
  return vocab

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./%s/vdict.json'%folder,'r') as f:
            self.vdict = json.load(f)
        with open('./%s/adict.json'%folder,'r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_vqa_json(data_split):
        """
        Parses the question and answer json files for the given data split. 
        Returns the question dictionary and the answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
            qdata = json.load(f)['questions']
            for q in qdata:
                qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
                    {'qstr': q['question'], 'iid': q['image_id']}

        if 'test' not in data_split:
            with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
                adata = json.load(f)['annotations']
                for a in adata:
                    adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
                        a['answers']

        print 'parsed', len(qdic), 'questions for', data_split
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./%s/vdict.json'%folder,'r') as f:
            self.vdict = json.load(f)
        with open('./%s/adict.json'%folder,'r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./%s/vdict.json'%folder,'r') as f:
            self.vdict = json.load(f)
        with open('./%s/adict.json'%folder,'r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_vqa_json(data_split):
        """
        Parses the question and answer json files for the given data split. 
        Returns the question dictionary and the answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
            qdata = json.load(f)['questions']
            for q in qdata:
                qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
                    {'qstr': q['question'], 'iid': q['image_id']}

        if 'test' not in data_split:
            with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
                adata = json.load(f)['annotations']
                for a in adata:
                    adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
                        a['answers']

        print 'parsed', len(qdic), 'questions for', data_split
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def __init__(self, folder='result', batchsize=64, max_length=15, mode='train'):
        self.batchsize = batchsize
        self.d_vocabulary = None
        self.batch_index = None
        self.batch_len = None
        self.rev_adict = None
        self.max_length = max_length
        self.mode = mode
        self.qdic, self.adic = VQADataProvider.load_data(mode)

        with open('./%s/vdict.json'%folder,'r') as f:
            self.vdict = json.load(f)
        with open('./%s/adict.json'%folder,'r') as f:
            self.adict = json.load(f)

        self.n_ans_vocabulary = len(self.adict)
        self.nlp = spacy.load('en', vectors='en_glove_cc_300_1m_vectors')
        self.glove_dict = {} # word -> glove vector

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_vqa_json(data_split):
        """
        Parses the question and answer json files for the given data split. 
        Returns the question dictionary and the answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS[data_split]['ques_file'], 'r') as f:
            qdata = json.load(f)['questions']
            for q in qdata:
                qdic[data_split + QID_KEY_SEPARATOR + str(q['question_id'])] = \
                    {'qstr': q['question'], 'iid': q['image_id']}

        if 'test' not in data_split:
            with open(config.DATA_PATHS[data_split]['ans_file'], 'r') as f:
                adata = json.load(f)['annotations']
                for a in adata:
                    adic[data_split + QID_KEY_SEPARATOR + str(a['question_id'])] = \
                        a['answers']

        print 'parsed', len(qdic), 'questions for', data_split
        return qdic, adic

项目：vqa-mfb 作者：yuzcccc | 项目源码 | 文件源码

def load_genome_json():
        """
        Parses the genome json file. Returns the question dictionary and the
        answer dictionary.
        """
        qdic, adic = {}, {}

        with open(config.DATA_PATHS['genome']['genome_file'], 'r') as f:
            qdata = json.load(f)
            for q in qdata:
                key = 'genome' + QID_KEY_SEPARATOR + str(q['id'])
                qdic[key] = {'qstr': q['question'], 'iid': q['image']}
                adic[key] = [{'answer': q['answer']}]

        print 'parsed', len(qdic), 'questions for genome'
        return qdic, adic

项目：R-net 作者：matthew-z | 项目源码 | 文件源码

def _set_tokenizer(self, tokenizer):
        """
        Set tokenizer

        :param tokenizer: tokenization method
        :return: None
        """
        if tokenizer == "nltk":
            self.tokenizer = nltk.word_tokenize
        elif tokenizer == "spacy":
            spacy_en = spacy.load("en")

            def spacy_tokenizer(seq):
                return [w.text for w in spacy_en(seq)]

            self.tokenizer = spacy_tokenizer
        else:
            raise ValueError("Invalid tokenizing method %s" % tokenizer)