我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用spacy.io()。
def __init__(self,language='en'): """ Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en) :param language: Language to parse (en/de/es/pt/fr/it/nl) :type language: str """ # We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily) import spacy acceptedLanguages = ['en','de','es','pt','fr','it','nl'] assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages)) self.language = language if not language in Parser.languageModels: Parser.languageModels[language] = spacy.load(language, disable=['ner']) self.nlp = Parser.languageModels[language]
def __init__(self, lang='en', lower=True, lemmatize=False, remove_punct=True, remove_digits=True, remove_stop_words=False, exclude_oov=False, exclude_pos_tags=None, exclude_entities=['PERSON']): """Encodes text into `(samples, words)` Args: lang: The spacy language to use. (Default value: 'en') lower: Lower cases the tokens if True. (Default value: True) lemmatize: Lemmatizes words when set to True. This also makes the word lower case irrespective if the `lower` setting. (Default value: False) remove_punct: Removes punct words if True. (Default value: True) remove_digits: Removes digit words if True. (Default value: True) remove_stop_words: Removes stop words if True. (Default value: False) exclude_oov: Exclude words that are out of spacy embedding's vocabulary. By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom embedding to change this. (Default value: False) exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS (Default value: None) exclude_entities: A list of entity types to be excluded. Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types (Default value: ['PERSON']) """ super(WordTokenizer, self).__init__(lang, lower) self.lemmatize = lemmatize self.remove_punct = remove_punct self.remove_digits = remove_digits self.remove_stop_words = remove_stop_words self.exclude_oov = exclude_oov self.exclude_pos_tags = set(exclude_pos_tags or []) self.exclude_entities = set(exclude_entities or [])
def __init__(self, lang='en', lower=True, lemmatize=False, remove_punct=True, remove_digits=True, remove_stop_words=False, exclude_oov=False, exclude_pos_tags=None, exclude_entities=['PERSON']): """Encodes text into `(samples, sentences, words)` Args: lang: The spacy language to use. (Default value: 'en') lower: Lower cases the tokens if True. (Default value: True) lemmatize: Lemmatizes words when set to True. This also makes the word lower case irrespective if the `lower` setting. (Default value: False) remove_punct: Removes punct words if True. (Default value: True) remove_digits: Removes digit words if True. (Default value: True) remove_stop_words: Removes stop words if True. (Default value: False) exclude_oov: Exclude words that are out of spacy embedding's vocabulary. By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom embedding to change this. (Default value: False) exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS (Default value: None) exclude_entities: A list of entity types to be excluded. Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types (Default value: ['PERSON']) """ super(SentenceWordTokenizer, self).__init__(lang, lower, lemmatize, remove_punct, remove_digits, remove_stop_words, exclude_oov, exclude_pos_tags, exclude_entities)
def ensure_proper_language_model(nlp): # type: (Optional[Language]) -> None """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid.""" if nlp is None: raise Exception("Failed to load spacy language model. Loading the model returned 'None'.") if nlp.path is None: # Spacy sets the path to `None` if it did not load the model from disk. # In this case `nlp` is an unusable stub. raise Exception("Failed to load spacy language model for lang '{}'. ".format(nlp.lang) + "Make sure you have downloaded the correct model (https://spacy.io/docs/usage/).")
def get_tokenizer(tokenizer): if callable(tokenizer): return tokenizer if tokenizer == "spacy": try: import spacy spacy_en = spacy.load('en') return lambda s: [tok.text for tok in spacy_en.tokenizer(s)] except ImportError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise except AttributeError: print("Please install SpaCy and the SpaCy English tokenizer. " "See the docs at https://spacy.io for more information.") raise elif tokenizer == "moses": try: from nltk.tokenize.moses import MosesTokenizer moses_tokenizer = MosesTokenizer() return moses_tokenizer.tokenize except ImportError: print("Please install NLTK. " "See the docs at http://nltk.org for more information.") raise except LookupError: print("Please install the necessary NLTK corpora. " "See the docs at http://nltk.org for more information.") raise elif tokenizer == 'revtok': try: import revtok return revtok.tokenize except ImportError: print("Please install revtok.") raise elif tokenizer == 'subword': try: import revtok return lambda x: revtok.tokenize(x, decap=True) except ImportError: print("Please install revtok.") raise raise ValueError("Requested tokenizer {}, valid choices are a " "callable that takes a single string as input, " "\"revtok\" for the revtok reversible tokenizer, " "\"subword\" for the revtok caps-aware tokenizer, " "\"spacy\" for the SpaCy English tokenizer, or " "\"moses\" for the NLTK port of the Moses tokenization " "script.".format(tokenizer))