Python spacy 模块,io() 实例源码

我们从Python开源项目中,提取了以下6个代码示例,用于说明如何使用spacy.io()

项目:kindred    作者:jakelever    | 项目源码 | 文件源码
def __init__(self,language='en'):
        """
        Create a Parser object that will use Spacy for parsing. It uses Spacy and offers all the same languages that Spacy offers. Check out: https://spacy.io/usage/models. Note that the language model needs to be downloaded first (e.g. python -m spacy download en)

        :param language: Language to parse (en/de/es/pt/fr/it/nl)
        :type language: str
        """

        # We only load spacy if a Parser is created (to allow ReadTheDocs to build the documentation easily)
        import spacy

        acceptedLanguages = ['en','de','es','pt','fr','it','nl']
        assert language in acceptedLanguages, "Language for parser (%s) not in accepted languages: %s" % (language,str(acceptedLanguages))

        self.language = language

        if not language in Parser.languageModels:
            Parser.languageModels[language] = spacy.load(language, disable=['ner'])

        self.nlp = Parser.languageModels[language]
项目:keras-text    作者:raghakot    | 项目源码 | 文件源码
def __init__(self,
                 lang='en',
                 lower=True,
                 lemmatize=False,
                 remove_punct=True,
                 remove_digits=True,
                 remove_stop_words=False,
                 exclude_oov=False,
                 exclude_pos_tags=None,
                 exclude_entities=['PERSON']):
        """Encodes text into `(samples, words)`

        Args:
            lang: The spacy language to use. (Default value: 'en')
            lower: Lower cases the tokens if True. (Default value: True)
            lemmatize: Lemmatizes words when set to True. This also makes the word lower case
                irrespective if the `lower` setting. (Default value: False)
            remove_punct: Removes punct words if True. (Default value: True)
            remove_digits: Removes digit words if True. (Default value: True)
            remove_stop_words: Removes stop words if True. (Default value: False)
            exclude_oov: Exclude words that are out of spacy embedding's vocabulary.
                By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom
                embedding to change this. (Default value: False)
            exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS
                (Default value: None)
            exclude_entities: A list of entity types to be excluded.
                Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
                (Default value: ['PERSON'])
        """

        super(WordTokenizer, self).__init__(lang, lower)
        self.lemmatize = lemmatize
        self.remove_punct = remove_punct
        self.remove_digits = remove_digits
        self.remove_stop_words = remove_stop_words

        self.exclude_oov = exclude_oov
        self.exclude_pos_tags = set(exclude_pos_tags or [])
        self.exclude_entities = set(exclude_entities or [])
项目:keras-text    作者:raghakot    | 项目源码 | 文件源码
def __init__(self,
                 lang='en',
                 lower=True,
                 lemmatize=False,
                 remove_punct=True,
                 remove_digits=True,
                 remove_stop_words=False,
                 exclude_oov=False,
                 exclude_pos_tags=None,
                 exclude_entities=['PERSON']):
        """Encodes text into `(samples, sentences, words)`

        Args:
            lang: The spacy language to use. (Default value: 'en')
            lower: Lower cases the tokens if True. (Default value: True)
            lemmatize: Lemmatizes words when set to True. This also makes the word lower case
                irrespective if the `lower` setting. (Default value: False)
            remove_punct: Removes punct words if True. (Default value: True)
            remove_digits: Removes digit words if True. (Default value: True)
            remove_stop_words: Removes stop words if True. (Default value: False)
            exclude_oov: Exclude words that are out of spacy embedding's vocabulary.
                By default, GloVe 1 million, 300 dim are used. You can override spacy vocabulary with a custom
                embedding to change this. (Default value: False)
            exclude_pos_tags: A list of parts of speech tags to exclude. Can be any of spacy.parts_of_speech.IDS
                (Default value: None)
            exclude_entities: A list of entity types to be excluded.
                Supported entity types can be found here: https://spacy.io/docs/usage/entity-recognition#entity-types
                (Default value: ['PERSON'])
        """
        super(SentenceWordTokenizer, self).__init__(lang,
                                                    lower,
                                                    lemmatize,
                                                    remove_punct,
                                                    remove_digits,
                                                    remove_stop_words,
                                                    exclude_oov,
                                                    exclude_pos_tags,
                                                    exclude_entities)
项目:rasa_nlu    作者:RasaHQ    | 项目源码 | 文件源码
def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for lang '{}'. ".format(nlp.lang) +
                            "Make sure you have downloaded the correct model (https://spacy.io/docs/usage/).")
项目:Rasa_NLU_Chi    作者:crownpku    | 项目源码 | 文件源码
def ensure_proper_language_model(nlp):
        # type: (Optional[Language]) -> None
        """Checks if the spacy language model is properly loaded. Raises an exception if the model is invalid."""

        if nlp is None:
            raise Exception("Failed to load spacy language model. Loading the model returned 'None'.")
        if nlp.path is None:
            # Spacy sets the path to `None` if it did not load the model from disk.
            # In this case `nlp` is an unusable stub.
            raise Exception("Failed to load spacy language model for lang '{}'. ".format(nlp.lang) +
                            "Make sure you have downloaded the correct model (https://spacy.io/docs/usage/).")
项目:text    作者:pytorch    | 项目源码 | 文件源码
def get_tokenizer(tokenizer):
    if callable(tokenizer):
        return tokenizer
    if tokenizer == "spacy":
        try:
            import spacy
            spacy_en = spacy.load('en')
            return lambda s: [tok.text for tok in spacy_en.tokenizer(s)]
        except ImportError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
        except AttributeError:
            print("Please install SpaCy and the SpaCy English tokenizer. "
                  "See the docs at https://spacy.io for more information.")
            raise
    elif tokenizer == "moses":
        try:
            from nltk.tokenize.moses import MosesTokenizer
            moses_tokenizer = MosesTokenizer()
            return moses_tokenizer.tokenize
        except ImportError:
            print("Please install NLTK. "
                  "See the docs at http://nltk.org for more information.")
            raise
        except LookupError:
            print("Please install the necessary NLTK corpora. "
                  "See the docs at http://nltk.org for more information.")
            raise
    elif tokenizer == 'revtok':
        try:
            import revtok
            return revtok.tokenize
        except ImportError:
            print("Please install revtok.")
            raise
    elif tokenizer == 'subword':
        try:
            import revtok
            return lambda x: revtok.tokenize(x, decap=True)
        except ImportError:
            print("Please install revtok.")
            raise
    raise ValueError("Requested tokenizer {}, valid choices are a "
                     "callable that takes a single string as input, "
                     "\"revtok\" for the revtok reversible tokenizer, "
                     "\"subword\" for the revtok caps-aware tokenizer, "
                     "\"spacy\" for the SpaCy English tokenizer, or "
                     "\"moses\" for the NLTK port of the Moses tokenization "
                     "script.".format(tokenizer))