我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.download()。
def get_only_text_washingtonpost_url(url): # this func will take the URL as an argument and return only # the raw text of the url. # this function works specifically for the washPost articles # because we know the structure of the pages page = urllib.urlopen(url).read().decode('utf8') # we download the URL soup = BeautifulSoup(page) # initialize a beautifulsoup object with the page we downloaded text = ' '.join(map(lambda p: p.text, soup.find_all('article'))) # the above gets everything bewteen a pair of HTML tags # that look a certain way e.g. <article> stuff</article> # the above format is specific to the washington post soup2 = BeautifulSoup(text) # find all the paragraph tage <p> text = ' '.join(map(lambda p: p.text, soup2.find_all('p'))) return soup.title.text, text ####################################################################### # TEST ######################################################################
def activate(self, *args, **kwargs): np.random.seed(1337) # for reproducibility st = datetime.now() self._classifierModel = load_model(self.savedModelPath) logger.info("{} {}".format(datetime.now() - st, "loaded _classifierModel")) st = datetime.now() self._tokenizer = self.get_tokenizer() logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer")) #st = datetime.now() #nltk.download() #self._tokenizer_nltk = nltk.data.load('tokenizers/punkt/english.pickle') #logger.info("{} {}".format(datetime.now() - st, "loaded _tokenizer_nltk")) logger.info("SuggestionMiningDL plugin is ready to go!")
def prepare_data(): make_dirs("data/cache") make_dirs("data/embedding/char") make_dirs("data/embedding/word") make_dirs("data/squad") make_dirs("data/trained_model") make_dirs("checkpoint") nltk.download("punkt") train_filename = "train-v1.1.json" dev_filename = "dev-v1.1.json" squad_base_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/" train_url = os.path.join(squad_base_url, train_filename) dev_url = os.path.join(squad_base_url, dev_filename) download_prefix = os.path.join("data", "squad") maybe_download(train_url, download_prefix, train_filename) maybe_download(dev_url, download_prefix, dev_filename) char_embedding_pretrain_url = "https://raw.githubusercontent.com/minimaxir/char-embeddings/master/glove.840B.300d-char.txt" char_embedding_filename = "glove_char.840B.300d.txt" maybe_download(char_embedding_pretrain_url, "data/embedding/char", char_embedding_filename)
def download(): ''' Install required libraries. Note this library will install nltk dependencies into your user directory. ''' click.echo("Installing nltk packages into your user directories in " + "the following order of existence (first found):\n" + '\n'.join(nltk.data.path)) extensions = [("taggers", "averaged_perceptron_tagger"), ("corpora", "wordnet"), ("tokenizers", "punkt")] missing = check_packages_exist(extensions) for ext_tuple in missing: nltk.download(ext_tuple[1])
def retrieve_onet_titles(self): onet_titles = pd.concat( (pd.read_csv(self.onet_downloader.download( version, 'Occupation Data.txt', 'occupation_data.txt' ), sep='\t') for version in ONET_VERSIONS), ignore_index=True ) # Assumes pandas 0.19, keeps newest duplicate Title onet_titles.drop_duplicates('Title', inplace=True, keep='last') onet_titles['Major'] = onet_titles.iloc[:, 0].apply(lambda x: x[:2]) LOWER = True if LOWER: # all RDD strings are unicode onet_titles['Title'] = onet_titles['Title'].str.lower() onet_titles['Description'] = onet_titles['Description'].str.lower() # now we can do a title -> Major, Minor lookup onet_titles.set_index('Title', inplace=True) # access with onet_titles.loc[u'Sales Agents, Financial Services'] return onet_titles
def __init__(self, opt, embedding_dim): """Initialize the class according to given parameters.""" self.tok2emb = {} self.embedding_dim = embedding_dim self.opt = copy.deepcopy(opt) self.load_items() nltk.download('punkt') if not self.opt.get('fasttext_model'): raise RuntimeError('No pretrained fasttext model provided') self.fasttext_model_file = self.opt.get('fasttext_model') if not os.path.isfile(self.fasttext_model_file): emb_path = os.environ.get('EMBEDDINGS_URL') if not emb_path: raise RuntimeError('No pretrained fasttext model provided') fname = os.path.basename(self.fasttext_model_file) try: print('Trying to download a pretrained fasttext model from the repository') url = urllib.parse.urljoin(emb_path, fname) urllib.request.urlretrieve(url, self.fasttext_model_file) print('Downloaded a fasttext model') except Exception as e: raise RuntimeError('Looks like the `EMBEDDINGS_URL` variable is set incorrectly', e) self.fasttext_model = fasttext.load_model(self.fasttext_model_file)
def main(): nltk_deps = ['punkt', 'averaged_perceptron_tagger'] print 'Checking nltk deps...' map(nltk.download, nltk_deps) print 'nltk deps done'
def __init__(self, tokenizer_type="PTBTokenizer"): # Sanity checks if tokenizer_type in ['SpaceTokenizer', 'NLTKWhiteSpaceTokenizer', 'PTBTokenizer']: self.tokenizer_type = tokenizer_type else: print ("Unrecognized tokenizer type : setting back to default (PTBTokenizer)") self.tokenizer_type = "PTBTokenizer" try: nltk.data.find('punkt.zip') except LookupError: nltk.download('punkt')
def __init__(self): try: nltk.data.find('taggers/averaged_perceptron_tagger/averaged_perceptron_tagger.pickle') except LookupError: nltk.download('averaged_perceptron_tagger')
def load_nltk_data(): nltk.download('punkt') nltk.download('stopwords') nltk.download('snowball_data')
def download_packages(self): import nltk for x in [comp for comp in self._missing if "/" in comp]: package = x.split("/")[1] self.updateLabel.emit(package) nltk.download(package, raise_on_error=True) self.progressTheBar.emit()
def prepare(): """Prepare tagger for run. This should be after installation to initialize tagger's resources. """ import nltk import requests from libarchive import extract_memory import os from shutil import move from f8a_tagger.utils import get_files_dir nltk.download("punkt") nltk.download("wordnet") maven_index_checker_url = 'https://github.com/fabric8-analytics/' \ 'maven-index-checker/files/1275145/' \ 'maven-index-checker-v0.1-alpha.zip' response = requests.get(maven_index_checker_url) if response.ok is not True: raise RemoteDependencyMissingError("Failed to download maven-index-checker with " "response code %s", response.status_code) # Unfortunately no way how to know name or path of extracted file, # so assume it's maven-index-checker.jar jar_name = "maven-index-checker.jar" jar_path = get_files_dir() extract_memory(response.content) move(jar_name, os.path.join(jar_path, jar_name))
def run(self): # setuptools is an oldie goldie. super() is not supported by base class (it's an "old style class") SetuptoolsInstallCommand.do_egg_install(self) import nltk for corpus in _required_nltk_corpora: nltk.download(corpus)
def install_nltk_corpora(*packages): nltk_packages = list(packages) try: installed = (set(os.listdir(nltk.data.find("corpora"))) | (set(os.listdir(nltk.data.find("taggers"))))) | \ (set(os.listdir(nltk.data.find("tokenizers")))) except LookupError: installed = set() if not set(nltk_packages) <= set(installed): nltk.download(nltk_packages)
def run(self): # PUT YOUR POST-INSTALL SCRIPT HERE or CALL A FUNCTION import nltk nltk.download('punkt') install.run(self)
def ensure_nltk_packages(): for package in ('stopwords', 'punkt', 'wordnet'): nltk.download(package, quiet=True)
def download_nltk_resource_if_missing(resource_path, resource): """ Download a missing resource from the Natural Language Processing Toolkit. :param resource_path: Link / path for NLTK resource. :type resource_path: str :param resource: Identifier / name of resource (will be used to download the resource if its not found). :type resource: str """ try: nltk.data.find(resource_path) except LookupError: nltk.download(resource)
def download(): """ Download reuters data and stopwords if not already present" """ nltk.download("reuters") nltk.download("stopwords")
def __init__(self, num_topics=6, num_iterations=500, random_state=None, clean_text=True, vectorizer=None): """ Init for LDA estimator :param num_topics: Number of topics to model (generally 3-10) :type num_topics: int :param num_iterations: Number of iterations to allow before locking in topics :type num_iterations: int :param random_state: Random seed, for consistent topics :type random_state: int :param clean_text: Whether to clean text using self.preprocess(). Recommended if you have not preprocessed the text already :type clean_text: bool :param vectorizer: Word vectorizer to use. The word vectorizer should convert a collection of text documents to a matrix of token counts """ self.num_topics = num_topics self.num_iterations = num_iterations self.random_state = random_state self.lda_model = lda.LDA(n_topics=self.num_topics, n_iter=self.num_iterations, random_state=self.random_state) self.clean_text = clean_text self.get_topic_description_df = None if vectorizer is not None: self.vectorizer = vectorizer else: self.vectorizer = CountVectorizer() # Make sure nltk has required data sets nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet')
def __init__(self, onet_source=OnetSourceDownloader): self.onet_downloader = onet_source() self.onet_titles = self.retrieve_onet_titles() logging.info('Retrieved onet titles') # ... Following the ESA description: # https://en.wikipedia.org/wiki/Explicit_semantic_analysis self.tfidf_vectorizer = TfidfVectorizer(stop_words='english') # optimization note: convert from CSR to CSC self.tf = self.tfidf_vectorizer.fit_transform(self.onet_titles['Description'].values) self.concept_row = self.onet_titles.index.values try: wn.synset except LookupError: nltk.download('wordnet')
def download_lite(): for each in MIN_CORPORA: nltk.download(each)
def download_all(): for each in ALL_CORPORA: nltk.download(each)
def nltkDownload(self): try: nltk.data.find("tokenizers") except LookupError: #self.dis.spinner("Downloading NLTK Data") print("No NLTK data found, downloading now...") nltk.download("all") #self.dis.stop() # The searcher find tweets in the database with with the search term handed # to it with. It will return the tweets the term and number of times it # apeares in the database in a dictionary. # It must be handed: # *a search term as a string
def run(): nltk.download('punkt')
def _post(): import nltk nltk.download('stopwords') nltk.download('punkt')
def __init__(self, wiki, vocab, n_consec): self.wiki = wiki self.vocab = vocab self.n_consec = n_consec # number of consecutive sections that are used to form a query nltk.download('punkt') self.tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
def nltk_download_packages(): nltk.download("words") nltk.download("brown") nltk.download("abc") nltk.download("inaugural") nltk.download("genesis")
def tokenize(str_stream, eos=True, remove_punct=False): """ Given a str or str_stream (f.read()) convert the str to a list of sentences, e.g.: [[word, word], [word, word, ...], ...] :param str_stream: a str or a str_stream :param eos: wether turns '.' into <eos> tag :param remove_punct: wether to remove punctuations: ':', ';', '--', ',', "'" :return: a list of sentences, each sentence is a list of words (str) """ # do lazy import coz import nltk is very slow import nltk try: nltk.data.load('tokenizers/punkt/english.pickle') except LookupError: print('punct resource not found, using nltk.download("punkt") to download resource data...') nltk.download('punkt') tokens = [nltk.word_tokenize(t) for t in nltk.sent_tokenize(str_stream.lower())] # get POS Tags tokens_tags = nltk.pos_tag_sents(tokens, tagset='universal') pos_tags = [] for token_tags in tokens_tags: _, tags = zip(*token_tags) pos_tags.append(tags) # tag number tokens = [['N' if isfloat(t) else t for t in sublist] for sublist in tokens] if eos: for token in tokens: token[-1] = '<eos>' if remove_punct: tokens = [[t for t in sublist if t not in __punct_set] for sublist in tokens] return tokens, pos_tags
def load_movie_reviews(): # movie_reviews is a sizeable corpus to import, so only load it if we have to from nltk.corpus import movie_reviews try: movie_reviews.categories() except: import nltk print('This appears to be your first time using the NLTK Movie Reviews corpus. We will first download the necessary corpus (this is a one-time download that might take a little while') nltk.download('movie_reviews') from nltk.corpus import movie_reviews raw_data = [] # NLTK's corpus is structured in an interesting way # first iterate through the two categories (pos and neg) for category in movie_reviews.categories(): if category == 'pos': pretty_category_name = 'positive' elif category == 'neg': pretty_category_name = 'negative' # each of these categories is just fileids, so grab those for fileid in movie_reviews.fileids(category): # then each review is a NLTK class where each item in that class instance is a word review_words = movie_reviews.words(fileid) review_text = '' for word in review_words: review_text += ' ' + word review_dictionary = { 'text': review_text, 'sentiment': pretty_category_name } raw_data.append(review_dictionary) return raw_data
def download(): """skip unverified certificate and show download dialog""" try: create_unverified_https_context = ssl._create_unverified_context except AttributeError: pass else: ssl._create_default_https_context = create_unverified_https_context nltk.download()
def download(self, name: str) -> None: if not self.exists(name): nltk.download(name, download_dir=self.nltk_dir)
def run(self): import nltk from memex_dossier.models.tests.test_features import nltk_data_packages for data_name in nltk_data_packages: print('nltk.download(%r)' % data_name) nltk.download(data_name)
def nltk_data(): for data_name in nltk_data_packages: print('nltk.download(%r)' % data_name) nltk.download(data_name)
def setup_nltk(self, **kw): import nltk from nltk.data import find tagger = "averaged_perceptron_tagger" try: find("taggers/%s" % tagger) except LookupError: click.echo("Downloading NTLK data (~2MB)...") nltk.download(tagger) return True return False
def initstopwords(self): try: s=set(stopwords.words('english')) except LookupError as e: import nltk nltk.download() s=set(stopwords.words('english')) st = LancasterStemmer() for each in s: self.stopwords.append(st.stem(each)) #Given a dictionary of key: frequency, value: array of words #build the opposite
def install(): for d in dependencies: pip.main(['install', d]) # after nltk module was installed import nltk for data in nltk_data: nltk.download(data)
def build_dict_from_nltk(output_file, corpus=None, stopwords=None, stemmer=Stemmer(), measure='IDF', verbose=False): ''' @param output_file: the name of the file where the dictionary should be saved @param corpus: the NLTK corpus to use (defaults to nltk.corpus.reuters) @param stopwords: a list of (not stemmed) stopwords (defaults to nltk.corpus.reuters.words('stopwords')) @param stemmer: the L{Stemmer} object to be used @param measure: the measure used to compute the weights ('IDF' i.e. 'inverse document frequency' or 'ICF' i.e. 'inverse collection frequency'; defaults to 'IDF') @param verbose: whether information on the progress should be printed on screen ''' from build_dict import build_dict import nltk import pickle if not (corpus and stopwords): nltk.download('reuters') corpus = corpus or nltk.corpus.reuters stopwords = stopwords or nltk.corpus.reuters.words('stopwords') corpus_list = [] if verbose: print 'Processing corpus...' for file in corpus.fileids(): doc = [stemmer(Tag(w.lower())).stem for w in corpus.words(file) if w[0].isalpha()] corpus_list.append(doc) if verbose: print 'Processing stopwords...' stopwords = [stemmer(Tag(w.lower())).stem for w in stopwords] if verbose: print 'Building dictionary... ' dictionary = build_dict(corpus_list, stopwords, measure) with open(output_file, 'wb') as out: pickle.dump(dictionary, out, -1)
def download(): nltk.download()
def download_nltk_data(package_name='all'): """ download necessary data from NLTK args: package_name: string containing the package name to install returns: None """ if package_name is 'all': data = ['punkt', 'wordnet', 'stopwords', 'averaged_perceptron_tagger'] for package in data: download(package) else: download(package)
def create_data_paths(): if not os.path.isdir(DATA_DIR): raise EnvironmentError('Needs to be run from project directory containing ' + DATA_DIR) needed_paths = [ os.path.join(DATA_DIR, 'samples'), os.path.join(DATA_DIR, 'val_samples'), os.path.join(DATA_DIR, 'Models'), ] for p in needed_paths: make_sure_path_exists(p) # adapted from http://stackoverflow.com/questions/51212/how-to-write-a-download-progress-indicator-in-python
def _sentence_tokenizer(self, language): try: path = to_string("tokenizers/punkt/%s.pickle") % to_string(language) return nltk.data.load(path) except (LookupError, zipfile.BadZipfile): raise LookupError( "NLTK tokenizers are missing. Download them by following command: " '''python -c "import nltk; nltk.download('punkt')"''' )
def english_sentence_segment(text): """segment text into sentence""" try: sent_detector = nltk.data.load( 'tokenizers/punkt/english.pickle' ) extra_abbrev = ["e.g", "al", "i.e"] sent_detector._params.abbrev_types.update(extra_abbrev) return sent_detector.tokenize(text) except LookupError as e: raise LookupError( "NLTK tokenizers are missing. Download them by following command: " '''python -c "import nltk; nltk.download('punkt')"''' )
def download_preferences(self): import nltk # importing the natural language processing module nltk.download() # opening the gui based Natural language processing download kit