我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.data()。
def copy_packages(self): import nltk.data target_path = nltk.data.path[0] for x in [comp for comp in self._missing if "/" in comp]: parts = x.split("/") subdir = os.path.join(target_path, parts[0]) package = parts[1] zip_name = "{}.zip".format(package) self.updateLabel.emit(package) src = os.path.join(_NLTK_dir, zip_name) dst = os.path.join(subdir, zip_name) if not os.path.exists(subdir): os.makedirs(subdir) if os.path.exists(src): shutil.copyfile(src, dst) else: raise ValueError("Package file {}.zip not found in {}".format(package, _NLTK_dir)) with zipfile.ZipFile(dst) as zipped: for member in zipped.infolist(): zipped.extract(member, subdir) self.progressTheBar.emit()
def batch_iter(data, batch_size, num_epochs, shuffle=True): """ Generates a batch iterator for a dataset. """ data = np.array(data) data_size = len(data) num_batches_per_epoch = int(len(data)/batch_size) + 1 for epoch in range(num_epochs): # Shuffle the data at each epoch if shuffle: shuffle_indices = np.random.permutation(np.arange(data_size)) shuffled_data = data[shuffle_indices] else: shuffled_data = data for batch_num in range(num_batches_per_epoch): start_index = batch_num * batch_size end_index = min((batch_num + 1) * batch_size, data_size) yield shuffled_data[start_index:end_index]
def read_json_file(path_to_json): objects = [] data = '' with io.open(path_to_json, 'r', encoding='utf8') as f: for line in f: if line in ['\n', '\n\r']: objects.append(json.loads(data)) data = '' else: data += line try: objects.append(json.loads(data)) except: return objects return objects # get original sentence, compression sentence
def __init__(self, root, fileids=DOC_PATTERN, tags=None, word_tokenizer=WordPunctTokenizer(), sent_tokenizer=nltk.data.LazyLoader( 'tokenizers/punkt/english.pickle'), encoding='utf8', **kwargs): """ Initialize the corpus reader. Categorization arguments (``cat_pattern``, ``cat_map``, and ``cat_file``) are passed to the ``CategorizedCorpusReader`` constructor. The remaining arguments are passed to the ``CorpusReader`` constructor. """ # Add the default category pattern if not passed into the class. if not any(key.startswith('cat_') for key in kwargs.keys()): kwargs['cat_pattern'] = CAT_PATTERN CategorizedCorpusReader.__init__(self, kwargs) CorpusReader.__init__(self, root, fileids, encoding) self._word_tokenizer = word_tokenizer self._sent_tokenizer = sent_tokenizer self._good_tags = tags or self.TAGS
def docs(self, fileids=None, categories=None): """ Returns the complete JSON document for every file in the corpus. Note that I attempted to use the nltk ``CorpusView`` and ``concat`` methods here, but was not getting memory safe iteration. Instead the simple Python generator by far did a better job of ensuring that file handles got closed and that not all data was loaded into memory at a time. In the future, I will try to re-implement the corpus view. """ # Resolve the fileids and the categories fileids = self._resolve(fileids, categories) # Create a generator, loading one document into memory at a time. for path, enc, fileid in self.abspaths(fileids, True, True): with codecs.open(path, 'r', encoding=enc) as f: yield json.load(f)
def sql_query(dbname, query): """ Execute an SQL query over a database. :param dbname: filename of persistent store :type schema: str :param query: SQL query :type rel_name: str """ import sqlite3 try: path = nltk.data.find(dbname) connection = sqlite3.connect(str(path)) cur = connection.cursor() return cur.execute(query) except (ValueError, sqlite3.OperationalError): import warnings warnings.warn("Make sure the database file %s is installed and uncompressed." % dbname) raise
def load_data_and_labels(): """ Loads MR polarity data from files, splits the data into words and generates labels. Returns split sentences and labels. """ # Load data from files positive_examples = list(open("./data/rt-polaritydata/rt-polarity.pos", "r").readlines()) positive_examples = [s.strip() for s in positive_examples] negative_examples = list(open("./data/rt-polaritydata/rt-polarity.neg", "r").readlines()) negative_examples = [s.strip() for s in negative_examples] # Split by words #x_text = list(open("./trainUNK.txt", "r").readlines()) x_text = positive_examples + negative_examples x_text = [clean_str(sent) for sent in x_text] x_text = [s.split(" ") for s in x_text] # Generate labels positive_labels = [[0, 1] for _ in positive_examples] negative_labels = [[1, 0] for _ in negative_examples] y = np.concatenate([positive_labels, negative_labels], 0) return [x_text, y]
def load_data_for_books(path): text = ''.join(open(path).readlines()).decode('utf8') tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') book = tokenizer.tokenize(text) #book = re.split(r' *[\.\?!][\'"\)\]]* *', text) #book = list(open(path, "r").readlines()) book = [s.strip() for s in book] book = [clean_str(sent) for sent in book] book = [s.split(" ") for s in book] x_text = book y = np.vstack([np.zeros(len(book)),np.ones(len(book))]).T sentences, labels = x_text,y sentences_padded = pad_sentences(sentences) sentencesT, labelsT = load_data_and_labels() sentences_paddedT = pad_sentences(sentencesT) vocabulary, vocabulary_inv = build_vocab(sentences_paddedT) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv, sentencesT]
def add_full_stops_to_the_end(infile, outfile): #clean data of small titles nad add full stops for NLTK to work output_format = '{}.\n'.format with open(infile) as fin, codecs.open(outfile, 'w+', 'utf-8') as fout: for line in fin: if line[0] == ' ': pass #ignore headlines with less than three words elif len(line.split()) <= 3: pass elif line.endswith('.\n') or line.endswith('!\n') or line.endswith('?\n') or line.endswith('!\n') or line.endswith('\'\n') or line.endswith('"\n'): print >> fout, line.decode('utf-8'), else: print >> fout, output_format(line.strip()).decode('utf-8'), ############################################ # Convert All except first word and quotes # to lower case # ############################################
def location(url): fdata={'Accept':'*/*', 'Accept-Encoding':'gzip, deflate', 'Accept-Language':'en-US,en;q=0.8', 'Connection':'keep-alive', 'Content-Length':'29', 'Content-type':'application/x-www-form-urlencoded', 'Cookie':'PHPSESSID=hisbu0rrh09nssn99vckkqr740; __utma=103585558.1324897437.1443987736.1443987736.1443987736.1; __utmb=103585558.2.10.1443987736; __utmc=103585558; __utmz=103585558.1443987736.1.1.utmcsr=google|utmccn=(organic)|utmcmd=organic|utmctr=(not%20provided)', 'Host':'get-site-ip.com', 'Origin':'http://get-site-ip.com', 'Referer':'http://get-site-ip.com/', 'User-Agent':'Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} response=requests.post('http://get-site-ip.com/_pages/_moduler/ajaxSkrivUtIpPaNamn.php',data={'dnsNakeLookUp_In':url}) #print response.content soup=BeautifulSoup(response.content,"lxml") #print "Location : " for i in soup.find_all("div", { "class" :"response"}): # print i.get_text() # print i.get_text().split('-')[2].replace(' ','') return i.get_text().split('-')[2].replace(' ','') #Finds number of special characters
def nofoutofplacefeatures(url): # pdb.set_trace() if url[:4]=="http": r = requests.get(url) else: url="http://"+url r = requests.get(url) #r = requests.get(url) data = r.text data2=r.content document, errors = tidy_document(data, options={'numeric-entities':1}) #print document #print errors #print "Number of Elements Out of Place : " + str(len(errors)) return len(errors)
def reg_date(url): url=url.strip("www.") print url ur="http://www.whois.com/whois/"+url r = requests.get(ur) data = r.content.decode("utf-8") #print data try : soup = BeautifulSoup(data) #<div class="whois_result" for link in soup.find_all("div",{"class":"whois_result"}): site = link.get_text().lower() print site.decode("utf-8") print "\n date is \n" print re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1] return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1] except: pass
def nofoutofplacefeatures(url): try: # pdb.set_trace() if url[:4]=="http": r = requests.get(url) else: url="http://"+url r = requests.get(url) #r = requests.get(url) data = r.text data2=r.content document, errors = tidy_document(data, options={'numeric-entities':1}) #print document #print errors #print "Number of Elements Out of Place : " + str(len(errors)) return len(errors) except: pass
def reg_date(url): url=url.strip("www.") #print url ur="http://www.whois.com/whois/"+url r = requests.get(ur) data = r.content.decode("utf-8") #print data try : soup = BeautifulSoup(data,"lxml") #<div class="whois_result" for link in soup.find_all("div",{"class":"whois_result"}): site = link.get_text().lower() #print site.decode("utf-8") print "\n Domain registration date is " + re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1] return re.findall("\d\d-[a-z][a-z][a-z]-\d\d\d\d",site.decode("utf-8"))[1] except: pass
def read_data(source): """ Reads the sentence data from the csv file, which is of the form (sentence, is_summary_sentence). Args: source = the data file to read the data from Returns: A list of tuples where each tuple is of the form (sentence, is_summary_sentence). """ sentences = [] count = 0 with open(source, "r") as csvfile: reader = csv.reader(csvfile) for row in reader: sentence = row[0] sentence = sentence.strip("\"") sentence = sentence.strip("[") sentence = sentence.strip("]") sentence = sentence.replace("'", "") sentence = sentence.replace(" ", "") sentence = sentence.split(",") sentences.append(sentence) count += 1 return sentences # ============================================ # ================ MAIN PROGRAM ============== # Read in all of the papers into a list of lists. Each item in the list is a sentence, in the form of a list of words.
def tokenize_to_sentence(text): parser = nltk.data.load('tokenizers/punkt/english.pickle') # split into sentences sentences = parser.tokenize(text.strip()) return [lemmatize_sentence(sentence) for sentence in sentences]
def getMeta(self, fileName): """Return the meta data for a given fileName e.g year, url, MOH, borough, bID. """ splitReport = fileName.split('.') bID = splitReport[2] year = splitReport[1] url = self.getUrl(bID) try: region = mapping[bID][1] mohRegion = mapping[bID][0] except: # TODO there is a problem with mappings e.g Acton.1915.b19783905.txt. Region cannot be found print(fileName) return (None, None, None, None, None) return year, region, bID, url, mohRegion
def get_app_data(app_id): url = 'http://store.steampowered.com/api/appdetails?appids=' + str(app_id) response = urllib.urlopen(url) try: data = json.loads(response.read()) if not data[str(app_id)]['success'] or data[str(app_id)]['data']['type'] != 'game': return None return data[str(app_id)] except: return None
def get_apps(): url = 'http://api.steampowered.com/ISteamApps/GetAppList/v2/' response = urllib.urlopen(url) try: data = json.loads(response.read()) apps = data['applist']['apps'] return apps except: return None
def get_description_from_app_data(app_data): description = clean_string(app_data['data']['detailed_description']) sentences = SENTENCE_DETECTOR.tokenize(description.strip()) if len(sentences) > 0: sentences = sentences[0:(min(3, len(sentences)))] sentences = [x for x in sentences if len(x.split(' ')) > 5 and not x.split(' ')[0].isupper() and x.find('\r') == -1] combined_sentence = ' '.join(sentences) if len(combined_sentence) == 0 or not combined_sentence[0].isalpha() or len(combined_sentence.split(' ')) < 5: return None return combined_sentence return None
def get_title_from_app_data(app_data): return clean_string(app_data['data']['name'])
def load_data_from_json2(path_to_json, test_split, vocabulary_size): ''' Load data for training and testing from json file :param path_to_json: path to json file :param word2vec_dict: dictionary of word2vec :return: X_train, y_train, X_test, y_test ''' X=[] y=[] len_sent_array=[] sample_weight=[] objests=read_json_file(path_to_json) print 'Data %d sentences'%len(objests) i=0 original_sentence_array=[] compression_sentence_array=[] word2indext_dict, _ = word2index(objests, vocabulary_size) for object in objests: original_sentence, compression_sentence = get_originalSent_compressionSent(object) (array_sent, sample_w) = word2vec(original_sentence, word2indext_dict) X.append(array_sent) sample_weight.append(sample_w) (y_l,l) = label_compress(original_sentence, compression_sentence) y.append(y_l) len_sent_array.append(l) i+=1 if i%100==0: sys.stdout.write('.') #get text array: original_sentence_array.append(original_sentence) compression_sentence_array.append(compression_sentence) return ((X[int(len(X)*test_split):],y[int(len(y)*test_split):], len_sent_array[int(len(len_sent_array)*test_split):], sample_weight[int(len(sample_weight)*test_split):]), (X[:int(len(X)*test_split)], y[:int(len(y)*test_split)], len_sent_array[:int(len(len_sent_array)*test_split)], sample_weight[:int(len(sample_weight)*test_split)]), (original_sentence_array, compression_sentence_array))
def feeds(self): """ Opens and returns the collection of feeds associated with the corpus. """ data = self.open('feeds.json') return json.load(data)
def describe(self, fileids=None, categories=None): """ Performs a single pass of the corpus and returns a dictionary with a variety of metrics concerning the state of the corpus. """ # Structures to perform counting. counts = nltk.FreqDist() tokens = nltk.FreqDist() started = time.time() # Perform single pass over paragraphs, tokenize and count for para in self.paras(fileids, categories): counts['paras'] += 1 for sent in self._sent_tokenizer.tokenize(para): counts['sents'] += 1 for word in self._word_tokenizer.tokenize(sent): counts['words'] += 1 tokens[word] += 1 # Compute the number of files and categories in the corpus n_fileids = len(self._resolve(fileids, categories) or self.fileids()) n_topics = len(self.categories(self._resolve(fileids, categories))) # Return data structure with information return { 'files': n_fileids, 'topics': n_topics, 'paras': counts['paras'], 'sents': counts['sents'], 'words': counts['words'], 'vocab': len(tokens), 'lexdiv': float(counts['words']) / float(len(tokens)), 'ppdoc': float(counts['paras']) / float(n_fileids), 'sppar': float(counts['sents']) / float(counts['paras']), 'secs': time.time() - started, }
def html(self, fileids=None, categories=None): """ The preprocessed pickles do not contain HTML data. """ raise TypeError( "Preprocessed corpus does not contain HTML data." )
def prep_data(data): sent_detector = nltk.data.load('tokenizers/punkt/english.pickle') sents = sent_detector.tokenize(data['content'].strip()) sent_dict = {str(uuid.uuid4()): {'text': x} for x in sents[:2]} data['sents'] = sent_dict return data
def fcfg_demo(): import nltk.data g = nltk.data.load('grammars/book_grammars/feat0.fcfg') print(g) print()
def setup_module(module): from nose import SkipTest import nltk.data try: nltk.data.find('corpora/childes/data-xml/Eng-USA-MOR/') except LookupError as e: print(e) raise SkipTest("The CHILDES corpus is not found. " "It should be manually downloaded and saved/unpacked " "to [NLTK_Data_Dir]/corpora/childes/")
def data(self): for name in self.names: f = nltk.data.find(name) with f.open() as fp: file_data = fp.read().decode('utf8') yield f, file_data
def test_correct_length(self): # Check that the corpus views report the correct lengths: for f, file_data in self.data(): v = StreamBackedCorpusView(f, read_whitespace_block) self.assertEqual(len(v), len(file_data.split())) v = StreamBackedCorpusView(f, read_line_block) self.assertEqual(len(v), len(self.linetok.tokenize(file_data)))
def augment(self, data): """ Add more data to the ``Concept``'s extension set. :param data: a new semantic value :type data: string or pair of strings :rtype: set """ self._extension.add(data) self.extension = sorted(list(self._extension)) return self._extension
def _str2records(filename, rel): """ Read a file into memory and convert each relation clause into a list. """ recs = [] contents = nltk.data.load("corpora/chat80/%s" % filename, format="text") for line in contents.splitlines(): if line.startswith(rel): line = re.sub(rel+r'\(', '', line) line = re.sub(r'\)\.$', '', line) record = line.split(',') recs.append(record) return recs
def process_bundle(rels): """ Given a list of relation metadata bundles, make a corresponding dictionary of concepts, indexed by the relation name. :param rels: bundle of metadata needed for constructing a concept :type rels: list(dict) :return: a dictionary of concepts, indexed by the relation name. :rtype: dict(str): Concept """ concepts = {} for rel in rels: rel_name = rel['rel_name'] closures = rel['closures'] schema = rel['schema'] filename = rel['filename'] concept_list = clause2concepts(filename, rel_name, schema, closures) for c in concept_list: label = c.prefLabel if (label in concepts): for data in c.extension: concepts[label].augment(data) concepts[label].close() else: concepts[label] = c return concepts
def val_load(db): """ Load a ``Valuation`` from a persistent database. :param db: name of file from which data is read. The suffix '.db' should be omitted from the name. :type db: str """ dbname = db+".db" if not os.access(dbname, os.R_OK): sys.exit("Cannot read file: %s" % dbname) else: db_in = shelve.open(db) from nltk.sem import Valuation val = Valuation(db_in) # val.read(db_in.items()) return val #def alpha(str): #""" #Utility to filter out non-alphabetic constants. #:param str: candidate constant #:type str: string #:rtype: bool #""" #try: #int(str) #return False #except ValueError: ## some unknown values in records are labeled '?' #if not str == '?': #return True
def tokenize_sentences(text): import nltk.data sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') return sent_tokenizer.tokenize(text)
def readFileOfReviews(): # Read each review from file global reviewsLst preview = open("data.txt", "rb") reviewsLst = pickle.load(preview)
def load_data(): """ Loads and preprocessed data for the MR dataset. Returns input vectors, labels, vocabulary, and inverse vocabulary. """ # Load and preprocess data sentences, labels = load_data_and_labels() sentences_padded = pad_sentences(sentences) vocabulary, vocabulary_inv = build_vocab(sentences_padded) x, y = build_input_data(sentences_padded, labels, vocabulary) return [x, y, vocabulary, vocabulary_inv]