我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.tokenize.sent_tokenize()。
def read_folder(self, folder_name, number_of_files_to_read=10000): """ Reads all files in a directory, splits them into sentences and puts these sentences in a list to return. Args: folder_name = the name of the folder to read files from number_of_files_to_read = optional parameter for how many files in a directory to read Returns: A list of all sentences from all text files in the folder """ count = 0 all_sentences = [] for filename in os.listdir(folder_name): if filename.endswith(".txt") and count < number_of_files_to_read: main_text_to_open = folder_name + "/" + filename main_text = self.open_file_single_string(main_text_to_open) udata = main_text.decode("utf-8") main_text = udata.encode("ascii", "ignore") sentences = sent_tokenize(main_text) for sentence in sentences: all_sentences.append(sentence) count += 1 return all_sentences
def create_batch(self, sentence_li): """Create a batch for a list of sentences.""" embeddings_batch = [] for sen in sentence_li: embeddings = [] sent_toks = sent_tokenize(sen) word_toks = [word_tokenize(el) for el in sent_toks] tokens = [val for sublist in word_toks for val in sublist] tokens = [el for el in tokens if el != ''] for tok in tokens: embeddings.append(self.embdict.tok2emb.get(tok)) if len(tokens) < self.max_sequence_length: pads = [np.zeros(self.embedding_dim) for _ in range(self.max_sequence_length - len(tokens))] embeddings = pads + embeddings else: embeddings = embeddings[-self.max_sequence_length:] embeddings = np.asarray(embeddings) embeddings_batch.append(embeddings) embeddings_batch = np.asarray(embeddings_batch) return embeddings_batch
def article_to_pairs(arg): article, direction = arg pairs = [] if 'text' not in article: return [] sents = sent_tokenize(article['text'], language='norwegian') translations = translate(sents, direction) for sent, trans in zip(sents, translations): trans_tokens = tokenize(trans) tokens = tokenize(sent) pairs += compare(tokens, trans_tokens) del article del sents del translations return pairs
def extractFeatures(self, article, n, customStopWords=None): # pass in article as a tuple ( text, title) text = article[0] # extract the text title = article[1] # extract the title sentences = sent_tokenize(text) # split text into sentences word_sent = [word_tokenize(sentences.lower()) for a in sentences] # split sentences into words self._freq = self._compute_frequencies(word_sent, customStopWords) # calculate word freq using member func created above if n < 0: # how many features (words) to return - a -ve number means # no feature ( word) selection, just return all features return nlargest(len(self._freq_keys()), self._freq, key=self._freq.get) else: # here we say if calling e func has asked for a subset # then return only the 'n' largest features, i.e. the # most important words ( important == frequent, less stopwords) return nlargest(n, self._freq, key=self._freq.get)
def summarize(self, article, n): text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i, sentence in enumerate(word_sent): for word in sentence: if word in self._freq: ranking[i] += self._freq[word] sentences_index = nlargest(n, ranking, key=ranking.get) return [sentences[j] for j in sentences_index] ############################################################################## # TEST
def mmap_extract(text): """ Function-wrapper for metamap binary. Extracts concepts found in text. !!!! REMEMBER TO START THE METAMAP TAGGER AND WordSense DISAMBIGUATION SERVER !!!! Input: - text: str, a piece of text or sentence Output: - concepts: list, list of metamap concepts extracted """ # Tokenize into sentences sents = sent_tokenize(text) mm = MetaMap.get_instance(settings['load']['path']['metamap']) concepts, errors = mm.extract_concepts(sents, range(len(sents)), word_sense_disambiguation=True) if errors: print 'Errors with extracting concepts!' print errors return concepts
def person_connotation(tweet, name): """ Decide whether a person is talked favorably about or not, based on the tone of the sentences in which their name appears """ twtcontent = sent_tokenize(tweet) overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0} mentions = 0 # analyze each sentence talking about `name` person for s in twtcontent: tags = get_tweet_tags(s) # if the name appears in the tagged sentence, get its tone if (name, 'NNP') in tags: sentence = util.untag(tags) scores = tweet_connotation(' '.join(sentence)) # add it up to the overall tweet's tone for i, z in enumerate(scores): overall[z] += scores[z] mentions += 1 # averaging all sentences' scores. don't wanna divide by zero now do we if mentions != 0: for v in overall: overall[v] = round(overall[v] / mentions, 3) return overall
def make_summaries(): terms = Terms.objects.all() removals = ['DEFINITION', 'BREAKING DOWN', 'What is'] for term in terms: try: summary = summarizer(term.text, settings.SUMMARIZER_SENTENCES) sentence_tokens = sent_tokenize(summary) text = '' for sentence in sentence_tokens: if not any(to_remove in sentence for to_remove in removals): text += "{0} ".format(sentence.replace(r'\A[\d]\S\s', '')) term.summary = summarizer(text, settings.SUMMARIZER_SENTENCES) term.save() except Exception as e: print((coloredf.red("[ERROR] Ar terms summarizer: {0}".format(e))))
def clean_video(video): text = [] try: if len(video.description) > 0: sentence_tokens = sent_tokenize(video.description) for sentence in sentence_tokens: if not ('http' in sentence): text.append("{0} ".format(sentence)) video.description = "".join("{} ".format(s) for s in text) video.save() if settings.SHOW_DEBUG: print(colored.green("Cleaned video description saved to db: {0}".format(video.title))) except Exception as e: print(colored.red("At clean_video {}".format(e)))
def doc_to_ids(self, doc, training=True): l = [] words = dict() window = 150 # doc = doc.replace("–", " ") # doc = sent_tokenize(doc) for sentence in doc: miniArray = [] for term in sentence: id = self.term_to_id(term, training) if id != None: miniArray.append(id) if not id in words: words[id] = 1 self.docfreq[id] += 1 if not len(miniArray): continue if len(miniArray) > window: l.extend([np.array(miniArray[i:i+window]) for i in xrange(0, len(miniArray), window)]) else: l.append(np.array(miniArray)) return l
def summarize(self, text, n): """ Return a list of n sentences which represent the summary of text. """ sents = sent_tokenize(text) assert n <= len(sents) word_sent = [word_tokenize(s.lower()) for s in sents] self._freq = self._compute_frequencies(word_sent) ranking = defaultdict(int) for i,sent in enumerate(word_sent): for w in sent: if w in self._freq: ranking[i] += self._freq[w] sents_idx = self._rank(ranking, n) return [sents[j] for j in sents_idx]
def parse_xml_language_similarity(file_read,file_write): count = 0 with open(file_read,'r') as f, open(file_write,'w') as out: for line in f: count +=1 if count %1000 == 0: print(count) if "row Id" in line: line = line.strip() root = xml.etree.ElementTree.fromstring(line) try: body = remove_tags(root.get('Body')) title = remove_tags(root.get('Title')) body_sentences = sent_tokenize(body) title_sentences = sent_tokenize(title) for line in body_sentences: out.write(line+"\n") for line in title_sentences: out.write(line+"\n") except: continue
def train(self, chain_len = None): """ Trains the markov data structure by creating chains of desired length """ if not chain_len: chain_len = self.CHAIN_LENGTH self.CHAIN_LEN = chain_len self.everything['corpus'] = {} self.corpus = self.everything['corpus'] for f in self.everything['input']: for line in sent_tokenize( self.everything['input'][f] ): words = word_tokenize(line) for chain in self._make_chains(words): k = " ".join( chain[:-1] ) # key is everything but last word v = chain[-1] # value is last word try: self.corpus[k].append(v) except: self.corpus[k] = [v]
def nltk_extract_claims(text): """ Attempts to extract claims as a list from a large text string. Uses nltk sent_tokenize function in tokenize library param string text: string containing several claims """ sent_list = sent_tokenize(text) # On a test string this returned a list with the claim number # and then the claim text as separate items claims_list = [] for i in range(0, len(sent_list), 2): try: number = int(sent_list[i].split(".")[0]) except: number = 0 claims_list.append( (number, sent_list[i+1]) ) return claims_list
def check_sentence(text): """ Check, that only one sentence was provided. >>> QASystem.check_sentence("Example sentence.") >>> QASystem.check_sentence("Example sentence. Another example.") Traceback (most recent call last): core.MultipleSentences: ['Example sentence.', 'Another example.'] Args: text (str): provided question/answer. Returns: None Raises: MultipleSentenceQuestion: in case of more than one sentence inside of the text string. """ sent_tokenize_list = sent_tokenize(text) # nltk tokenize sentence if len(sent_tokenize_list) > 1: raise MultipleSentences(sent_tokenize_list)
def read_yelp(file_name='yelp_academic_dataset_review.json'): f = open(file_name) f = f.readlines() f = [eval(l.strip()) for l in f] stars = [i['stars'] for i in f] text = [i['text'] for i in f] df = pd.DataFrame() df['stars'] = stars df['text'] = text #compute the number of sentences in each doc l = list(df.text) text = [sent_tokenize(i) for i in list(df.text)] text_len = [len(i) for i in text] #2225188 in total #2089287 for length<=20 #1654640 for length<=10 #We decide to only consider length<=7 here df['length'] = text_len df['text_split'] = text return df
def get_sentiment(song): scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)]) if not song: return scores raw_text = song raw_text = re.sub("\n", ". ", str(raw_text)) # Using already trained sid = SentimentIntensityAnalyzer() sentences = tokenize.sent_tokenize(raw_text) scores = dict([('pos', 0), ('neu', 0), ('neg', 0), ('compound', 0)]) for sentence in sentences: ss = sid.polarity_scores(sentence) for k in sorted(ss): scores[k] += ss[k] return scores
def offset_tokenize(text): tail = text accum = 0 tokens = [word for sent in sent_tokenize(text) for word in word_tokenize(sent)] info_tokens = [] for tok in tokens: scaped_tok = re.escape(tok) m = re.search(scaped_tok, tail) start, end = m.span() # global offsets gs = accum + start ge = accum + end accum += end # keep searching in the rest tail = tail[end:] info_tokens.append((tok, (gs, ge))) return info_tokens
def parse_xml_all(self, data_file, doc_type, language='english'): e = ET.parse(data_file) cluster_data = {} root = e.getroot() for topics in root: data = [] topic_id = topics.attrib.get('id') for documents in topics.findall(doc_type): doc_id = documents.attrib.get('id') if doc_type == 'document': title_text = documents.find('title').text doc_text = documents.find('text').text text = text_normalization(doc_text) doc_sents = sent_tokenize(text, language) data.append([doc_id, doc_sents]) cluster_data[topic_id] = data return cluster_data
def analysis(self, paragraph): ''' analysis sentiment given paragraph ''' result = 0 counter = 0 sentences = tokenize.sent_tokenize(paragraph) for sentence in sentences: sentiment = self.analyzer.polarity_scores(sentence)['compound'] if sentiment > SentimentAnalyzer.neutral_threshold[0] and \ sentiment < SentimentAnalyzer.neutral_threshold[1]: continue counter += 1 result += sentiment result = result / float(counter) if counter > 0 else 0 return result
def add_items(self, sentence_li): """Add new items to the tok2emb dictionary from a given text.""" for sen in sentence_li: sent_toks = sent_tokenize(sen) word_toks = [word_tokenize(el) for el in sent_toks] tokens = [val for sublist in word_toks for val in sublist] tokens = [el for el in tokens if el != ''] for tok in tokens: if self.tok2emb.get(tok) is None: self.tok2emb[tok] = self.fasttext_model[tok]
def get_sentiment_from_paragraph(paragraph): sentence_list = tokenize.sent_tokenize(paragraph) paragraphSentiments = 0.0 for sentence in sentence_list: vs = analyzer.polarity_scores(sentence) paragraphSentiments += vs["compound"] return round(paragraphSentiments/len(sentence_list), 4)
def pre_processing(tokenizer, truecaser, info): # SPLIT THE WHITESPACES source_file_t = re.split('([\t\n\r\f\v]+)', info['src']) # SENTENCE TOKENIZE for i in range(len(source_file_t)): if i % 2 == 0: source_file_t[i] = sent_tokenize(source_file_t[i]) # TOKENIZATION if info['tok']: for j in range(len(source_file_t)): if j % 2 == 0: for i in range(len(source_file_t[j])): try: source_file_t[j][i] = str( tokenizer.tokenize(source_file_t[j][i], return_str=True).encode('utf-8')) except NameError: source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.'])) # TRUECASING if info['tc']: for j in range(len(source_file_t)): if j % 2 == 0: for i in range(len(source_file_t[j])): source_file_t[j][i] = str((truecasing(truecaser, source_file_t[j][i].split(' ')[0]).decode( 'utf-8') + " " + (' '.join(source_file_t[j][i].split(' ')[1:]).decode('utf-8'))).encode('utf-8')) print source_file_t[j][i] # IF NEITHER if not (info['tc'] or info['tok']): for j in range(len(source_file_t)): if j % 2 == 0: for i in range(len(source_file_t[j])): try: source_file_t[j][i] = str(source_file_t[j][i].encode('utf-8')) except NameError: source_file_t[j][i] = str(' '.join(source_file_t[j][i].split('.') + ['.'])) return source_file_t
def extractRawFrequencies(self, article): # this method is similar to above but returns # the raw freq.cies ( all word count) text = article[0] text = article[1] sentences = sent_tokenize(text) word_sent = [word_tokenize(s.lower()) for s in sentences] freq = defaultdict(int) for s in word_sent: for word in s: if word not in self._stopwords: freq[word] += 1 return freq
def sentence(text): '''Break the text into sentences''' return sent_tokenize(text)
def getSentences(self): self.sentences = sent_tokenize(self.text)
def metamap_wrapper(text): """ Function-wrapper for metamap binary. Extracts concepts found in text. !!!! REMEMBER TO START THE METAMAP TAGGER AND WordSense DISAMBIGUATION SERVER !!!! Input: - text: str, a piece of text or sentence Output: - a dictionary with key sents and values a list of the concepts found """ # Tokenize into sentences sents = sent_tokenize(text) # Load Metamap Instance mm = MetaMap.get_instance(settings['load']['path']['metamap']) concepts, errors = mm.extract_concepts(sents, range(len(sents))) # Keep the sentence ids ids = np.array([int(concept[0]) for concept in concepts]) sentences = [] for i in xrange(len(sents)): tmp = {'sent_id': i+1, 'entities': [], 'relations': []} # Wanted concepts according to sentence wanted = np.where(ids == i)[0].tolist() for w_ind in wanted: w_conc = concepts[w_ind] if hasattr(w_conc, 'cui'): tmp_conc = {'label': w_conc.preferred_name, 'cui': w_conc.cui, 'sem_types': w_conc.semtypes, 'score': w_conc.score} tmp['entities'].append(tmp_conc) sentences.append(tmp) if errors: time_log('Errors with extracting concepts!') time_log(errors) return {'sents': sentences, 'sent_text':text}
def reverb_wrapper(text, stop=None): """ Function-wrapper for ReVerb binary. Extracts relations found in text. Input: - text: str, a piece of text or sentence - stop: list, list of stopwords to remove from the relations Output: - total: list, list of lists. Each inner list contains one relation in the form [subject, predicate, object] """ total = [] for sent in sent_tokenize(text): cmd = 'echo "' + sent + '"' "| ./reverb -q | tr '\t' '\n' | cat -n" reverb_dir = settings['load']['path']['reverb'] result = runProcess(cmd, reverb_dir) # Extract relations from reverb output result = result[-3:] result = [row.split('\t')[1].strip('\n') for row in result] # Remove common stopwords from relations if stop: result = [stopw_removal(res, stop) for res in result] total.append(result) # Remove empty relations total = [t for t in total if t] return total
def extract_entities(text, json_={}): """ Extract entities from a given text using metamap and generate a json, preserving infro regarding the sentence of each entity that was found. For the time being, we preserve both concepts and the entities related to them Input: - text: str, a piece of text or sentence - json_: dic, sometimes the json to be returned is given to us to be enriched Defaults to an empty json_ Output: - json_: dic, json with fields text, sents, concepts and entities containg the final results """ json_['text'] = text # Tokenize the text sents = sent_tokenize(text) json_['sents'] = [{'sent_id': i, 'sent_text': sent} for i, sent in enumerate(sents)] json_['concepts'], _ = mmap_extract(text) json_['entities'] = {} for i, sent in enumerate(json_['sents']): ents = metamap_ents(sent) json_['entities'][sent['sent_id']] = ents return json_
def tweet_connotation(tweet): """ Decide whether a tweet is generally positive or negative """ anlyzr = SentimentIntensityAnalyzer() # break tweet up into sentences and analyze each seperately twtcontent = sent_tokenize(tweet) overall = {'compound': 0, 'neg': 0, 'neu': 0, 'pos': 0} for s in twtcontent: scores = anlyzr.polarity_scores(s) # tally up each sentence's overall tone for i, z in enumerate(scores): overall[z] += scores[z] # average it all together for the tweet as a whole for v in overall: overall[v] = round(overall[v] / len(twtcontent), 3) return overall
def tokenize_into_opinion_units(text): output = [] for str in sent_tokenize(text): for output_str in str.split(' but '): output.append(output_str) return output #Take positive.csv and negative.csv and mix them into #positiveandnegative.csv #This has each unit tagged with its booking.com sentiment #This is the data I tagged with Mechanical Turk
def ask_confirmation(self,best_matching_action): alternative_formulations = sent_tokenize(self.trigger_dict[best_matching_action]) alternative_formulation = choice(alternative_formulations) self.speak("Excuse me, I didn't understand your request very well. Do you want me to "+alternative_formulation) answer = self.active_listen() if "no" in answer: self.speak("Please reformulate your request.") return 0 if "yes" in answer: self.speak("Very good") return 1
def stem_and_tokenize_text(text): sents = sent_tokenize(text) tokens = list(itertools.chain(*[TreebankWordTokenizer().tokenize(sent) for sent in sents])) terms = [Term(token) for token in tokens] return filter(lambda term: not term.is_punctuation(), terms)
def convert_text2bin1(docs, writer): global counter for i, fi in enumerate(docs): with open(os.path.join(curdir,"input","cnn","stories",fi),'r', encoding="UTF-8") as f: wholetext=f.read().lower() wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext) wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext) wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext) wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext) wholetext=wholetext.replace("."," . ") wholetext=wholetext.replace(","," , ") wholetext=wholetext.replace('-',' - ') wholetext=wholetext.replace('?',' ? ') wholetext=wholetext.replace('(','( ') wholetext=wholetext.replace(')',' )') data=wholetext.split("@highlight") news=data[0] highlights=data[1].replace('\n\n','') news=(" ".join(news.split('\n\n'))).strip() sentences = sent_tokenize(news) news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>' highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>' words = (news+" "+highlights).split() counter.update(words) tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')]) tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) if i%3000==0: print(int((float(i)/ len(docs))*100), "%") print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
def convert_text2bin2(docs, writer): global counter for i, fi in enumerate(docs): with open(os.path.join(curdir,"input","dailymail","stories",fi),'r', encoding="UTF-8") as f: wholetext=f.read().lower() wholetext=re.sub(r'[^\x00-\x7F]+','', wholetext) wholetext=re.sub(r"(\s?[\']\s+|\s+[\']\s?)"," ' ", wholetext) wholetext=re.sub(r'(\s?[\"]\s+|\s+[\"]\s?)',' " ', wholetext) wholetext=re.sub(r"(\'[s]\s+)"," 's ", wholetext) wholetext=wholetext.replace("."," . ") wholetext=wholetext.replace(","," , ") wholetext=wholetext.replace('-',' - ') wholetext=wholetext.replace('?',' ? ') wholetext=wholetext.replace('(','( ') wholetext=wholetext.replace(')',' )') data=wholetext.split("@highlight") news=data[0] try: news=news.split("updated:")[1] news=news[news.find('20')+4:] except: None news=(" ".join(news.split('\n'))).strip() highlights=data[1].replace('\n\n','') news=(" ".join(news.split('\n\n'))).strip() sentences = sent_tokenize(news) news = '<d> <p> ' + ' '.join(['<s> ' + sentence + ' </s>' for sentence in sentences]) + ' </p> </d>' highlights = '<d> <p> <s> ' + highlights + ' </s> </p> </d>' words = (news+" "+highlights).split() counter.update(words) tf_example = example_pb2.Example() tf_example.features.feature['article'].bytes_list.value.extend([(' '.join(news.split())).encode('utf-8')]) tf_example.features.feature['abstract'].bytes_list.value.extend([(' '.join(highlights.split())).encode('utf-8')]) tf_example_str = tf_example.SerializeToString() str_len = len(tf_example_str) writer.write(struct.pack('q', str_len)) writer.write(struct.pack('%ds' % str_len, tf_example_str)) if i%3000==0: print(int((float(i)/ len(docs))*100), "%") print((float(len(docs))/ len(docs))*100, "%...." "converted\n\n")
def text_cleaner(data): paragraphs_ = "" try: keep_endings = ['.', '?'] removals_ = open(join(settings.BASE_DIR, "aggregator", 'data', 'stop_sentences.txt'), 'r') removals = [r.replace('\n', '') for r in removals_] if not (data is None): text = data.split('\n') paragraphs = [] for p in text: if len(p) > settings.MINIMUM_PARAGRAPH: paragraphs.append(p) for p in paragraphs: sentence_tokens = sent_tokenize(p) paragraph = "" for sentence in sentence_tokens: if sentence[-1] in keep_endings: if len(sentence) > settings.MINIMUM_SENTENCE: #should remove most of the code: if sentence[0].isupper(): if not any(to_remove in sentence for to_remove in removals): #eliminate some bad ending strings: if not sentence.endswith(('e.g.', 'i.e.')): paragraph += "{0} ".format(sentence) paragraphs_ += "<p>{0}</p>".format(paragraph) except Exception as e: print(colored.red("At text_cleaner {}".format(e))) return paragraphs_
def write_paragraph_lines(paragraph_lines): paragraph_str = ' '.join(paragraph_lines) for sent in sent_tokenize(paragraph_str): if lowercase: sent = sent.lower() output_file.write(' '.join(word_tokenize(sent))+'\n')
def extract_target_context(self, paragraph, isolate_target_sentence): if isolate_target_sentence: for sent in sent_tokenize(paragraph): words, position = self.extract_context(sent) if words is not None: break else: words, position = self.extract_context(paragraph) return words, position
def doc_to_ids(self, doc, training=True): l = [] words = dict() doc_sents = sent_tokenize(doc) for sentence in doc_sents: miniArray = [] for term in sentence.split(): id = self.term_to_id(term, training) if id != None: miniArray.append(id) if not id in words: words[id] = 1 self.docfreq[id] += 1 # It counts in how many documents a word appears. If it appears in only a few, remove it from the vocabulary using cut_low_freq() l.append(np.array(miniArray, dtype=np.int32)) return l
def text2sentences(text): '''Tokenize text into sentence tokens.''' content = '\n'.join([open(f).read() for f in text]) sentences = [] try: sentences = sent_tokenize(content) except LookupError as err: click.echo(message="Error with tokenization", nl=True) click.echo(message="Have you run \"textkit download\"?", nl=True) click.echo(message="\nOriginal Error:", nl=True) click.echo(err) [output(s.strip()) for s in sentences]
def make_phrases(self, start = 1, end = None): if not end: end = start + 1 for chain_len in range(start, end): # +1 because of the way range works self.phrases[chain_len] = [] for f in self.everything['input']: for line in sent_tokenize( self.everything['input'][f] ): words = word_tokenize(line) for chain in self._make_chains(words, chain_len): try: # print "ERROR.0:", chain chain = chain[:-1] # drop last item in chain as it's "value" for markov chain = [c for c in chain if c is not None] # quick clean as None is breaking join except: print "ERROR.1:", chain # sys.exit(-1) # print chain_len, " => ", chain try: self.phrases[chain_len].append(" ".join(chain) ) except: print "ERROR.2:", chain sys.exit(-1) return Counter( self.phrases[chain_len] )
def buildGraph(text): vertices = [] sentences = sent_tokenize(text, language='english') for sentence_raw in sentences: sentence_processed = sub("[^a-zA-Z ]+", '', sentence_raw).lower() words = word_tokenize(sentence_processed, language='english') vertices.append(vertex(sentence_raw, sentence_processed, words)) for v1 in vertices: for v2 in vertices: if v1.order != v2.order: v1.scores.append(overlap(v1.words, v2.words)) v1.averageScores() return vertices
def updateSentiment(dbLoc, tableName): sid = SentimentIntensityAnalyzer() conn = sqlite3.connect(dbLoc) cursor = conn.execute("SELECT * from %s" % tableName) # Go through every sentence for row in cursor: text = cleanTweet(row[TWEET_INDEX]) #blob = TextBlob(text) sent = 0.0 count = 0 sentList = tokenize.sent_tokenize(text) # Go through each sentence in tweet for sentence in sentList: count += 1 ss = sid.polarity_scores(sentence) sent += ss['compound'] # Tally up the overall sentiment if count != 0: sent = float(sent / count) # Update into DB conn.execute("UPDATE " + tableName + " set SENTIMENT = ? where ID = ?", \ (sent, row[ID_INDEX])) conn.commit() conn.close()
def getSentiment(tweet): sid = SentimentIntensityAnalyzer() tweet = cleanTweet(tweet) sent = 0.0 count = 0 sentList = tokenize.sent_tokenize(tweet) # Go through each sentence in tweet for sentence in sentList: count += 1 ss = sid.polarity_scores(sentence) sent += ss['compound'] # Tally up the overall sentiment if count != 0: sent = float(sent / count) return sent # Update the sentiment
def _preprocess(self, text): """ Return a list of lists. Each list is a preprocessed sentence of text in bag-of-words format.""" stemmer = PorterStemmer() self._sents = sent_tokenize(text) # tokenize sentences word_sents = [word_tokenize(sent.lower()) for sent in self._sents] # remove stop-words and stem words word_sents = [[stemmer.stem(word) for word in sent if word not in self._stopwords] for sent in word_sents] return word_sents
def text_to_sentences(self, text, tokenizer, remove_stopwords=False ): print "text_to_sentence" #from nltk.tokenize import wordpunct_tokenize # Function to split a review into parsed sentences. Returns a # list of sentences, where each sentence is a list of words # text=text.decode("utf8") from nltk.tokenize import sent_tokenize,wordpunct_tokenize # 1. Use the NLTK tokenizer to split the paragraph into sentences #raw_sentences = tokenizer.tokenize(text.strip()) raw_sentences = sent_tokenize(text.strip()) print "finish tokenize sentence",len(raw_sentences) # # 2. Loop over each sentence sentences = [] for raw_sentence in raw_sentences: #print "sentence:",raw_sentence # If a sentence is empty, skip it if len(raw_sentence) > 0: # Otherwise, call review_to_wordlist to get a list of words #sentences.append( text_to_wordlist( raw_sentence, \ # remove_stopwords )) #print removePunctuation(raw_sentence).lower().split() print raw_sentence sentences.append(wordpunct_tokenize(raw_sentence))#raw_sentence.split()) print wordpunct_tokenize(raw_sentence) #print text_to_wordlist( raw_sentence, remove_stopwords ) # # Return the list of sentences (each sentence is a list of words, # so this returns a list of lists return sentences
def create_example(text): raw_sentences = sent_tokenize(text) sentences = [word_tokenize(s) for s in raw_sentences] speakers = [["" for _ in sentence] for sentence in sentences] return { "doc_key": "nw", "clusters": [], "sentences": sentences, "speakers": speakers, }
def getSentences(paragraph): """ Extracts sentences from a paragraph :param paragraph: (str) paragraph text :returns: list of sentences """ indexed = {} i = 0 sentenceList = tokenize.sent_tokenize(paragraph) for s in sentenceList: indexed[i] = s i += 1 return sentenceList, indexed