我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用nltk.sent_tokenize()。
def tokenize_and_stem(text): """ First tokenize by sentence, then by word to ensure that punctuation is caught as it's own token """ tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): if 'intern' == token: token = '' if 'student' == token: token = '' if 'and' == token: token = '' filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0] return stems
def preprocessing(text): text = text.decode("utf8") # tokenize into words tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] # remove stopwords stop = stopwords.words('english') tokens = [token for token in tokens if token not in stop] # remove words less than three letters tokens = [word for word in tokens if len(word) >= 3] # lower capitalization tokens = [word.lower() for word in tokens] # lemmatize lmtzr = WordNetLemmatizer() tokens = [lmtzr.lemmatize(word) for word in tokens] preprocessed_text= ' '.join(tokens) return preprocessed_text
def get_sentence_tokens(text): ''' Given a text(review), return the token list of each sentence :param text: :return: ''' sentences = sent_tokenize(text) sent_tokens = [] for sentence in sentences: sent_token = word_tokenize(sentence) sent_token = [token for token in sent_token if ((not token.strip()=='') and (not token in stopwords))] sent_tokens.append(sent_token) # remove stop words and short tokens # stemming, experiment shows that stemming works nothing... # if (stemming): # stemmer = PorterStemmer() # texts = [[ stemmer.stem(token) for token in text] for text in texts] return sent_tokens
def parse_gender(text): sentences = [ [word.lower() for word in nltk.word_tokenize(sentence)] for sentence in nltk.sent_tokenize(text) ] sents, words = count_gender(sentences) total = sum(words.values()) for gender, count in words.items(): pcent = (count / total) * 100 nsents = sents[gender] print( "{:0.3f}% {} ({} sentences)".format(pcent, gender, nsents) )
def ie_preprocess(self, document): """This function takes raw text and chops and then connects the process to break it down into sentences""" # Pre-processing # e.g.","exempli gratia" document = document.replace("e.g.", "exempli gratia") # Sentence tokenizer out of nltk.sent_tokenize split = re.split('\n|\*', document) # Sentence tokenizer sentences = [] for sent in split: sents = nltk.sent_tokenize(sent) length = len(sents) if length == 0: next elif length == 1: sentences.append(sents[0]) else: for i in range(length): sentences.append(sents[i]) return sentences
def maybe_build_sentences(text_filename, sent_filename): sents = [] if os.path.exists(sent_filename): fsent = open(sent_filename, "rb") for line in fsent: docid, sent_id, sent = line.strip().split("\t") sents.append(sent) fsent.close() else: ftext = open(text_filename, "rb") fsent = open(sent_filename, "wb") for line in ftext: docid, text = line.strip().split("\t") sent_id = 1 for sent in nltk.sent_tokenize(text): sents.append(sent) fsent.write("{:d}\t{:d}\t{:s}\n" .format(int(docid), sent_id, sent)) sent_id += 1 fsent.close() ftext.close() return sents
def get_review_sentences(): ''' Read the yelp review and return after sentence segmentattion :return: ''' review_file = io.open(FULL_YELP_REVIEW_PATH, 'r', encoding='utf-8') count_sentence = 0 sentences = [] for line in review_file: json_review = json.loads(line.strip()) text = json_review.get("text").replace('\n','').lower() raw_sentences = sent_tokenize(text) for raw_sentence in raw_sentences: if len(raw_sentence.strip()) > 0: sent_tokens = word_tokenize(raw_sentence) sentences.append(sent_tokens) return sentences
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group).lower() for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def print_symptoms_from_page(url = '', model = '', stanford_jar = ''): html_reader = HTMLReader(url) cleaned_text = html_reader.get_text_from_page() symptoms = set() st = NERTagger(model, stanford_jar, encoding='utf-8') sentences = nltk.sent_tokenize(cleaned_text) for sentence in sentences: tags = st.tag(nltk.word_tokenize(sentence)) tag_index = 0 while tag_index < len(tags): if tags[tag_index][1] == 'SYMP': symptom = [] while tag_index < len(tags) and tags[tag_index][1] != 'O': symptom.append(tags[tag_index][0]) tag_index += 1 symptoms.add(' '.join(symptom)) else: tag_index += 1 print "Found %d symptoms:" % len(symptoms) for symptom in symptoms: print symptom
def extract_candidate_chunks(text, grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}'): ''' This function will extract text of a specific POS sequence rather than just Noun Phrase ''' import itertools, nltk, string # exclude candidates that are stop words or entirely punctuation punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) # tokenize, POS-tag, and chunk using regular expressions chunker = nltk.chunk.regexp.RegexpParser(grammar) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text)) all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # join constituent chunk words into a single chunked phrase candidates = [' '.join(word for word, pos, chunk in group) for key, group in itertools.groupby(all_chunks, lambda (word,pos,chunk): chunk != 'O') if key] return [cand for cand in candidates if cand not in stop_words and not all(char in punct for char in cand)]
def tokenize(self, document): # Break the document into sentences for sent in sent_tokenize(document): # Break the sentence into part of speech tagged tokens for token, tag in pos_tag(wordpunct_tokenize(sent)): # Apply preprocessing to the token token = token.lower() if self.lower else token token = token.strip() if self.strip else token token = token.strip('_') if self.strip else token token = token.strip('*') if self.strip else token # If stopword, ignore token and continue # if token in self.stopwords: # continue # If punctuation, ignore token and continue if all(char in self.punct for char in token): continue # Lemmatize the token and yield lemma = self.lemmatize(token, tag) yield lemma
def process(self, fc, context=None): text_source = self.config.get('text_source') if text_source and text_source in fc: text = fc[text_source] else: return fc names = defaultdict(StringCounter) for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label'): label = chunk.label() name = ' '.join(c[0] for c in chunk.leaves()) if not isinstance(name, unicode): name = unicode(name, 'utf-8') name = cleanse(name) #print chunk.node, name names[label][name] += 1 for entity_type, name_counts in names.items(): fc[entity_type] = name_counts return fc
def generate_vocab(filename,min_fre=5,prefix=""): vf = open("../data/"+prefix+"vocab_generate.txt",'w') word = {} for line in file(filename): line = line.strip() try: sentencesToken = nltk.sent_tokenize(line) except: continue for i in range(len(sentencesToken)): tokens = nltk.word_tokenize(sentencesToken[i]) for token in tokens: word.setdefault(token,0) word[token] += 1 for char,num in sorted(word.items(),key=lambda x:x[1],reverse=True): if num < min_fre: break vf.write(char+" "+str(num)+"\n")
def extract_chunks(text_string,max_words=3,lemmatize=False): # Any number of adjectives followed by any number of nouns and (optionally) again # any number of adjectives folowerd by any number of nouns grammar=r'KT: {(<JJ>* <NN.*>+ <IN>)? <JJ>* <NN.*>+}' # Makes chunks using grammar regex chunker = nltk.RegexpParser(grammar) # Get grammatical functions of words # What this is doing: tag(sentence -> words) tagged_sents = nltk.pos_tag_sents(nltk.word_tokenize(sent) for sent in nltk.sent_tokenize(text_string)) # Make chunks from the sentences, using grammar. Output in IOB. all_chunks = list(itertools.chain.from_iterable(nltk.chunk.tree2conlltags(chunker.parse(tagged_sent)) for tagged_sent in tagged_sents)) # Join phrases based on IOB syntax. candidates = [' '.join(w[0] for w in group).lower() for key, group in itertools.groupby(all_chunks, lambda l: l[2] != 'O') if key] # Filter by maximum keyphrase length candidates = list(filter(lambda l: len(l.split()) <= 3, candidates)) # Filter phrases consisting of punctuation or stopwords punct = set(string.punctuation) stop_words = set(nltk.corpus.stopwords.words('english')) candidates = list(filter(lambda l: l not in stop_words and not all(c in punct for c in l),candidates)) # lemmatize if lemmatize: lemmatizer = nltk.stem.WordNetLemmatizer().lemmatize candidates = [lemmatizer(x) for x in candidates] return candidates
def tokenizer(): if len(request.vars)!=0: user_input=request.vars import sys reload(sys) sys.setdefaultencoding('utf-8') if user_input.parameter=="sentence": our_output=nltk.sent_tokenize(user_input.input,"english") print user_input if request.vars.filename!='' and len(request.vars.filename.value)!="": file_input=user_input.filename.value file_output=nltk.word_tokenize(file_input,"english") print our_output else: our_output=nltk.word_tokenize(user_input.input,"english") if request.vars.filename!='' and len(request.vars.filename.value)!=None: file_input=user_input.filename.value file_output=nltk.word_tokenize(file_input,"english") user_input.output=our_output return locals()
def extract(text, paper=None, logger=logger): search_any = functools.partial(re_util.search_any, logger=logger) if not text and paper: try: text, _ = paper.get_text() except pdfutil.pdfutil.MalformedPDF as e: return None filters = [r'data documentation.*?shared'] for sentence in nltk.sent_tokenize(text): match = search_any(filters, sentence) if match: source_type = "extracted" source_detail = "nltk search v1" value_text = sentence value_result = "Yes" return (value_text, value_result, source_type, source_detail) #if no match found: source_type = "extracted" source_detail = "nltk search v1" value_text = "Not Found" value_result = "No" return (value_text, value_result, source_type, source_detail)
def extract(text, paper=None, logger=logger): search_any = functools.partial(re_util.search_any, logger=logger) if not text and paper: try: text, _ = paper.get_text() except pdfutil.pdfutil.MalformedPDF as e: return None for sentence in nltk.sent_tokenize(text): if search_any([r'data mine.*?source', r'text mine.*?shared'], sentence): # yapf: disable match = search_any([ "data mine.*?(\w*\d[\w\d/-]*)", "text mine.*?(\w*\d[\w\d/-]*)" ], sentence) # yapf: enable source_type = "extracted" source_detail = "nltk search v1" value_text = sentence try: value_result = match.group(1).strip() return (value_text, value_result, source_type, source_detail) except AttributeError: # no match was found return None return None
def extract(text, paper=None, logger=logger): search_any = functools.partial(re_util.search_any, logger=logger) if not text and paper: try: text, _ = paper.get_text() except pdfutil.pdfutil.MalformedPDF as e: return None filters = [r'analys(is|es)'] for sentence in nltk.sent_tokenize(text): match = search_any(filters, sentence) if match and search_any([r'algorithm', r'summary', r'outline', r'statistic', r'table|graph', r'following'], sentence): source_type = "extracted" source_detail = "nltk search v1" value_text = sentence value_result = "Yes" return (value_text, value_result, source_type, source_detail) #if no match found: source_type = "extracted" source_detail = "nltk search v1" value_text = "Not Found" value_result = "No" return (value_text, value_result, source_type, source_detail)
def get_story_question_answer_triples(sqa_file): sqatriples = [] fsqa = open(sqa_file, "rb") for line in fsqa: line = line.strip().decode("utf8").encode("ascii", "ignore") if line.startswith("#"): continue story, question, answer, correct = line.split("\t") swords = [] story_sents = nltk.sent_tokenize(story) for story_sent in story_sents: swords.extend(nltk.word_tokenize(story_sent)) qwords = nltk.word_tokenize(question) awords = nltk.word_tokenize(answer) is_correct = int(correct) == 1 sqatriples.append((swords, qwords, awords, is_correct)) fsqa.close() return sqatriples
def maybe_build_vocab(reuters_dir, vocab_file): vocab = collections.defaultdict(int) if os.path.exists(vocab_file): fvoc = open(vocab_file, "rb") for line in fvoc: word, idx = line.strip().split("\t") vocab[word] = int(idx) fvoc.close() else: counter = collections.Counter() num_docs_read = 0 for doc in stream_reuters_documents(reuters_dir): if num_docs_read % 100 == 0: print("building vocab from {:d} docs" .format(num_docs_read)) topics = doc["topics"] if len(topics) == 0: continue title = doc["title"] body = doc["body"] title_body = ". ".join([title, body]).lower() for sent in nltk.sent_tokenize(title_body): for word in nltk.word_tokenize(sent): counter[word] += 1 for i, c in enumerate(counter.most_common(VOCAB_SIZE)): vocab[c[0]] = i + 1 num_docs_read += 1 print("vocab built from {:d} docs, complete" .format(num_docs_read)) fvoc = open(vocab_file, "wb") for k in vocab.keys(): fvoc.write("{:s}\t{:d}\n".format(k, vocab[k])) fvoc.close() return vocab
def build_numeric_text(vocab, text): wids = [] for sent in nltk.sent_tokenize(text): for word in nltk.word_tokenize(sent): wids.append(vocab[word]) return ",".join([str(x) for x in wids]) ##################### main ######################
def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens] return stems
def tokenize_and_stem(text): # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)] filtered_tokens = [] # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation) for token in tokens: if re.search('[a-zA-Z]', token): if 'and' == token: token = '' filtered_tokens.append(token) stems = [stemmer.stem(t) for t in filtered_tokens if len(t) > 0] return stems
def split_sentences(text): """ Returns a list of the sentences in the text that is passed in. """ return sent_tokenize(text)
def tokenize_documents(documents): for document in documents: text = document.text tokenized_doc = [] for sent in nltk.sent_tokenize(text): tokenized_doc += nltk.word_tokenize(sent) document.text = tokenized_doc
def _tokenize_corpus_into_list_of_tokenized_sentences(cls, corpus): tokenized_corpus = nltk.sent_tokenize(corpus) tokenized_corpus = [cls._clean_sentence(sentence) for sentence in tokenized_corpus] return [nltk.word_tokenize(sentence) for sentence in tokenized_corpus]
def extract(self, text, max_length=3, metric='avg', incl_scores=False): """Extract keywords and keyphrases from input text in descending order of score""" sentences = nltk.sent_tokenize(text) phrase_list = self._generate_candidate_keywords(sentences, max_length=max_length) word_scores = self._calculate_word_scores(phrase_list) phrase_scores = self._calculate_phrase_scores(phrase_list, word_scores, metric=metric) sorted_phrase_scores = sorted(phrase_scores.iteritems(), key=operator.itemgetter(1), reverse=True) n_phrases = len(sorted_phrase_scores) if incl_scores: return sorted_phrase_scores[0:int(n_phrases/self.top_fraction)] else: return map(lambda x: x[0], sorted_phrase_scores[0:int(n_phrases/self.top_fraction)])
def parse_text(filename, vocabulary_size=9000, type="word"): with open(filename, 'rb') as f: txt = f.read() if type == "word": sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' ')) # sentences = ["%s %s %s" % (sentence_start_token, x, sentence_end_token) for x in sentences] tokenized_sentences = [nltk.word_tokenize(sent) for sent in sentences] word_freq = nltk.FreqDist(itertools.chain(*tokenized_sentences)) print("Found %d unique words tokens." % len(word_freq.items())) vocab = word_freq.most_common(vocabulary_size-1) index = [sentence_start_token, sentence_end_token, unknown_token] + [x[0] for x in vocab] word_to_index = dict([(w,i) for i,w in enumerate(index)]) print("The least frequent word in our vocabulary is '%s' and appeared %d times." % (vocab[-1][0], vocab[-1][1])) for i, sent in enumerate(tokenized_sentences): tokenized_sentences[i] = [w if w in word_to_index else unknown_token for w in sent] X_train = np.asarray([ [0]+[word_to_index[w] for w in sent] for sent in tokenized_sentences]) y_train = np.asarray([ [word_to_index[w] for w in sent]+[1] for sent in tokenized_sentences]) # X_train, y_train = [], [] # for sent in tokenized_sentences: # l = len(sent) - 1 # X_train.append(coo_matrix((np.ones( (l) ), ( range(l), [word_to_index[w] for w in sent[:-1]] )), shape=(l, vocabulary_size )).toarray()) # y_train.append( [word_to_index[w] for w in sent[1:] ] ) else: sentences = nltk.sent_tokenize(txt.decode('utf-8').lower().replace('\n', ' ')) index = ['^','$'] + list(set(txt)) char_to_index = dict([(w,i) for i,w in enumerate(index)]) X_train = np.asarray([ [0]+[ char_to_index[w] for w in sent] for sent in sentences]) y_train = np.asarray([ [ char_to_index[w] for w in sent]+[1] for sent in sentences]) return X_train, y_train, index
def word_tokenize(tokens): # return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)] return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)] #from my.corenlp_interface import CoreNLPInterface #url = 'vision-server2.corp.ai2' #port = 8000 #interface = CoreNLPInterface(url, port) #sent_tokenize = interface.split_doc #word_tokenize = interface.split_sent
def tokenize(self, fileid): """ Segments, tokenizes, and tags a document in the corpus. Returns a generator of paragraphs, which are lists of sentences, which in turn are lists of part of speech tagged words. """ for paragraph in self.corpus.paras(fileids=fileid): yield [ nltk.pos_tag(nltk.wordpunct_tokenize(sent)) for sent in nltk.sent_tokenize(paragraph) ]
def get_sentences_nltk(text): text = text.replace('\n', ' ') text = text.replace('\t', ' ') sentences = [s.lower() for s in nltk.sent_tokenize(text) if s] return sentences
def performNameExtraction(text): #Returns a list of what NLTK defines as persons after processing the text passed into it. try: entity_names = [] for sent in nltk.sent_tokenize(text): for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sent))): if hasattr(chunk, 'label') and chunk.label: if chunk.label() == 'PERSON': name_value = ' '.join(child[0] for child in chunk.leaves()) if name_value not in entity_names: entity_names.append(name_value) except: print "Unexpected error:", sys.exc_info()[0] return entity_names
def tokenizeSentence(args): document = args['sentences'] tokenized_sentences = nltk.sent_tokenize(document) return jsonify(tokenized_sentences)
def tokenizeWord(args): document = args['sentences'] tokenized_sentences = nltk.tokenize.sent_tokenize(document) tokenized_words = nltk.word_tokenize(tokenized_sentences[0]) return jsonify(tokenized_words)
def tokenize_individual_text(raw_text): ''' Given raw_text, a string, return a list of tokens. ''' return sum(map(nltk.word_tokenize, nltk.sent_tokenize(raw_text)), [])
def handle_negation(comments): sentences = nltk.sent_tokenize(comments) modified_st=[] for st in sentences: allwords = nltk.word_tokenize(st) modified_words=[] if negated(allwords): part_of_speech = nltk.tag.pos_tag(allwords,tagset='universal') chunked = chunk_parser.parse(part_of_speech) #print("---------------------------") #print(st) for n in chunked: if isinstance(n, nltk.tree.Tree): words = [pair[0] for pair in n.leaves()] #print(words) if n.label() == 'NegP' and negated(words): for i, (word, pos) in enumerate(n.leaves()): if (pos=="ADV" or pos=="ADJ" or pos=="VERB") and (word!="not"): modified_words.append(prepend_not(word)) else: modified_words.append(word) else: modified_words.extend(words) else: modified_words.append(n[0]) newst =' '.join(modified_words) #print(newst) modified_st.append(newst) else: modified_st.append(st) return ". ".join(modified_st)
def scored_document_phrases(documents, segmented=True): # If documents are not segmented and tagged, do so. if not segmented: documents = [ nltk.sent_tokenize(document) for document in documents ] # Compose the documents as a list of their keyphrases documents = [ list(extract_candidate_phrases(document, tagged=segmented)) for document in documents ] # Create a lexicon of candidate phrases lexicon = gensim.corpora.Dictionary(documents) # Vectorize the documents by phrases for scoring vectors = [ lexicon.doc2bow(document) for document in documents ] # Create the TF-IDF Model and compute the scores model = gensim.models.TfidfModel(vectors) scores = model[vectors] for doc in scores: yield [ (lexicon[vec], score) for vec, score in doc ]
def preprocess(text): return [ [ list(nltk.pos_tag(nltk.word_tokenize(sent))) for sent in nltk.sent_tokenize(para) ] for para in text.split("\n\n") ]
def augment(pair): # convert single pair into multiple pairs question, answer = map(sent_tokenize, pair) q_sents = list(reversed(question)) for _ in range(len(q_sents)): a_sents = answer[:] for _ in range(len(a_sents)): yield (' '.join(reversed(q_sents)), ' '.join(a_sents)) a_sents.pop() q_sents.pop()
def factAnalysis(text): ''' Goes through text, tokenizes by sentence and returns a tuple containing a boolean representing if overall the text is real or fake, a confidence score determined by the amount of votes against the verdict and list of tuples containing the specific sentence and boolean representing if the sentence is true or fake :return: tuple of boolean, double, and list ''' text = sent_tokenize(text) trueCount = 0 falseCount = 0 sentenceLabels = [] for sentence in text: features = findFeatures(sentence) if voteClassifier.classify(features): trueCount += 1 sentenceLabels.append((sentence, True)) else: falseCount += 1 sentenceLabels.append((sentence, False)) if not sentenceLabels: return False, False, False elif trueCount > falseCount: return True, 1 - falseCount/trueCount, sentenceLabels else: return False, 1 - trueCount/falseCount, sentenceLabels
def getSentences(corpus): '''tokenize the corpus into sentences''' sentences = nltk.sent_tokenize(corpus) sentences = [removePunctuations(sentence) for sentence in sentences] return sentences
def split_text(filename): with open(filename, 'rU') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # extra decoding to account for non UTF-8 characters sentences = itertools.chain(*[nltk.sent_tokenize( x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower()) for x in reader]) return sentences
def split_sentences(self): print("Reading CSV file...") with open(self.train_file, 'rU') as f: reader = csv.reader(f, skipinitialspace=True) reader.next() # extra decoding to account for non UTF-8 characters self.sentences = itertools.chain(*[nltk.sent_tokenize( x[0].decode('latin-1').encode('utf-8').decode('utf-8').lower()) for x in reader]) self.sentences = ["%s %s %s" % ( self.sentence_start_token, x, self.sentence_end_token) for x in self.sentences] print("Parsed %d sentences." % (len(self.sentences)))
def __get_sentences(self, content, length): sentences = nltk.sent_tokenize(content.decode('utf-8')) res = " ".join(sentences[0:length]) return res
def fetch_name(resume_text): tokenized_sentences = nltk.sent_tokenize(resume_text) for sentence in tokenized_sentences: for chunk in nltk.ne_chunk(nltk.pos_tag(nltk.word_tokenize(sentence), tagset='universal')): if hasattr(chunk, 'label'):# and chunk.label() == 'PERSON': chunk = chunk[0] (name, tag) = chunk if tag == 'NOUN': return name return "Applicant name couldn't be processed"