我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用string.punctuation()。
def test_bag_of_words_for_series(): dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes')) series = XSeries(dataset.data[:10]) assert series.data_type == str translator = str.maketrans('', '', string.punctuation) tokenizer_transformer = XSeriesTransformer( transform_function=lambda text: text.lower().translate(translator).strip().split() ) transformed_series = tokenizer_transformer.fit_transform(series) # print(transformed_series) bag_transform = BagOfWordsTransformer() transformed_series = bag_transform.fit_transform(transformed_series) # print(transformed_series) assert type(transformed_series) == XDataFrame
def __iter__(self): """ Read a file where each line is of the form "word1 word2 ..." Yields lists of the form [word1, word2, ...] """ if os.path.isdir(self.fname): filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)] else: filenames = [self.fname] for filename in filenames: # with io.open(filename, encoding='utf-8') as f: with open(filename) as f: doc = f.read() for line in doc.split("\n"): #if not line: continue sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split() # sent = [word for word in line.strip().split()] sent = [self.begin] + sent + [self.end] yield sent
def __iter__(self): """ Read a file where each line is of the form "word1 word2 ..." Yields lists of the form [word1, word2, ...] """ #jfbbb if os.path.isdir(self.fname): filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)] #else: # filenames = [self.fname] for langpath in filenames: with open(filename) as f: doc = f.read() for line in doc.split("\n"): #if not line: continue sent = "".join([ch for ch in line.lower() if ch not in string.punctuation]).strip().split() # sent = [word for word in line.strip().split()] sent = [self.begin] + sent + [self.end] yield sent
def hexdump(src, length=16, sep='.'): """ Displays a hex output of the content it is passed. This was based on https://gist.github.com/7h3rAm/5603718 with some minor modifications """ allowed = digits + ascii_letters + punctuation + ' ' print_map = ''.join(((x if x in allowed else '.') for x in map(chr, range(256)))) lines = [] for c in xrange(0, len(src), length): chars = src[c:c + length] hex = ' '.join(["%02x" % ord(x) for x in chars]) if len(hex) > 24: hex = "%s %s" % (hex[:24], hex[24:]) printable = ''.join(["%s" % ( (ord(x) <= 127 and print_map[ord(x)]) or sep) for x in chars]) lines.append("%08x: %-*s |%s|" % (c, length * 3, hex, printable)) return '\n'.join(lines)
def attack(): ip = socket.gethostbyname( host ) global n msg=str(string.letters+string.digits+string.punctuation) data="".join(random.sample(msg,5)) dos = socket.socket(socket.AF_INET, socket.SOCK_STREAM) try: n+=1 dos.connect((ip, port)) dos.send( "GET /%s HTTP/1.1\r\n" % data ) print "\n "+time.ctime().split(" ")[3]+" "+"["+str(n)+"] #-#-# Hold Your Tears #-#-#" except socket.error: print "\n [ No connection! Server maybe down ] " dos.close()
def removeCommonWords(self, sentence, common_words, tokenized=False): """Takes a sentence and list of stopwords and removes the stopwords from the sentence.""" if not tokenized: words = sentence.split(' ') else: words = sentence final_sentence = [] for word in words: word = word.translate(string.maketrans("", ""), string.punctuation) word = word.lower() if word in common_words: continue else: final_sentence.append(word) return final_sentence
def ex3(argv): password = '' for i in range(len(argv)): for j in range(int(argv[i])): if i == 0: password += string.uppercase[random.randint(0,len(string.uppercase)-1)] elif i == 1: password += string.lowercase[random.randint(0,len(string.lowercase)-1)] elif i == 2: password += string.digits[random.randint(0,len(string.digits)-1)] elif i == 3: password += string.punctuation[random.randint(0,len(string.punctuation)-1)] return ''.join(random.sample(password,len(password)))
def mark(line): tmp_line = '' for c in line: if c in string.punctuation: if c is not "'": tmp_line += ' ' + c + ' ' else: tmp_line += ' ' + c else: tmp_line += c tmp_line = tmp_line.lower() words = [w for w in tmp_line.split() if len(w) > 0] for w in words: if w not in word2freq: word2freq[w] = 1 else: word2freq[w] += 1 return words
def _normalize_answer(s): """Normalize string to score answers according to SQuAD dataset scoring rules. Remove articles, remove punctuation, fix multiple whitespaces in string, and convert all characters to lowercase. """ def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
def tiny_tokenize(text, stem=False, stop_words=[]): words = [] for token in wordpunct_tokenize(re.sub('[%s]' % re.escape(string.punctuation), ' ', \ text.decode(encoding='UTF-8', errors='ignore'))): if not token.isdigit() and not token in stop_words: if stem: try: w = EnglishStemmer().stem(token) except Exception as e: w = token else: w = token words.append(w) return words # return [EnglishStemmer().stem(token) if stem else token for token in wordpunct_tokenize( # re.sub('[%s]' % re.escape(string.punctuation), ' ', text.decode(encoding='UTF-8', errors='ignore'))) if # not token.isdigit() and not token in stop_words]
def _extract_values_from_rpdr_notes( rpdr_notes, phrase_type, phrases, ignore_punctuation, show_n_words_context_before, show_n_words_context_after): """Return a list of NotePhraseMatches for each note in rpdr_notes.""" note_phrase_matches = [] if ignore_punctuation: logging.info('ignore_punctuation is True, so we will also ignore ' 'any punctuation in the entered phrases.') phrases = [_remove_punctuation(phrase) for phrase in phrases] match_contexts = PhraseMatchContexts( show_n_words_context_before, show_n_words_context_after) for rpdr_note in rpdr_notes: if ignore_punctuation: rpdr_note.remove_punctuation_from_note() phrase_matches = _extract_phrase_from_notes(phrase_type, phrases, rpdr_note, match_contexts) note_phrase_matches.append(phrase_matches) match_contexts.print_ordered_contexts() return note_phrase_matches
def _words_plus_punc(self): """ Returns mapping of form: { 'cat,': 'cat', ',cat': 'cat', } """ no_punc_text = REGEX_REMOVE_PUNCTUATION.sub('', self.text) # removes punctuation (but loses emoticons & contractions) words_only = no_punc_text.split() # remove singletons words_only = set( w for w in words_only if len(w) > 1 ) # the product gives ('cat', ',') and (',', 'cat') punc_before = {''.join(p): p[1] for p in product(PUNC_LIST, words_only)} punc_after = {''.join(p): p[0] for p in product(words_only, PUNC_LIST)} words_punc_dict = punc_before words_punc_dict.update(punc_after) return words_punc_dict
def ping(self, user, text = None): """Measure round-trip delay to another IRC client. """ if self._pings is None: self._pings = {} if text is None: chars = string.letters + string.digits + string.punctuation key = ''.join([random.choice(chars) for i in range(12)]) else: key = str(text) self._pings[(user, key)] = time.time() self.ctcpMakeQuery(user, [('PING', key)]) if len(self._pings) > self._MAX_PINGRING: # Remove some of the oldest entries. byValue = [(v, k) for (k, v) in self._pings.items()] byValue.sort() excess = self._MAX_PINGRING - len(self._pings) for i in xrange(excess): del self._pings[byValue[i][1]]
def _init_(self, min_cut=0.1, max_cut=0.9): # identation changes - we are inside the constructor # here we set up the behaviour # this is called each time an object of feq summ class is # created or instantiated self._min_cut = min_cut # self=keyword that reports the variable self._max_cut = max_cut # we save the val of the 2 parameters passed by assigning them # two member variables - the 'self.' prefix identifies them as part # of the self argument - using underscore as first char. self._stopwords = set(stopwords.words('english') + list(punctuation)) # this is alist of all common words and punc symols # identation changes - we are out of the constructor here # This is still the body of the class # Defining var here ( outside a member function) but within the class # member var becomes STATIC. This means it belongs to the class, and not # to any specific individual instance (object) of the class
def normalize_answer(s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
def lemmatize_text(text, stop_words=STOPLIST, keep_pos=KEEP_POS): ''' Function to lemmatize a single document of the corpus INPUT: text: string, text of review stop_words: words to remove from text, default STOPLIST defined above keep_pos: parts of speech to keep in text, default KEEP_POS def above OUTPUT: lemmatized text ''' x = nlp(text) words = [tok.lemma_.strip(punctuation) for tok in x if ( tok.pos_ in keep_pos) and (tok.lemma_.strip(punctuation) not in STOPLIST)] words.extend(['boss' for tok in x if tok.lemma_ == 'bos']) return ' '.join(words)
def header_link(title): """Return a github-style link target for a title. >>> header_link('Hello there!') 'hello-there' """ # This doesn't do the-title-1, the-title-2 etc. with multiple titles # with same text, but usually this doesn't matter. result = '' for character in title: if character in string.whitespace: result += '-' elif character in string.punctuation: pass else: result += character.lower() return result
def header_link(title): """Return a github-style link target for a title. >>> header_link('Hello there!') 'hello-there' """ # This doesn't handle multiple titles with the same text in the # same file, but usually that's not a problem. GitHub makes # links like the-title, the-title-1, the-title-2 etc. result = '' for character in title: if character in string.whitespace: result += '-' elif character in string.punctuation: pass else: result += character.lower() return result
def _bots(self, ctx, amount: int=100): """Clears bots and bot calls.""" def check(m): if m.author.bot: return True for mem in m.mentions: if mem.bot: return True if m.content.startswith(tuple(i for i in string.punctuation)) and not bool(re.search(r'^<@!?(\d+)>', m.content)): return True return False messages = await self.bot.purge_from(ctx.message.channel, limit=amount, before=ctx.message, check=check) await self.bot.delete_message(ctx.message) send = await self.bot.say("Successfully cleared **{}** messages".format(len(messages))) await asyncio.sleep(3) await self.bot.delete_message(send)
def could_be(self, other): """Return True if the other PersonName is not explicitly inconsistent.""" # TODO: Some suffix and title differences should be allowed if type(other) is not type(self): return NotImplemented if self == other: return True for attr in ['title', 'firstname', 'middlename', 'nickname', 'prefix', 'lastname', 'suffix']: if attr not in self or attr not in other: continue puncmap = dict((ord(char), None) for char in string.punctuation) s = self[attr].lower().translate(puncmap) o = other[attr].lower().translate(puncmap) if s == o: continue if attr in {'firstname', 'middlename', 'lastname'}: if (({len(comp) for comp in s.split()} == {1} and [el[0] for el in o.split()] == s.split()) or ({len(comp) for comp in o.split()} == {1} and [el[0] for el in s.split()] == o.split())): continue return False return True
def posNegCount(self, tweet): pos = 0 neg = 0 for p in list(punctuation): tweet = tweet.replace(p, '') tweet = tweet.lower() words = tweet.split(' ') word_count = len(words) for word in words: if word in self.positive_words: pos = pos + 1 elif word in self.negative_words: neg = neg + 1 return pos, neg
def LemNormalize(text): # convert non ascii characters text = text.encode('ascii', 'replace').decode() # remove punctuation and digits remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits]) transformed = text.lower().translate(remove_punct_and_digits) # shortword = re.compile(r'\W*\b\w{1,2}\b') # transformed = shortword.sub('', transformed) # tokenize the transformed string tokenized = nltk.word_tokenize(transformed) # remove short words (less than 3 char) tokenized = [w for w in tokenized if len(w) > 3] tokenizer = LemTokens(tokenized) return tokenizer
def LemNormalizeIt(text): # convert non ascii characters text = text.encode('ascii', 'replace').decode() # remove punctuation and digits remove_punct_and_digits = dict([(ord(punct), ' ') for punct in string.punctuation + string.digits]) transformed = text.lower().translate(remove_punct_and_digits) # tokenize the transformed string tokenized = nltk.word_tokenize(transformed) # apply lemming with morph it morph_it = load_morph_it() tokenized = [morph_it.get(w, w) for w in tokenized if len(w) > 3] return tokenized
def preprocessing(content): remove_punc = ('? ? ? ? ? ? ? ? ? —').split(' ') ## preprocessing #1 : remove XXenglishXX and numbers preprocessing_1 = re.compile(r'\d*',re.L) ## only substitute numbers #preprocessing_1 = re.compile(r'\w*',re.L) ## substitute number & English content = preprocessing_1.sub("",content) ## preprocessing #2 : remove punctuation preprocessing_2 = re.compile('[%s]' % re.escape(string.punctuation)) content = preprocessing_2.sub("",content) ## preprocessing #3 : remove Chinese punctuation and multiple whitspaces content = content.replace('\n','') for punc in remove_punc: content = content.replace(punc,'') try: content = parsing.strip_multiple_whitespaces(content) except: print 'Warning : failed to strip whitespaces @ ' return content
def get_anilist_links(title): """Iterates through all search methods until link is constructed""" exclude = set(string.punctuation) title = ''.join(ch for ch in title if ch not in exclude) title = title.lower().split(' ') if 'season' in title: title.remove('season') title = ' '.join(title) anilist_regex = re.compile(r'http(s)?://anilist.co/anime/([0-9]){1,5}(/.*)?') link_dispatcher = {'api': _get_anilist_link_by_api} for _, v in link_dispatcher.items(): anilist_url = v(title) if anilist_url is None: continue if re.match(anilist_regex, anilist_url) is not None: return anilist_url return
def search_crunchyroll(anime): """Searches if anime exists on Crunchyroll and returns a link""" try: exclude = set(string.punctuation) anime = ''.join(ch for ch in anime if ch not in exclude) keywords = anime.split(' ') crunchy_api = MetaApi() crunchyroll_listing = [] while len(keywords) > 0: crunchyroll_listing = list(crunchy_api.search_anime_series(' '.join(keywords))) if len(crunchyroll_listing) <= 0: print('No crunchyroll listings found') keywords.pop() continue else: break except: print('Crunchyroll url couldn\'t be retrieved') return return crunchyroll_listing[0].url if len(crunchyroll_listing) > 0 else None
def search_funimation(anime): """Checks if anime exists on Funimation website and returns a link""" try: exclude = set(string.punctuation) anime = ''.join(ch for ch in anime if ch not in exclude) keywords = anime.split(' ') funi_url = None while len(keywords) > 0: show_slug = '-'.join(keywords).lower() funi_url = f'https://www.funimation.com/shows/{show_slug}/' funi_url = utilities.make_get_request(funi_url) if funi_url is None: keywords.pop() continue else: break except: print('Funimation url couldn\'t be retrieved') return return funi_url.url if funi_url is not None else None
def search_animelab(anime): """Checks if anime title exists on AnimeLab website and returns a link""" try: exclude = set(string.punctuation) anime = ''.join(ch for ch in anime if ch not in exclude) keywords = anime.split(' ') animelab_url = None while len(keywords) > 0: show_slug = '-'.join(keywords).lower() animelab_url = f'https://www.animelab.com/shows/{show_slug}' animelab_url = utilities.make_get_request(animelab_url) if animelab_url is None: keywords.pop() return else: break except: print('Animelab url couldn\'t be retrieved') return return animelab_url.url
def __init__(self, clean_config = None): self.cc = { 'lower' : False, 'punctuation' : False, 'whitespace' : False, 'digit' : False, } # Override clean config and validation check if clean_config != None: for key, value in clean_config.iteritems(): if key in self.cc: if value not in [True, False,1,0]: print ("Invalid: Incorrect boolean value: "+str(value)+" for key: " + str(key)) else: self.cc[key] = value else: print ("Invalid: Cleaner not recognized: " + str(key) + ", available Cleaners: " + ", ".join(self.cc.keys())) cleaners_applied = [key for key in self.cc if self.cc[key]] if cleaners_applied: print ("Applying Cleaners: " + ", ".join(cleaners_applied)) else: print ("Warning: No cleaners in config")
def clean_text(self, txt): """ function to clean a text on the basis of configurations mentioned in clean config. """ txt = str(txt) if self.cc['lower']: txt = txt.lower() if self.cc['punctuation']: txt = "".join([x for x in txt if x not in punctuations]) if self.cc['whitespace']: txt = "".join(txt.split()).strip() if self.cc['digit']: txt = "".join(x for x in txt if x not in "0987654321") return txt
def list_returns(fileToCheck, desiredInterface): returnsList = [] newLine = "" with open(fileToCheck, 'r') as pyFile: for line in pyFile: if line.find("#") == -1: newFront = line.find("return") if newFront != -1: possibleErrorMessageCheck1 = line.find("'") bracketBefore = line.find("{") lastBracket = line.find("}") newLine = line[possibleErrorMessageCheck1:] possibleErrorMessageCheck2 = newLine.find(" ") if possibleErrorMessageCheck2 == -1: line = line[newFront + 7:] line.split() line = [word.strip(punctuation) for word in line.split()] returnsList.extend(line) elif possibleErrorMessageCheck1 == bracketBefore + 1: line = line[newFront + 7:lastBracket + 1] line.split() returnsList.append(line) return returnsList
def make_wifipassword(args): import random, string, hashlib if args.password is None: printable = string.digits + string.letters + string.punctuation args.password = ''.join([random.choice(printable) for i in xrange(32)]) if args.password_id is None: args.password_id = random.randint(0x0010, 0xFFFF) pkhash = hashlib.sha256(args.pubkey.read()).digest()[0:20] record = nfc.ndef.WifiPasswordRecord() record.password['public-key-hash'] = pkhash record.password['password-id'] = args.password_id record.password['password'] = args.password message = nfc.ndef.Message(record) if args.outfile.name == "<stdout>": args.outfile.write(str(message).encode("hex")) else: args.outfile.write(str(message))
def _insert(self, x, y, text): """ Insert text at given x, y coordinates --- used with drag-and-drop. """ # Clean text. import string text = filter(lambda x: x in (string.letters + string.digits + string.punctuation + ' '), text) # Find insertion point. index, flags = self.HitTest((x, y)) if index == wx.NOT_FOUND: if flags & wx.LIST_HITTEST_NOWHERE: index = self.GetItemCount() else: return # Get bounding rectangle for the item the user is dropping over. rect = self.GetItemRect(index) # If the user is dropping into the lower half of the rect, we want to insert _after_ this item. if y > rect.y + rect.height/2: index += 1 self.InsertStringItem(index, text)
def public_posts(self): now = datetime.now() # ???????? 30 ????????? ??????? ?? rss ?????? ? ??????? ?? ??, ? ??????? message_id=0 posts_from_db = self.db.get_post_without_message_id() today_news = [i for i in self.src.news if ( now - datetime.fromtimestamp(i.date)).days < 1] # ????? ?????????? ???? ??????? for_publishing = list(set(today_news) & set(posts_from_db)) for_publishing = sorted(for_publishing, key=lambda news: news.date) # for_publishing = sorted(today_news, key=lambda news: news.date) # ??????? ??????? ????????? for post in tqdm(for_publishing, desc="Posting news"): header = base64.b64decode(post.text).decode('utf8') header = ''.join(c for c in header if c not in set(punctuation + '—«»')) header = '#' + '_'.join(header.lower().split()) text = '%s %s' % (header, self.bit_ly.short_link(base64.b64decode(post.link).decode('utf8'))) a = self.send_message( chat_id=self.chat_id, text=text) # , parse_mode=telegram.ParseMode.HTML) message_id = a.message_id chat_id = a['chat']['id'] self.db.update(post.link, chat_id, message_id) logging.info(u'Public: %s;%s;' % (str(post), message_id)) time.sleep(self.delay_between_messages)
def rem_whitespace(string): """ careful to keep this order of patterns or duplicate whitespace created in first round will not be removed """ unwanted_chars = punctuation + whitespace pat_l = [r'[' + unwanted_chars + ']', r'\s+', r' ', r' \\', r' \ ' ] for p in pat_l: rx = re.compile(p) string = re.sub(rx, ' ', string) return string.strip()
def get_tag_translate(self, tag): translate_dict = { "p": "p", "punctuation": "", "heading": "span style='font-style: bold'", #"heading": "span style='font-style: bold; font-size:150%'", #"h1": "span style='font-style: bold; font-size:150%'", "boldface": "b", "italics": "i", "underline": "u", "superscript": "sup", "subscript": "sup", "object": "object", "text": "html"} if tag in translate_dict: return translate_dict[tag] else: print("unsupported tag: ", tag) return tag
def add_token(self, token_string, token_pos=None): # get lemma string: if all(x in string.punctuation for x in token_string): token_pos = "PUNCT" lemma = token_string else: try: # use the current lemmatizer to assign the token to a lemma: lemma = self._lemmatize(token_string, self._pos_translate(token_pos)).lower() except Exception: lemma = token_string.lower() # get word id, and create new word if necessary: word_dict = {self.word_lemma: lemma, self.word_label: token_string} if token_pos and self.arguments.use_nltk: word_dict[self.word_pos] = token_pos word_id = self.table(self.word_table).get_or_insert(word_dict, case=True) # store new token in corpus table: return self.add_token_to_corpus( {self.corpus_word_id: word_id, self.corpus_sentence: self._sentence_id, self.corpus_file_id: self._file_id})
def normalize_answer(self, s): """Lower text and remove punctuation, articles and extra whitespace.""" def remove_articles(text): return re.sub(r'\b(a|an|the)\b', ' ', text) def white_space_fix(text): return ' '.join(text.split()) def remove_punc(text): exclude = set(string.punctuation) return ''.join(ch for ch in text if ch not in exclude) def lower(text): return text.lower() return white_space_fix(remove_articles(remove_punc(lower(s))))
def normalize_text(texts, stops): # Lower case texts = [x.lower() for x in texts] # Remove punctuation texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] # Remove numbers texts = [''.join(c for c in x if c not in '0123456789') for x in texts] # Remove stopwords texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts] # Trim extra whitespace texts = [' '.join(x.split()) for x in texts] return(texts) # Build dictionary of words
def normalize_text(texts, stops): # Lower case texts = [x.lower() for x in texts] # Remove punctuation texts = [''.join(c for c in x if c not in string.punctuation) for x in texts] # Remove numbers texts = [''.join(c for c in x if c not in '0123456789') for x in texts] # Remove stopwords texts = [' '.join([word for word in x.split() if word not in (stops)]) for x in texts] # Trim extra whitespace texts = [' '.join(x.split()) for x in texts] return(texts)
def strip_punctuation(text): """ strips the punctuation from a bunch of text """ # build a translation table for string.translate: # there are other ways to do this: # create a translation table to replace all punctuation with spaces # -- then split() will remove the extra spaces punctuation = string.punctuation punctuation = punctuation.replace("'", "") # keep apostropies punctuation = punctuation.replace("-", "") # keep hyphenated words # building a translation table table = {} for c in punctuation: table[ord(c)] = ' ' # remove punctuation with the translation table text = text.translate(table) # remove "--" -- can't do multiple characters with translate text = text.replace("--", " ") return text
def make_words(text): """ make a list of words from a large bunch of text strips all the punctuation and other stuff from a string """ text = strip_punctuation(text) # lower-case everything to remove that complication: text = text.lower() # split into words words = text.split() # remove the bare single quotes: "'" is both a quote and an apostrophe # and capitalize "i" words2 = [] for word in words: if word != "'": # remove quote by itself # "i" by itself should be capitalized words2.append("I" if word == 'i' else word) # could be done with list comprehension too -- next week! # words2 = [("I" if word == 'i' else word) for word in words if word != "'"] return words2
def _get_base_doge_words(self, eng_text): """ Get all base words from text to make doge phrases from. eg. 'Hello there, I am happy' -> ['hello', 'are', 'happy'] Args: eng_text (str): Text to get words from. Returns: list[str]: List of lower case words to use from text. """ phrase_no_punct = "".join([ch for ch in eng_text if ch not in string.punctuation]) tagged_words = nltk.pos_tag([w.lower() for w in phrase_no_punct.split(' ') if w.isalpha()]) chosen_words = [] for word, tag in tagged_words: if tag[0] in ['N', 'V', 'J']: # make noun singular if tag[0] == 'N': word = self._lemmatizer.lemmatize(word, pos='n') # make verb infinitive elif tag[0] == 'V': word = self._lemmatizer.lemmatize(word, pos='v') chosen_words.append(word.encode('ascii', 'ignore')) # lemmatize makes word unicode return list(set(chosen_words))
def str2index(str_): # clean white space str_ = ' '.join(str_.split()) # remove punctuation and make lower case str_ = str_.translate(None, string.punctuation).lower() res = [] for ch in str_: try: res.append(byte2index[ch]) except KeyError: # drop OOV pass return res # convert index list to string