def analyze(content): # ????? ???? content ? string ??? ???? # ????? ??? nouns ?? ??? ?? nouns=t.nouns(str(content)) # ????? ?? trash=["??","????","??","??","??","??","?????"] for i in trash: for j in nouns: if i==j: nouns.remove(i) ko=nltk.Text(nouns,name="??") #ranking??? ??? ????? ?? ranking=ko.vocab().most_common(100) tmpData=dict(ranking) # ?????? ?? wordcloud=WordCloud(font_path="/Library/Fonts/AppleGothic.ttf",relative_scaling=0.2,background_color="white",).generate_from_frequencies(tmpData) #matplotlib ?????? ?? ??????? ??? ???? ??? plt.figure(figsize=(16,8)) plt.imshow(wordcloud) plt.axis("off") plt.show() # ??? ??(??? ????? ???? ???? ? ?????? ??? ??)
def guess_by_frequency(self): input_data = None words = None to_replace = {} try: with open(os.path.join(os.path.dirname(__file__), "Lingvo/wordlist.txt"), 'r') as words_file: input_data = words_file.read().split() words = self.text.split() except FileNotFoundError: logging.critical("Wordlist could not be found.") return False frequencies = nltk.FreqDist(words).most_common(len(words)) # Choosing to replace an element where needed. for elem in frequencies: word = elem[0] if word in to_replace.keys() or '?' not in word: continue for sample_word in input_data: if check_similarity(word, sample_word): to_replace[word] = sample_word break # Replacing for i in range(len(words)): if words[i] in to_replace.keys(): words[i] = to_replace[words[i]] text = nltk.Text(words) self.text = nltk.Text(words).name[:-3] return True
def analyze(content, url, title): tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = LancasterStemmer() stop_token = ['The', 'can', 's', 'I', 't', 'am', 'are'] texts = [] content_tokens = word_tokenize(content) title_tokens = word_tokenize(title) content_text = nltk.Text(content_tokens) tokens = tokenizer.tokenize(content) tokens = [i for i in tokens if not i.isdigit()] #Remove all numbers stopped_tokens = [i for i in tokens if not i in en_stop] #Remove all meaningless words stopped_tokens = [i for i in stopped_tokens if not i in stop_token] #Stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=1,\ id2word = dictionary, passes=20) topics = ldamodel.show_topic(0, 3) #topics = ldamodel.print_topics(num_topics=1, num_words=3)[0] Rtopic = [] for topicTuple in topics: topic, rate = topicTuple Rtopic.append(topic) if len(Rtopic) == 0: Rtopic.append("Not English") Rtopic.append("Maybe Chinese?") return (Rtopic, url, title)
def Text(str1): if isinstance(str1,list) == False: str1=word_tokenize(str(str1)) return nltk.Text(str1)