我们从Python开源项目中,提取了以下11个代码示例,用于说明如何使用gensim.corpora.WikiCorpus()。
def formatTime(seconds): """ Takes a number of elapsed seconds and returns a string in the format h:mm. """ m, s = divmod(seconds, 60) h, m = divmod(m, 60) return "%d:%02d" % (h, m) # TODO - Add example code for loading each item back from disk (if needed). # - Maybe a commented line below the 'save' command? # ======== main ======== # Main entry point for the script. # This little check has to do with the multiprocess module (which is used by # WikiCorpus). Without it, the code will spawn infinite processes and hang!
def wiki2texts(self, wiki_data_path, wiki_texts_path='./wiki_texts.txt'): """ ?????????????? Arguments: wiki_data_path -- ???????? """ if not wiki_data_path: print("??? Wiki ?????????? https://dumps.wikimedia.org/zhwiki/ ??") exit() # ??????? wiki_corpus = WikiCorpus(wiki_data_path, dictionary={}) texts_num = 0 with open(wiki_text_path, 'w', encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("??? %d ???" % texts_num) print("???????? OpenCC ??????")
def zhwiki2chars(in_file, out_file): reg = re.compile(r'^[a-zA-Z]+$') def _isalpha(string): return reg.match(string) is not None i = 0 out = open(out_file, 'w') wiki = WikiCorpus(in_file, lemmatize=False, dictionary={}) for article in wiki.get_texts(): tokens = [] for token in article: token = token.decode("utf-8").strip() if _isalpha(token): continue tokens.append(" ".join(token)) # divided by character out.write(" ".join(tokens) + "\n") i += 1 if i % 10000 == 0: print("process %d articles" % i) out.close()
def main(): if len(sys.argv) != 2: print("Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) texts_num = 0 with io.open("wiki_texts.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("??? %d ???" % texts_num)
def main(): if len(sys.argv) != 2: print("Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) wiki_corpus = WikiCorpus(sys.argv[1], dictionary={}) texts_num = 0 with open("wiki_texts.txt",'w',encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(' '.join(text) + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("??? %d ???" % texts_num)
def set_wiki_to_txt(self, wiki_data_path = None): if wiki_data_path == None: # ????? if len(sys.argv) != 2: print("Please Usage: python3 " + sys.argv[0] + " wiki_data_path") exit() else: wiki_corpus = WikiCorpus(sys.argv[1], dictionary = {}) else: wiki_corpus = WikiCorpus(wiki_data_path, dictionary = {}) # wiki.xml convert to wiki.txt with open("wiki_text.txt", 'w', encoding = 'utf-8') as output: text_count = 0 for text in wiki_corpus.get_texts(): # save use byte and decode utf-8 output.write(b' '.join(text).decode('utf-8') + '\n') text_count += 1 if text_count % 10000 == 0: logging.info("????? %d ???" % text_count) print("????!")
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)): self.fname = fname self.logger = startlog() self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary)
def __init__(self, fname, _lemmatize=False, _dictionary={}, filter_namespaces=('0',)): self.fname = fname self.logger = startlog() self.corpus = WikiCorpus(fname, lemmatize=_lemmatize, dictionary=_dictionary) self.traincorpusfname = None
def wikiToTxt(self): # This function takes about 25 minutes from gensim.corpora import WikiCorpus wiki_corpus = WikiCorpus('./build/zhwiki-latest-pages-articles.xml.bz2', dictionary={}) texts_num = 0 with open('./build/wiki_texts.txt', 'w', encoding='utf-8') as output: for text in wiki_corpus.get_texts(): output.write(b' '.join(text).decode('utf-8') + '\n') texts_num += 1 if texts_num % 10000 == 0: logging.info("??? %d ???" % texts_num)
def get_save_wikitext(wiki_filename,text_filename): output = open(text_filename, 'w') wiki = corpora.WikiCorpus(wiki_filename, lemmatize=False, dictionary={}) for text in wiki.get_texts(): # text = delNOTNeedWords(text,"../../stopwords.txt")[1] output.write(" ".join(text) + "\n") i = i + 1 if (i % 10000 == 0): logging.info("Saved " + str(i) + " articles") output.close()
def main(): parser = argparse.ArgumentParser(description='Create a corpus from a collection of tweets and/or build an LDA model') subparsers = parser.add_subparsers(dest='mode') text_corpus_parser = subparsers.add_parser('text', help='Build corpus from directory of text files') text_corpus_parser.add_argument('-d', '--docs_loc', required=True, action='store', dest='docs_loc', help='Directory where tweet documents stored') text_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus') text_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words') wiki_corpus_parser = subparsers.add_parser('wiki', help='Build corpus from compressed Wikipedia articles') wiki_corpus_parser.add_argument('-w', '--wiki_loc', required=True, action='store', dest='wiki_loc', help='Location of compressed Wikipedia dump') wiki_corpus_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location and name to save corpus') wiki_corpus_parser.add_argument('-m', '--lemma', action='store_true', dest='lemma', help='Use this option to lemmatize words') lda_model_parser = subparsers.add_parser('lda', help='Create LDA model from saved corpus') lda_model_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus') lda_model_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary') lda_model_parser.add_argument('-n', '--num_topics', required=True, action='store', dest='num_topics', help='Number of topics to assign to LDA model') lda_model_parser.add_argument('-p', '--num_pass', required=True, action='store', dest='num_pass', help='Number of passes through corpus when training the LDA model') lda_model_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location and name to save LDA model') lda_vis_parser = subparsers.add_parser('ldavis', help='Create visualization of LDA model') lda_vis_parser.add_argument('-c', '--corp_loc', required=True, action='store', dest='corp_loc', help='Location of corpus') lda_vis_parser.add_argument('-d', '--dict_loc', required=True, action='store', dest='dict_loc', help='Location of dictionary') lda_vis_parser.add_argument('-l', '--lda_loc', required=True, action='store', dest='lda_loc', help='Location of LDA model') argcomplete.autocomplete(parser) args = parser.parse_args() if args.mode == 'text': doc_corpus = DocCorpus(args.docs_loc, args.lemma) doc_corpus.dictionary.filter_extremes(no_below=1, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(args.corp_loc + '.mm', doc_corpus) doc_corpus.dictionary.save(args.corp_loc + '.dict') if args.mode == 'wiki': if args.lemma: wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=True, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15) else: wiki_corpus = WikiCorpus(args.wiki_loc, lemmatize=False, tokenizer_func=wiki_tokenizer, article_min_tokens=100, token_min_len=3, token_max_len=15) wiki_corpus.dictionary.filter_extremes(no_below=5, no_above=0.5, keep_n=DEFAULT_DICT_SIZE) MmCorpus.serialize(args.corp_loc + '.mm', wiki_corpus) wiki_corpus.dictionary.save(args.corp_loc + '.dict') if args.mode == 'lda': build_LDA_model(args.corp_loc, args.dict_loc, args.num_topics, args.num_pass, args.lda_loc) if args.mode == 'ldavis': build_pyLDAvis_output(args.corp_loc, args.dict_loc, args.lda_loc)