Python jieba 模块,lcut() 实例源码

我们从Python开源项目中,提取了以下48个代码示例,用于说明如何使用jieba.lcut()

项目:tensorflow_novelist-master    作者:charlesXu86    | 项目源码 | 文件源码
def process_sent(sent, vocab_int, steps):
    """
    this file token sentence and make it into numpy array, return a fixed length 2d array
    :param sent: 
    :param vocab_int: 
    :param steps: 
    :return: 
    """
    sent_list = jieba.lcut(sent)
    # if words not in vocab dict then let this word be a random index which maybe other words
    index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
    if len(index_list) < steps:
        index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
    else:
        index_list = index_list[0: steps]
    return np.array([index_list])
项目:tensorflow_novelist-master    作者:charlesXu86    | 项目源码 | 文件源码
def process_sent(sent, vocab_int, steps):
    """
    this file token sentence and make it into numpy array, return a fixed length 2d array
    :param sent: 
    :param vocab_int: 
    :param steps: 
    :return: 
    """
    sent_list = jieba.lcut(sent)
    # if words not in vocab dict then let this word be a random index which maybe other words
    index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
    if len(index_list) < steps:
        index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
    else:
        index_list = index_list[0: steps]
    return np.array([index_list])
项目:tensorflow_novelist-master    作者:charlesXu86    | 项目源码 | 文件源码
def prepare_data(self):
        corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
        vocabs = []
        for l in corpus_cut:
            for i in l:
                vocabs.append(i)
        # vocabs = reduce(lambda x, y: x+y, corpus_cut)
        # count every vocab frequency
        # but currently we don't think about the 'most' frequent one, just let it go
        counter = collections.Counter(vocabs)
        counter = counter.most_common()
        vocabs_set, _ = zip(*counter)
        vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}

        data_flatten = np.array([vocab_int_map[v] for v in vocabs])
        #step=3
        data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(0,data_flatten.shape[0]-self.n_steps -1,3)])
        # let's shuffle data to see anything happens
        np.random.shuffle(data)
        return len(vocabs_set), vocab_int_map, data
项目:tensorflow_novelist-master    作者:charlesXu86    | 项目源码 | 文件源码
def prepare_data(self):
        corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
        vocabs = []
        for l in corpus_cut:
            for i in l:
                vocabs.append(i)
        # vocabs = reduce(lambda x, y: x+y, corpus_cut)
        # count every vocab frequency
        # but currently we don't think about the 'most' frequent one, just let it go
        counter = collections.Counter(vocabs)
        counter = counter.most_common()
        vocabs_set, _ = zip(*counter)
        vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}

        data_flatten = np.array([vocab_int_map[v] for v in vocabs])
        data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(data_flatten.shape[0] // (self.n_steps + 1))])
        # let's shuffle data to see anything happens
        np.random.shuffle(data)
        return len(vocabs_set), vocab_int_map, data
项目:Book_DeepLearning_Practice    作者:wac81    | 项目源码 | 文件源码
def delNOTNeedWords(content,customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result=''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            result += tempword # +"/"+str(w.flag)+" "  #????
            return_words.append(tempword)
    return result,return_words
项目:news-search-engine    作者:01joy    | 项目源码 | 文件源码
def result_by_time(self, sentence):
        seg_list = jieba.lcut(sentence, cut_all=False)
        n, cleaned_dict = self.clean_list(seg_list)
        time_scores = {}
        for term in cleaned_dict.keys():
            r = self.fetch_from_db(term)
            if r is None:
                continue
            docs = r[2].split('\n')
            for doc in docs:
                docid, date_time, tf, ld = doc.split('\t')
                if docid in time_scores:
                    continue
                news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
                now_datetime = datetime.now()
                td = now_datetime - news_datetime
                docid = int(docid)
                td = (timedelta.total_seconds(td) / 3600) # hour
                time_scores[docid] = td
        time_scores = sorted(time_scores.items(), key = operator.itemgetter(1))
        if len(time_scores) == 0:
            return 0, []
        else:
            return 1, time_scores
项目:news-search-engine    作者:01joy    | 项目源码 | 文件源码
def gen_idf_file(self):
        files = listdir(self.doc_dir_path)
        n = float(len(files))
        idf = {}
        for i in files:
            root = ET.parse(self.doc_dir_path + i).getroot()
            title = root.find('title').text
            body = root.find('body').text
            seg_list = jieba.lcut(title + '?' + body, cut_all=False)
            seg_list = set(seg_list) - self.stop_words
            for word in seg_list:
                word = word.strip().lower()
                if word == '' or self.is_number(word):
                    continue
                if word not in idf:
                    idf[word] = 1
                else:
                    idf[word] = idf[word] + 1
        idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
        for word, df in idf.items():
            idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
        idf_file.close()
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def delstopwords(content):
    result=''

    words = jieba.lcut(content)
    return_words = []
    for w in words:
        if w not in app.config['stopwords']:
            result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
            return_words.append(w.encode('utf-8'))

    # words = pseg.lcut(content)
    # with app.test_request_context():
    # for word, flag in words:
    #     if (word not in app.config['stopwords'] and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
    #         result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    #             print result
    return result,return_words
项目:recommended_system    作者:wac81    | 项目源码 | 文件源码
def delNOTNeedWords(content,stopwords):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????

    words = pseg.lcut(content)
    # jieba.cut()
    text_list = []
    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            # text_list.append(word.encode('utf-8'))
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
        # ''.join(text_list)
    return result
    # return ''.join(text_list)
项目:Answer_Selection    作者:xjtushilei    | 项目源码 | 文件源码
def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if True:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs
项目:Answer_Selection    作者:xjtushilei    | 项目源码 | 文件源码
def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if False:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs
项目:Answer_Selection    作者:xjtushilei    | 项目源码 | 文件源码
def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if True:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs
项目:Answer_Selection    作者:xjtushilei    | 项目源码 | 文件源码
def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = ''
    for seg in segs:
        if seg not in stopwords:
            final = final + seg
    return final
项目:basic-encoder-decoder    作者:pemywei    | 项目源码 | 文件源码
def main(_):
    print("Loading vocabulary")
    cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt")
    en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt")

    cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path)
    _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)

    print("Building model...")
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        model = create_model(sess, False)
        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            seg_list = jieba.lcut(sentence.strip())
            #print(" ".join(seg_list))
            token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list]
            #print(token_ids)
            outputs = model.test(sess, token_ids)
            outputs = outputs.tolist()
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])
            print(output.capitalize())
            print("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag
项目:CNKICrawler    作者:roliygu    | 项目源码 | 文件源码
def cut_with_flag(raw_str, filter_invalid_word_flag=True):
    """

    :param raw_str: str
    :return: list[(str, str)]
    """
    res = [(a, b) for a, b in pseg.lcut(raw_str)]

    if filter_invalid_word_flag:
        return filter_invalid_word(res)
    else:
        return res
项目:tf_tang_poems    作者:charlesXu86    | 项目源码 | 文件源码
def process_lyrics(file_name):
    lyrics = []
    content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
    for l in content:
        if len(l) < 40:
            continue
        l = start_token + l + end_token
        lyrics.append(l)
    lyrics = sorted(lyrics, key=lambda line: len(line))
    print('all %d songs...' % len(lyrics))

    # if not os.path.exists(os.path.dirname(segment_list_file)):
    #     os.mkdir(os.path.dirname(segment_list_file))
    # if os.path.exists(segment_list_file):
    #     print('load segment file from %s' % segment_list_file)
    #     with open(segment_list_file, 'rb') as p:
    #         all_words = pickle.load(p)
    # else:
    all_words = []
    for lyric in lyrics:
        all_words += jieba.lcut(lyric, cut_all=False)
        # with open(segment_list_file, 'wb') as p:
        #     pickle.dump(all_words, p)
        #     print('segment result have been save into %s' % segment_list_file)

    # calculate how many time appear per word
    counter = collections.Counter(all_words)
    print(counter['E'])
    # sorted depends on frequent
    counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = zip(*counter_pairs)
    print('E' in words)

    words = words[:len(words)] + (' ',)
    word_int_map = dict(zip(words, range(len(words))))
    # translate all lyrics into int vector
    lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
    return lyrics_vector, word_int_map, words
项目:seq2seq_chatterbot    作者:StephenLee2016    | 项目源码 | 文件源码
def segement(self, strs):
        return jieba.lcut(strs)
项目:Chinese-QA    作者:distantJing    | 项目源码 | 文件源码
def word_tokenize(tokens):
#   return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]

#from my.corenlp_interface import CoreNLPInterface
#url = 'vision-server2.corp.ai2'
#port = 8000
#interface = CoreNLPInterface(url, port)
#sent_tokenize = interface.split_doc
#word_tokenize = interface.split_sent
项目:SwissCheese-at-SemEval-2016    作者:naivechen    | 项目源码 | 文件源码
def get_train_data(language):

    # Load data from files
    path = "./data/" + language + "/"
    positive_examples = list(open(path + "rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples[:100]]   # -1000
    negative_examples = list(open(path + "rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples[:100]]

    x_text = positive_examples + negative_examples

    x_text = [sent for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    # Build vocabulary
    max_length_of_sentence = max([len(jieba.lcut(x)) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_of_sentence)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(1234)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/cross-validation set
    cross_validation_indices = np.array(random.sample(np.arange(len(y)), int(len(y) * 0.1) )) 
    train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices)))

    x_train, x_dev = x_shuffled[train_indices], x_shuffled[cross_validation_indices]
    y_train, y_dev = y_shuffled[train_indices], y_shuffled[cross_validation_indices]

    return [x_train, x_dev, y_train, y_dev, vocab_processor]
项目:FineGrainedOpinionMining    作者:chaoming0625    | 项目源码 | 文件源码
def cut(sentence):
    if not __init_seg:
        __init()
    return jieba.lcut(sentence)
项目:FineGrainedOpinionMining    作者:chaoming0625    | 项目源码 | 文件源码
def cut(sentence):
    if not __init_seg:
        __init()
    return jieba.lcut(sentence)
项目:FAQrobot    作者:ofooo    | 项目源码 | 文件源码
def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
        """
        ????????????????????
        simType=simple, simple_POS, vec
        """
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt?simType?????: {}'.format(simType)

        # ??????????????? simple_pos ??
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)

            t.sim = max(
                similarity(in_vec, question, method=simType, embedding=embedding)
                for question in questions
            )
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return '?????????????????????????'

        return maxSim.a
项目:wende    作者:h404bi    | 项目源码 | 文件源码
def tokenize(question, on='jieba'):
    """ ???????????
    :param question: ???????
    :return: ?????????
    """
    if on == 'ltp':
        # LTP ??
        words = segmentor.segment(question.encode('utf-8'))
        rv = _remove_stopwords([i.decode('utf-8') for i in words])
    else:
        # jieba ??
        rv = _remove_stopwords(jieba.lcut(question))
    logging.debug("NLP:tokenize: {}".format(" ".join(rv)))
    return rv
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def predict_with_content(classifier, news_content, feature_words):
    word_list = jieba.lcut(news_content)
    x = np.array([1 if word in word_list else 0 for word in feature_words]).reshape(1, -1)
    return classifier.predict(x)[0]
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_probability(news_folder, feature_words):
    """????, prob_matrix, prob_classes
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")
    prob_matrix = pd.DataFrame(index=feature_words, columns=news_classes)
    num_of_all_news = 0
    prob_classes = {}
    for cls in news_classes:
        prob_classes[cls] = 0
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        prob_count = {}
        for word in feature_words:
            prob_count[word] = 1 # ??????
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in prob_count.keys():
                    if word in word_list:
                        prob_count[word] += 1
        news_nums = len(news_list)
        num_of_all_news += news_nums
        prob_classes[news_class] = news_nums
        for word in prob_count.keys():
            prob_matrix.loc[word, news_class] = prob_count[word]/(news_nums + 2)# ??????
    jieba.disable_parallel()
    for cls in prob_classes.keys():
        prob_classes[cls] = prob_classes[cls] / num_of_all_news
    return prob_matrix, prob_classes
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
    word_list = set(jieba.lcut(content))
    result = {}
    for cls in prob_classes.keys():
        result[cls] = np.log(prob_classes[cls])
    for cls in result.keys():
        for word in feature_words:
            if word in word_list:
                result[cls] += np.log(prob_matrix.loc[word, cls])
            else:
                result[cls] += np.log(1 - prob_matrix.loc[word, cls])
    return max(result, key=result.get)
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def words_extract(news_folder):
    """??????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    subfolder_list = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    data_list = [] # element: ([word1, word2, ...], "??")

    jieba.enable_parallel(4)
    # ??????????
    for subfolder in subfolder_list:
        news_class = subfolder
        subfolder = os.path.join(news_folder, subfolder)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
               content = f.read()
            word_list = jieba.lcut(content)
            data_list.append((word_list,news_class)) # element: ([word1, word2, ...], "??")
    jieba.disable_parallel()
    return data_list
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def get_feature_words(news_folder, size=1000, stopwords_file="stopwords.txt"):
    """????????????
    Args:
        news_folder/
            ??/
            ??/
            ??/
    """
    news_classes = [subfolder for subfolder in os.listdir(news_folder) \
                        if os.path.isdir(os.path.join(news_folder, subfolder))]
    stopwords = get_stopwords(stopwords_file)
    feature_words_dict = {}
    # ??????????
    jieba.enable_parallel(4)
    for news_class in news_classes:
        subfolder = os.path.join(news_folder, news_class)
        news_list = [os.path.join(subfolder, news) for news in os.listdir(subfolder) \
                        if os.path.isfile(os.path.join(subfolder, news))]
        for news in news_list:
            with open(news, 'r') as f:
                content = f.read()
                word_list = jieba.lcut(content)
                for word in word_list:
                    if not re.match("[a-z0-9A-Z]", word) and len(word) > 1 and word not in stopwords:
                        if word in feature_words_dict:
                            feature_words_dict[word] += 1
                        else:
                            feature_words_dict[word] = 1
    jieba.disable_parallel()
    feature_words_tuple = sorted(feature_words_dict.items(), key=lambda x:x[1], reverse=True)
    feature_words = list(list(zip(*feature_words_tuple))[0])
    return set(feature_words[:size]) if len(feature_words) > size else set(feature_words)
项目:Text-Classifier    作者:daniellaah    | 项目源码 | 文件源码
def predict_with_content(prob_matrix, prob_classes, feature_words, content):
    word_list = jieba.lcut(content)
    result = {}
    for cls in prob_classes.keys():
        result[cls] = np.log(prob_classes[cls])
    for cls in result.keys():
        for word in feature_words:
            if word in word_list:
                result[cls] += np.log(prob_matrix.loc[word, cls] * word_list.count(word))
            else:
                result[cls] += np.log(1 - prob_matrix.loc[word, cls])
    return max(result, key=result.get)
项目:Neural-Headline-Generator-CN    作者:QuantumLiu    | 项目源码 | 文件源码
def cut(text,custom_words=['FLOAT','TIME','DATE','EOS']):
    jieba.enable_parallel(32)
    for word in custom_words:
        jieba.add_word(word)
    words=jieba.lcut(text)
    return words
项目:momoCrawler    作者:njames741    | 项目源码 | 文件源码
def if_contains(self, one_page_des):
        kw_dict_high_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
        kw_dict_low_ratio = {u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'??': 0, u'???': 0, u'??': 0, u'??': 0}
        # kw_dict = {u'??'}
        # kw_dict = {u'???'}
        seg_list = jieba.lcut(one_page_des, cut_all=False)
        for item in seg_list:
            if item in kw_dict:
                # print '??'
                return 1
        # print '?????????'
        return 0
项目:text_clustering    作者:WennieZhi    | 项目源码 | 文件源码
def tokenize(sentence):
    cn_sent = get_cnstr(sentence)
    term_list = jieba.lcut(cn_sent, cut_all=False)
    final_term_list = [term for term in term_list if len(term)>1 and is_cn_char(term)]
    return final_term_list
项目:RobotWriter    作者:Moicen    | 项目源码 | 文件源码
def process(file_name):

    content = read(file_name)

    words = jieba.lcut(content, cut_all=False)
    words = words + ['\n']
    vocab = set(words)
    word2int = { w: i for i, w in enumerate(vocab)}
    int2word = dict(enumerate(vocab))

    data = np.array([word2int[c] for c in words], dtype=np.int32)

    return data, word2int, int2word, vocab
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def jieba_tokenizer(sentence):
    sentence =sentence.replace("^"," ")
    #??????????
    return jieba.lcut(sentence)
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def jieba_tokenizer(self,sentence):
        return jieba.lcut(sentence)
项目:deeplearning4chatbot    作者:liangjz92    | 项目源码 | 文件源码
def jieba_tokenizer(self,sentence):
        return jieba.lcut(sentence)
项目:dynamic-seq2seq    作者:yanwii    | 项目源码 | 文件源码
def segement(self, strs):
        return jieba.lcut(strs)
项目:slack_emoji_bot    作者:linnil1    | 项目源码 | 文件源码
def init(self):
        # cut
        self.img = []
        if os.path.exists(self.food_dir):
            self.imgs = json.loads(open(self.food_dir).read())
            for img in self.imgs:
                img['jieba'] = (jieba.lcut(img['title']))
        open(self.food_dir, "w").write(json.dumps(self.imgs))

        # build
        self.jieba_dic = {}
        for img in self.imgs:
            for jiba in img['jieba']:
                self.jieba_dic[jiba] = img
项目:slack_emoji_bot    作者:linnil1    | 项目源码 | 文件源码
def wordSearch(self, text):
        textarr = jieba.lcut(text)
        self.colorPrint("Jieba cut", textarr)
        for t in textarr:
            if t in self.jieba_dic:
                return self.jieba_dic[t]
        raise ValueError("not found")
项目:slack_emoji_bot    作者:linnil1    | 项目源码 | 文件源码
def imageAdd(self, img):
        self.colorPrint("Add Foods", img)
        img['jieba'] = (jieba.lcut(img['title']))
        for jiba in img['jieba']:
            self.jieba_dic[jiba] = img
        self.img.append(img)
        open(self.food_dir, "w").write(json.dumps(self.imgs))
项目:baidu-ner-contest    作者:bojone    | 项目源码 | 文件源码
def mycut(s):
    result = []
    j = 0
    s = re_replace.sub(' ', s)
    for i in not_cuts.finditer(s):
        result.extend(jieba.lcut(s[j:i.start()], HMM=False))
        if s[i.start()] in [u'?', u'“']:
            result.extend([s[i.start()], s[i.start()+1:i.end()-1], s[i.end()-1]])
        else:
            result.append(s[i.start():i.end()])
        j = i.end()
    result.extend(jieba.lcut(s[j:], HMM=False))
    return result