Python jieba 模块，lcut() 实例源码

我们从Python开源项目中，提取了以下48个代码示例，用于说明如何使用jieba.lcut()。

项目：tensorflow_novelist-master 作者：charlesXu86 | 项目源码 | 文件源码

def process_sent(sent, vocab_int, steps):
    """
    this file token sentence and make it into numpy array, return a fixed length 2d array
    :param sent: 
    :param vocab_int: 
    :param steps: 
    :return: 
    """
    sent_list = jieba.lcut(sent)
    # if words not in vocab dict then let this word be a random index which maybe other words
    index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
    if len(index_list) < steps:
        index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
    else:
        index_list = index_list[0: steps]
    return np.array([index_list])

项目：tensorflow_novelist-master 作者：charlesXu86 | 项目源码 | 文件源码

def process_sent(sent, vocab_int, steps):
    """
    this file token sentence and make it into numpy array, return a fixed length 2d array
    :param sent: 
    :param vocab_int: 
    :param steps: 
    :return: 
    """
    sent_list = jieba.lcut(sent)
    # if words not in vocab dict then let this word be a random index which maybe other words
    index_list = [vocab_int[i] if i in vocab_int.keys() else np.random.randint(0, 90) for i in sent_list]
    if len(index_list) < steps:
        index_list = np.hstack((index_list, np.random.randint(0, 90, steps - len(index_list))))
    else:
        index_list = index_list[0: steps]
    return np.array([index_list])

项目：tensorflow_novelist-master 作者：charlesXu86 | 项目源码 | 文件源码

def prepare_data(self):
        corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
        vocabs = []
        for l in corpus_cut:
            for i in l:
                vocabs.append(i)
        # vocabs = reduce(lambda x, y: x+y, corpus_cut)
        # count every vocab frequency
        # but currently we don't think about the 'most' frequent one, just let it go
        counter = collections.Counter(vocabs)
        counter = counter.most_common()
        vocabs_set, _ = zip(*counter)
        vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}

        data_flatten = np.array([vocab_int_map[v] for v in vocabs])
        #step=3
        data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(0,data_flatten.shape[0]-self.n_steps -1,3)])
        # let's shuffle data to see anything happens
        np.random.shuffle(data)
        return len(vocabs_set), vocab_int_map, data

项目：tensorflow_novelist-master 作者：charlesXu86 | 项目源码 | 文件源码

def prepare_data(self):
        corpus_cut = np.array([jieba.lcut(s) for s in self.raw_corpus])
        vocabs = []
        for l in corpus_cut:
            for i in l:
                vocabs.append(i)
        # vocabs = reduce(lambda x, y: x+y, corpus_cut)
        # count every vocab frequency
        # but currently we don't think about the 'most' frequent one, just let it go
        counter = collections.Counter(vocabs)
        counter = counter.most_common()
        vocabs_set, _ = zip(*counter)
        vocab_int_map = {vocab: index for index, vocab in enumerate(vocabs_set)}

        data_flatten = np.array([vocab_int_map[v] for v in vocabs])
        data = np.array([data_flatten[i: i+self.n_steps+1] for i in range(data_flatten.shape[0] // (self.n_steps + 1))])
        # let's shuffle data to see anything happens
        np.random.shuffle(data)
        return len(vocabs_set), vocab_int_map, data

项目：Book_DeepLearning_Practice 作者：wac81 | 项目源码 | 文件源码

def delNOTNeedWords(content,customstopwords=None):
    # words = jieba.lcut(content)
    if customstopwords == None:
        customstopwords = "stopwords.txt"
    import os
    if os.path.exists(customstopwords):
        stop_words = codecs.open(customstopwords, encoding='UTF-8').read().split(u'\n')
        customstopwords = stop_words

    result=''
    return_words = []
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    words = pseg.lcut(content)

    for word, flag in words:
        # print word.encode('utf-8')
        tempword = word.encode('utf-8').strip(' ')
        if (word not in customstopwords and len(tempword)>0 and flag in [u'n',u'nr',u'ns',u'nt',u'nz',u'ng',u't',u'tg',u'f',u'v',u'vd',u'vn',u'vf',u'vx',u'vi',u'vl',u'vg', u'a',u'an',u'ag',u'al',u'm',u'mq',u'o',u'x']):
            # and flag[0] in [u'n', u'f', u'a', u'z']):
            # ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            result += tempword # +"/"+str(w.flag)+" "  #????
            return_words.append(tempword)
    return result,return_words

项目：news-search-engine 作者：01joy | 项目源码 | 文件源码

def result_by_time(self, sentence):
        seg_list = jieba.lcut(sentence, cut_all=False)
        n, cleaned_dict = self.clean_list(seg_list)
        time_scores = {}
        for term in cleaned_dict.keys():
            r = self.fetch_from_db(term)
            if r is None:
                continue
            docs = r[2].split('\n')
            for doc in docs:
                docid, date_time, tf, ld = doc.split('\t')
                if docid in time_scores:
                    continue
                news_datetime = datetime.strptime(date_time, "%Y-%m-%d %H:%M:%S")
                now_datetime = datetime.now()
                td = now_datetime - news_datetime
                docid = int(docid)
                td = (timedelta.total_seconds(td) / 3600) # hour
                time_scores[docid] = td
        time_scores = sorted(time_scores.items(), key = operator.itemgetter(1))
        if len(time_scores) == 0:
            return 0, []
        else:
            return 1, time_scores

项目：news-search-engine 作者：01joy | 项目源码 | 文件源码

def gen_idf_file(self):
        files = listdir(self.doc_dir_path)
        n = float(len(files))
        idf = {}
        for i in files:
            root = ET.parse(self.doc_dir_path + i).getroot()
            title = root.find('title').text
            body = root.find('body').text
            seg_list = jieba.lcut(title + '?' + body, cut_all=False)
            seg_list = set(seg_list) - self.stop_words
            for word in seg_list:
                word = word.strip().lower()
                if word == '' or self.is_number(word):
                    continue
                if word not in idf:
                    idf[word] = 1
                else:
                    idf[word] = idf[word] + 1
        idf_file = open(self.idf_path, 'w', encoding = 'utf-8')
        for word, df in idf.items():
            idf_file.write('%s %.9f\n'%(word, math.log(n / df)))
        idf_file.close()

项目：recommended_system 作者：wac81 | 项目源码 | 文件源码

def delstopwords(content):
    result=''

    words = jieba.lcut(content)
    return_words = []
    for w in words:
        if w not in app.config['stopwords']:
            result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
            return_words.append(w.encode('utf-8'))

    # words = pseg.lcut(content)
    # with app.test_request_context():
    # for word, flag in words:
    #     if (word not in app.config['stopwords'] and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
    #         result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
    #             print result
    return result,return_words

项目：recommended_system 作者：wac81 | 项目源码 | 文件源码

def delNOTNeedWords(content,stopwords):
    # words = jieba.lcut(content)
    result=''
    # for w in words:
    #     if w not in stopwords:
    #         result += w.encode('utf-8')  # +"/"+str(w.flag)+" "  #????

    words = pseg.lcut(content)
    # jieba.cut()
    text_list = []
    for word, flag in words:
        # print word.encode('utf-8')
        if (word not in stopwords and flag not in ["/x","/zg","/uj","/ul","/e","/d","/uz","/y"]): #??????????????????
            # text_list.append(word.encode('utf-8'))
            result += word.encode('utf-8')  # +"/"+str(w.flag)+" "  #????
        # ''.join(text_list)
    return result
    # return ''.join(text_list)

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if True:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if False:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = []
    if True:
        for seg in segs:
            if seg not in stopwords:
                final.append(seg)
        return final
    else:
        return segs

项目：Answer_Selection 作者：xjtushilei | 项目源码 | 文件源码

def cut_with_stop_words(string):
    segs = jieba.lcut(string)
    final = ''
    for seg in segs:
        if seg not in stopwords:
            final = final + seg
    return final

项目：basic-encoder-decoder 作者：pemywei | 项目源码 | 文件源码

def main(_):
    print("Loading vocabulary")
    cn_vocab_path = os.path.join(FLAGS.data_dir, "source_vocab.txt")
    en_vocab_path = os.path.join(FLAGS.data_dir, "target_vocab.txt")

    cn_vocab, _ = data_utils.initialize_vocabulary(cn_vocab_path)
    _, rev_en_vocab = data_utils.initialize_vocabulary(en_vocab_path)

    print("Building model...")
    config = tf.ConfigProto(allow_soft_placement=True)
    with tf.Session(config=config) as sess:
        model = create_model(sess, False)
        # Decode from standard input.
        sys.stdout.write("> ")
        sys.stdout.flush()
        sentence = sys.stdin.readline()
        while sentence:
            seg_list = jieba.lcut(sentence.strip())
            #print(" ".join(seg_list))
            token_ids = [cn_vocab.get(w.encode(encoding="utf-8"), data_utils.UNK_ID) for w in seg_list]
            #print(token_ids)
            outputs = model.test(sess, token_ids)
            outputs = outputs.tolist()
            if data_utils.EOS_ID in outputs:
                outputs = outputs[:outputs.index(data_utils.EOS_ID)]
            output = " ".join([tf.compat.as_str(rev_en_vocab[output]) for output in outputs])
            print(output.capitalize())
            print("> ")
            sys.stdout.flush()
            sentence = sys.stdin.readline()

项目：CNKICrawler 作者：roliygu | 项目源码 | 文件源码

def jieba_example():
    raw = "????S5????,123,?,?"
    raw_seq = jieba.cut(raw)
    raw_seq_list = jieba.lcut(raw)
    raw_keyword = jieba.analyse.extract_tags(raw, topK=3, withWeight=False, allowPOS=())
    raw_with_ictclas = pseg.cut(raw)
    for word, flag in raw_with_ictclas:
        print word, flag

项目：CNKICrawler 作者：roliygu | 项目源码 | 文件源码

def cut_with_flag(raw_str, filter_invalid_word_flag=True):
    """

    :param raw_str: str
    :return: list[(str, str)]
    """
    res = [(a, b) for a, b in pseg.lcut(raw_str)]

    if filter_invalid_word_flag:
        return filter_invalid_word(res)
    else:
        return res

项目：tf_tang_poems 作者：charlesXu86 | 项目源码 | 文件源码

def process_lyrics(file_name):
    lyrics = []
    content = clean_cn_corpus(file_name, clean_level='all', is_save=False)
    for l in content:
        if len(l) < 40:
            continue
        l = start_token + l + end_token
        lyrics.append(l)
    lyrics = sorted(lyrics, key=lambda line: len(line))
    print('all %d songs...' % len(lyrics))

    # if not os.path.exists(os.path.dirname(segment_list_file)):
    #     os.mkdir(os.path.dirname(segment_list_file))
    # if os.path.exists(segment_list_file):
    #     print('load segment file from %s' % segment_list_file)
    #     with open(segment_list_file, 'rb') as p:
    #         all_words = pickle.load(p)
    # else:
    all_words = []
    for lyric in lyrics:
        all_words += jieba.lcut(lyric, cut_all=False)
        # with open(segment_list_file, 'wb') as p:
        #     pickle.dump(all_words, p)
        #     print('segment result have been save into %s' % segment_list_file)

    # calculate how many time appear per word
    counter = collections.Counter(all_words)
    print(counter['E'])
    # sorted depends on frequent
    counter_pairs = sorted(counter.items(), key=lambda x: -x[1])
    words, _ = zip(*counter_pairs)
    print('E' in words)

    words = words[:len(words)] + (' ',)
    word_int_map = dict(zip(words, range(len(words))))
    # translate all lyrics into int vector
    lyrics_vector = [list(map(lambda word: word_int_map.get(word, len(words)), lyric)) for lyric in lyrics]
    return lyrics_vector, word_int_map, words

项目：seq2seq_chatterbot 作者：StephenLee2016 | 项目源码 | 文件源码

def segement(self, strs):
        return jieba.lcut(strs)

项目：Chinese-QA 作者：distantJing | 项目源码 | 文件源码

def word_tokenize(tokens):
#   return [token.replace("''", '"').replace("``", '"') for token in jieba.lcut(tokens, cut_all=False)]
    return [token.replace("''", '"').replace("``", '"') for token in nltk.word_tokenize(tokens)]

#from my.corenlp_interface import CoreNLPInterface
#url = 'vision-server2.corp.ai2'
#port = 8000
#interface = CoreNLPInterface(url, port)
#sent_tokenize = interface.split_doc
#word_tokenize = interface.split_sent

项目：SwissCheese-at-SemEval-2016 作者：naivechen | 项目源码 | 文件源码

def get_train_data(language):

    # Load data from files
    path = "./data/" + language + "/"
    positive_examples = list(open(path + "rt-polarity.pos", "r").readlines())
    positive_examples = [s.strip() for s in positive_examples[:100]]   # -1000
    negative_examples = list(open(path + "rt-polarity.neg", "r").readlines())
    negative_examples = [s.strip() for s in negative_examples[:100]]

    x_text = positive_examples + negative_examples

    x_text = [sent for sent in x_text]
    # Generate labels
    positive_labels = [[0, 1] for _ in positive_examples]
    negative_labels = [[1, 0] for _ in negative_examples]
    y = np.concatenate([positive_labels, negative_labels], 0)

    # Build vocabulary
    max_length_of_sentence = max([len(jieba.lcut(x)) for x in x_text])
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_length_of_sentence)
    x = np.array(list(vocab_processor.fit_transform(x_text)))

    # Randomly shuffle data
    np.random.seed(1234)
    shuffle_indices = np.random.permutation(np.arange(len(y)))
    x_shuffled = x[shuffle_indices]
    y_shuffled = y[shuffle_indices]

    # Split train/cross-validation set
    cross_validation_indices = np.array(random.sample(np.arange(len(y)), int(len(y) * 0.1) )) 
    train_indices = np.array(list(set(np.arange(len(y))) - set(cross_validation_indices)))

    x_train, x_dev = x_shuffled[train_indices], x_shuffled[cross_validation_indices]
    y_train, y_dev = y_shuffled[train_indices], y_shuffled[cross_validation_indices]

    return [x_train, x_dev, y_train, y_dev, vocab_processor]

项目：FineGrainedOpinionMining 作者：chaoming0625 | 项目源码 | 文件源码

def cut(sentence):
    if not __init_seg:
        __init()
    return jieba.lcut(sentence)

项目：FineGrainedOpinionMining 作者：chaoming0625 | 项目源码 | 文件源码

def cut(sentence):
    if not __init_seg:
        __init()
    return jieba.lcut(sentence)

项目：FAQrobot 作者：ofooo | 项目源码 | 文件源码

def maxSimTxt(self, intxt, simCondision=0.1, simType='simple'):
        """
        ????????????????????
        simType=simple, simple_POS, vec
        """
        self.lastTxt.append(intxt)
        if simType not in ('simple', 'simple_pos', 'vec'):
            return 'error:  maxSimTxt?simType?????: {}'.format(simType)

        # ??????????????? simple_pos ??
        embedding = self.vecModel
        if simType == 'vec' and not embedding:
            simType = 'simple_pos'

        for t in self.zhishiku:
            questions = t.q_vec if simType == 'vec' else t.q_word
            in_vec = jieba.lcut(intxt) if simType == 'simple' else pseg.lcut(intxt)

            t.sim = max(
                similarity(in_vec, question, method=simType, embedding=embedding)
                for question in questions
            )
        maxSim = max(self.zhishiku, key=lambda x: x.sim)
        logger.info('maxSim=' + format(maxSim.sim, '.0%'))

        if maxSim.sim < simCondision:
            return '?????????????????????????'

        return maxSim.a

项目：wende 作者：h404bi | 项目源码 | 文件源码

def tokenize(question, on='jieba'):
    """ ???????????
    :param question: ???????
    :return: ?????????
    """
    if on == 'ltp':
        # LTP ??
        words = segmentor.segment(question.encode('utf-8'))
        rv = _remove_stopwords([i.decode('utf-8') for i in words])
    else:
        # jieba ??
        rv = _remove_stopwords(jieba.lcut(question))
    logging.debug("NLP:tokenize: {}".format(" ".join(rv)))
    return rv