Python jieba 模块,cut_for_search() 实例源码

我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用jieba.cut_for_search()

项目:AlphaLogo    作者:gigaflw    | 项目源码 | 文件源码
def _index_docs(indexFile, writer):
    for line in indexFile:

        ind, ent_name, info, keywords, imgurl, filename, url = line.split('\t')
        print("adding %s" % ind)

        filename = "{:05d}".format(int(ind)) + '.jpg'
        keywords = keywords.replace('%', ' ')

        ent_name = " ".join(x.strip() for x in jieba.cut_for_search(ent_name))
        keywords = " ".join(x.strip() for x in jieba.cut_for_search(keywords))

        try:
            doc = Document()

            doc.add(Field('ind', ind, Field.Store.YES, Field.Index.NO))
            doc.add(Field('ent_name', ent_name, Field.Store.NO, Field.Index.ANALYZED))
            doc.add(Field('keywords', keywords, Field.Store.NO, Field.Index.ANALYZED))
            # doc.add(Field('n_colors', n_colors, Field.Store.NO, Field.Index.ANALYZED))

            writer.addDocument(doc)

        except Exception, e:
            print("Failed in indexDocs: %r" % e)
项目:Information_retrieva_Projectl-    作者:Google1234    | 项目源码 | 文件源码
def GET(self):
        data=web.input()
        if data:
            searchword=data.searchword
        else:
            searchword=''
        news_list=list()
        topic=list()
        if searchword:
            cut = jieba.cut_for_search(searchword)
            word_list = []
            for word in cut:
                if word not in punct and word not in Letters_and_numbers:
                    word_list.append(word.encode("utf-8"))
            topK=query.calculate(word_list,config.query_return_numbers)
            for k in topK:
                data = dict()
                title, content, url= id_index.get_data(k)
                data['id'] = k
                data['content'] = content.decode("utf-8")[:config.query_return_snipper_size]
                data['title']=title.decode("utf-8")
                data['url'] = url.decode("utf-8")
                news_list.append(data)
            del data,cut,word_list,word,topK,title,content,url
            #word2Vec??????
            word2vec.cal(searchword.encode('utf-8'))
            print word2vec.result.length
            if word2vec.result.length==0:#????????1
                pass
            else:
                for i in range(config.recommand_topic_numbers):
                    topic.append(word2vec.result.word[i].char)
        return render.index(searchword,news_list,topic)
项目:Information_retrieva_Projectl-    作者:Google1234    | 项目源码 | 文件源码
def GET(self):
        data=web.input()
        if data:
            ID=data.id
            news = dict()
            title, content, url=id_index.get_data(int(ID))
            news['content'] = content.decode("utf-8")
            news['title'] = title.decode("utf-8")
            news['url'] = url.decode("utf-8")
            recomand=[]
            #????
            cut = jieba.cut_for_search(content)
            word_list = []
            for word in cut:
                if word not in punct and word not in Letters_and_numbers:
                    # ????????????????????
                    if recommand.stopword.has_key(word.encode("utf-8")):
                        pass
                    else:
                        word_list.append(word.encode("utf-8"))
            topk= recommand.calculate(word_list, config.recommand_numbers, 10)
            for i in topk:#????
            #for i in recommand.dic[int(ID)]:#????
                if i !=int(ID):
                    title, content, url=id_index.get_data(i)
                    recomand.append([title.decode('utf-8'),content.decode('utf-8'),url.decode('utf-8')])
            news['recommand']=recomand
            del title,content,url,recomand
        else:
            ID=''
            news = dict()
            news['title'] = "No Such News"
            news['content'] = "Oh No!"
            news['url'] = "#"
            news['recommand']=[['','',''] for m in range(config.recommand_numbers)]
        return render.news(news)
项目:Information_retrieva_Projectl-    作者:Google1234    | 项目源码 | 文件源码
def calculate(self,doc_id,Top_numbers=10,multiple=10):
        title,content,url=self.index.get_data(doc_id)
        cut=jieba.cut_for_search(content)
        word_list=[]
        for word in cut:
            if  word not in self.punct and word not in self.Letters_and_numbers :
                #????????????????????
                if self.stopword.has_key(word.encode("utf-8")):
                    pass
                else:
                    word_list.append(word.encode("utf-8"))
    return self.FastCos.calculate(word_list,Top_numbers,multiple)
项目:HtmlExtract-Python    作者:xinyi-spark    | 项目源码 | 文件源码
def cut_search(data):
    '''
    ?????????????????????????????
    ??????-->??/??/??/????
    '''
    temp_result = jieba.cut_for_search(data)
    temp_result = '/'.join(temp_result)
    return temp_result
项目:AlphaLogo    作者:gigaflw    | 项目源码 | 文件源码
def search_func_factory(analyzer, searcher, vm_env):
    """Search function factory"""

    def retrieve(doc):
        return doc.get('ind')

    def search(**kwargs):
        vm_env.attachCurrentThread()
        query = BooleanQuery() 

        print("Searched keywords:")
        for field_name, keywords in kwargs.items():
            # assert field_name in SearchConfig.searchable_fields

            # keywords = list(filter(None, jieba.cut(keywords, cut_all=True)))
            keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords))))

            for kw in keywords:
                print(kw)

            # construct query
            for kw in keywords:
                q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw)
                query.add(q, BooleanClause.Occur.SHOULD)

            if field_name == 'keywords':
                for kw in keywords:
                    q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw)
                    query.add(q, BooleanClause.Occur.SHOULD)

        # search
        scoreDocs = searcher.search(query, 50).scoreDocs

        return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs]

    return search
项目:newsAnalyzer    作者:Rolight    | 项目源码 | 文件源码
def CUT(f):
    s = jieba.cut_for_search(f)
    return ' '.join(s)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testCutForSearch(self):
        for content in test_contents:
            result = jieba.cut_for_search(content)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def testCutForSearch_NOHMM(self):
        for content in test_contents:
            result = jieba.cut_for_search(content,HMM=False)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print(" , ".join(result), file=sys.stderr)
        print("testCutForSearch_NOHMM", file=sys.stderr)
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("")
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def run(self):
        seg_list = jieba.cut("?????????",cut_all=True)
        print("Full Mode:" + "/ ".join(seg_list)) #???

        seg_list = jieba.cut("?????????",cut_all=False)
        print("Default Mode:" + "/ ".join(seg_list)) #????

        seg_list = jieba.cut("??????????")
        print(", ".join(seg_list))

        seg_list = jieba.cut_for_search("??????????????????????????") #??????
        print(", ".join(seg_list))
项目:Malicious_Domain_Whois    作者:h-j-13    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print(word, "/", end=' ') 
    print("")
项目:internet-content-detection    作者:liubo0621    | 项目源码 | 文件源码
def cut_for_search(self, text):
        '''
        @summary: ??????????????????????????????????????????
        ---------
        @param text: ??
        ---------
        @result:
        '''
        result = list(jieba.cut_for_search(text))
        result = self.__del_stop_key(result)
        return result
项目:Rnews    作者:suemi994    | 项目源码 | 文件源码
def extractSearchWords(self,sentence):
        return list(jieba.cut_for_search(sentence))
项目:KnowledgeGraph-QA-Service    作者:kangzhun    | 项目源码 | 文件源码
def seg_for_search(self, sentence):
        words = list()
        for item in jieba.cut_for_search(sentence):
            words.append(item)
        return words
项目:jieba    作者:isuhao    | 项目源码 | 文件源码
def testCutForSearch(self):
        for content in test_contents:
            result = jieba.cut_for_search(content)
            assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error"
            result = list(result)
            assert isinstance(result, list), "Test CutForSearch error on content: %s" % content
            print >> sys.stderr, " , ".join(result)
        print  >> sys.stderr, "testCutForSearch"
项目:jieba    作者:isuhao    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print word, "/", 
    print ""
项目:jieba    作者:isuhao    | 项目源码 | 文件源码
def run(self):
        seg_list = jieba.cut("?????????",cut_all=True)
        print "Full Mode:" + "/ ".join(seg_list) #???

        seg_list = jieba.cut("?????????",cut_all=False)
        print "Default Mode:" + "/ ".join(seg_list) #????

        seg_list = jieba.cut("??????????")
        print ", ".join(seg_list)

        seg_list = jieba.cut_for_search("??????????????????????????") #??????
        print ", ".join(seg_list)
项目:jieba    作者:isuhao    | 项目源码 | 文件源码
def cuttest(test_sent):
    result = jieba.cut_for_search(test_sent)
    for word in result:
        print word, "/", 
    print ""
项目:wiki_w2v_demo    作者:Trangle    | 项目源码 | 文件源码
def test_demo1():
    text = "?????????"
    seg_list = jieba.cut(text, cut_all=True)
    print u"[???]: ", "/ ".join(seg_list)
    seg_list = jieba.cut(text, cut_all=False)
    print u"[????]: ", "/ ".join(seg_list)
    seg_list = jieba.cut(text)
    print u"[????]: ", "/ ".join(seg_list)
    seg_list = jieba.cut("??????????")
    print u"[????]: ", "/ ".join(seg_list)
    seg_list = jieba.cut_for_search(text)
    print u"[??????]: ", "/ ".join(seg_list)


# Read file and cut