我们从Python开源项目中,提取了以下20个代码示例,用于说明如何使用jieba.cut_for_search()。
def _index_docs(indexFile, writer): for line in indexFile: ind, ent_name, info, keywords, imgurl, filename, url = line.split('\t') print("adding %s" % ind) filename = "{:05d}".format(int(ind)) + '.jpg' keywords = keywords.replace('%', ' ') ent_name = " ".join(x.strip() for x in jieba.cut_for_search(ent_name)) keywords = " ".join(x.strip() for x in jieba.cut_for_search(keywords)) try: doc = Document() doc.add(Field('ind', ind, Field.Store.YES, Field.Index.NO)) doc.add(Field('ent_name', ent_name, Field.Store.NO, Field.Index.ANALYZED)) doc.add(Field('keywords', keywords, Field.Store.NO, Field.Index.ANALYZED)) # doc.add(Field('n_colors', n_colors, Field.Store.NO, Field.Index.ANALYZED)) writer.addDocument(doc) except Exception, e: print("Failed in indexDocs: %r" % e)
def GET(self): data=web.input() if data: searchword=data.searchword else: searchword='' news_list=list() topic=list() if searchword: cut = jieba.cut_for_search(searchword) word_list = [] for word in cut: if word not in punct and word not in Letters_and_numbers: word_list.append(word.encode("utf-8")) topK=query.calculate(word_list,config.query_return_numbers) for k in topK: data = dict() title, content, url= id_index.get_data(k) data['id'] = k data['content'] = content.decode("utf-8")[:config.query_return_snipper_size] data['title']=title.decode("utf-8") data['url'] = url.decode("utf-8") news_list.append(data) del data,cut,word_list,word,topK,title,content,url #word2Vec?????? word2vec.cal(searchword.encode('utf-8')) print word2vec.result.length if word2vec.result.length==0:#????????1 pass else: for i in range(config.recommand_topic_numbers): topic.append(word2vec.result.word[i].char) return render.index(searchword,news_list,topic)
def GET(self): data=web.input() if data: ID=data.id news = dict() title, content, url=id_index.get_data(int(ID)) news['content'] = content.decode("utf-8") news['title'] = title.decode("utf-8") news['url'] = url.decode("utf-8") recomand=[] #???? cut = jieba.cut_for_search(content) word_list = [] for word in cut: if word not in punct and word not in Letters_and_numbers: # ???????????????????? if recommand.stopword.has_key(word.encode("utf-8")): pass else: word_list.append(word.encode("utf-8")) topk= recommand.calculate(word_list, config.recommand_numbers, 10) for i in topk:#???? #for i in recommand.dic[int(ID)]:#???? if i !=int(ID): title, content, url=id_index.get_data(i) recomand.append([title.decode('utf-8'),content.decode('utf-8'),url.decode('utf-8')]) news['recommand']=recomand del title,content,url,recomand else: ID='' news = dict() news['title'] = "No Such News" news['content'] = "Oh No!" news['url'] = "#" news['recommand']=[['','',''] for m in range(config.recommand_numbers)] return render.news(news)
def calculate(self,doc_id,Top_numbers=10,multiple=10): title,content,url=self.index.get_data(doc_id) cut=jieba.cut_for_search(content) word_list=[] for word in cut: if word not in self.punct and word not in self.Letters_and_numbers : #???????????????????? if self.stopword.has_key(word.encode("utf-8")): pass else: word_list.append(word.encode("utf-8")) return self.FastCos.calculate(word_list,Top_numbers,multiple)
def cut_search(data): ''' ????????????????????????????? ??????-->??/??/??/???? ''' temp_result = jieba.cut_for_search(data) temp_result = '/'.join(temp_result) return temp_result
def search_func_factory(analyzer, searcher, vm_env): """Search function factory""" def retrieve(doc): return doc.get('ind') def search(**kwargs): vm_env.attachCurrentThread() query = BooleanQuery() print("Searched keywords:") for field_name, keywords in kwargs.items(): # assert field_name in SearchConfig.searchable_fields # keywords = list(filter(None, jieba.cut(keywords, cut_all=True))) keywords = list(filter(None, (k.strip() for k in jieba.cut_for_search(keywords)))) for kw in keywords: print(kw) # construct query for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, field_name, analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) if field_name == 'keywords': for kw in keywords: q = QueryParser(Version.LUCENE_CURRENT, 'ent_name', analyzer).parse(kw) query.add(q, BooleanClause.Occur.SHOULD) # search scoreDocs = searcher.search(query, 50).scoreDocs return [retrieve(searcher.doc(scoreDoc.doc)) for scoreDoc in scoreDocs] return search
def CUT(f): s = jieba.cut_for_search(f) return ' '.join(s)
def testCutForSearch(self): for content in test_contents: result = jieba.cut_for_search(content) assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" result = list(result) assert isinstance(result, list), "Test CutForSearch error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testCutForSearch", file=sys.stderr)
def testCutForSearch_NOHMM(self): for content in test_contents: result = jieba.cut_for_search(content,HMM=False) assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" result = list(result) assert isinstance(result, list), "Test CutForSearch error on content: %s" % content print(" , ".join(result), file=sys.stderr) print("testCutForSearch_NOHMM", file=sys.stderr)
def cuttest(test_sent): result = jieba.cut_for_search(test_sent) for word in result: print(word, "/", end=' ') print("")
def run(self): seg_list = jieba.cut("?????????",cut_all=True) print("Full Mode:" + "/ ".join(seg_list)) #??? seg_list = jieba.cut("?????????",cut_all=False) print("Default Mode:" + "/ ".join(seg_list)) #???? seg_list = jieba.cut("??????????") print(", ".join(seg_list)) seg_list = jieba.cut_for_search("??????????????????????????") #?????? print(", ".join(seg_list))
def cut_for_search(self, text): ''' @summary: ?????????????????????????????????????????? --------- @param text: ?? --------- @result: ''' result = list(jieba.cut_for_search(text)) result = self.__del_stop_key(result) return result
def extractSearchWords(self,sentence): return list(jieba.cut_for_search(sentence))
def seg_for_search(self, sentence): words = list() for item in jieba.cut_for_search(sentence): words.append(item) return words
def testCutForSearch(self): for content in test_contents: result = jieba.cut_for_search(content) assert isinstance(result, types.GeneratorType), "Test CutForSearch Generator error" result = list(result) assert isinstance(result, list), "Test CutForSearch error on content: %s" % content print >> sys.stderr, " , ".join(result) print >> sys.stderr, "testCutForSearch"
def cuttest(test_sent): result = jieba.cut_for_search(test_sent) for word in result: print word, "/", print ""
def run(self): seg_list = jieba.cut("?????????",cut_all=True) print "Full Mode:" + "/ ".join(seg_list) #??? seg_list = jieba.cut("?????????",cut_all=False) print "Default Mode:" + "/ ".join(seg_list) #???? seg_list = jieba.cut("??????????") print ", ".join(seg_list) seg_list = jieba.cut_for_search("??????????????????????????") #?????? print ", ".join(seg_list)
def test_demo1(): text = "?????????" seg_list = jieba.cut(text, cut_all=True) print u"[???]: ", "/ ".join(seg_list) seg_list = jieba.cut(text, cut_all=False) print u"[????]: ", "/ ".join(seg_list) seg_list = jieba.cut(text) print u"[????]: ", "/ ".join(seg_list) seg_list = jieba.cut("??????????") print u"[????]: ", "/ ".join(seg_list) seg_list = jieba.cut_for_search(text) print u"[??????]: ", "/ ".join(seg_list) # Read file and cut