protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ StringBuilder fragment = new StringBuilder(); final int s = fragInfo.getStartOffset(); int[] modifiedStartOffset = { s }; String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); int srcIndex = 0; for( SubInfo subInfo : fragInfo.getSubInfos() ){ for( Toffs to : subInfo.getTermsOffsets() ){ fragment .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) .append( getPreTag( preTags, subInfo.getSeqnum() ) ) .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) .append( getPostTag( postTags, subInfo.getSeqnum() ) ); srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); return fragment.toString(); }
HighlightingHelper(Query query, Analyzer analyzer) { this.analyzer = analyzer; Formatter formatter = new SimpleHTMLFormatter(); Encoder encoder = new MinimalHTMLEncoder(); scorer = new QueryScorer(query); highlighter = new Highlighter(formatter, encoder, scorer); fragmentLength = DEFAULT_FRAGMENT_LENGTH; Fragmenter fragmenter = new SimpleSpanFragmenter(scorer, fragmentLength); highlighter.setTextFragmenter(fragmenter); }
@Override public String createFragment( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, String[] preTags, String[] postTags, Encoder encoder ) throws IOException { String[] fragments = createFragments( reader, docId, fieldName, fieldFragList, 1, preTags, postTags, encoder ); if( fragments == null || fragments.length == 0 ) return null; return fragments[0]; }
@Override public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments, String[] preTags, String[] postTags, Encoder encoder ) throws IOException { if( maxNumFragments < 0 ) { throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); } List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos(); Field[] values = getFields( reader, docId, fieldName ); if( values.length == 0 ) { return null; } if (discreteMultiValueHighlighting && values.length > 1) { fragInfos = discreteMultiValueHighlighting(fragInfos, values); } fragInfos = getWeightedFragInfoList(fragInfos); int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size(); List<String> fragments = new ArrayList<>( limitFragments ); StringBuilder buffer = new StringBuilder(); int[] nextValueIndex = { 0 }; for( int n = 0; n < limitFragments; n++ ){ WeightedFragInfo fragInfo = fragInfos.get( n ); fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) ); } return fragments.toArray( new String[fragments.size()] ); }
@Override public String[] createFragments( IndexReader reader, int docId, String fieldName, FieldFragList fieldFragList, int maxNumFragments, String[] preTags, String[] postTags, Encoder encoder ) throws IOException { if( maxNumFragments < 0 ) { throw new IllegalArgumentException( "maxNumFragments(" + maxNumFragments + ") must be positive number." ); } List<WeightedFragInfo> fragInfos = fieldFragList.getFragInfos(); Field[] values = getFields( reader, docId, fieldName ); if( values.length == 0 ) { return null; } if (discreteMultiValueHighlighting && values.length > 1) { fragInfos = discreteMultiValueHighlighting(fragInfos, values); } fragInfos = getWeightedFragInfoList(fragInfos); int limitFragments = maxNumFragments < fragInfos.size() ? maxNumFragments : fragInfos.size(); List<String> fragments = new ArrayList<String>( limitFragments ); StringBuilder buffer = new StringBuilder(); int[] nextValueIndex = { 0 }; for( int n = 0; n < limitFragments; n++ ){ WeightedFragInfo fragInfo = fragInfos.get( n ); fragments.add( makeFragment( buffer, nextValueIndex, values, fragInfo, preTags, postTags, encoder ) ); } return fragments.toArray( new String[fragments.size()] ); }
public CustomPassageFormatter(String preTag, String postTag, Encoder encoder) { this.preTag = preTag; this.postTag = postTag; this.encoder = encoder; }
@Override protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ return super.makeFragment(buffer, index, values, FragmentBuilderHelper.fixWeightedFragInfo(mapper, values, fragInfo), preTags, postTags, encoder); }
public void testMultiValuedSortByScore() throws IOException { Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(new MockAnalyzer( random() ) ) ); Document doc = new Document(); FieldType type = new FieldType( TextField.TYPE_STORED ); type.setStoreTermVectorOffsets( true ); type.setStoreTermVectorPositions( true ); type.setStoreTermVectors( true ); type.freeze(); doc.add( new Field( "field", "zero if naught", type ) ); // The first two fields contain the best match doc.add( new Field( "field", "hero of legend", type ) ); // but total a lower score (3) than the bottom doc.add( new Field( "field", "naught of hero", type ) ); // two fields (4) doc.add( new Field( "field", "naught of hero", type ) ); writer.addDocument(doc); FastVectorHighlighter highlighter = new FastVectorHighlighter(); ScoreOrderFragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(); fragmentsBuilder.setDiscreteMultiValueHighlighting( true ); IndexReader reader = DirectoryReader.open(writer, true ); String[] preTags = new String[] { "<b>" }; String[] postTags = new String[] { "</b>" }; Encoder encoder = new DefaultEncoder(); int docId = 0; BooleanQuery query = new BooleanQuery(); query.add( clause( "field", "hero" ), Occur.SHOULD); query.add( clause( "field", "of" ), Occur.SHOULD); query.add( clause( "field", "legend" ), Occur.SHOULD); FieldQuery fieldQuery = highlighter.getFieldQuery( query, reader ); for ( FragListBuilder fragListBuilder : new FragListBuilder[] { new SimpleFragListBuilder(), new WeightedFragListBuilder() } ) { String[] bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 20, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]); bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 28, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]); bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 30000, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); assertEquals("<b>hero</b> <b>of</b> <b>legend</b>", bestFragments[0]); } reader.close(); writer.close(); dir.close(); }
private void matchedFieldsTestCase( boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses ) throws IOException { Document doc = new Document(); FieldType stored = new FieldType( TextField.TYPE_STORED ); stored.setStoreTermVectorOffsets( true ); stored.setStoreTermVectorPositions( true ); stored.setStoreTermVectors( true ); stored.freeze(); FieldType matched = new FieldType( TextField.TYPE_NOT_STORED ); matched.setStoreTermVectorOffsets( true ); matched.setStoreTermVectorPositions( true ); matched.setStoreTermVectors( true ); matched.freeze(); doc.add( new Field( "field", fieldValue, stored ) ); // Whitespace tokenized with English stop words doc.add( new Field( "field_exact", fieldValue, matched ) ); // Whitespace tokenized without stop words doc.add( new Field( "field_super_exact", fieldValue, matched ) ); // Whitespace tokenized without toLower doc.add( new Field( "field_characters", fieldValue, matched ) ); // Each letter is a token doc.add( new Field( "field_tripples", fieldValue, matched ) ); // Every three letters is a token doc.add( new Field( "field_sliced", fieldValue.substring( 0, // Sliced at 10 chars then analyzed just like field Math.min( fieldValue.length() - 1 , 10 ) ), matched ) ); doc.add( new Field( "field_der_red", new CannedTokenStream( // Hacky field containing "der" and "red" at pos = 0 token( "der", 1, 0, 3 ), token( "red", 0, 0, 3 ) ), matched ) ); final Map<String, Analyzer> fieldAnalyzers = new TreeMap<>(); fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) ); fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) ); fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) ); fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) ); fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) ); fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) ); fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) ); // This is required even though we provide a token stream Analyzer analyzer = new DelegatingAnalyzerWrapper(Analyzer.PER_FIELD_REUSE_STRATEGY) { public Analyzer getWrappedAnalyzer(String fieldName) { return fieldAnalyzers.get( fieldName ); } }; Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig(analyzer)); writer.addDocument( doc ); FastVectorHighlighter highlighter = new FastVectorHighlighter(); FragListBuilder fragListBuilder = new SimpleFragListBuilder(); FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(); IndexReader reader = DirectoryReader.open( writer, true ); String[] preTags = new String[] { "<b>" }; String[] postTags = new String[] { "</b>" }; Encoder encoder = new DefaultEncoder(); int docId = 0; BooleanQuery query = new BooleanQuery(); for ( Query clause : queryClauses ) { query.add( clause, Occur.MUST ); } FieldQuery fieldQuery = new FieldQuery( query, reader, true, fieldMatch ); String[] bestFragments; if ( useMatchedFields ) { Set< String > matchedFields = new HashSet<>(); matchedFields.add( "field" ); matchedFields.add( "field_exact" ); matchedFields.add( "field_super_exact" ); matchedFields.add( "field_characters" ); matchedFields.add( "field_tripples" ); matchedFields.add( "field_sliced" ); matchedFields.add( "field_der_red" ); bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); } else { bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); } assertEquals( expected, bestFragments[ 0 ] ); reader.close(); writer.close(); dir.close(); }
@Override public Encoder getEncoder(String fieldName, SolrParams params) { return new SimpleHTMLEncoder(); }
@Override public Encoder getEncoder(String fieldName, SolrParams params) { return new org.apache.lucene.search.highlight.DefaultEncoder(); }
private void alternateField(NamedList docSummaries, SolrParams params, Document doc, String requestFieldname, String schemaFieldName, SolrQueryRequest req) { String requestAlternateField = params.getFieldParam(requestFieldname, HighlightParams.ALTERNATE_FIELD); if (requestAlternateField != null && requestAlternateField.length() > 0) { String schemaAlternateFieldName = AlfrescoSolrDataModel.getInstance().mapProperty(requestAlternateField, FieldUse.HIGHLIGHT, req); IndexableField[] docFields = doc.getFields(schemaAlternateFieldName); if (docFields.length == 0) { // The alternate field did not exist, treat the original field as fallback instead docFields = doc.getFields(schemaFieldName); } List<String> listFields = new ArrayList<>(); for (IndexableField field : docFields) { if (field.binaryValue() == null) listFields.add(field.stringValue()); } String[] altTexts = listFields.toArray(new String[listFields.size()]); if (altTexts != null && altTexts.length > 0) { Encoder encoder = getEncoder(requestFieldname, params); int alternateFieldLen = params.getFieldInt(requestFieldname, HighlightParams.ALTERNATE_FIELD_LENGTH, 0); List<String> altList = new ArrayList<>(); int len = 0; for (String altText : altTexts) { if (alternateFieldLen <= 0) { altList.add(encoder.encodeText(altText)); } else { altList.add(len + altText.length() > alternateFieldLen ? encoder.encodeText(new String(altText.substring(0, alternateFieldLen - len))) : encoder .encodeText(altText)); len += altText.length(); if (len >= alternateFieldLen) break; } } docSummaries.add(requestFieldname, altList); } } }
private void matchedFieldsTestCase( boolean useMatchedFields, boolean fieldMatch, String fieldValue, String expected, Query... queryClauses ) throws IOException { Document doc = new Document(); FieldType stored = new FieldType( TextField.TYPE_STORED ); stored.setStoreTermVectorOffsets( true ); stored.setStoreTermVectorPositions( true ); stored.setStoreTermVectors( true ); stored.freeze(); FieldType matched = new FieldType( TextField.TYPE_NOT_STORED ); matched.setStoreTermVectorOffsets( true ); matched.setStoreTermVectorPositions( true ); matched.setStoreTermVectors( true ); matched.freeze(); doc.add( new Field( "field", fieldValue, stored ) ); // Whitespace tokenized with English stop words doc.add( new Field( "field_exact", fieldValue, matched ) ); // Whitespace tokenized without stop words doc.add( new Field( "field_super_exact", fieldValue, matched ) ); // Whitespace tokenized without toLower doc.add( new Field( "field_characters", fieldValue, matched ) ); // Each letter is a token doc.add( new Field( "field_tripples", fieldValue, matched ) ); // Every three letters is a token doc.add( new Field( "field_sliced", fieldValue.substring( 0, // Sliced at 10 chars then analyzed just like field Math.min( fieldValue.length() - 1 , 10 ) ), matched ) ); doc.add( new Field( "field_der_red", new CannedTokenStream( // Hacky field containing "der" and "red" at pos = 0 token( "der", 1, 0, 3 ), token( "red", 0, 0, 3 ) ), matched ) ); final Map<String, Analyzer> fieldAnalyzers = new TreeMap<String, Analyzer>(); fieldAnalyzers.put( "field", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, true, MockTokenFilter.ENGLISH_STOPSET ) ); fieldAnalyzers.put( "field_exact", new MockAnalyzer( random() ) ); fieldAnalyzers.put( "field_super_exact", new MockAnalyzer( random(), MockTokenizer.WHITESPACE, false ) ); fieldAnalyzers.put( "field_characters", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp(".").toAutomaton() ), true ) ); fieldAnalyzers.put( "field_tripples", new MockAnalyzer( random(), new CharacterRunAutomaton( new RegExp("...").toAutomaton() ), true ) ); fieldAnalyzers.put( "field_sliced", fieldAnalyzers.get( "field" ) ); fieldAnalyzers.put( "field_der_red", fieldAnalyzers.get( "field" ) ); // This is required even though we provide a token stream Analyzer analyzer = new AnalyzerWrapper() { public Analyzer getWrappedAnalyzer(String fieldName) { return fieldAnalyzers.get( fieldName ); } }; Directory dir = newDirectory(); IndexWriter writer = new IndexWriter( dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer ) ); writer.addDocument( doc ); FastVectorHighlighter highlighter = new FastVectorHighlighter(); FragListBuilder fragListBuilder = new SimpleFragListBuilder(); FragmentsBuilder fragmentsBuilder = new ScoreOrderFragmentsBuilder(); IndexReader reader = DirectoryReader.open( writer, true ); String[] preTags = new String[] { "<b>" }; String[] postTags = new String[] { "</b>" }; Encoder encoder = new DefaultEncoder(); int docId = 0; BooleanQuery query = new BooleanQuery(); for ( Query clause : queryClauses ) { query.add( clause, Occur.MUST ); } FieldQuery fieldQuery = new FieldQuery( query, reader, true, fieldMatch ); String[] bestFragments; if ( useMatchedFields ) { Set< String > matchedFields = new HashSet< String >(); matchedFields.add( "field" ); matchedFields.add( "field_exact" ); matchedFields.add( "field_super_exact" ); matchedFields.add( "field_characters" ); matchedFields.add( "field_tripples" ); matchedFields.add( "field_sliced" ); matchedFields.add( "field_der_red" ); bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", matchedFields, 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); } else { bestFragments = highlighter.getBestFragments( fieldQuery, reader, docId, "field", 25, 1, fragListBuilder, fragmentsBuilder, preTags, postTags, encoder ); } assertEquals( expected, bestFragments[ 0 ] ); reader.close(); writer.close(); dir.close(); }