protected String makeFragment( StringBuilder buffer, int[] index, Field[] values, WeightedFragInfo fragInfo, String[] preTags, String[] postTags, Encoder encoder ){ StringBuilder fragment = new StringBuilder(); final int s = fragInfo.getStartOffset(); int[] modifiedStartOffset = { s }; String src = getFragmentSourceMSO( buffer, index, values, s, fragInfo.getEndOffset(), modifiedStartOffset ); int srcIndex = 0; for( SubInfo subInfo : fragInfo.getSubInfos() ){ for( Toffs to : subInfo.getTermsOffsets() ){ fragment .append( encoder.encodeText( src.substring( srcIndex, to.getStartOffset() - modifiedStartOffset[0] ) ) ) .append( getPreTag( preTags, subInfo.getSeqnum() ) ) .append( encoder.encodeText( src.substring( to.getStartOffset() - modifiedStartOffset[0], to.getEndOffset() - modifiedStartOffset[0] ) ) ) .append( getPostTag( postTags, subInfo.getSeqnum() ) ); srcIndex = to.getEndOffset() - modifiedStartOffset[0]; } } fragment.append( encoder.encodeText( src.substring( srcIndex ) ) ); return fragment.toString(); }
private void testCase( Query query, int fragCharSize, String expectedFragInfo, double expectedTotalSubInfoBoost ) throws Exception { makeIndexLongMV(); FieldQuery fq = new FieldQuery( query, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); FieldPhraseList fpl = new FieldPhraseList( stack, fq ); WeightedFragListBuilder wflb = new WeightedFragListBuilder(); FieldFragList ffl = wflb.createFieldFragList( fpl, fragCharSize ); assertEquals( 1, ffl.getFragInfos().size() ); assertEquals( expectedFragInfo, ffl.getFragInfos().get( 0 ).toString() ); float totalSubInfoBoost = 0; for ( WeightedFragInfo info : ffl.getFragInfos() ) { for ( SubInfo subInfo : info.getSubInfos() ) { totalSubInfoBoost += subInfo.getBoost(); } } assertEquals( expectedTotalSubInfoBoost, totalSubInfoBoost, .0000001 ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseInfo.getBoost() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<>(); List<SubInfo> realSubInfos = new ArrayList<>(); HashSet<String> distinctTerms = new HashSet<>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) totalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. totalBoost *= length * ( 1 / Math.sqrt( length ) ); getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseInfo.getBoost() ) ); totalBoost += phraseInfo.getBoost(); } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<SubInfo>(); List<SubInfo> realSubInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }