public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){ this.boost = boost; this.seqnum = seqnum; // We keep TermInfos for further operations termsInfos = new ArrayList<>( terms ); termsOffsets = new ArrayList<>( terms.size() ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ return; } int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); } else{ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); } pos = ti.getPosition(); } }
public boolean isValidTermOrPhrase( final List<TermInfo> phraseCandidate ){ // check terminal if( !terminal ) return false; // if the candidate is a term, it is valid if( phraseCandidate.size() == 1 ) return true; // else check whether the candidate is valid phrase // compare position-gaps between terms to slop int pos = phraseCandidate.get( 0 ).getPosition(); for( int i = 1; i < phraseCandidate.size(); i++ ){ int nextPos = phraseCandidate.get( i ).getPosition(); if( Math.abs( nextPos - pos - 1 ) > slop ) return false; pos = nextPos; } return true; }
public void testTermInfoComparisonConsistency() { TermInfo a = new TermInfo( TestUtil.randomUnicodeString(random()), 0, 0, 0, 1 ); TermInfo b = new TermInfo( TestUtil.randomUnicodeString(random()), 0, 0, 1, 1 ); TermInfo c = new TermInfo( TestUtil.randomUnicodeString(random()), 0, 0, 2, 1 ); TermInfo d = new TermInfo( TestUtil.randomUnicodeString(random()), 0, 0, 0, 1 ); assertConsistentEquals( a, a ); assertConsistentEquals( b, b ); assertConsistentEquals( c, c ); assertConsistentEquals( d, d ); assertConsistentEquals( a, d ); assertConsistentLessThan( a, b ); assertConsistentLessThan( b, c ); assertConsistentLessThan( a, c ); assertConsistentLessThan( d, b ); assertConsistentLessThan( d, c ); }
public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){ this.boost = boost; this.seqnum = seqnum; // We keep TermInfos for further operations termsInfos = new ArrayList<TermInfo>( terms ); termsOffsets = new ArrayList<Toffs>( terms.size() ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ return; } int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); } else{ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); } pos = ti.getPosition(); } }
public void testTermInfoComparisonConsistency() { TermInfo a = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 ); TermInfo b = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 1, 1 ); TermInfo c = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 2, 1 ); TermInfo d = new TermInfo( _TestUtil.randomUnicodeString( random() ), 0, 0, 0, 1 ); assertConsistentEquals( a, a ); assertConsistentEquals( b, b ); assertConsistentEquals( c, c ); assertConsistentEquals( d, d ); assertConsistentEquals( a, d ); assertConsistentLessThan( a, b ); assertConsistentLessThan( b, c ); assertConsistentLessThan( a, c ); assertConsistentLessThan( d, b ); assertConsistentLessThan( d, c ); }
/** * Text of the match, calculated on the fly. Use for debugging only. * @return the text */ public String getText() { StringBuilder text = new StringBuilder(); for ( TermInfo ti: termsInfos ) { text.append( ti.getText() ); } return text.toString(); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<>(); List<SubInfo> realSubInfos = new ArrayList<>(); HashSet<String> distinctTerms = new HashSet<>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }
public QueryPhraseMap searchPhrase( final List<TermInfo> phraseCandidate ){ QueryPhraseMap currMap = this; for( TermInfo ti : phraseCandidate ){ currMap = currMap.subMap.get( ti.getText() ); if( currMap == null ) return null; } return currMap.isValidTermOrPhrase( phraseCandidate ) ? currMap : null; }
public void testSearchPhrase() throws Exception { Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a" List<TermInfo> phraseCandidate = new ArrayList<>(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); // "a b c" assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNotNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); // "a" phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); }
public void testSearchPhraseSlop() throws Exception { // "a b c"~0 Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 List<TermInfo> phraseCandidate = new ArrayList<>(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 2, 1 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 4, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c"~1 query = pqF( 1F, 1, "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" w/ position-gap = 3 phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 3, 1 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 6, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); }
private void defgMultiTermQueryTest(Query query) throws IOException { FieldQuery fq = new FieldQuery( query, reader, true, true ); QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg"); assertNotNull (qpm); assertNull (fq.getFieldTermMap(F, "dog")); List<TermInfo> phraseCandidate = new ArrayList<>(); phraseCandidate.add( new TermInfo( "defg", 0, 12, 0, 1 ) ); assertNotNull (fq.searchPhrase(F, phraseCandidate)); }
public void testFieldTermStackIndex1wSearch2terms() throws Exception { makeIndex1w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "Mac" ), Occur.SHOULD ); bq.add( tq( "MacBook" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 1, stack.termList.size() ); TermInfo ti = stack.pop(); assertEquals("Mac(11,20,3)", ti.toString()); assertEquals("MacBook(11,20,3)", ti.getNext().toString()); assertSame(ti, ti.getNext().getNext()); }
public void testFieldTermStackIndex1w2wSearch1term1phrase() throws Exception { makeIndex1w2w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); TermInfo ti = stack.pop(); assertEquals( "pc(3,5,1)", ti.toString()); assertEquals( "personal(3,5,1)", ti.getNext().toString()); assertSame(ti, ti.getNext().getNext()); assertEquals( "computer(3,5,2)", stack.pop().toString() ); }
public void testFieldTermStackIndex2w1wSearch1term1phrase() throws Exception { makeIndex2w1w(); BooleanQuery bq = new BooleanQuery(); bq.add( tq( "pc" ), Occur.SHOULD ); bq.add( pqF( "personal", "computer" ), Occur.SHOULD ); FieldQuery fq = new FieldQuery( bq, true, true ); FieldTermStack stack = new FieldTermStack( reader, 0, F, fq ); assertEquals( 2, stack.termList.size() ); TermInfo ti = stack.pop(); assertEquals("pc(3,20,1)", ti.toString()); assertEquals("personal(3,20,1)", ti.getNext().toString()); assertSame(ti, ti.getNext().getNext()); assertEquals( "computer(3,20,2)", stack.pop().toString() ); }
public WeightedPhraseInfo( LinkedList<TermInfo> terms, float boost, int seqnum ){ this.boost = boost; this.seqnum = seqnum; // We keep TermInfos for further operations termsInfos = new ArrayList<TermInfo>( terms ); termsOffsets = new ArrayList<Toffs>( terms.size() ); TermInfo ti = terms.get( 0 ); termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); if( terms.size() == 1 ){ text = ti.getText(); return; } StringBuilder sb = new StringBuilder(); sb.append( ti.getText() ); int pos = ti.getPosition(); for( int i = 1; i < terms.size(); i++ ){ ti = terms.get( i ); sb.append( ti.getText() ); if( ti.getPosition() - pos == 1 ){ Toffs to = termsOffsets.get( termsOffsets.size() - 1 ); to.setEndOffset( ti.getEndOffset() ); } else{ termsOffsets.add( new Toffs( ti.getStartOffset(), ti.getEndOffset() ) ); } pos = ti.getPosition(); } text = sb.toString(); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { float totalBoost = 0; List<SubInfo> subInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ subInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum() ) ); for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) totalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. totalBoost *= length * ( 1 / Math.sqrt( length ) ); getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, subInfos, totalBoost ) ); }
public void testSearchPhrase() throws Exception { Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a" List<TermInfo> phraseCandidate = new ArrayList<TermInfo>(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = true, fieldMatch = false fq = new FieldQuery( query, true, false ); // "a b c" assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNotNull( fq.searchPhrase( "x", phraseCandidate ) ); // phraseHighlight = false, fieldMatch = true fq = new FieldQuery( query, false, true ); // "a" phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b" phraseCandidate.add( new TermInfo( "b", 2, 3, 1, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" phraseCandidate.add( new TermInfo( "c", 4, 5, 2, 1 ) ); assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); assertNull( fq.searchPhrase( "x", phraseCandidate ) ); }
public void testSearchPhraseSlop() throws Exception { // "a b c"~0 Query query = pqF( "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true FieldQuery fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 List<TermInfo> phraseCandidate = new ArrayList<TermInfo>(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 2, 1 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 4, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c"~1 query = pqF( 1F, 1, "a", "b", "c" ); // phraseHighlight = true, fieldMatch = true fq = new FieldQuery( query, true, true ); // "a b c" w/ position-gap = 2 assertNotNull( fq.searchPhrase( F, phraseCandidate ) ); // "a b c" w/ position-gap = 3 phraseCandidate.clear(); phraseCandidate.add( new TermInfo( "a", 0, 1, 0, 1 ) ); phraseCandidate.add( new TermInfo( "b", 2, 3, 3, 1 ) ); phraseCandidate.add( new TermInfo( "c", 4, 5, 6, 1 ) ); assertNull( fq.searchPhrase( F, phraseCandidate ) ); }
private void defgMultiTermQueryTest(Query query) throws IOException { FieldQuery fq = new FieldQuery( query, reader, true, true ); QueryPhraseMap qpm = fq.getFieldTermMap(F, "defg"); assertNotNull (qpm); assertNull (fq.getFieldTermMap(F, "dog")); List<TermInfo> phraseCandidate = new ArrayList<TermInfo>(); phraseCandidate.add( new TermInfo( "defg", 0, 12, 0, 1 ) ); assertNotNull (fq.searchPhrase(F, phraseCandidate)); }
@Override public void add( int startOffset, int endOffset, List<WeightedPhraseInfo> phraseInfoList ) { List<SubInfo> tempSubInfos = new ArrayList<SubInfo>(); List<SubInfo> realSubInfos = new ArrayList<SubInfo>(); HashSet<String> distinctTerms = new HashSet<String>(); int length = 0; for( WeightedPhraseInfo phraseInfo : phraseInfoList ){ float phraseTotalBoost = 0; for ( TermInfo ti : phraseInfo.getTermsInfos()) { if ( distinctTerms.add( ti.getText() ) ) phraseTotalBoost += ti.getWeight() * phraseInfo.getBoost(); length++; } tempSubInfos.add( new SubInfo( phraseInfo.getText(), phraseInfo.getTermsOffsets(), phraseInfo.getSeqnum(), phraseTotalBoost ) ); } // We want that terms per fragment (length) is included into the weight. Otherwise a one-word-query // would cause an equal weight for all fragments regardless of how much words they contain. // To avoid that fragments containing a high number of words possibly "outrank" more relevant fragments // we "bend" the length with a standard-normalization a little bit. float norm = length * ( 1 / (float)Math.sqrt( length ) ); float totalBoost = 0; for ( SubInfo tempSubInfo : tempSubInfos ) { float subInfoBoost = tempSubInfo.getBoost() * norm; realSubInfos.add( new SubInfo( tempSubInfo.getText(), tempSubInfo.getTermsOffsets(), tempSubInfo.getSeqnum(), subInfoBoost )); totalBoost += subInfoBoost; } getFragInfos().add( new WeightedFragInfo( startOffset, endOffset, realSubInfos, totalBoost ) ); }