Java 类org.apache.lucene.index.TermFreqVector 实例源码

项目:opensearchserver    文件:Facet.java   
private static Map<String, FacetCounter> computeMultivaluedTFV(ReaderAbstract reader, String fieldName,
        DocIdInterface docIdInterface) throws IOException, SearchLibException {
    final Map<String, FacetCounter> termMap = new LinkedHashMap<>();
    if (docIdInterface.getSize() == 0)
        return termMap;
    for (int docId : docIdInterface.getIds()) {
        final TermFreqVector tfv = reader.getTermFreqVector(docId, fieldName);
        if (tfv == null)
            continue;
        final String[] terms = tfv.getTerms();
        final int[] freqs = tfv.getTermFrequencies();
        if (terms == null || freqs == null)
            continue;
        int i = 0;
        for (String term : terms) {
            if (freqs[i++] > 0) {
                final FacetCounter facetItem = termMap.get(term);
                if (facetItem == null)
                    termMap.put(term, new FacetCounter(1));
                else
                    facetItem.increment();
            }
        }
    }
    return termMap;
}
项目:opensearchserver    文件:ReaderLocal.java   
public Set<FieldValue> getTermsVectorFields(int docId, Set<String> fieldNameSet) throws IOException {
    Set<FieldValue> fieldValueList = new HashSet<FieldValue>();
    for (String fieldName : fieldNameSet) {
        TermFreqVector termFreqVector = indexReader.getTermFreqVector(docId, fieldName);
        if (termFreqVector == null)
            continue;
        String[] terms = termFreqVector.getTerms();
        if (terms == null)
            continue;
        FieldValueItem[] fieldValueItem = new FieldValueItem[terms.length];
        int i = 0;
        for (String term : terms)
            fieldValueItem[i++] = new FieldValueItem(FieldValueOriginEnum.TERM_VECTOR, term);
        fieldValueList.add(new FieldValue(fieldName, fieldValueItem));
    }
    return fieldValueList;
}
项目:opensearchserver    文件:ResultDocuments.java   
@Override
public void populate(List<IndexDocumentResult> indexDocuments) throws IOException, SearchLibException {
    SchemaFieldList schemaFieldList = request.getConfig().getSchema().getFieldList();
    for (int docId : docArray) {
        IndexDocumentResult indexDocument = new IndexDocumentResult(schemaFieldList.size());
        Map<String, FieldValue> storedFieldMap = reader.getDocumentStoredField(docId);
        for (SchemaField schemaField : schemaFieldList) {
            String fieldName = schemaField.getName();
            List<IndexTerm> indexTermList = null;
            if (schemaField.checkIndexed(Indexed.YES)) {
                if (schemaField.getTermVector() == TermVector.NO) {
                    indexTermList = IndexTerm.toList(reader, fieldName, docId);
                } else {
                    TermFreqVector termFreqVector = reader.getTermFreqVector(docId, fieldName);
                    indexTermList = IndexTerm.toList(termFreqVector);
                }
            }
            IndexField indexField = new IndexField(fieldName, storedFieldMap.get(fieldName), indexTermList);
            indexDocument.add(indexField);
        }
        indexDocuments.add(indexDocument);
    }
}
项目:t4f-data    文件:CategorizerTest.java   
private void buildCategoryVectors() throws IOException {
  IndexReader reader = DirectoryReader.open(TestUtil.getBookIndexDirectory());

  int maxDoc = reader.maxDoc();

  for (int i = 0; i < maxDoc; i++) {
    if (!reader.isDeleted(i)) {
      Document doc = reader.document(i);
      String category = doc.get("category");

      Map vectorMap = (Map) categoryMap.get(category);
      if (vectorMap == null) {
        vectorMap = new TreeMap();
        categoryMap.put(category, vectorMap);
      }

      TermFreqVector termFreqVector =
          reader.getTermFreqVector(i, "subject");

      addTermFreqToMap(vectorMap, termFreqVector);
    }
  }
}
项目:t4f-data    文件:CategorizerTest.java   
private void addTermFreqToMap(Map vectorMap,
                              TermFreqVector termFreqVector) {
  String[] terms = termFreqVector.getTerms();
  int[] freqs = termFreqVector.getTermFrequencies();

  for (int i = 0; i < terms.length; i++) {
    String term = terms[i];

    if (vectorMap.containsKey(term)) {
      Integer value = (Integer) vectorMap.get(term);
      vectorMap.put(term,
          new Integer(value.intValue() + freqs[i]));
    } else {
      vectorMap.put(term, new Integer(freqs[i]));
    }
  }
}
项目:opensearchserver    文件:ReaderLocal.java   
public void putTermFreqVectors(final int[] docIds, final String field,
        final Collection<TermFreqVector> termFreqVectors) throws IOException {
    if (termFreqVectors == null || docIds == null || docIds.length == 0)
        return;
    for (int docId : docIds)
        termFreqVectors.add(indexReader.getTermFreqVector(docId, field));
}
项目:opensearchserver    文件:ReaderLocal.java   
@Override
public void putTermVectors(int[] docIds, String field, Collection<String[]> termVectors) throws IOException {
    if (docIds == null || docIds.length == 0 || field == null || termVectors == null)
        return;
    List<TermFreqVector> termFreqVectors = new ArrayList<TermFreqVector>(docIds.length);
    putTermFreqVectors(docIds, field, termFreqVectors);
    for (TermFreqVector termFreqVector : termFreqVectors)
        termVectors.add(termFreqVector.getTerms());
}
项目:opensearchserver    文件:IndexSingle.java   
@Override
final public TermFreqVector getTermFreqVector(final int docId, final String field)
        throws IOException, SearchLibException {
    checkOnline(true);
    ReaderLocal reader = acquire();
    try {
        return reader.getTermFreqVector(docId, field);
    } finally {
        release(reader);
    }
}
项目:opensearchserver    文件:SnippetVectors.java   
private static final TermPositionVector getTermPositionVector(
        final String[] terms, final ReaderInterface readerInterface,
        final int docId, final String field, List<FieldValueItem> values,
        CompiledAnalyzer analyzer, Timer timer) throws IOException,
        SearchLibException, ParseException, SyntaxError {
    TermFreqVector termFreqVector = readerInterface.getTermFreqVector(
            docId, field);
    if (termFreqVector != null)
        if (termFreqVector instanceof TermPositionVector)
            return (TermPositionVector) termFreqVector;
    if (analyzer == null)
        return null;
    SnippetTermPositionVector stpv = new SnippetTermPositionVector(field,
            terms);
    int positionOffset = 0;
    int characterOffset = 0;
    List<TokenTerm> tokenTerms = new ArrayList<TokenTerm>();
    for (FieldValueItem fieldValueItem : values) {
        if (fieldValueItem.value == null)
            continue;
        analyzer.populate(fieldValueItem.value, tokenTerms);
        positionOffset = stpv.addCollection(tokenTerms, characterOffset,
                positionOffset);
        characterOffset += fieldValueItem.value.length() + 1;
        tokenTerms.clear();
    }
    stpv.compile();
    return stpv;
}
项目:opensearchserver    文件:IndexDocumentResult.java   
public final static List<IndexTerm> toList(TermFreqVector termVector) {
    if (termVector == null)
        return null;
    String[] terms = termVector.getTerms();
    if (terms == null)
        return null;
    int[] frequencies = termVector.getTermFrequencies();
    List<IndexTerm> indexTerms = new ArrayList<IndexTerm>(terms.length);
    if (termVector instanceof TermPositionVector)
        toListPosition((TermPositionVector) termVector, terms, frequencies, indexTerms);
    else
        toListFreq(termVector, terms, frequencies, indexTerms);
    return indexTerms;
}
项目:opensearchserver    文件:IndexDocumentResult.java   
private final static void toListFreq(TermFreqVector termVector, String[] terms, int[] frequencies,
        List<IndexTerm> indexTerms) {
    int i = 0;
    for (String term : terms) {
        IndexTerm indexTerm = new IndexTerm(term, frequencies[i], null, null);
        indexTerms.add(indexTerm);
        i++;
    }
}
项目:WikiKreator    文件:CosineDocumentSimilarity.java   
public static DocVector[] getCosineSimilarityMatrix(List<String> fileSentences) throws IOException{

        RAMDirectory ramDir = new RAMDirectory();
        FileReader fr=new FileReader(new File("lib/stoplists/en.txt"));

        //  Set<String> stopWords = new HashSet<String>(FileUtils.readLines(new File("stop-words.txt")));  
        Analyzer analyzer = new StopAnalyzer(Version.LUCENE_36, fr );
        //Index the full text of both documents
        //IndexWriter writer = new IndexWriter(ramDir, new StandardAnalyzer(Version.LUCENE_36), true, IndexWriter.MaxFieldLength.UNLIMITED);
        IndexWriter writer =new IndexWriter(ramDir, new IndexWriterConfig(Version.LUCENE_36, analyzer));
        for (String s:fileSentences)
        {
            Document doc1 = new Document();
            StringReader d1reader=new StringReader(s);
            doc1.add(new Field("contents", d1reader, TermVector.YES));
            writer.addDocument(doc1);
        }


        //  writer.commit();
        writer.close();

        DocVector[] docs = new DocVector[fileSentences.size()];
        //Build a term vector for each document
        IndexReader RAMreader = IndexReader.open(ramDir);
        Map<String,Integer> terms = new HashMap<String,Integer>();
        TermEnum termEnum = RAMreader.terms(new Term("contents"));

        //System.out.println(RAMreader.numDocs());
        int pos = 0;
        while (termEnum.next()) {
          Term term = termEnum.term();
          if (!"contents".equals(term.field())) 
            break;
          terms.put(term.text(), pos++);
        }

        //System.out.println("Num terms:"+terms.size());

        for(int i=0;i<fileSentences.size();i++)
        {
            TermFreqVector[] tfvs = RAMreader.getTermFreqVectors(i);
            docs[i]=new DocVector(terms);
            if (tfvs==null)
                    continue;
            for (TermFreqVector tfv : tfvs) 
            {
                String[] termTexts = tfv.getTerms();
                int[] termFreqs = tfv.getTermFrequencies();
                for (int j = 0; j < termTexts.length; j++) {
                    double idfValue=getIDF(RAMreader,termTexts[j]);
                    double tfIdfValue=termFreqs[j]*idfValue;
                    docs[i].setEntry(termTexts[j], tfIdfValue);
                }

            }
            docs[i].normalize();


        }


        RAMreader.close();
        ramDir.close();
        //ramDir.close();
        //System.out.println(RAMreader.numDocs());
        //System.out.println("Similarity:" + calcCosineSimilarity(docs[5], docs[19]));
        return docs;

    }
项目:dash-xtf    文件:LimIndexReader.java   
public TermFreqVector getTermFreqVector(int docNumber, String field)
  throws IOException 
{
  return wrapped.getTermFreqVector(docNumber, field);
}
项目:dash-xtf    文件:LimIndexReader.java   
public TermFreqVector[] getTermFreqVectors(int docNumber)
  throws IOException 
{
  return wrapped.getTermFreqVectors(docNumber);
}
项目:opensearchserver    文件:ReaderLocal.java   
@Override
public TermFreqVector getTermFreqVector(final int docId, final String field) throws IOException {
    return indexReader.getTermFreqVector(docId, field);
}
项目:eventspotter    文件:CosineSimilarity.java   
public double run(String doc1,String doc2) throws IOException 
    {
        // index strings
        s[0]=doc1;
        s[1]=doc2;
        //System.out.print(s[0]+"\n"+s[1]+"\n");
        Directory index = new RAMDirectory();
        StandardAnalyzer analyzer = new StandardAnalyzer(Version.LUCENE_36);
        IndexWriterConfig config = new IndexWriterConfig(Version.LUCENE_36, analyzer);
        IndexWriter writer = new IndexWriter(index, config);        
        for (String si : s) {
            Document doc = new Document();
            doc.add(new Field("content", si, Field.Store.YES, Field.Index.ANALYZED,TermVector.WITH_POSITIONS_OFFSETS));
            writer.addDocument(doc);
        }
        writer.close();

        // read the index
        IndexReader reader = IndexReader.open(index);

        // calculate tf/idf
        Map<String,Integer> terms = new HashMap<String,Integer>();
        TermEnum termEnum = reader.terms(new Term("content"));
        int pos = 0;
        while (termEnum.next()) {
            Term term = termEnum.term();
            if (! "content".equals(term.field())) break;
                terms.put(term.text(), pos++);
        }

//        for (int i=0; i<reader.maxDoc(); i++) {
//            if (reader.isDeleted(i))
//                continue;
//
//            Document doc = reader.document(i);
//            System.out.println(doc);
//            TermFreqVector tfvs = reader.getTermFreqVector(i,"content");
//            System.out.println(tfvs);
//        }
//        
        // apply cosine similarity
        DocVector[] docs = new DocVector[s.length];
        for (int i=0; i<s.length; i++) {
            TermFreqVector[] tfvs = reader.getTermFreqVectors(i);
            //String strip_str=tfvs.toString();
            //strip_str.replaceAll("null", "");

            docs[i] = new DocVector(terms); 
            //System.out.print(tfvs);
        //}

           for (TermFreqVector tfv : tfvs) {
                String[] termTexts = tfv.getTerms();
                int[] termFreqs = tfv.getTermFrequencies();
                for (int j = 0; j < termTexts.length; j++) {
                docs[i].setEntry(termTexts[j], termFreqs[j]);
              }
            }
            docs[i].normalize();

          }

        // now get similarity between doc[0] and doc[1]
        double cosim01 = getCosineSimilarity(docs[0], docs[1]);
        //System.out.println("cosim(0,1)=" + cosim01);
        // between doc[0] and doc[2]
       // double cosim02 = getCosineSimilarity(docs[0], docs[3]);
        //System.out.println("cosim(0,2)=" + cosim02);
        // between doc[1] and doc[3]
        //double cosim03 = getCosineSimilarity(docs[1], docs[2]);
        //System.out.println("cosim(1,2)=" + cosim03);

       // }
        //double cosim01=10.0;
        reader.close();
        return cosim01;
    }
项目:opensearchserver    文件:ReaderInterface.java   
TermFreqVector getTermFreqVector(final int docId, final String field) throws IOException, SearchLibException;