Java 类org.apache.lucene.util.fst.FST 实例源码

项目:elasticsearch_my    文件:XFuzzySuggester.java   
@Override
protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(
    List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths, Automaton lookupAutomaton,
    FST<PairOutputs.Pair<Long,BytesRef>> fst)
        throws IOException {

    // TODO: right now there's no penalty for fuzzy/edits,
    // ie a completion whose prefix matched exactly what the
    // user typed gets no boost over completions that
    // required an edit, which get no boost over completions
    // requiring two edits.  I suspect a multiplicative
    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
  Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
  w.write(levA.toDot());
  w.close();
  System.out.println("Wrote LevA to out.dot");
*/
    return FSTUtil.intersectPrefixPaths(levA, fst);
}
项目:lams    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:lams    文件:NormalizeCharMap.java   
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      builder.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = builder.finish();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
项目:lams    文件:Stemmer.java   
/**
 * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
 *
 * @param dictionary Dictionary that will be used to create the stems
 */
public Stemmer(Dictionary dictionary) {
  this.dictionary = dictionary;
  this.affixReader = new ByteArrayDataInput(dictionary.affixData);
  for (int level = 0; level < 3; level++) {
    if (dictionary.prefixes != null) {
      prefixArcs[level] = new FST.Arc<>();
      prefixReaders[level] = dictionary.prefixes.getBytesReader();
    }
    if (dictionary.suffixes != null) {
      suffixArcs[level] = new FST.Arc<>();
      suffixReaders[level] = dictionary.suffixes.getBytesReader();
    }
  }
  formStep = dictionary.hasStemExceptions ? 2 : 1;
}
项目:lams    文件:Dictionary.java   
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();

  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }

  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }

  return builder.finish();
}
项目:lams    文件:StemmerOverrideFilter.java   
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
  IntsRefBuilder intsSpare = new IntsRefBuilder();
  final int size = hash.size();
  BytesRef spare = new BytesRef();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    intsSpare.copyUTF8Bytes(bytesRef);
    builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(builder.finish(), ignoreCase);
}
项目:Elasticsearch    文件:XFuzzySuggester.java   
@Override
protected List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<PairOutputs.Pair<Long,BytesRef>>> prefixPaths,
                                                                                 Automaton lookupAutomaton,
                                                                                 FST<PairOutputs.Pair<Long,BytesRef>> fst)
        throws IOException {

    // TODO: right now there's no penalty for fuzzy/edits,
    // ie a completion whose prefix matched exactly what the
    // user typed gets no boost over completions that
    // required an edit, which get no boost over completions
    // requiring two edits.  I suspect a multiplicative
    // factor is appropriate (eg, say a fuzzy match must be at
    // least 2X better weight than the non-fuzzy match to
    // "compete") ... in which case I think the wFST needs
    // to be log weights or something ...

    Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
/*
  Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
  w.write(levA.toDot());
  w.close();
  System.out.println("Wrote LevA to out.dot");
*/
    return FSTUtil.intersectPrefixPaths(levA, fst);
}
项目:elasticsearch-analysis-dynamic-synonym    文件:DynamicSynonymFilter.java   
/**
 * 增加update逻辑,此方法中所有赋值的属性皆为final改造,注意只能在此方法中使用,否则可能导致bug
 *
 * @param synonymMap
 */
@Override
public void update(SynonymMap synonymMap) {
    this.synonyms = synonymMap;
    this.fst = synonyms.fst;
    if(this.fst == null) {
        throw new IllegalArgumentException("fst must be non-null");
    } else {
        this.fstReader = this.fst.getBytesReader();
        this.rollBufferSize = 1 + synonyms.maxHorizontalContext;
        this.futureInputs = new DynamicSynonymFilter.PendingInput[this.rollBufferSize];
        this.futureOutputs = new DynamicSynonymFilter.PendingOutputs[this.rollBufferSize];

        for(int pos = 0; pos < this.rollBufferSize; ++pos) {
            this.futureInputs[pos] = new DynamicSynonymFilter.PendingInput();
            this.futureOutputs[pos] = new DynamicSynonymFilter.PendingOutputs();
        }

        this.scratchArc = new FST.Arc();
    }
}
项目:search    文件:FSTTermsReader.java   
/** Lazily accumulate meta data, when we got a accepted term */
void loadMetaData() throws IOException {
  FST.Arc<FSTTermOutputs.TermData> last, next;
  last = stack[metaUpto].fstArc;
  while (metaUpto != level) {
    metaUpto++;
    next = stack[metaUpto].fstArc;
    next.output = fstOutputs.add(next.output, last.output);
    last = next;
  }
  if (last.isFinal()) {
    meta = fstOutputs.add(last.output, last.nextFinalOutput);
  } else {
    meta = last.output;
  }
  state.docFreq = meta.docFreq;
  state.totalTermFreq = meta.totalTermFreq;
}
项目:search    文件:FSTTermsReader.java   
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target;
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
项目:search    文件:MemoryDocValuesConsumer.java   
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
  meta.writeVInt(field.number);
  meta.writeByte(FST);
  meta.writeLong(data.getFilePointer());
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  long ord = 0;
  for (BytesRef v : values) {
    builder.add(Util.toIntsRef(v, scratch), ord);
    ord++;
  }
  FST<Long> fst = builder.finish();
  if (fst != null) {
    fst.save(data);
  }
  meta.writeVLong(ord);
}
项目:search    文件:FSTOrdTermsReader.java   
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target;
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
项目:search    文件:OrdsSegmentTermsEnum.java   
OrdsSegmentTermsEnumFrame pushFrame(FST.Arc<Output> arc, Output frameData, int length) throws IOException {
  scratchReader.reset(frameData.bytes.bytes, frameData.bytes.offset, frameData.bytes.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> OrdsBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  // System.out.println("    fpSeek=" + fpSeek);
  final OrdsSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.hasTerms = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & OrdsBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;

  // Must setFloorData before pushFrame in case pushFrame tries to rewind:
  if (f.isFloor) {
    f.termOrdOrig = frameData.startOrd;
    f.setFloorData(scratchReader, frameData.bytes);
  }

  pushFrame(arc, fpSeek, length, frameData.startOrd);

  return f;
}
项目:search    文件:FuzzySuggester.java   
@Override
protected List<FSTUtil.Path<Pair<Long,BytesRef>>> getFullPrefixPaths(List<FSTUtil.Path<Pair<Long,BytesRef>>> prefixPaths,
                                                                     Automaton lookupAutomaton,
                                                                     FST<Pair<Long,BytesRef>> fst)
  throws IOException {

  // TODO: right now there's no penalty for fuzzy/edits,
  // ie a completion whose prefix matched exactly what the
  // user typed gets no boost over completions that
  // required an edit, which get no boost over completions
  // requiring two edits.  I suspect a multiplicative
  // factor is appropriate (eg, say a fuzzy match must be at
  // least 2X better weight than the non-fuzzy match to
  // "compete") ... in which case I think the wFST needs
  // to be log weights or something ...

  Automaton levA = convertAutomaton(toLevenshteinAutomata(lookupAutomaton));
  /*
    Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), StandardCharsets.UTF_8);
    w.write(levA.toDot());
    w.close();
    System.out.println("Wrote LevA to out.dot");
  */
  return FSTUtil.intersectPrefixPaths(levA, fst);
}
项目:search    文件:FreeTextSuggester.java   
@Override
public boolean load(DataInput input) throws IOException {
  CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
  count = input.readVLong();
  byte separatorOrig = input.readByte();
  if (separatorOrig != separator) {
    throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
  }
  int gramsOrig = input.readVInt();
  if (gramsOrig != grams) {
    throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
  }
  totTokens = input.readVLong();

  fst = new FST<>(input, PositiveIntOutputs.getSingleton());

  return true;
}
项目:search    文件:FreeTextSuggester.java   
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,
                          BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {

  Long output = fst.outputs.getNoOutput();

  fst.getFirstArc(arc);

  byte[] bytes = scratch.bytes;
  int pos = scratch.offset;
  int end = pos + scratch.length;
  while (pos < end) {
    if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
      return null;
    } else {
      output = fst.outputs.add(output, arc.output);
    }
  }

  return output;
}
项目:search    文件:FSTCompletion.java   
/**
 * Cache the root node's output arcs starting with completions with the
 * highest weights.
 */
@SuppressWarnings({"unchecked","rawtypes"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
  try {
    List<Arc<Object>> rootArcs = new ArrayList<>();
    Arc<Object> arc = automaton.getFirstArc(new Arc<>());
    FST.BytesReader fstReader = automaton.getBytesReader();
    automaton.readFirstTargetArc(arc, arc, fstReader);
    while (true) {
      rootArcs.add(new Arc<>().copyFrom(arc));
      if (arc.isLast()) break;
      automaton.readNextArc(arc, fstReader);
    }

    Collections.reverse(rootArcs); // we want highest weights first.
    return rootArcs.toArray(new Arc[rootArcs.size()]);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
项目:search    文件:IDVersionSegmentTermsEnum.java   
IDVersionSegmentTermsEnumFrame pushFrame(FST.Arc<Pair<BytesRef,Long>> arc, Pair<BytesRef,Long> frameData, int length) throws IOException {
  scratchReader.reset(frameData.output1.bytes, frameData.output1.offset, frameData.output1.length);
  final long code = scratchReader.readVLong();
  final long fpSeek = code >>> VersionBlockTreeTermsWriter.OUTPUT_FLAGS_NUM_BITS;
  final IDVersionSegmentTermsEnumFrame f = getFrame(1+currentFrame.ord);
  f.maxIDVersion = Long.MAX_VALUE - frameData.output2;
  f.hasTerms = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_HAS_TERMS) != 0;
  f.hasTermsOrig = f.hasTerms;
  f.isFloor = (code & VersionBlockTreeTermsWriter.OUTPUT_FLAG_IS_FLOOR) != 0;
  if (f.isFloor) {
    f.setFloorData(scratchReader, frameData.output1);
  }
  pushFrame(arc, fpSeek, length);

  return f;
}
项目:search    文件:TokenInfoDictionary.java   
private TokenInfoDictionary() throws IOException {
  super();
  InputStream is = null;
  FST<Long> fst = null;
  boolean success = false;
  try {
    is = getResource(FST_FILENAME_SUFFIX);
    is = new BufferedInputStream(is);
    fst = new FST<>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton());
    success = true;
  } finally {
    if (success) {
      IOUtils.close(is);
    } else {
      IOUtils.closeWhileHandlingException(is);
    }
  }
  // TODO: some way to configure?
  this.fst = new TokenInfoFST(fst, true);
}
项目:search    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:search    文件:NormalizeCharMap.java   
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      builder.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = builder.finish();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
项目:search    文件:Stemmer.java   
/**
 * Constructs a new Stemmer which will use the provided Dictionary to create its stems.
 *
 * @param dictionary Dictionary that will be used to create the stems
 */
public Stemmer(Dictionary dictionary) {
  this.dictionary = dictionary;
  this.affixReader = new ByteArrayDataInput(dictionary.affixData);
  for (int level = 0; level < 3; level++) {
    if (dictionary.prefixes != null) {
      prefixArcs[level] = new FST.Arc<>();
      prefixReaders[level] = dictionary.prefixes.getBytesReader();
    }
    if (dictionary.suffixes != null) {
      suffixArcs[level] = new FST.Arc<>();
      suffixReaders[level] = dictionary.suffixes.getBytesReader();
    }
  }
  formStep = dictionary.hasStemExceptions ? 2 : 1;
}
项目:search    文件:Dictionary.java   
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();

  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }

  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }

  return builder.finish();
}
项目:search    文件:StemmerOverrideFilter.java   
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
  IntsRefBuilder intsSpare = new IntsRefBuilder();
  final int size = hash.size();
  BytesRef spare = new BytesRef();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    intsSpare.copyUTF8Bytes(bytesRef);
    builder.add(intsSpare.get(), new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(builder.finish(), ignoreCase);
}
项目:search    文件:Lucene42DocValuesConsumer.java   
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
  meta.writeVInt(field.number);
  meta.writeByte(FST);
  meta.writeLong(data.getFilePointer());
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  long ord = 0;
  for (BytesRef v : values) {
    builder.add(Util.toIntsRef(v, scratch), ord);
    ord++;
  }
  FST<Long> fst = builder.finish();
  if (fst != null) {
    fst.save(data);
  }
  meta.writeVLong(ord);
}
项目:cc-analysis    文件:CcWordsFilter.java   
protected CcWordsFilter(TokenStream input, CcArgs args) {
    super(input);
    this.args = args;
    //
    this.fst = args.wordSet.fst;
    this.fstReader = args.wordSet.fst.getBytesReader();
    this.fstWords = args.wordSet.words;
    this.fstFirstArc = new FST.Arc<>();
    this.fst.getFirstArc(fstFirstArc);
    this.scratchWordBytesRef = new BytesRef();
    this.scratchArc = new FST.Arc<>();
    this.scratchArcOfSep = new FST.Arc<>();
    this.scatchArcOfEnd = new FST.Arc<>();
    //
    this.pendingOutputs = new LinkedList<>();
}
项目:elasticsearch-analysis-synonym    文件:NGramSynonymTokenizer.java   
@Override
public void reset() throws IOException {
    super.reset();
    block.setLength(0);
    prevToken = null;
    readBufferIndex = BUFFER_SIZE;
    readBufferLen = 0;
    ch = 0;
    blkStart = 0;
    nextBlkStart = 0;
    if (synonymLoader != null && synonymLoader.isUpdate(lastModified)) {
        lastModified = synonymLoader.getLastModified();
        final SynonymMap map = synonymLoader.getSynonymMap();
        if (map != null) {
            synonymMap = map;
            fst = synonymMap.fst;
            if (fst == null) {
                throw new IllegalArgumentException("fst must be non-null");
            }
            fstReader = fst.getBytesReader();
            scratchArc = new FST.Arc<>();
            clearAttributes();
        }
    }
}
项目:read-open-source-code    文件:MemoryDocValuesConsumer.java   
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
  meta.writeVInt(field.number);
  meta.writeByte(FST);
  meta.writeLong(data.getFilePointer());
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<>(INPUT_TYPE.BYTE1, outputs);
  IntsRefBuilder scratch = new IntsRefBuilder();
  long ord = 0;
  for (BytesRef v : values) {
    builder.add(Util.toIntsRef(v, scratch), ord);
    ord++;
  }
  FST<Long> fst = builder.finish();
  if (fst != null) {
    fst.save(data);
  }
  meta.writeVLong(ord);
}
项目:NYBC    文件:FSTCompletion.java   
/**
 * Cache the root node's output arcs starting with completions with the
 * highest weights.
 */
@SuppressWarnings({"unchecked","rawtypes"})
private static Arc<Object>[] cacheRootArcs(FST<Object> automaton) {
  try {
    List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
    Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
    FST.BytesReader fstReader = automaton.getBytesReader();
    automaton.readFirstTargetArc(arc, arc, fstReader);
    while (true) {
      rootArcs.add(new Arc<Object>().copyFrom(arc));
      if (arc.isLast()) break;
      automaton.readNextArc(arc, fstReader);
    }

    Collections.reverse(rootArcs); // we want highest weights first.
    return rootArcs.toArray(new Arc[rootArcs.size()]);
  } catch (IOException e) {
    throw new RuntimeException(e);
  }
}
项目:read-open-source-code    文件:FreeTextSuggester.java   
@Override
public boolean load(DataInput input) throws IOException {
  CodecUtil.checkHeader(input, CODEC_NAME, VERSION_START, VERSION_START);
  count = input.readVLong();
  byte separatorOrig = input.readByte();
  if (separatorOrig != separator) {
    throw new IllegalStateException("separator=" + separator + " is incorrect: original model was built with separator=" + separatorOrig);
  }
  int gramsOrig = input.readVInt();
  if (gramsOrig != grams) {
    throw new IllegalStateException("grams=" + grams + " is incorrect: original model was built with grams=" + gramsOrig);
  }
  totTokens = input.readVLong();

  fst = new FST<Long>(input, PositiveIntOutputs.getSingleton());

  return true;
}
项目:read-open-source-code    文件:FreeTextSuggester.java   
private Long lookupPrefix(FST<Long> fst, FST.BytesReader bytesReader,
                          BytesRef scratch, Arc<Long> arc) throws /*Bogus*/IOException {

  Long output = fst.outputs.getNoOutput();

  fst.getFirstArc(arc);

  byte[] bytes = scratch.bytes;
  int pos = scratch.offset;
  int end = pos + scratch.length;
  while (pos < end) {
    if (fst.findTargetArc(bytes[pos++] & 0xff, arc, arc, bytesReader) == null) {
      return null;
    } else {
      output = fst.outputs.add(output, arc.output);
    }
  }

  return output;
}
项目:NYBC    文件:TokenInfoDictionary.java   
private TokenInfoDictionary() throws IOException {
  super();
  IOException priorE = null;
  InputStream is = null;
  FST<Long> fst = null;
  try {
    is = getResource(FST_FILENAME_SUFFIX);
    is = new BufferedInputStream(is);
    fst = new FST<Long>(new InputStreamDataInput(is), PositiveIntOutputs.getSingleton(true));
  } catch (IOException ioe) {
    priorE = ioe;
  } finally {
    IOUtils.closeWhileHandlingException(priorE, is);
  }
  // TODO: some way to configure?
  this.fst = new TokenInfoFST(fst, true);
}
项目:read-open-source-code    文件:MemoryDocValuesConsumer.java   
private void writeFST(FieldInfo field, Iterable<BytesRef> values) throws IOException {
  meta.writeVInt(field.number);
  meta.writeByte(FST);
  meta.writeLong(data.getFilePointer());
  PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton();
  Builder<Long> builder = new Builder<Long>(INPUT_TYPE.BYTE1, outputs);
  IntsRef scratch = new IntsRef();
  long ord = 0;
  for (BytesRef v : values) {
    builder.add(Util.toIntsRef(v, scratch), ord);
    ord++;
  }
  FST<Long> fst = builder.finish();
  if (fst != null) {
    fst.save(data);
  }
  meta.writeVLong(ord);
}
项目:read-open-source-code    文件:NormalizeCharMap.java   
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRef scratch = new IntsRef();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      builder.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = builder.finish();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
项目:NYBC    文件:BlockTreeTermsWriter.java   
TermsWriter(FieldInfo fieldInfo) {
  this.fieldInfo = fieldInfo;

  noOutputs = NoOutputs.getSingleton();

  // This Builder is just used transiently to fragment
  // terms into "good" blocks; we don't save the
  // resulting FST:
  blockBuilder = new Builder<Object>(FST.INPUT_TYPE.BYTE1,
                                     0, 0, true,
                                     true, Integer.MAX_VALUE,
                                     noOutputs,
                                     new FindBlocks(), false,
                                     PackedInts.COMPACT,
                                     true, 15);

  postingsWriter.setField(fieldInfo);
}
项目:read-open-source-code    文件:StemmerOverrideFilter.java   
/**
 * Returns an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @return an {@link StemmerOverrideMap} to be used with the {@link StemmerOverrideFilter}
 * @throws IOException if an {@link IOException} occurs;
 */
public StemmerOverrideMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  org.apache.lucene.util.fst.Builder<BytesRef> builder = new org.apache.lucene.util.fst.Builder<BytesRef>(
      FST.INPUT_TYPE.BYTE4, outputs);
  final int[] sort = hash.sort(BytesRef.getUTF8SortedAsUnicodeComparator());
  IntsRef intsSpare = new IntsRef();
  final int size = hash.size();
  for (int i = 0; i < size; i++) {
    int id = sort[i];
    BytesRef bytesRef = hash.get(id, spare);
    UnicodeUtil.UTF8toUTF32(bytesRef, intsSpare);
    builder.add(intsSpare, new BytesRef(outputValues.get(id)));
  }
  return new StemmerOverrideMap(builder.finish(), ignoreCase);
}
项目:read-open-source-code    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:read-open-source-code    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:read-open-source-code    文件:FSTOrdTermsReader.java   
static<T> void walk(FST<T> fst) throws IOException {
  final ArrayList<FST.Arc<T>> queue = new ArrayList<FST.Arc<T>>();
  final BitSet seen = new BitSet();
  final FST.BytesReader reader = fst.getBytesReader();
  final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
  queue.add(startArc);
  while (!queue.isEmpty()) {
    final FST.Arc<T> arc = queue.remove(0);
    final long node = arc.target;
    //System.out.println(arc);
    if (FST.targetHasArcs(arc) && !seen.get((int) node)) {
      seen.set((int) node);
      fst.readFirstRealTargetArc(node, arc, reader);
      while (true) {
        queue.add(new FST.Arc<T>().copyFrom(arc));
        if (arc.isLast()) {
          break;
        } else {
          fst.readNextRealArc(arc, reader);
        }
      }
    }
  }
}
项目:read-open-source-code    文件:FSTTermsReader.java   
/** Lazily accumulate meta data, when we got a accepted term */
void loadMetaData() throws IOException {
  FST.Arc<FSTTermOutputs.TermData> last, next;
  last = stack[metaUpto].fstArc;
  while (metaUpto != level) {
    metaUpto++;
    next = stack[metaUpto].fstArc;
    next.output = fstOutputs.add(next.output, last.output);
    last = next;
  }
  if (last.isFinal()) {
    meta = fstOutputs.add(last.output, last.nextFinalOutput);
  } else {
    meta = last.output;
  }
  state.docFreq = meta.docFreq;
  state.totalTermFreq = meta.totalTermFreq;
}