Java 类org.apache.lucene.util.CharsRef 实例源码

项目:search    文件:TestLimitTokenPositionFilter.java   
public void testMaxPosition3WithSynomyms() throws IOException {
  for (final boolean consumeAll : new boolean[]{true, false}) {
    MockTokenizer tokenizer = new MockTokenizer(new StringReader("one two three four five"), MockTokenizer.WHITESPACE, false);
    // if we are consuming all tokens, we can use the checks, otherwise we can't
    tokenizer.setEnableChecks(consumeAll);

    SynonymMap.Builder builder = new SynonymMap.Builder(true);
    builder.add(new CharsRef("one"), new CharsRef("first"), true);
    builder.add(new CharsRef("one"), new CharsRef("alpha"), true);
    builder.add(new CharsRef("one"), new CharsRef("beguine"), true);
    CharsRefBuilder multiWordCharsRef = new CharsRefBuilder();
    SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef);
    builder.add(new CharsRef("one"), multiWordCharsRef.get(), true);
    SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef);
    builder.add(new CharsRef("two"), multiWordCharsRef.get(), true);
    SynonymMap synonymMap = builder.build();
    TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true);
    stream = new LimitTokenPositionFilter(stream, 3, consumeAll);

    // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3.
    assertTokenStreamContents(stream,
        new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"},
        new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0});
  }
}
项目:lams    文件:SynonymMap.java   
/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}
项目:lams    文件:SynonymMap.java   
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
  final int end = chars.offset + chars.length;
  for(int idx=chars.offset+1;idx<end;idx++) {
    if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
      return true;
    }
  }
  if (chars.chars[chars.offset] == '\u0000') {
    return true;
  }
  if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
    return true;
  }

  return false;
}
项目:lams    文件:WordnetSynonymParser.java   
private void addInternal(CharsRef synset[], int size) {
  if (size <= 1) {
    return; // nothing to do
  }

  if (expand) {
    for (int i = 0; i < size; i++) {
      for (int j = 0; j < size; j++) {
        add(synset[i], synset[j], false);
      }
    }
  } else {
    for (int i = 0; i < size; i++) {
      add(synset[i], synset[0], false);
    }
  }
}
项目:lams    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:lams    文件:NormalizeCharMap.java   
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      builder.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = builder.finish();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
项目:lams    文件:Stemmer.java   
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
项目:lams    文件:Dictionary.java   
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();

  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }

  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }

  return builder.finish();
}
项目:lams    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Add another character sequence to this automaton. The sequence must be
 * lexicographically larger or equal compared to any previous sequences added
 * to this automaton (the input must be sorted).
 */
public void add(CharsRef current) {
  assert stateRegistry != null : "Automaton already built.";
  assert previous == null
      || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
      + previous + " >= " + current;
  assert setPrevious(current);

  // Descend in the automaton (find matching prefix).
  int pos = 0, max = current.length();
  State next, state = root;
  while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
    state = next;
    // todo, optimize me
    pos += Character.charCount(Character.codePointAt(current, pos));
  }

  if (state.hasChildren()) replaceOrRegister(state);

  addSuffix(state, current, pos);
}
项目:lams    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }

  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
项目:lams    文件:CharSequenceOutputs.java   
@Override
public CharsRef subtract(CharsRef output, CharsRef inc) {
  assert output != null;
  assert inc != null;
  if (inc == NO_OUTPUT) {
    // no prefix removed
    return output;
  } else if (inc.length == output.length) {
    // entire output removed
    return NO_OUTPUT;
  } else {
    assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
    assert inc.length > 0;
    return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length);
  }
}
项目:lams    文件:CharSequenceOutputs.java   
@Override
public CharsRef add(CharsRef prefix, CharsRef output) {
  assert prefix != null;
  assert output != null;
  if (prefix == NO_OUTPUT) {
    return output;
  } else if (output == NO_OUTPUT) {
    return prefix;
  } else {
    assert prefix.length > 0;
    assert output.length > 0;
    CharsRef result = new CharsRef(prefix.length + output.length);
    System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length);
    System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length);
    result.length = prefix.length + output.length;
    return result;
  }
}
项目:search    文件:SynonymMap.java   
/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRefBuilder reuse) {
  int upto = 0;
  char[] buffer = reuse.chars();
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars();
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.setLength(upto);
  return reuse.get();
}
项目:search    文件:SynonymMap.java   
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
  final int end = chars.offset + chars.length;
  for(int idx=chars.offset+1;idx<end;idx++) {
    if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
      return true;
    }
  }
  if (chars.chars[chars.offset] == '\u0000') {
    return true;
  }
  if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
    return true;
  }

  return false;
}
项目:search    文件:WordnetSynonymParser.java   
private void addInternal(CharsRef synset[], int size) {
  if (size <= 1) {
    return; // nothing to do
  }

  if (expand) {
    for (int i = 0; i < size; i++) {
      for (int j = 0; j < size; j++) {
        add(synset[i], synset[j], false);
      }
    }
  } else {
    for (int i = 0; i < size; i++) {
      add(synset[i], synset[0], false);
    }
  }
}
项目:search    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:search    文件:NormalizeCharMap.java   
/** Builds the NormalizeCharMap; call this once you
 *  are done calling {@link #add}. */
public NormalizeCharMap build() {

  final FST<CharsRef> map;
  try {
    final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
    final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
    final IntsRefBuilder scratch = new IntsRefBuilder();
    for(Map.Entry<String,String> ent : pendingPairs.entrySet()) {
      builder.add(Util.toUTF16(ent.getKey(), scratch),
                  new CharsRef(ent.getValue()));
    }
    map = builder.finish();
    pendingPairs.clear();
  } catch (IOException ioe) {
    // Bogus FST IOExceptions!!  (will never happen)
    throw new RuntimeException(ioe);
  }

  return new NormalizeCharMap(map);
}
项目:search    文件:Stemmer.java   
/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}
项目:search    文件:Dictionary.java   
private FST<CharsRef> parseConversions(LineNumberReader reader, int num) throws IOException, ParseException {
  Map<String,String> mappings = new TreeMap<>();

  for (int i = 0; i < num; i++) {
    String line = reader.readLine();
    String parts[] = line.split("\\s+");
    if (parts.length != 3) {
      throw new ParseException("invalid syntax: " + line, reader.getLineNumber());
    }
    if (mappings.put(parts[1], parts[2]) != null) {
      throw new IllegalStateException("duplicate mapping specified for: " + parts[1]);
    }
  }

  Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
  Builder<CharsRef> builder = new Builder<>(FST.INPUT_TYPE.BYTE2, outputs);
  IntsRefBuilder scratchInts = new IntsRefBuilder();
  for (Map.Entry<String,String> entry : mappings.entrySet()) {
    Util.toUTF16(entry.getKey(), scratchInts);
    builder.add(scratchInts.get(), new CharsRef(entry.getValue()));
  }

  return builder.finish();
}
项目:search    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Add another character sequence to this automaton. The sequence must be
 * lexicographically larger or equal compared to any previous sequences added
 * to this automaton (the input must be sorted).
 */
public void add(CharsRef current) {
  assert stateRegistry != null : "Automaton already built.";
  assert previous == null
      || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
      + previous + " >= " + current;
  assert setPrevious(current);

  // Descend in the automaton (find matching prefix).
  int pos = 0, max = current.length();
  State next, state = root;
  while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
    state = next;
    // todo, optimize me
    pos += Character.charCount(Character.codePointAt(current, pos));
  }

  if (state.hasChildren()) replaceOrRegister(state);

  addSuffix(state, current, pos);
}
项目:search    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Build a minimal, deterministic automaton from a sorted list of {@link BytesRef} representing
 * strings in UTF-8. These strings must be binary-sorted.
 */
public static Automaton build(Collection<BytesRef> input) {
  final DaciukMihovAutomatonBuilder builder = new DaciukMihovAutomatonBuilder();

  char[] chars = new char[0];
  CharsRef ref = new CharsRef();
  for (BytesRef b : input) {
    chars = ArrayUtil.grow(chars, b.length);
    final int len = UnicodeUtil.UTF8toUTF16(b, chars);
    ref.chars = chars;
    ref.length = len;
    builder.add(ref);
  }

  Automaton.Builder a = new Automaton.Builder();
  convert(a,
      builder.complete(), 
      new IdentityHashMap<State,Integer>());

  return a.finish();
}
项目:search    文件:CharSequenceOutputs.java   
@Override
public CharsRef subtract(CharsRef output, CharsRef inc) {
  assert output != null;
  assert inc != null;
  if (inc == NO_OUTPUT) {
    // no prefix removed
    return output;
  } else if (inc.length == output.length) {
    // entire output removed
    return NO_OUTPUT;
  } else {
    assert inc.length < output.length: "inc.length=" + inc.length + " vs output.length=" + output.length;
    assert inc.length > 0;
    return new CharsRef(output.chars, output.offset + inc.length, output.length-inc.length);
  }
}
项目:search    文件:CharSequenceOutputs.java   
@Override
public CharsRef add(CharsRef prefix, CharsRef output) {
  assert prefix != null;
  assert output != null;
  if (prefix == NO_OUTPUT) {
    return output;
  } else if (output == NO_OUTPUT) {
    return prefix;
  } else {
    assert prefix.length > 0;
    assert output.length > 0;
    CharsRef result = new CharsRef(prefix.length + output.length);
    System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length);
    System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length);
    result.length = prefix.length + output.length;
    return result;
  }
}
项目:SolrPlugins    文件:DiceMultipleCaseSuggester.java   
private List<LookupResult> getLookupResults(SpellingOptions options, Token currentToken) throws IOException {
    CharsRef scratch = new CharsRef();
    scratch.chars = currentToken.buffer();
    scratch.offset = 0;
    scratch.length = currentToken.length();
    boolean onlyMorePopular = (options.suggestMode == SuggestMode.SUGGEST_MORE_POPULAR) &&
            !(lookup instanceof WFSTCompletionLookup) &&
            !(lookup instanceof AnalyzingSuggester);

    List<LookupResult> suggestions = lookup.lookup(scratch, onlyMorePopular, options.count);
    if (suggestions == null || suggestions.size() == 0) {
        return null;
    }

    return suggestions;
}
项目:cc-analysis    文件:CcWordSet.java   
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
    final int end = chars.offset + chars.length;
    for (int idx = chars.offset + 1; idx < end; idx++) {
        if (chars.chars[idx] == WORD_SEPARATOR && chars.chars[idx - 1] == WORD_SEPARATOR) {
            return true;
        }
    }
    if (chars.chars[chars.offset] == '\u0000') {
        return true;
    }
    if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
        return true;
    }

    return false;
}
项目:cc-analysis    文件:CcWordsFilterTest.java   
private CharsRef analyze(Analyzer analyzer, String text) throws IOException {
    CharsRefBuilder charsRefBuilder = new CharsRefBuilder();
    try (TokenStream ts = analyzer.tokenStream("", text)) {
        CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
        PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
        ts.reset();
        while (ts.incrementToken()) {
            int length = termAtt.length();
            if (length == 0) {
                throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
            }
            charsRefBuilder.grow(charsRefBuilder.length() + length + 1); /* current + word + separator */
            if (charsRefBuilder.length() > 0) {
                charsRefBuilder.append(CcWordSet.WORD_SEPARATOR);
            }
            charsRefBuilder.append(termAtt);
        }
        ts.end();
    }
    if (charsRefBuilder.length() == 0) {
        return null;
    }
    charsRefBuilder.append(CcWordSet.WORD_END);
    return charsRefBuilder.get();
}
项目:read-open-source-code    文件:SynonymMap.java   
/** Sugar: just joins the provided terms with {@link
 *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
 *  must not be null. */
public static CharsRef join(String[] words, CharsRef reuse) {
  int upto = 0;
  char[] buffer = reuse.chars;
  for (String word : words) {
    final int wordLen = word.length();
    final int needed = (0 == upto ? wordLen : 1 + upto + wordLen); // Add 1 for WORD_SEPARATOR
    if (needed > buffer.length) {
      reuse.grow(needed);
      buffer = reuse.chars;
    }
    if (upto > 0) {
      buffer[upto++] = SynonymMap.WORD_SEPARATOR;
    }

    word.getChars(0, wordLen, buffer, upto);
    upto += wordLen;
  }
  reuse.length = upto;
  return reuse;
}
项目:NYBC    文件:MoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
  final TermsEnum termsEnum = vector.iterator(null);
  final CharsRef spare = new CharsRef();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    UnicodeUtil.UTF8toUTF16(text, spare);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
项目:NYBC    文件:JaspellLookup.java   
@Override
public void build(TermFreqIterator tfit) throws IOException {
  if (tfit.getComparator() != null) {
    // make sure it's unsorted
    // WTF - this could result in yet another sorted iteration....
    tfit = new UnsortedTermFreqIteratorWrapper(tfit);
  }
  trie = new JaspellTernarySearchTrie();
  trie.setMatchAlmostDiff(editDistance);
  BytesRef spare;
  final CharsRef charsSpare = new CharsRef();

  while ((spare = tfit.next()) != null) {
    final long weight = tfit.weight();
    if (spare.length == 0) {
      continue;
    }
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    trie.put(charsSpare.toString(), Long.valueOf(weight));
  }
}
项目:NYBC    文件:TSTLookup.java   
@Override
public void build(TermFreqIterator tfit) throws IOException {
  root = new TernaryTreeNode();
  // buffer first
  if (tfit.getComparator() != BytesRef.getUTF8SortedAsUTF16Comparator()) {
    // make sure it's sorted and the comparator uses UTF16 sort order
    tfit = new SortedTermFreqIteratorWrapper(tfit, BytesRef.getUTF8SortedAsUTF16Comparator());
  }

  ArrayList<String> tokens = new ArrayList<String>();
  ArrayList<Number> vals = new ArrayList<Number>();
  BytesRef spare;
  CharsRef charsSpare = new CharsRef();
  while ((spare = tfit.next()) != null) {
    charsSpare.grow(spare.length);
    UnicodeUtil.UTF8toUTF16(spare.bytes, spare.offset, spare.length, charsSpare);
    tokens.add(charsSpare.toString());
    vals.add(Long.valueOf(tfit.weight()));
  }
  autocomplete.balancedTree(tokens.toArray(), vals.toArray(), 0, tokens.size() - 1, root);
}
项目:NYBC    文件:SynonymMap.java   
/** only used for asserting! */
private boolean hasHoles(CharsRef chars) {
  final int end = chars.offset + chars.length;
  for(int idx=chars.offset+1;idx<end;idx++) {
    if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
      return true;
    }
  }
  if (chars.chars[chars.offset] == '\u0000') {
    return true;
  }
  if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
    return true;
  }

  return false;
}
项目:read-open-source-code    文件:CharSequenceOutputs.java   
@Override
public CharsRef add(CharsRef prefix, CharsRef output) {
  assert prefix != null;
  assert output != null;
  if (prefix == NO_OUTPUT) {
    return output;
  } else if (output == NO_OUTPUT) {
    return prefix;
  } else {
    assert prefix.length > 0;
    assert output.length > 0;
    CharsRef result = new CharsRef(prefix.length + output.length);
    System.arraycopy(prefix.chars, prefix.offset, result.chars, 0, prefix.length);
    System.arraycopy(output.chars, output.offset, result.chars, prefix.length, output.length);
    result.length = prefix.length + output.length;
    return result;
  }
}
项目:NYBC    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<CharsRef>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:NYBC    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Add another character sequence to this automaton. The sequence must be
 * lexicographically larger or equal compared to any previous sequences added
 * to this automaton (the input must be sorted).
 */
public void add(CharsRef current) {
  assert stateRegistry != null : "Automaton already built.";
  assert previous == null
      || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
      + previous + " >= " + current;
  assert setPrevious(current);

  // Descend in the automaton (find matching prefix).
  int pos = 0, max = current.length();
  State next, state = root;
  while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
    state = next;
    // todo, optimize me
    pos += Character.charCount(Character.codePointAt(current, pos));
  }

  if (state.hasChildren()) replaceOrRegister(state);

  addSuffix(state, current, pos);
}
项目:read-open-source-code    文件:DaciukMihovAutomatonBuilder.java   
/**
 * Add another character sequence to this automaton. The sequence must be
 * lexicographically larger or equal compared to any previous sequences added
 * to this automaton (the input must be sorted).
 */
public void add(CharsRef current) {
  assert stateRegistry != null : "Automaton already built.";
  assert previous == null
      || comparator.compare(previous, current) <= 0 : "Input must be in sorted UTF-8 order: "
      + previous + " >= " + current;
  assert setPrevious(current);

  // Descend in the automaton (find matching prefix).
  int pos = 0, max = current.length();
  State next, state = root;
  while (pos < max && (next = state.lastChild(Character.codePointAt(current, pos))) != null) {
    state = next;
    // todo, optimize me
    pos += Character.charCount(Character.codePointAt(current, pos));
  }

  if (state.hasChildren()) replaceOrRegister(state);

  addSuffix(state, current, pos);
}
项目:NYBC    文件:TestIndexWriterUnicode.java   
public void testRandomUnicodeStrings() throws Throwable {
  char[] buffer = new char[20];
  char[] expected = new char[20];

  BytesRef utf8 = new BytesRef(20);
  CharsRef utf16 = new CharsRef(20);

  int num = atLeast(100000);
  for (int iter = 0; iter < num; iter++) {
    boolean hasIllegal = fillUnicode(buffer, expected, 0, 20);

    UnicodeUtil.UTF16toUTF8(buffer, 0, 20, utf8);
    if (!hasIllegal) {
      byte[] b = new String(buffer, 0, 20).getBytes("UTF-8");
      assertEquals(b.length, utf8.length);
      for(int i=0;i<b.length;i++)
        assertEquals(b[i], utf8.bytes[i]);
    }

    UnicodeUtil.UTF8toUTF16(utf8.bytes, 0, utf8.length, utf16);
    assertEquals(utf16.length, 20);
    for(int i=0;i<20;i++)
      assertEquals(expected[i], utf16.chars[i]);
  }
}
项目:read-open-source-code    文件:NormalizeCharMap.java   
private NormalizeCharMap(FST<CharsRef> map) {
  this.map = map;
  if (map != null) {
    try {
      // Pre-cache root arcs:
      final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
      final FST.BytesReader fstReader = map.getBytesReader();
      map.getFirstArc(scratchArc);
      if (FST.targetHasArcs(scratchArc)) {
        map.readFirstRealTargetArc(scratchArc.target, scratchArc, fstReader);
        while(true) {
          assert scratchArc.label != FST.END_LABEL;
          cachedRootArcs.put(Character.valueOf((char) scratchArc.label), new FST.Arc<CharsRef>().copyFrom(scratchArc));
          if (scratchArc.isLast()) {
            break;
          }
          map.readNextRealArc(scratchArc, fstReader);
        }
      }
      //System.out.println("cached " + cachedRootArcs.size() + " root arcs");
    } catch (IOException ioe) {
      // Bogus FST IOExceptions!!  (will never happen)
      throw new RuntimeException(ioe);
    }
  }
}
项目:read-open-source-code    文件:MoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
  final TermsEnum termsEnum = vector.iterator(null);
  final CharsRef spare = new CharsRef();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    UnicodeUtil.UTF8toUTF16(text, spare);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}
项目:read-open-source-code    文件:AnalyzingSuggester.java   
private LookupResult getLookupResult(Long output1, BytesRef output2, CharsRef spare) {
  LookupResult result;
  if (hasPayloads) {
    int sepIndex = -1;
    for(int i=0;i<output2.length;i++) {
      if (output2.bytes[output2.offset+i] == PAYLOAD_SEP) {
        sepIndex = i;
        break;
      }
    }
    assert sepIndex != -1;
    spare.grow(sepIndex);
    final int payloadLen = output2.length - sepIndex - 1;
    UnicodeUtil.UTF8toUTF16(output2.bytes, output2.offset, sepIndex, spare);
    BytesRef payload = new BytesRef(payloadLen);
    System.arraycopy(output2.bytes, sepIndex+1, payload.bytes, 0, payloadLen);
    payload.length = payloadLen;
    result = new LookupResult(spare.toString(), decodeWeight(output1), payload);
  } else {
    spare.grow(output2.length);
    UnicodeUtil.UTF8toUTF16(output2, spare);
    result = new LookupResult(spare.toString(), decodeWeight(output1));
  }

  return result;
}
项目:read-open-source-code    文件:MoreLikeThis.java   
/**
 * Adds terms and frequencies found in vector into the Map termFreqMap
 *
 * @param termFreqMap a Map of terms and their frequencies
 * @param vector List of terms and their frequencies for a doc/field
 */
private void addTermFrequencies(Map<String, Int> termFreqMap, Terms vector) throws IOException {
  final TermsEnum termsEnum = vector.iterator(null);
  final CharsRef spare = new CharsRef();
  BytesRef text;
  while((text = termsEnum.next()) != null) {
    UnicodeUtil.UTF8toUTF16(text, spare);
    final String term = spare.toString();
    if (isNoiseWord(term)) {
      continue;
    }
    final int freq = (int) termsEnum.totalTermFreq();

    // increment frequency
    Int cnt = termFreqMap.get(term);
    if (cnt == null) {
      cnt = new Int();
      termFreqMap.put(term, cnt);
      cnt.x = freq;
    } else {
      cnt.x += freq;
    }
  }
}