Java 类org.apache.lucene.analysis.cn.smart.hhmm.SegToken 实例源码

项目:search    文件:WordSegmenter.java   
/**
 * Process a {@link SegToken} so that it is ready for indexing.
 * 
 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
 * 
 * @param st input {@link SegToken}
 * @param sentence associated Sentence
 * @param sentenceStartOffset offset into sentence
 * @return Lucene {@link SegToken}
 */
public SegToken convertSegToken(SegToken st, String sentence,
    int sentenceStartOffset) {

  switch (st.wordType) {
    case WordType.STRING:
    case WordType.NUMBER:
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING:
      st.charArray = sentence.substring(st.startOffset, st.endOffset)
          .toCharArray();
      break;
    default:
      break;
  }

  st = tokenFilter.filter(st);
  st.startOffset += sentenceStartOffset;
  st.endOffset += sentenceStartOffset;
  return st;
}
项目:NYBC    文件:WordSegmenter.java   
/**
 * Process a {@link SegToken} so that it is ready for indexing.
 * 
 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
 * 
 * @param st input {@link SegToken}
 * @param sentence associated Sentence
 * @param sentenceStartOffset offset into sentence
 * @return Lucene {@link SegToken}
 */
public SegToken convertSegToken(SegToken st, String sentence,
    int sentenceStartOffset) {

  switch (st.wordType) {
    case WordType.STRING:
    case WordType.NUMBER:
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING:
      st.charArray = sentence.substring(st.startOffset, st.endOffset)
          .toCharArray();
      break;
    default:
      break;
  }

  st = tokenFilter.filter(st);
  st.startOffset += sentenceStartOffset;
  st.endOffset += sentenceStartOffset;
  return st;
}
项目:read-open-source-code    文件:WordSegmenter.java   
/**
 * Process a {@link SegToken} so that it is ready for indexing.
 * 
 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
 * 
 * @param st input {@link SegToken}
 * @param sentence associated Sentence
 * @param sentenceStartOffset offset into sentence
 * @return Lucene {@link SegToken}
 */
public SegToken convertSegToken(SegToken st, String sentence,
    int sentenceStartOffset) {

  switch (st.wordType) {
    case WordType.STRING:
    case WordType.NUMBER:
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING:
      st.charArray = sentence.substring(st.startOffset, st.endOffset)
          .toCharArray();
      break;
    default:
      break;
  }

  st = tokenFilter.filter(st);
  st.startOffset += sentenceStartOffset;
  st.endOffset += sentenceStartOffset;
  return st;
}
项目:read-open-source-code    文件:WordSegmenter.java   
/**
 * Process a {@link SegToken} so that it is ready for indexing.
 * 
 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
 * 
 * @param st input {@link SegToken}
 * @param sentence associated Sentence
 * @param sentenceStartOffset offset into sentence
 * @return Lucene {@link SegToken}
 */
public SegToken convertSegToken(SegToken st, String sentence,
    int sentenceStartOffset) {

  switch (st.wordType) {
    case WordType.STRING:
    case WordType.NUMBER:
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING:
      st.charArray = sentence.substring(st.startOffset, st.endOffset)
          .toCharArray();
      break;
    default:
      break;
  }

  st = tokenFilter.filter(st);
  st.startOffset += sentenceStartOffset;
  st.endOffset += sentenceStartOffset;
  return st;
}
项目:Maskana-Gestor-de-Conocimiento    文件:WordSegmenter.java   
/**
 * Process a {@link SegToken} so that it is ready for indexing.
 * 
 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
 * 
 * @param st input {@link SegToken}
 * @param sentence associated Sentence
 * @param sentenceStartOffset offset into sentence
 * @return Lucene {@link SegToken}
 */
public SegToken convertSegToken(SegToken st, String sentence,
    int sentenceStartOffset) {

  switch (st.wordType) {
    case WordType.STRING:
    case WordType.NUMBER:
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING:
      st.charArray = sentence.substring(st.startOffset, st.endOffset)
          .toCharArray();
      break;
    default:
      break;
  }

  st = tokenFilter.filter(st);
  st.startOffset += sentenceStartOffset;
  st.endOffset += sentenceStartOffset;
  return st;
}
项目:search    文件:WordSegmenter.java   
/**
 * Segment a sentence into words with {@link HHMMSegmenter}
 * 
 * @param sentence input sentence
 * @param startOffset start offset of sentence
 * @return {@link List} of {@link SegToken}
 */
public List<SegToken> segmentSentence(String sentence, int startOffset) {

  List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  List<SegToken> result = Collections.emptyList();

  if (segTokenList.size() > 2) // if its not an empty sentence
    result = segTokenList.subList(1, segTokenList.size() - 1);

  for (SegToken st : result)
    convertSegToken(st, sentence, startOffset);

  return result;
}
项目:search    文件:WordTokenFilter.java   
@Override
public boolean incrementToken() throws IOException {   
  if (tokenIter == null || !tokenIter.hasNext()) {
    // there are no remaining tokens from the current sentence... are there more sentences?
    if (input.incrementToken()) {
      tokStart = offsetAtt.startOffset();
      tokEnd = offsetAtt.endOffset();
      // if length by start + end offsets doesn't match the term text then assume
      // this is a synonym and don't adjust the offsets.
      hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
      // a new sentence is available: process it.
      tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
      tokenIter = tokenBuffer.iterator();
      /* 
       * it should not be possible to have a sentence with 0 words, check just in case.
       * returning EOS isn't the best either, but its the behavior of the original code.
       */
      if (!tokenIter.hasNext())
        return false;
    } else {
      return false; // no more sentences, end of stream!
    }
  } 
  // WordTokenFilter must clear attributes, as it is creating new tokens.
  clearAttributes();
  // There are remaining tokens from the current sentence, return the next one. 
  SegToken nextWord = tokenIter.next();
  termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  if (hasIllegalOffsets) {
    offsetAtt.setOffset(tokStart, tokEnd);
  } else {
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  }
  typeAtt.setType("word");
  return true;
}
项目:search    文件:HHMMSegmenter.java   
/**
 * Return a list of {@link SegToken} representing the best segmentation of a sentence
 * @param sentence input sentence
 * @return best segmentation as a {@link List}
 */
public List<SegToken> process(String sentence) {
  SegGraph segGraph = createSegGraph(sentence);
  BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  List<SegToken> shortPath = biSegGraph.getShortPath();
  return shortPath;
}
项目:search    文件:HMMChineseTokenizer.java   
@Override
protected boolean incrementWord() {
  if (tokens == null || !tokens.hasNext()) {
    return false;
  } else {
    SegToken token = tokens.next();
    clearAttributes();
    termAtt.copyBuffer(token.charArray, 0, token.charArray.length);
    offsetAtt.setOffset(correctOffset(token.startOffset), correctOffset(token.endOffset));
    typeAtt.setType("word");
    return true;
  }
}
项目:NYBC    文件:WordSegmenter.java   
/**
 * Segment a sentence into words with {@link HHMMSegmenter}
 * 
 * @param sentence input sentence
 * @param startOffset start offset of sentence
 * @return {@link List} of {@link SegToken}
 */
public List<SegToken> segmentSentence(String sentence, int startOffset) {

  List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  List<SegToken> result = Collections.emptyList();

  if (segTokenList.size() > 2) // if its not an empty sentence
    result = segTokenList.subList(1, segTokenList.size() - 1);

  for (SegToken st : result)
    convertSegToken(st, sentence, startOffset);

  return result;
}
项目:NYBC    文件:WordTokenFilter.java   
@Override
public boolean incrementToken() throws IOException {   
  if (tokenIter == null || !tokenIter.hasNext()) {
    // there are no remaining tokens from the current sentence... are there more sentences?
    if (input.incrementToken()) {
      tokStart = offsetAtt.startOffset();
      tokEnd = offsetAtt.endOffset();
      // if length by start + end offsets doesn't match the term text then assume
      // this is a synonym and don't adjust the offsets.
      hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
      // a new sentence is available: process it.
      tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
      tokenIter = tokenBuffer.iterator();
      /* 
       * it should not be possible to have a sentence with 0 words, check just in case.
       * returning EOS isn't the best either, but its the behavior of the original code.
       */
      if (!tokenIter.hasNext())
        return false;
    } else {
      return false; // no more sentences, end of stream!
    }
  } 
  // WordTokenFilter must clear attributes, as it is creating new tokens.
  clearAttributes();
  // There are remaining tokens from the current sentence, return the next one. 
  SegToken nextWord = tokenIter.next();
  termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  if (hasIllegalOffsets) {
    offsetAtt.setOffset(tokStart, tokEnd);
  } else {
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  }
  typeAtt.setType("word");
  return true;
}
项目:NYBC    文件:HHMMSegmenter.java   
/**
 * Return a list of {@link SegToken} representing the best segmentation of a sentence
 * @param sentence input sentence
 * @return best segmentation as a {@link List}
 */
public List<SegToken> process(String sentence) {
  SegGraph segGraph = createSegGraph(sentence);
  BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  List<SegToken> shortPath = biSegGraph.getShortPath();
  return shortPath;
}
项目:read-open-source-code    文件:WordSegmenter.java   
/**
 * Segment a sentence into words with {@link HHMMSegmenter}
 * 
 * @param sentence input sentence
 * @param startOffset start offset of sentence
 * @return {@link List} of {@link SegToken}
 */
public List<SegToken> segmentSentence(String sentence, int startOffset) {

  List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  List<SegToken> result = Collections.emptyList();

  if (segTokenList.size() > 2) // if its not an empty sentence
    result = segTokenList.subList(1, segTokenList.size() - 1);

  for (SegToken st : result)
    convertSegToken(st, sentence, startOffset);

  return result;
}
项目:read-open-source-code    文件:WordTokenFilter.java   
@Override
public boolean incrementToken() throws IOException {   
  if (tokenIter == null || !tokenIter.hasNext()) {
    // there are no remaining tokens from the current sentence... are there more sentences?
    if (input.incrementToken()) {
      tokStart = offsetAtt.startOffset();
      tokEnd = offsetAtt.endOffset();
      // if length by start + end offsets doesn't match the term text then assume
      // this is a synonym and don't adjust the offsets.
      hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
      // a new sentence is available: process it.
      tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
      tokenIter = tokenBuffer.iterator();
      /* 
       * it should not be possible to have a sentence with 0 words, check just in case.
       * returning EOS isn't the best either, but its the behavior of the original code.
       */
      if (!tokenIter.hasNext())
        return false;
    } else {
      return false; // no more sentences, end of stream!
    }
  } 
  // WordTokenFilter must clear attributes, as it is creating new tokens.
  clearAttributes();
  // There are remaining tokens from the current sentence, return the next one. 
  SegToken nextWord = tokenIter.next();
  termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  if (hasIllegalOffsets) {
    offsetAtt.setOffset(tokStart, tokEnd);
  } else {
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  }
  typeAtt.setType("word");
  return true;
}
项目:read-open-source-code    文件:HHMMSegmenter.java   
/**
 * Return a list of {@link SegToken} representing the best segmentation of a sentence
 * @param sentence input sentence
 * @return best segmentation as a {@link List}
 */
public List<SegToken> process(String sentence) {
  SegGraph segGraph = createSegGraph(sentence);
  BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  List<SegToken> shortPath = biSegGraph.getShortPath();
  return shortPath;
}
项目:read-open-source-code    文件:WordSegmenter.java   
/**
 * Segment a sentence into words with {@link HHMMSegmenter}
 * 
 * @param sentence input sentence
 * @param startOffset start offset of sentence
 * @return {@link List} of {@link SegToken}
 */
public List<SegToken> segmentSentence(String sentence, int startOffset) {

  List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  List<SegToken> result = Collections.emptyList();

  if (segTokenList.size() > 2) // if its not an empty sentence
    result = segTokenList.subList(1, segTokenList.size() - 1);

  for (SegToken st : result)
    convertSegToken(st, sentence, startOffset);

  return result;
}
项目:read-open-source-code    文件:WordTokenFilter.java   
@Override
public boolean incrementToken() throws IOException {   
  if (tokenIter == null || !tokenIter.hasNext()) {
    // there are no remaining tokens from the current sentence... are there more sentences?
    if (input.incrementToken()) {
      tokStart = offsetAtt.startOffset();
      tokEnd = offsetAtt.endOffset();
      // if length by start + end offsets doesn't match the term text then assume
      // this is a synonym and don't adjust the offsets.
      hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
      // a new sentence is available: process it.
      tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
      tokenIter = tokenBuffer.iterator();
      /* 
       * it should not be possible to have a sentence with 0 words, check just in case.
       * returning EOS isn't the best either, but its the behavior of the original code.
       */
      if (!tokenIter.hasNext())
        return false;
    } else {
      return false; // no more sentences, end of stream!
    }
  } 
  // WordTokenFilter must clear attributes, as it is creating new tokens.
  clearAttributes();
  // There are remaining tokens from the current sentence, return the next one. 
  SegToken nextWord = tokenIter.next();
  termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  if (hasIllegalOffsets) {
    offsetAtt.setOffset(tokStart, tokEnd);
  } else {
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  }
  typeAtt.setType("word");
  return true;
}
项目:read-open-source-code    文件:HHMMSegmenter.java   
/**
 * Return a list of {@link SegToken} representing the best segmentation of a sentence
 * @param sentence input sentence
 * @return best segmentation as a {@link List}
 */
public List<SegToken> process(String sentence) {
  SegGraph segGraph = createSegGraph(sentence);
  BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  List<SegToken> shortPath = biSegGraph.getShortPath();
  return shortPath;
}
项目:Maskana-Gestor-de-Conocimiento    文件:WordSegmenter.java   
/**
 * Segment a sentence into words with {@link HHMMSegmenter}
 * 
 * @param sentence input sentence
 * @param startOffset start offset of sentence
 * @return {@link List} of {@link SegToken}
 */
public List<SegToken> segmentSentence(String sentence, int startOffset) {

  List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  List<SegToken> result = Collections.emptyList();

  if (segTokenList.size() > 2) // if its not an empty sentence
    result = segTokenList.subList(1, segTokenList.size() - 1);

  for (SegToken st : result)
    convertSegToken(st, sentence, startOffset);

  return result;
}
项目:Maskana-Gestor-de-Conocimiento    文件:WordTokenFilter.java   
@Override
public boolean incrementToken() throws IOException {   
  if (tokenIter == null || !tokenIter.hasNext()) {
    // there are no remaining tokens from the current sentence... are there more sentences?
    if (input.incrementToken()) {
      tokStart = offsetAtt.startOffset();
      tokEnd = offsetAtt.endOffset();
      // if length by start + end offsets doesn't match the term text then assume
      // this is a synonym and don't adjust the offsets.
      hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd;
      // a new sentence is available: process it.
      tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
      tokenIter = tokenBuffer.iterator();
      /* 
       * it should not be possible to have a sentence with 0 words, check just in case.
       * returning EOS isn't the best either, but its the behavior of the original code.
       */
      if (!tokenIter.hasNext())
        return false;
    } else {
      return false; // no more sentences, end of stream!
    }
  } 
  // WordTokenFilter must clear attributes, as it is creating new tokens.
  clearAttributes();
  // There are remaining tokens from the current sentence, return the next one. 
  SegToken nextWord = tokenIter.next();
  termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  if (hasIllegalOffsets) {
    offsetAtt.setOffset(tokStart, tokEnd);
  } else {
    offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  }
  typeAtt.setType("word");
  return true;
}
项目:Maskana-Gestor-de-Conocimiento    文件:HHMMSegmenter.java   
/**
 * Return a list of {@link SegToken} representing the best segmentation of a sentence
 * @param sentence input sentence
 * @return best segmentation as a {@link List}
 */
public List<SegToken> process(String sentence) {
  SegGraph segGraph = createSegGraph(sentence);
  BiSegGraph biSegGraph = new BiSegGraph(segGraph);
  List<SegToken> shortPath = biSegGraph.getShortPath();
  return shortPath;
}