/** * Process a {@link SegToken} so that it is ready for indexing. * * This method calculates offsets and normalizes the token with {@link SegTokenFilter}. * * @param st input {@link SegToken} * @param sentence associated Sentence * @param sentenceStartOffset offset into sentence * @return Lucene {@link SegToken} */ public SegToken convertSegToken(SegToken st, String sentence, int sentenceStartOffset) { switch (st.wordType) { case WordType.STRING: case WordType.NUMBER: case WordType.FULLWIDTH_NUMBER: case WordType.FULLWIDTH_STRING: st.charArray = sentence.substring(st.startOffset, st.endOffset) .toCharArray(); break; default: break; } st = tokenFilter.filter(st); st.startOffset += sentenceStartOffset; st.endOffset += sentenceStartOffset; return st; }
/** * Segment a sentence into words with {@link HHMMSegmenter} * * @param sentence input sentence * @param startOffset start offset of sentence * @return {@link List} of {@link SegToken} */ public List<SegToken> segmentSentence(String sentence, int startOffset) { List<SegToken> segTokenList = hhmmSegmenter.process(sentence); // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END List<SegToken> result = Collections.emptyList(); if (segTokenList.size() > 2) // if its not an empty sentence result = segTokenList.subList(1, segTokenList.size() - 1); for (SegToken st : result) convertSegToken(st, sentence, startOffset); return result; }
@Override public boolean incrementToken() throws IOException { if (tokenIter == null || !tokenIter.hasNext()) { // there are no remaining tokens from the current sentence... are there more sentences? if (input.incrementToken()) { tokStart = offsetAtt.startOffset(); tokEnd = offsetAtt.endOffset(); // if length by start + end offsets doesn't match the term text then assume // this is a synonym and don't adjust the offsets. hasIllegalOffsets = (tokStart + termAtt.length()) != tokEnd; // a new sentence is available: process it. tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset()); tokenIter = tokenBuffer.iterator(); /* * it should not be possible to have a sentence with 0 words, check just in case. * returning EOS isn't the best either, but its the behavior of the original code. */ if (!tokenIter.hasNext()) return false; } else { return false; // no more sentences, end of stream! } } // WordTokenFilter must clear attributes, as it is creating new tokens. clearAttributes(); // There are remaining tokens from the current sentence, return the next one. SegToken nextWord = tokenIter.next(); termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length); if (hasIllegalOffsets) { offsetAtt.setOffset(tokStart, tokEnd); } else { offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset); } typeAtt.setType("word"); return true; }
/** * Return a list of {@link SegToken} representing the best segmentation of a sentence * @param sentence input sentence * @return best segmentation as a {@link List} */ public List<SegToken> process(String sentence) { SegGraph segGraph = createSegGraph(sentence); BiSegGraph biSegGraph = new BiSegGraph(segGraph); List<SegToken> shortPath = biSegGraph.getShortPath(); return shortPath; }
@Override protected boolean incrementWord() { if (tokens == null || !tokens.hasNext()) { return false; } else { SegToken token = tokens.next(); clearAttributes(); termAtt.copyBuffer(token.charArray, 0, token.charArray.length); offsetAtt.setOffset(correctOffset(token.startOffset), correctOffset(token.endOffset)); typeAtt.setType("word"); return true; } }