Java 类org.apache.lucene.analysis.cn.smart.WordType 实例源码

项目:search    文件:SegTokenFilter.java   
/**
 * Filter an input {@link SegToken}
 * <p>
 * Full-width latin will be converted to half-width, then all latin will be lowercased.
 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
 * </p>
 * 
 * @param token input {@link SegToken}
 * @return normalized {@link SegToken}
 */
public SegToken filter(SegToken token) {
  switch (token.wordType) {
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0xFF10)
          token.charArray[i] -= 0xFEE0;

        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.STRING:
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
      token.charArray = Utility.COMMON_DELIMITER;
      break;
    default:
      break;
  }
  return token;
}
项目:NYBC    文件:SegTokenFilter.java   
/**
 * Filter an input {@link SegToken}
 * <p>
 * Full-width latin will be converted to half-width, then all latin will be lowercased.
 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
 * </p>
 * 
 * @param token input {@link SegToken}
 * @return normalized {@link SegToken}
 */
public SegToken filter(SegToken token) {
  switch (token.wordType) {
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0xFF10)
          token.charArray[i] -= 0xFEE0;

        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.STRING:
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
      token.charArray = Utility.COMMON_DELIMITER;
      break;
    default:
      break;
  }
  return token;
}
项目:read-open-source-code    文件:SegTokenFilter.java   
/**
 * Filter an input {@link SegToken}
 * <p>
 * Full-width latin will be converted to half-width, then all latin will be lowercased.
 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
 * </p>
 * 
 * @param token input {@link SegToken}
 * @return normalized {@link SegToken}
 */
public SegToken filter(SegToken token) {
  switch (token.wordType) {
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0xFF10)
          token.charArray[i] -= 0xFEE0;

        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.STRING:
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
      token.charArray = Utility.COMMON_DELIMITER;
      break;
    default:
      break;
  }
  return token;
}
项目:read-open-source-code    文件:SegTokenFilter.java   
/**
 * Filter an input {@link SegToken}
 * <p>
 * Full-width latin will be converted to half-width, then all latin will be lowercased.
 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
 * </p>
 * 
 * @param token input {@link SegToken}
 * @return normalized {@link SegToken}
 */
public SegToken filter(SegToken token) {
  switch (token.wordType) {
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0xFF10)
          token.charArray[i] -= 0xFEE0;

        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.STRING:
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
      token.charArray = Utility.COMMON_DELIMITER;
      break;
    default:
      break;
  }
  return token;
}
项目:Maskana-Gestor-de-Conocimiento    文件:SegTokenFilter.java   
/**
 * Filter an input {@link SegToken}
 * <p>
 * Full-width latin will be converted to half-width, then all latin will be lowercased.
 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
 * </p>
 * 
 * @param token input {@link SegToken}
 * @return normalized {@link SegToken}
 */
public SegToken filter(SegToken token) {
  switch (token.wordType) {
    case WordType.FULLWIDTH_NUMBER:
    case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0xFF10)
          token.charArray[i] -= 0xFEE0;

        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.STRING:
      for (int i = 0; i < token.charArray.length; i++) {
        if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
          token.charArray[i] += 0x0020;
      }
      break;
    case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
      token.charArray = Utility.COMMON_DELIMITER;
      break;
    default:
      break;
  }
  return token;
}