private void sortEachItems() { char[] tmpArray; int tmpFreq; for (int i = 0; i < wordItem_charArrayTable.length; i++) { if (wordItem_charArrayTable[i] != null && wordItem_charArrayTable[i].length > 1) { for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) { for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j2], 0) > 0) { tmpArray = wordItem_charArrayTable[i][j]; tmpFreq = wordItem_frequencyTable[i][j]; wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; wordItem_charArrayTable[i][j2] = tmpArray; wordItem_frequencyTable[i][j2] = tmpFreq; } } } } } }
/** * Look up the text string corresponding with the word char array, * and return the position of the word list. * * @param knownHashIndex already figure out position of the first word * symbol charArray[0] in hash table. If not calculated yet, can be * replaced with function int findInTable(char[] charArray). * @param charArray look up the char array corresponding with the word. * @return word location in word array. If not found, then return -1. */ private int findInTable(short knownHashIndex, char[] charArray) { if (charArray == null || charArray.length == 0) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; int start = 0, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArray(items[mid], 0, charArray, 1); if (cmpResult == 0) return mid;// find it else if (cmpResult < 0) start = mid + 1; else if (cmpResult > 0) end = mid - 1; mid = (start + end) / 2; } return -1; }
/** * Filter an input {@link SegToken} * <p> * Full-width latin will be converted to half-width, then all latin will be lowercased. * All punctuation is converted into {@link Utility#COMMON_DELIMITER} * </p> * * @param token input {@link SegToken} * @return normalized {@link SegToken} */ public SegToken filter(SegToken token) { switch (token.wordType) { case WordType.FULLWIDTH_NUMBER: case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */ for (int i = 0; i < token.charArray.length; i++) { if (token.charArray[i] >= 0xFF10) token.charArray[i] -= 0xFEE0; if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */ token.charArray[i] += 0x0020; } break; case WordType.STRING: for (int i = 0; i < token.charArray.length; i++) { if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */ token.charArray[i] += 0x0020; } break; case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */ token.charArray = Utility.COMMON_DELIMITER; break; default: break; } return token; }
/** * Get the character types for every character in a sentence. * * @see Utility#getCharType(char) * @param sentence input sentence * @return array of character types corresponding to character positions in the sentence */ private static int[] getCharTypes(String sentence) { int length = sentence.length(); int[] charTypeArray = new int[length]; // the type of each character by position for (int i = 0; i < length; i++) { charTypeArray[i] = Utility.getCharType(sentence.charAt(i)); } return charTypeArray; }
private void mergeSameWords() { int i; for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { if (wordItem_charArrayTable[i] == null) continue; int len = 1; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, wordItem_charArrayTable[i][j - 1], 0) != 0) len++; } if (len < wordItem_charArrayTable[i].length) { char[][] tempArray = new char[len][]; int[] tempFreq = new int[len]; int k = 0; tempArray[0] = wordItem_charArrayTable[i][0]; tempFreq[0] = wordItem_frequencyTable[i][0]; for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, tempArray[k], 0) != 0) { k++; // temp[k] = wordItemTable[i][j]; tempArray[k] = wordItem_charArrayTable[i][j]; tempFreq[k] = wordItem_frequencyTable[i][j]; } else { // temp[k].frequency += wordItemTable[i][j].frequency; tempFreq[k] += wordItem_frequencyTable[i][j]; } } // wordItemTable[i] = temp; wordItem_charArrayTable[i] = tempArray; wordItem_frequencyTable[i] = tempFreq; } } }
/** * Find the nth word in the dictionary that starts with the supplied prefix * * @see #getPrefixMatch(char[]) * @param charArray input prefix * @param knownStart relative position in the dictionary to start * @return index of word, or -1 if not found */ public int getPrefixMatch(char[] charArray, int knownStart) { short index = getWordItemTableIndex(charArray[0]); if (index == -1) return -1; char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; int start = knownStart, end = items.length - 1; int mid = (start + end) / 2, cmpResult; // Binary search for the index of idArray while (start <= end) { cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0); if (cmpResult == 0) { // Get the first item which match the current word while (mid >= 0 && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0) mid--; mid++; return mid;// Find the first word that uses charArray as prefix. } else if (cmpResult < 0) end = mid - 1; else start = mid + 1; mid = (start + end) / 2; } return -1; }
private void generateBiSegGraph(SegGraph segGraph) { double smooth = 0.1; int wordPairFreq = 0; int maxStart = segGraph.getMaxStart(); double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; int next; char[] idBuffer; // get the list of tokens ordered and indexed segTokenList = segGraph.makeIndex(); // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 int key = -1; List<SegToken> nextTokens = null; while (key < maxStart) { if (segGraph.isStartExist(key)) { List<SegToken> tokenList = segGraph.getStartList(key); // Calculate all tokens for a given key. for (SegToken t1 : tokenList) { oneWordFreq = t1.weight; next = t1.endOffset; nextTokens = null; // Find the next corresponding Token. // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". // If we cannot find the next Token, then go to the end and repeat the same cycle. while (next <= maxStart) { // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. if (segGraph.isStartExist(next)) { nextTokens = segGraph.getStartList(next); break; } next++; } if (nextTokens == null) { break; } for (SegToken t2 : nextTokens) { idBuffer = new char[t1.charArray.length + t2.charArray.length + 1]; System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length); idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR; System.arraycopy(t2.charArray, 0, idBuffer, t1.charArray.length + 1, t2.charArray.length); // Two linked Words frequency wordPairFreq = bigramDict.getFrequency(idBuffer); // Smoothing // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 weight = -Math .log(smooth * (1.0 + oneWordFreq) / (Utility.MAX_FREQUENCE + 0.0) + (1.0 - smooth) * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index, t2.index, weight); this.addSegTokenPair(tokenPair); } } } key++; } }
/** * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray * * @param charArray input word * @param itemIndex item index for table charArray[0] * @return true if the entry exists */ public boolean isEqual(char[] charArray, int itemIndex) { short hashIndex = getWordItemTableIndex(charArray[0]); return Utility.compareArray(charArray, 1, wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; }