private boolean oneStepStatistic(Instance inst) { if(inst==null) return false; int[] type; Object t=inst.getTarget(); if(t instanceof Integer){ type=new int[1]; type[0]=Integer.parseInt(t.toString()); } else{ return false; } HashSparseVector data = (HashSparseVector) inst.getData(); TIntFloatIterator it = data.data.iterator(); while (it.hasNext()) { it.advance(); int feature=it.key(); for(int i=0;i<type.length;i++){ addItemFrequency(feature, type[i], (int)it.value()); } } return true; }
private float distanceEuclidean(int n, HashSparseVector sv, float baseDistance) { HashSparseVector center = classCenter.get(n); int count = classCount.get(n); float dist = baseDistance / (count * count); TIntFloatHashMap data = center.data; TIntFloatIterator it = sv.data.iterator(); while (it.hasNext()) { it.advance(); int key = it.key(); if (!data.containsKey(key)) { dist += it.value() * it.value(); } else { float temp = data.get(key) / count; dist -= temp * temp; dist += (it.value() - temp) * (it.value() - temp); } } return dist; }
private void updateBaseDist(int classid, HashSparseVector vector) { float base = baseDistList.get(classid); TIntFloatHashMap center = classCenter.get(classid).data; TIntFloatIterator it = vector.data.iterator(); while (it.hasNext()) { it.advance(); if (!center.containsKey(it.key())) { base += it.value() * it.value(); } else { float temp = center.get(it.key()); base -= temp * temp; base += (it.value() - temp) * (it.value() - temp); } } baseDistList.set(classid, base); }
/** * 由大到小排序 * @param tmap * @return 数组下标 */ public static int[] sort(TIntFloatHashMap tmap) { HashMap<Integer, Float> map = new HashMap<Integer, Float>(); TIntFloatIterator it = tmap.iterator(); while (it.hasNext()) { it.advance(); int id = it.key(); float val = it.value(); map.put(id, Math.abs(val)); } it = null; List<Entry> list = sort(map); int[] idx = new int[list.size()]; Iterator<Entry> it1 = list.iterator(); int i=0; while (it1.hasNext()) { Entry entry = it1.next(); idx[i++] = (Integer) entry.getKey(); } return idx; }
public double compare( TIntFloatMap v1, TIntFloatMap v2, boolean sorted) { //it does not matter if the vectors are sorted or not double DKL = 0.0; TIntFloatIterator iter = v1.iterator(); while (iter.hasNext()) { iter.advance(); int key = iter.key(); if (!v2.containsKey(key)) { continue; } double P = iter.value(); double Q = v2.get(key); DKL += Math.log(P/Q) * P; } return DKL; }
public static int[] getSortedIndices(TIntFloatMap vector) { // NOTE: it's probably possible to do this using purely primitive // operations without having to resort to pushing things into an // Index[]. However, this code is much cleaner to have and since we // sort at most once per vector and the result is memoized, we don't // lose too much from the Object-based sorting. Index[] keyValPairs = new Index[vector.size()]; TIntFloatIterator iter = vector.iterator(); int i = 0; while (iter.hasNext()) { iter.advance(); keyValPairs[i++] = new Index(iter.key(), iter.value()); } Arrays.sort(keyValPairs); int[] sortedIndices = new int[keyValPairs.length]; for (i = 0; i < keyValPairs.length; ++i) sortedIndices[i] = keyValPairs[i].key; return sortedIndices; }
/** * Normalizes the probability values in a vector so that to sum to 1.0 * @param vector * @return */ public static TIntFloatMap normalizeVector(TIntFloatMap vector) { float total = 0; TFloatIterator iter = vector.valueCollection().iterator(); while (iter.hasNext()) total += iter.next(); TIntFloatMap normalized = new TIntFloatHashMap(vector.size()); TIntFloatIterator iter2 = vector.iterator(); while (iter2.hasNext()) { iter2.advance(); normalized.put(iter2.key(), iter2.value() / total); } return normalized; }
/** * 一次性统计概率,节约时间 */ private void statisticProb() { System.out.println("统计概率"); float totalword = alpahbet.size(); TIntFloatIterator it = wordProb.iterator(); while(it.hasNext()){ it.advance(); float v = it.value()/totalword; it.setValue(v); Cluster cluster = new Cluster(it.key(),v,alpahbet.lookupString(it.key())); clusters.put(it.key(), cluster); } TIntObjectIterator<TIntFloatHashMap> it1 = pcc.iterator(); while(it1.hasNext()){ it1.advance(); TIntFloatHashMap map = it1.value(); TIntFloatIterator it2 = map.iterator(); while(it2.hasNext()){ it2.advance(); it2.setValue(it2.value()/totalword); } } }
/** * 由大到小排序 * @param map * @return 数组下标 */ public static int[] sort(TIntFloatHashMap tmap) { HashMap<Integer, Float> map = new HashMap<Integer, Float>(); TIntFloatIterator it = tmap.iterator(); while (it.hasNext()) { it.advance(); int id = it.key(); float val = it.value(); map.put(id, Math.abs(val)); } it = null; List<Entry> list = sort(map); int[] idx = new int[list.size()]; Iterator<Entry> it1 = list.iterator(); int i=0; while (it1.hasNext()) { Entry entry = it1.next(); idx[i++] = (Integer) entry.getKey(); } return idx; }
public HashSparseVector select(HashSparseVector vec){ HashSparseVector sv=new HashSparseVector(); TIntFloatIterator it=vec.data.iterator(); while(it.hasNext()){ it.advance(); if(isUseful[it.key()]) sv.put(it.key(), it.value()); } return sv; }
@Override public float dotProduct(float[] vector) { float v =0f; TIntFloatIterator it = data.iterator(); while(it.hasNext()){ it.advance(); v += vector[it.key()]*it.value(); } return v; }
public float l2Norm2() { TIntFloatIterator it = data.iterator(); float norm = 0f; while(it.hasNext()){ it.advance(); norm += it.value()*it.value(); } return norm; }
/** * * @param c */ public void scaleDivide(float c) { TIntFloatIterator it = data.iterator(); while(it.hasNext()){ it.advance(); float v = it.value()/c; data.put(it.key(), v); } }
/** * 欧氏距离 * @param sv1 * @param sv2 * @return */ public static float distanceEuclidean(HashSparseVector sv1 ,HashSparseVector sv2) { float dist = 0.0f; TIntFloatIterator it1 = sv1.data.iterator(); TIntFloatIterator it2 = sv2.data.iterator(); int increa = 0; while(it1.hasNext()&&it2.hasNext()){ if(increa==0){ it1.advance(); it2.advance(); }else if(increa==1){ it1.advance(); }else if(increa==2){ it2.advance(); } if(it1.key()<it2.key()){ dist += it1.value()*it1.value(); increa = 1; }else if(it1.key()>it2.key()){ dist += it2.value()*it2.value(); increa = 2; }else{ float t = it1.value() - it2.value(); dist += t*t; increa = 0; } } while(it1.hasNext()){ it1.advance(); dist += it1.value()*it1.value(); } while(it2.hasNext()){ it2.advance(); dist += it2.value()*it2.value(); } return dist; }
public String toString(){ StringBuilder sb = new StringBuilder(); TIntFloatIterator it = data.iterator(); while(it.hasNext()){ it.advance(); sb.append(it.key()); sb.append(":"); sb.append(it.value()); if(it.hasNext()) sb.append(", "); } return sb.toString(); }
/** * 统计信息,计算删除非0特征后,权重的长度 */ public void removeZero() { boolean freeze = false; if (feature.isStopIncrement()) { feature.setStopIncrement(false); freeze = true; } TIntObjectHashMap<String> index = (TIntObjectHashMap<String>) feature.toInverseIndexMap(); System.out.println("原字典大小"+index.size()); System.out.println("原字典大小"+feature.size()); StringFeatureAlphabet newfeat = new StringFeatureAlphabet(); cl.factory.setDefaultFeatureAlphabet(newfeat); for(int i=0;i<weights.length;i++){ TIntFloatIterator itt = weights[i].data.iterator(); HashSparseVector ww = new HashSparseVector(); while(itt.hasNext()){ itt.advance(); float v = itt.value(); if(Math.abs(v)<1e-3f) continue; String fea = index.get(itt.key()); int newidx = newfeat.lookupIndex(fea); ww.put(newidx, v); } weights[i] = ww; } newfeat.setStopIncrement(freeze); System.out.println("新字典大小"+newfeat.size()); System.out.println("新字典大小"+feature.size()); index.clear(); }
@Override public Predict classify(Instance instance, int n) { // TODO Auto-generated method stub int typeSize=tf.getTypeSize(); float[] score=new float[typeSize]; Arrays.fill(score, 0.0f); Object obj=instance.getData(); if(!(obj instanceof HashSparseVector)){ System.out.println("error 输入类型非HashSparseVector!"); return null; } HashSparseVector data = (HashSparseVector) obj; if(fs!=null) data=fs.select(data); TIntFloatIterator it = data.data.iterator(); float feaSize=tf.getFeatureSize(); while (it.hasNext()) { it.advance(); if(it.key()==0) continue; int feature=it.key(); for(int type=0;type<typeSize;type++){ float itemF=tf.getItemFrequency(feature, type); float typeF=tf.getTypeFrequency(type); score[type]+=it.value()*Math.log((itemF+0.1)/(typeF+feaSize)); } } Predict<Integer> res=new Predict<Integer>(n); for(int type=0;type<typeSize;type++) res.add(type, score[type]); return res; }
float distanceEuclidean(HashSparseVector sv1 ,HashSparseVector sv2) { float dist = 0.0f; TIntFloatIterator it1 = sv1.data.iterator(); TIntFloatIterator it2 = sv2.data.iterator(); if (it1.hasNext() && it2.hasNext()) { it1.advance(); it2.advance(); } while (it1.hasNext() && it2.hasNext()) { if(it1.key()<it2.key()){ dist += it1.value()*it1.value(); it1.advance(); }else if(it1.key()>it2.key()){ dist += it2.value()*it2.value(); it2.advance(); }else{ float t = it1.value() - it2.value(); dist += t*t; it1.advance(); it2.advance(); } } while (it1.hasNext()) { it1.advance(); dist += it1.value() * it1.value(); } while (it2.hasNext()) { it2.advance(); dist += it2.value() * it2.value(); } return dist; }
private float getBaseDist(int classid) { float base = 0.0f; TIntFloatIterator it = classCenter.get(classid).data.iterator(); while (it.hasNext()) { it.advance(); base += it.value() * it.value(); } return base; }
/** * 一次性统计概率,节约时间 */ private void statisticProb() { System.out.println("统计概率"); TIntFloatIterator it = wordProb.iterator(); while(it.hasNext()){ it.advance(); float v = it.value()/totalword; it.setValue(v); int key = it.key(); if(key<0) continue; Cluster cluster = new Cluster(key,v,alpahbet.lookupString(key)); clusters.put(key, cluster); } TIntObjectIterator<TIntFloatHashMap> it1 = pcc.iterator(); while(it1.hasNext()){ it1.advance(); TIntFloatHashMap map = it1.value(); TIntFloatIterator it2 = map.iterator(); while(it2.hasNext()){ it2.advance(); it2.setValue(it2.value()/totalword); } } }
@Override public void addThruPipe(Instance inst) { HashSparseVector data = (HashSparseVector) inst.getData(); TIntFloatIterator it = data.data.iterator(); while (it.hasNext()) { it.advance(); int id = it.key(); if (idf[id] > 0) { float value = (float) (it.value()*Math.log(docNum / idf[id])); data.put(id, value); } } }
@Override public void addThruPipe(Instance inst) { HashSparseVector data = (HashSparseVector) inst.getData(); TIntFloatIterator it = data.data.iterator(); while (it.hasNext()) { it.advance(); idf[it.key()]++; } }
/** * 统计信息,计算删除非0特征后,权重的长度 * * @throws IOException */ public void removeZero() { boolean freeze = false; if (feature.isStopIncrement()) { feature.setStopIncrement(false); freeze = true; } TIntObjectHashMap<String> index = (TIntObjectHashMap<String>) feature.toInverseIndexMap(); System.out.println("原字典大小"+index.size()); System.out.println("原字典大小"+feature.size()); StringFeatureAlphabet newfeat = new StringFeatureAlphabet(); cl.factory.setDefaultFeatureAlphabet(newfeat); for(int i=0;i<weights.length;i++){ TIntFloatIterator itt = weights[i].data.iterator(); HashSparseVector ww = new HashSparseVector(); while(itt.hasNext()){ itt.advance(); float v = itt.value(); if(Math.abs(v)<1e-3f) continue; String fea = index.get(itt.key()); int newidx = newfeat.lookupIndex(fea); ww.put(newidx, v); } weights[i] = ww; } newfeat.setStopIncrement(freeze); System.out.println("新字典大小"+newfeat.size()); System.out.println("新字典大小"+feature.size()); index.clear(); }
public TIntFloatIterator iterator() { return container.iterator(); }