我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用scipy.sparse.dok_matrix()。
def _compute_features(self, model): bls = [ b[0] for b in tuple(model.repr_model()) ] nfeats_other = 1 nfeats_ngrams = len(self.module_ngram_to_id) nfeats = nfeats_other + nfeats_ngrams feats = sp.dok_matrix((1, nfeats), dtype=np.float32) # other features feats[0, 0] = len(bls) # ngrams features for k in xrange(1, self.ngram_maxlen): for i in xrange(len(bls) - k): ngram = tuple(bls[i:i + k]) if ngram in self.module_ngram_to_id: ngram_i = self.module_ngram_to_id[ngram] feats_i = nfeats_other + ngram_i feats[0, feats_i] += 1.0 return sp.csr_matrix(feats)
def buildMatrix(inputDict, inputFolder, outputMatrix): wordDict = gensim.corpora.Dictionary.load(inputDict) wordDict.filter_extremes() docs = glob(inputFolder + '/**/*.') nDocs = len(docs) nWords = len(wordDict) sp = sparse.dok_matrix((nWords, nDocs)) for docId,doc in enumerate(docs): docTokens = loadTokens(doc) for wordIdx,wordCount in wordDict.doc2bow(docTokens): sp[wordIdx,docId] = wordCount print 'Words,Documents: ',(nWords, nDocs) mmwrite(outputMatrix, sp) # Main script
def get_pi_c(self, popSize, theta, rho): if not self.exact: return numpy.array([0.0] * self.n + [1.0]) n=self.n coalRate = 1. / popSize recomRate = float(rho) / 2. if rho == 0.0: return numpy.array([0.0] * self.n + [1.0]) else: numCoupledLinsRates = sparse.dok_matrix((n+1, n+1)) for i in range(n+1): if i < n: numCoupledLinsRates[i,i+1] = ((n-i)**2) * coalRate numCoupledLinsRates[i,i] -= numCoupledLinsRates[i,i+1] if i > 0: numCoupledLinsRates[i,i-1] = recomRate * i numCoupledLinsRates[i,i] -= numCoupledLinsRates[i,i-1] return stationary1d_tridiagonal(numCoupledLinsRates)
def _init_model(self): self.user_num, self.item_num = self.train_matrix.shape self.rating_mean = np.mean(self.train_matrix.values()) self.predictions = dok_matrix((self.user_num, self.item_num)) if self.config_handler['Output', 'is_load', 'bool']: self._load_model() assert(self.user_factors.shape[1] == self.item_factors.shape[1]) self.factor_num = self.user_factors.shape[1] else: self.factor_num = self.config_handler['Parameters', 'factor_num', 'int'] self.user_factors = np.random.normal(0, 1, size=(self.user_num, self.factor_num)) * 0.1 self.item_factors = np.random.normal(0, 1, size=(self.item_num, self.factor_num)) * 0.1 # Other Parameters self.learn_rate = self.config_handler['Parameters', 'learn_rate', 'float'] self.momentum = self.config_handler['Parameters', 'momentum', 'float'] self.user_lambda = self.config_handler['Parameters', 'user_lambda', 'float'] self.item_lambda = self.config_handler['Parameters', 'item_lambda', 'float'] # Momentum for update factors self.user_factors_inc = np.zeros((self.user_num, self.factor_num)) self.item_factors_inc = np.zeros((self.item_num, self.factor_num))
def read_data(self, filename): """ read raw dataset, and convert to sparse matrix format. :param filename: """ users, items = set(), set() ratings = list() with codecs.open(filename, mode="r", encoding="utf-8") as read_file: for line in read_file: user_item_rating = re.split('\t|,|::', line.strip()) user_id = int(user_item_rating[0]) item_id = int(user_item_rating[1]) rating = int(user_item_rating[2]) users.add(user_id) items.add(item_id) ratings.append((user_id, item_id, rating)) # Convert user_num, item_num = len(users), len(items) users_dict = {user_id: index for index, user_id in enumerate(list(users))} items_dict = {item_id: index for index, item_id in enumerate(list(items))} data_model = dok_matrix((user_num, item_num)) for user_id, item_id, rating in ratings: data_model[users_dict[user_id], items_dict[item_id]] = rating return data_model
def initModel(self): '''''' self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandler.getParameter('AspectModel', 'MAX_Iterations')) self.numFactors = int(self.configHandler.getParameter('AspectModel', 'numFactors')) self.threshold = float(self.configHandler.getParameter('AspectModel', 'threshold')) self.X = np.random.uniform(0, 1, size=(self.numUsers, self.numFactors)) # P(x|z) self.X = normalize(self.X) self.Y = np.random.uniform(0, 1, size=(self.numItems, self.numFactors)) # P(y|z) self.Y = normalize(self.Y) self.Z = np.random.uniform(0, 1, size=self.numFactors) # P(z) self.Z = normalize(self.Z) self.Q = np.zeros((self.numUsers, self.numFactors, self.numItems)) # P(z|x,y)
def load_data(self, filepath, V): f = open(filepath) S = [] for line in f: s = {} words = line.rstrip('\n').split(' ') for word in words: if word in V: wid = V[word] if wid in s: s[wid] += 1 else: s[wid] = 1 S.append(s) f.close() # Transform to dok and to csr. Sdok = dok_matrix((len(S), len(V)), dtype=int) for n, s in enumerate(S): for wid in s: Sdok[n, wid] = s[wid] S = Sdok.tocsr() return S
def predict(self, X): predictions = dok_matrix((X.shape[0], self.y.shape[1]), dtype=np.int) distances = self.base_classifier.predict_proba(X) topNIndices, topNDistances = self._get_top_labels(distances) for entry, (label_list, dist_list) in enumerate(zip(topNIndices, topNDistances)): for rank, label in enumerate(label_list): if not self.dependencies: training_sample = [[rank, dist_list[rank]]] else: training_sample = [distances[entry, :]] if label in self.meta_classifiers: prediction = self.meta_classifiers[label].predict(training_sample)[0] if prediction == 1: predictions[entry, label] = 1 return csr_matrix(predictions)
def _a(self, neighbor_ids): result = sp.csr_matrix((0, self.y.shape[1])) for ns in neighbor_ids: neighbor_labels = self.y[ns] # By squeezing we support matrix output from scipy.sparse.sum and 1D array from np.sum labels_sum = np.squeeze(np.array(neighbor_labels.sum(0))) predicted_labels = sp.csr_matrix([np.floor(np.divide(labels_sum, len(ns)) + (1 - self.threshold))]) # If there are no labels, we take the most frequent label. if predicted_labels.sum() == 0: divide = np.divide(labels_sum, len(ns)) max_label = divide.argmax() predicted_labels = sp.dok_matrix((1, predicted_labels.shape[1])) predicted_labels[0, max_label] = 1 predicted_labels = sp.csr_matrix(predicted_labels) result = sp.vstack((result, predicted_labels)) return result
def _b(self, neighbor_ids): result = sp.csr_matrix((0, self.y.shape[1])) for ns in neighbor_ids: average_label_nums = int(np.floor(np.mean([self.y[n].sum() for n in ns]))) neighbor_labels = self.y[ns] labels_sum = np.array(neighbor_labels.sum(0)) # By squeezing we support matrix output from scipy.sparse.sum and 1D array from np.sum divide = np.squeeze(np.divide(labels_sum, len(ns))) predicted_indices = np.argsort(divide)[-average_label_nums:] predicted_labels = sp.dok_matrix((1, len(divide))) # noinspection PyTypeChecker for index in predicted_indices: predicted_labels[0, index] = 1 predicted_labels = sp.csr_matrix(predicted_labels) result = sp.vstack((result, predicted_labels)) return result
def load_data(self): print('loading data') user_ids = list( Rating.objects.values('user_id') .annotate(movie_count=Count('movie_id')) .order_by('-movie_count')) content_ids = list(Rating.objects.values('movie_id').distinct()) content_map = {content_ids[i]['movie_id']: i for i in range(len(content_ids))} num_users = len(user_ids) user_ratings = dok_matrix((num_users, len(content_ids)), dtype=np.float32) for i in range(num_users): # each user corresponds to a row, in the order of all_user_names ratings = Rating.objects.filter(user_id=user_ids[i]['user_id']) for user_rating in ratings: user_ratings[i, content_map[user_rating.movie_id]] = user_rating.rating print('data loaded') return user_ids, user_ratings
def infer_unit(self, _h): k = _NUM_TOPICS n = _SKETCH_BUCKET_SIZE m2 = dok_matrix((n, n), dtype=np.float64) m3 = dok_matrix((n, n), dtype=np.float64) container = self.sketch_m2[_h] for key, value in container.container.iteritems(): i, j = self._inverse_index(key) m2[i,j] = value.get(self.timestamp)[2] if i != j: m2[j,i] = m2[i,j] container = self.sketch_m3[_h] for key, value in container.container.iteritems(): i, j = self._inverse_index(key) m3[i,j] = value.get(self.timestamp)[2] if i != j: m3[j,i] = m3[i,j] return solver.solve(csr_matrix(m2), csr_matrix(m3), n, k)
def update_theta(self): # compute w_over_tau_mu w_over_tau_mu = sp.dok_matrix(self.w) for ((s, t), value) in w_over_tau_mu.items(): w_over_tau_mu[(s, t)] = 1.*self.w[(s, t)] / (self.tau[s] * self.mu[t]) w_over_tau_mu = w_over_tau_mu.toarray() # objective function w.r.t. theta def cur_obj(x): return self.update_theta_obj(x, self.phi, w_over_tau_mu) # Jacobian w.r.t. theta def cur_jac(x): return self.update_theta_jac(x, self.phi, w_over_tau_mu) # optimize self.theta, _ = gradient_descent(self.theta, cur_obj, cur_jac, self.eta, self.delta_D*2., step_limit=1000, step_len_init=0.00001) # update phi*theta once self.update_phi_times_theta()
def bow2matrix(bow, numDocs, numWords): s = dok_matrix((numWords, numDocs)) for docNum in range(len(bow)): for wordId, count in bow[docNum]: s[wordId, docNum] = count return s
def normalize(self): m2 = self.m.copy() m2.data **= 2 norm = np.reciprocal(np.sqrt(np.array(m2.sum(axis=1))[:, 0])) normalizer = dok_matrix((len(norm), len(norm))) normalizer.setdiag(norm) self.m = normalizer.tocsr().dot(self.m)
def read_counts_matrix(counts_path): """ Reads the counts into a sparse matrix (CSR) from the count-word-context textual format. """ words = load_count_vocabulary(counts_path + '.words.vocab') contexts = load_count_vocabulary(counts_path + '.contexts.vocab') words = list(words.keys()) contexts = list(contexts.keys()) iw = sorted(words) ic = sorted(contexts) wi = dict([(w, i) for i, w in enumerate(iw)]) ci = dict([(c, i) for i, c in enumerate(ic)]) counts = csr_matrix((len(wi), len(ci)), dtype=np.float32) tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) update_threshold = 100000 i = 0 with open(counts_path) as f: for line in f: count, word, context = line.strip().split() if word in wi and context in ci: tmp_counts[wi[word], ci[context]] = int(count) i += 1 if i == update_threshold: counts = counts + tmp_counts.tocsr() tmp_counts = dok_matrix((len(wi), len(ci)), dtype=np.float32) i = 0 counts = counts + tmp_counts.tocsr() return counts, iw, ic
def multiply_by_rows(matrix, row_coefs): normalizer = dok_matrix((len(row_coefs), len(row_coefs))) normalizer.setdiag(row_coefs) return normalizer.tocsr().dot(matrix)
def multiply_by_columns(matrix, col_coefs): normalizer = dok_matrix((len(col_coefs), len(col_coefs))) normalizer.setdiag(col_coefs) return matrix.dot(normalizer.tocsr())
def citeulike(tag_occurence_thres=10): user_dict = defaultdict(set) for u, item_list in enumerate(open("citeulike-t/users.dat").readlines()): items = item_list.strip().split(" ") # ignore the first element in each line, which is the number of items the user liked. for item in items[1:]: user_dict[u].add(int(item)) n_users = len(user_dict) n_items = max([item for items in user_dict.values() for item in items]) + 1 user_item_matrix = dok_matrix((n_users, n_items), dtype=np.int32) for u, item_list in enumerate(open("citeulike-t/users.dat").readlines()): items = item_list.strip().split(" ") # ignore the first element in each line, which is the number of items the user liked. for item in items[1:]: user_item_matrix[u, int(item)] = 1 n_features = 0 for l in open("citeulike-t/tag-item.dat").readlines(): items = l.strip().split(" ") if len(items) >= tag_occurence_thres: n_features += 1 print("{} features over tag_occurence_thres ({})".format(n_features, tag_occurence_thres)) features = dok_matrix((n_items, n_features), dtype=np.int32) feature_index = 0 for l in open("citeulike-t/tag-item.dat").readlines(): items = l.strip().split(" ") if len(items) >= tag_occurence_thres: features[[int(i) for i in items], feature_index] = 1 feature_index += 1 return user_item_matrix, features
def split_data(user_item_matrix, split_ratio=(3, 1, 1), seed=1): # set the seed to have deterministic results np.random.seed(seed) train = dok_matrix(user_item_matrix.shape) validation = dok_matrix(user_item_matrix.shape) test = dok_matrix(user_item_matrix.shape) # convert it to lil format for fast row access user_item_matrix = lil_matrix(user_item_matrix) for user in tqdm(range(user_item_matrix.shape[0]), desc="Split data into train/valid/test"): items = list(user_item_matrix.rows[user]) if len(items) >= 5: np.random.shuffle(items) train_count = int(len(items) * split_ratio[0] / sum(split_ratio)) valid_count = int(len(items) * split_ratio[1] / sum(split_ratio)) for i in items[0: train_count]: train[user, i] = 1 for i in items[train_count: train_count + valid_count]: validation[user, i] = 1 for i in items[train_count + valid_count:]: test[user, i] = 1 print("{}/{}/{} train/valid/test samples".format( len(train.nonzero()[0]), len(validation.nonzero()[0]), len(test.nonzero()[0]))) return train, validation, test
def construct_s_matrix(self, growth_rate): """build the stoichiometric matrix at a specific growth rate""" # intialize to 0 s = dok_matrix((len(self.metabolites), len(self.reactions))) # populate with stoichiometry for i, r in enumerate(self.reactions): for met, value in iteritems(r._metabolites): met_index = self.metabolites.index(met) if hasattr(value, "subs"): s[met_index, i] = float(value.subs(mu, growth_rate)) else: s[met_index, i] = float(value) return s
def sparse_dok_matrices(): dok = sparse.dok_matrix([[i, j, k], [l, m, n], [p, q, r]]) #print "dok matrices =" #print dok return dok
def one_locus_probs(popSize, theta, n): coalRate = 1. / popSize mutRate = float(theta) / 2. numOnesRates = sparse.dok_matrix((n+1,n+1)) for i in range(n+1): if i < n: numOnesRates[i,i+1] = (n-i) * mutRate + i * (n-i) / 2.0 * coalRate numOnesRates[i,i] -= numOnesRates[i,i+1] if i > 0: numOnesRates[i,i-1] = i * mutRate + i * (n-i) / 2.0 * coalRate numOnesRates[i,i] -= numOnesRates[i,i-1] return stationary1d_tridiagonal(numOnesRates)
def build_symmetries(self): start = time.time() # the index of the folded version in all_configs folded_list = get_folded_config_idxs(self) # foldedIdx = the index in folded_configs, allIdx = the index in all_configs foldedIdx_to_allIdx = numpy.array(list(set(folded_list))) allIdx_to_foldedIdx = {v:k for k,v in enumerate(foldedIdx_to_allIdx)} allIdx_to_foldedIdx = [allIdx_to_foldedIdx[x] for x in folded_list] self.hash_to_foldedIdx = {k: allIdx_to_foldedIdx[v] for k,v in self.hash_to_allIdx.items()} self.folded_config_array = self.config_array[foldedIdx_to_allIdx,:,:] self.numC = self.folded_config_array[:,0,0] + self.folded_config_array[:,0,1] + self.folded_config_array[:,1,0] + self.folded_config_array[:,1,1] symm_mat = sparse.dok_matrix((len(allIdx_to_foldedIdx), self.folded_config_array.shape[0])) symm_mat.update(dict(zip(enumerate(allIdx_to_foldedIdx), [1]*len(folded_list)))) symm_mat = symm_mat.tocsc() antisymm_mat = symm_mat.transpose().tocsr(copy=True) # normalize rows self.n_unfolded_versions = numpy.array(antisymm_mat.sum(axis=1))[:,0] row_indices, col_indices = antisymm_mat.nonzero() antisymm_mat.data /= self.n_unfolded_versions[row_indices] self.symmetries = symm_mat.tocsr() self.antisymmetries = antisymm_mat.tocsr() logging.info("%f seconds to build symmetry matrices" % (time.time() - start))
def create_generalized_matrix(tmodel, array_type = 'dense'): """ Returns the generalized stoichiomatric matrix used for TFA :param tmodel: pytfa.ThermoModel :returns: matrix. """ if array_type not in ('DataFrame', 'dense') and not dok_matrix: raise ValueError('Sparse matrices require scipy') dtype = np.float64 array_constructor = {'dense': np.zeros, 'dok': dok_matrix, 'lil': lil_matrix, 'DataFrame': np.zeros, } n_constraints = len(tmodel.constraints) n_variables = len(tmodel.variables) array = array_constructor[array_type]((n_constraints, n_variables), dtype=dtype) c_ind = {x:e for e,x in enumerate(tmodel.constraints)} v_ind = {x:e for e,x in enumerate(tmodel.variables)} for this_cons in tmodel.constraints: var_coeff_dict = this_cons.get_linear_coefficients(this_cons.variables) for this_var,coeff in var_coeff_dict.items(): array[c_ind[this_cons], v_ind[this_var]] = coeff if array_type == 'DataFrame': metabolite_ids = [met.id for met in tmodel.constraints] reaction_ids = [rxn.id for rxn in tmodel.variables] return pd.DataFrame(array, index=metabolite_ids, columns=reaction_ids) else: return array
def find_tree(sub_network, weight='x_pu'): """Get the spanning tree of the graph, choose the node with the highest degree as a central "tree slack" and then see for each branch which paths from the slack to each node go through the branch. """ branches_bus0 = sub_network.branches()["bus0"] branches_i = branches_bus0.index buses_i = sub_network.buses_i() graph = sub_network.graph(weight=weight) sub_network.tree = nx.minimum_spanning_tree(graph) #find bus with highest degree to use as slack tree_slack_bus, slack_degree = max(degree(sub_network.tree), key=itemgetter(1)) logger.info("Tree slack bus is %s with degree %d.", tree_slack_bus, slack_degree) #determine which buses are supplied in tree through branch from slack #matrix to store tree structure sub_network.T = dok_matrix((len(branches_i),len(buses_i))) for j,bus in enumerate(buses_i): path = nx.shortest_path(sub_network.tree,bus,tree_slack_bus) for i in range(len(path)-1): branch = next(iterkeys(graph[path[i]][path[i+1]])) branch_i = branches_i.get_loc(branch) sign = +1 if branches_bus0.iat[branch_i] == path[i] else -1 sub_network.T[branch_i,j] = sign
def initModel(self): self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandler.getParameter('BPMF', 'MAX_Iterations')) self.numFactors = int(self.configHandler.getParameter('BPMF', 'numFactors')) self.beta0 = float(self.configHandler.getParameter('BPMF', 'beta0')) self.nu0 = float(self.configHandler.getParameter('BPMF', 'nu0')) self.wh0 = np.eye(self.numFactors) self.learnRate = float(self.configHandler.getParameter('BPMF', 'learning_rate')) self.regU = float(self.configHandler.getParameter('BPMF', 'regU')) self.regI = float(self.configHandler.getParameter('BPMF', 'regI')) self.P = np.random.normal(0, 1, size=(self.numUsers, self.numFactors)) self.Q = np.random.normal(0, 1, size=(self.numItems, self.numFactors)) self.alpha = 2 self.alpha_k = self.alpha/self.numFactors self.numRatings = 5 self.theta = np.random.dirichlet(np.array([self.alpha_k for i in range(self.numFactors)])) self.gamma = np.zeros((self.numUsers, self.numFactors, self.numItems)) self.sigma = np.random.normal(0, 1, size = self.numRatings) self.omega = np.random.normal(0, 1, size = self.numUsers) self.mu_vd = 1.0 / (1.0 + np.exp(-(self.omega[newaxis, ...] + self.sigma[..., newaxis]))) self.xi = 10.0 self.nu = 10.0 self.phi = 2.0
def initModel(self): ''' Read the model parameters, and get some common values. ''' self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandler.getParameter('BPoissMF', 'MAX_Iterations')) self.numFactors = int(self.configHandler.getParameter('BPoissMF', 'numFactors')) self.threshold = float(self.configHandler.getParameter('BPoissMF', 'threshold')) # Get the Parameters self.user_alpha = float(self.configHandler.getParameter('BPoissMF', 'user_alpha')) self.user_c = float(self.configHandler.getParameter('BPoissMF', 'user_c')) self.item_a = float(self.configHandler.getParameter('BPoissMF', 'item_a')) self.item_b = float(self.configHandler.getParameter('BPoissMF', 'item_b')) # The model parameters for users self.gamma0 = np.zeros(self.numUsers) self.gamma1 = np.zeros(self.numUsers) self.s = np.zeros(self.numUsers) self.nu = np.zeros((self.numUsers, self.numFactors)) self.theta = np.zeros((self.numUsers, self.numFactors)) # The model parameters for stick proportions self.tau = np.zeros((self.numUsers, self.numFactors)) # The model parameters for item weights self.lambda0 = np.zeros((self.numItems, self.numFactors)) self.lambda1 = np.zeros((self.numItems, self.numFactors)) self.beta = np.zeros((self.numItems, self.numFactors)) self.z = np.zeros((self.numUsers, self.numItems)) self.pi = np.zeros((self.numUsers, self.numItems)) self.logPi = np.zeros((self.numUsers, self.numItems))
def _init_model(self): self.user_num, self.item_num = self.train_matrix.shape self.mean_rating = np.mean(self.train_matrix.values()) self.predictions = dok_matrix((self.user_num, self.item_num)) if self.config_handler['Output', 'is_load', 'bool']: self._load_model() assert(self.user_factors.shape[1] == self.item_factors.shape[1]) self.factor_num = self.user_factors.shape[1] else: self._read_cfg() if self.config_handler['Parameters', 'is_init_path', 'bool']: self._load_init_model() else: self.factor_num = self.config_handler['Parameters', 'factor_num', 'int'] self.user_factors = np.random.normal(0, 1, size=(self.user_num, self.factor_num)) self.item_factors = np.random.normal(0, 1, size=(self.item_num, self.factor_num)) self.markov_num = 0 validation_rmse, test_rmse = self.__evaluate_epoch__() self.logger['Process'].debug('Epoch {0}: Training RMSE - {1}, Testing RMSE - {2}'.format(0, validation_rmse, test_rmse)) self.user_normal_dist_mu0 = np.zeros(self.factor_num, np.float) + self.user_normal_dist_mu0_init self.user_normal_dist_beta0 = self.user_normal_dist_beta0_init self.user_Wishart_dist_W0 = np.eye(self.factor_num) * self.user_Wishart_dist_W0_init self.user_Wishart_dist_nu0 = self.factor_num self.item_normal_dist_mu0 = np.zeros(self.factor_num, np.float) + self.item_normal_dist_mu0_init self.item_normal_dist_beta0 = self.item_normal_dist_beta0_init self.item_Wishart_dist_W0 = np.eye(self.factor_num) * self.item_Wishart_dist_W0_init self.item_Wishart_dist_nu0 = self.factor_num self.rating_sigma = self.rating_sigma_init
def _build_model(self): user_train_matrix = dict() item_train_matrix = dict() for user_id, item_id in self.train_matrix.keys(): user_train_matrix.setdefault(user_id, dok_matrix((1, self.item_num))) user_train_matrix[user_id][0, item_id] = self.train_matrix.get((user_id, item_id)) item_train_matrix.setdefault(item_id, dok_matrix((1, self.user_num))) item_train_matrix[item_id][0, user_id] = self.train_matrix.get((user_id, item_id)) self.previous_loss = -np.inf max_iterations = self.config_handler['Parameters', 'max_iterations', 'int'] for iteration in range(max_iterations): self.logger['Process'].debug('Epoch {0}: update hyper-parameters'.format(iteration)) user_factors_mu, user_factors_variance = \ self._sampling_hyperparameters(self.user_factors, self.user_normal_dist_mu0, self.user_normal_dist_beta0, self.user_Wishart_dist_nu0, self.user_Wishart_dist_W0) item_factors_mu, item_factors_variance = \ self._sampling_hyperparameters(self.item_factors, self.item_normal_dist_mu0, self.item_normal_dist_beta0, self.item_Wishart_dist_nu0, self.item_Wishart_dist_W0) self.logger['Process'].debug('Epoch {0}: update latent factors'.format(iteration)) for gibbs_iteration in range(2): for user_id in range(self.user_num): user_ratings = user_train_matrix[user_id] if user_id in user_train_matrix else dict() if len(user_ratings.keys()) == 0: continue self.user_factors[user_id] = self._update_parameters( self.item_factors, user_ratings, user_factors_mu, user_factors_variance) for item_id in range(self.item_num): item_ratings = item_train_matrix[item_id] if item_id in item_train_matrix else dict() if len(item_ratings.keys()) == 0: continue self.item_factors[item_id] = self._update_parameters( self.user_factors, item_ratings, item_factors_mu, item_factors_variance) validation_rmse, test_rmse = self.__evaluate_epoch__() self.logger['Process'].debug('Epoch {0}: Training RMSE - {1}, Testing RMSE - {2}'.format(iteration, validation_rmse, test_rmse))
def initModel(self): self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandler.getParameter('BPoissMF', 'MAX_Iterations')) self.numFactors = int(self.configHandler.getParameter('BPoissMF', 'numFactors')) self.threshold = float(self.configHandler.getParameter('BPoissMF', 'threshold')) # Get the Parameters self.a = float(self.configHandler.getParameter('BPoissMF', 'a')) self.ap = float(self.configHandler.getParameter('BPoissMF', 'ap')) self.bp = float(self.configHandler.getParameter('BPoissMF', 'bp')) self.c = float(self.configHandler.getParameter('BPoissMF', 'c')) self.cp = float(self.configHandler.getParameter('BPoissMF', 'cp')) self.dp = float(self.configHandler.getParameter('BPoissMF', 'dp')) # Init xi self.xi = gammaRnd(self.ap, self.ap/self.bp, size=self.numUsers) # Init theta self.theta = np.zeros((self.numUsers, self.numFactors)) for i in range(self.numUsers): self.theta[i, :] = gammaRnd(self.a, self.xi[i]) # Init eta self.eta = gammaRnd(self.cp, self.cp/self.dp, size=self.numItems) #Init beta self.beta = np.zeros((self.numItems, self.numFactors)) for i in range(self.numItems): self.beta[i, :] = gammaRnd(self.c, self.eta[i]) # Init z self.zs = np.zeros((self.numUsers, self.numItems, self.numFactors)) for user_id, item_id in self.trainMatrix.keys(): p = self.theta[user_id, :] * self.beta[item_id, :] p /= np.sum(p) self.zs[user_id, item_id, :] = np.random.multinomial(self.trainMatrix[user_id, item_id], p)
def initModel(self): '''''' self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandler.getParameter('CTR', 'MAX_Iterations')) self.numFactors = int(self.configHandler.getParameter('CTR', 'numFactors')) self.threshold = float(self.configHandler.getParameter('CTR', 'threshold')) self.U = np.zeros((self.numUsers, self.numFactors)) self.V = np.zeros((self.numItems, self.numFactors))
def get_train_matrix(self): if len(self.train_data.shape) == 2: return self.train_data train_matrix = dok_matrix((self.train_data.shape[0], self.train_data.shape[1])) for key in self.train_data.keys(): train_matrix[key[0], key[1]] = self.train_data[key] return train_matrix
def tensor_matrix(self, tensor_data): user_num, item_num = tensor_data.shape[0], tensor_data.shape[1] matrix_data = dok_matrix((user_num, item_num)) for user_id, item_id, time_id in tensor_data.keys(): matrix_data[user_id, item_id] += tensor_data.get((user_id, item_id, time_id)) return matrix_data
def read_given_train_test(self, train_file, test_file): """ read given data set """ users, items = set(), set() ratings = list() with codecs.open(train_file, mode="r", encoding="utf-8") as read_file: for line in read_file: user_item_rating = re.split('\t|,|::', line.strip()) user_id = int(user_item_rating[0]) item_id = int(user_item_rating[1]) rating = int(user_item_rating[2]) users.add(user_id) items.add(item_id) ratings.append((user_id, item_id, rating)) # Convert user_num, item_num = len(users), len(items) users_dict = {user_id: index for index, user_id in enumerate(list(users))} items_dict = {item_id: index for index, item_id in enumerate(list(items))} train_matrix = dok_matrix((user_num, item_num)) test_matrix = dok_matrix((user_num, item_num)) for user_id, item_id, rating in ratings: train_matrix[users_dict[user_id], items_dict[item_id]] = rating with codecs.open(test_file, mode='r', encoding='utf-8') as read_file: for line in read_file: user_item_rating = re.split('\t|,|::', line.strip()) user_id = int(user_item_rating[0]) item_id = int(user_item_rating[1]) rating = int(user_item_rating[2]) test_matrix[users_dict[user_id], items_dict[item_id]] = rating return train_matrix, test_matrix
def initModel(self): self.numUsers, self.numItems = self.trainMatrix.shape() self.prediction = dok_matrix((self.numUsers, self.numItems)) self.MAX_Iterations = int(self.configHandle.getParameter('PMF', 'MAX_Iterations'))
def __init__(self, dict_size, file_name, shuffle_buffer_size): self.dict_size = dict_size self.data = sparse.dok_matrix((dict_size, dict_size), dtype=np.uint32)
def setUp(self): self.soinn = Soinn() self.soinn.nodes = np.array([[0, 0], [1, 0], [1, 1], [0, 1]], dtype=np.float64) self.soinn.adjacent_mat = dok_matrix((4, 4)) self.soinn.winning_times = [1] * 4
def test_increment_edge_ages(self): self.soinn.adjacent_mat[0, 1:3] = 1 self.soinn.adjacent_mat[1:3, 0] = 1 self.soinn._Soinn__increment_edge_ages(0) expected = dok_matrix([[0, 2, 2, 0], [2, 0, 0, 0], [2, 0, 0, 0], [0, 0, 0, 0]]) np.testing.assert_array_equal(self.soinn.adjacent_mat.toarray(), expected.toarray()) self.soinn._Soinn__increment_edge_ages(1) expected = dok_matrix([[0, 3, 2, 0], [3, 0, 0, 0], [2, 0, 0, 0], [0, 0, 0, 0]]) np.testing.assert_array_equal(self.soinn.adjacent_mat.toarray(), expected.toarray())
def test_delete_old_edges(self): self.soinn.winning_times = [i for i in range(4)] m = self.soinn.max_edge_age self.soinn.adjacent_mat[[0, 1], [1, 0]] = m + 2 self.soinn.adjacent_mat[[0, 2], [2, 0]] = m + 1 self.soinn._Soinn__delete_old_edges(0) actual = self.soinn.adjacent_mat.toarray() expected = dok_matrix([[0, m+1, 0], [m+1, 0, 0], [0, 0, 0]]).toarray() np.testing.assert_array_equal(actual, expected) expected = np.array([[0, 0], [1, 1], [0, 1]], dtype=np.float64) np.testing.assert_array_equal(self.soinn.nodes, expected) self.assertEqual(self.soinn.winning_times, [0, 2, 3])
def test_delete_old_edges_with_deleting_no_node(self): # No node is deleted by the function self.soinn.winning_times = [i for i in range(4)] m = self.soinn.max_edge_age self.soinn.adjacent_mat[[0, 1], [1, 0]] = m + 2 self.soinn.adjacent_mat[[1, 2], [2, 1]] = 1 previous_nodes = self.soinn.nodes previous_winning_times = self.soinn.winning_times self.soinn._Soinn__delete_old_edges(0) actual = self.soinn.adjacent_mat.toarray() expected = dok_matrix([[0, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 0]]).toarray() np.testing.assert_array_equal(actual, expected) np.testing.assert_array_equal(self.soinn.nodes, previous_nodes) self.assertEqual(self.soinn.winning_times, previous_winning_times)
def test_delete_old_edges_with_deleting_several_nodes(self): # delete several nodes simultaneously self.soinn.winning_times = [i for i in range(4)] m = self.soinn.max_edge_age self.soinn.adjacent_mat[[0, 1, 0, 3], [1, 0, 3, 0]] = m + 2 self.soinn.adjacent_mat[[0, 2], [2, 0]] = m + 1 self.soinn._Soinn__delete_old_edges(0) actual = self.soinn.adjacent_mat.toarray() expected = dok_matrix([[0, m+1], [m+1, 0]]).toarray() np.testing.assert_array_equal(actual, expected) self.assertEqual(self.soinn.winning_times, [0, 2])
def test_delete_nodes_with_deleting_several_nodes(self): # delete several nodes simultaneously self.soinn.winning_times = [i for i in range(4)] self.soinn.adjacent_mat[[0, 1], [1, 0]] = 1 self.soinn.adjacent_mat[[2, 3], [3, 2]] = 2 self.soinn._Soinn__delete_nodes([1, 3]) expected = np.array([[0, 0], [1, 1]], dtype=np.float64) np.testing.assert_array_equal(self.soinn.nodes, expected) self.assertEqual(self.soinn.winning_times, [0, 2]) expected = dok_matrix((2, 2)).toarray() np.testing.assert_array_equal(self.soinn.adjacent_mat.toarray(), expected)
def directInitialMatrix(self): """ We generate an initial sparse matrix with all the transition rates (or probabilities). We later transform this matrix into a rate or probability matrix depending on the preferred method of obtaining pi. """ #First initialize state codes and the mapping with states. self.setStateCodes() #For each state, calculate the indices of reached states and rates using the transition function. results = imap(self.transitionStates, self.mapping.values()) #Simpler alternative that uses less memory. #Would be competitive if the conversion from dok to csr is faster. # D = dok_matrix((self.size,self.size),dtype=float) # for index,(col,rate) in enumerate(results): # D.update({(index,c): r for c,r in zip(col,rate)}) # return D.tocsr() #preallocate memory for the rows, cols and rates of the sparse matrix rows = np.empty(self.size,dtype=int) cols = np.empty(self.size,dtype=int) rates = np.empty(self.size,dtype=float) #now fill the arrays with the results, increasing their size if current memory is too small. right = 0 for index,(col,rate) in enumerate(results): #more robust alternative: in izip(self.mapping.keys(),results) left = right right += len(col) if right >= len(cols): new_capacity = int(round(right * 1.5)) #increase the allocated memory if the vectors turn out to be too small. cols.resize(new_capacity) rates.resize(new_capacity) rows.resize(new_capacity) rows[left:right] = index #since states are sorted, the index indeed corresponds to the state. cols[left:right] = col rates[left:right] = rate #Place all data in a coo_matrix and convert to a csr_matrix for quick computations. return coo_matrix((rates[:right],(rows[:right],cols[:right])),shape=(self.size,self.size)).tocsr()
def imitate_tr(self, graph, root): def tr(): pass tr.nx_graph = graph tr.nx_root = root return tr # # def test_speed(self): # _, _, tr = load_dataset('econ62k') # graph = tr.nx_graph # # def random_labels(): # def set_random_ones(n_nodes): # ids = np.random.choice(n_nodes, 5) # zeros = sp.dok_matrix((1, n_nodes), dtype=np.bool_) # for index in ids: # zeros[0, index] = True # return zeros # # number_of_nodes = graph.number_of_nodes() # matrix = set_random_ones(number_of_nodes) # for i in range(0, 62000): # zeros = set_random_ones(number_of_nodes) # matrix = sp.vstack((matrix, zeros)) # return sp.csr_matrix(matrix) # # y_true = random_labels() # y_pred = random_labels() # print('random constructed') # # start = default_timer() # hierarchical_f_measure(graph, y_true, y_pred) # print(default_timer() - start)
def _make_sparse(self, scores): n_features = len(self.vocabulary) result = sp.csr_matrix((0, n_features)) for score in scores: sparse_score = sp.dok_matrix((1, n_features)) for s in score.items(): sparse_score[0, self.vocabulary[s[0]]] = s[1] result = sp.vstack((result, sp.csr_matrix(sparse_score))) return result
def init_Q(): from scipy.sparse import dok_matrix return dok_matrix((17 ** 16, 16 * 16))