我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.intc()。
def _validate_X_predict( self, X: np.ndarray, check_input: bool) -> np.ndarray: if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError( "No support for np.int64 index based sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError( "Number of features of the model must match the input." " Model n_features is %s and input n_features is %s " % (self.n_features_, n_features)) return X
def default(self, obj): # convert dates and numpy objects in a json serializable format if isinstance(obj, datetime): return obj.strftime('%Y-%m-%dT%H:%M:%SZ') elif isinstance(obj, date): return obj.strftime('%Y-%m-%d') elif type(obj) in (np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64): return int(obj) elif type(obj) in (np.bool_,): return bool(obj) elif type(obj) in (np.float_, np.float16, np.float32, np.float64, np.complex_, np.complex64, np.complex128): return float(obj) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj)
def _validate_X_predict(self, X, check_input): """Validate X whenever one tries to predict, apply, predict_proba""" if self.tree_ is None: raise NotFittedError("Estimator not fitted, " "call `fit` before exploiting the model.") if check_input: X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based " "sparse matrices") n_features = X.shape[1] if self.n_features_ != n_features: raise ValueError("Number of features of the model must " "match the input. Model n_features is %s and " "input n_features is %s " % (self.n_features_, n_features)) return X
def pairFeatureMatrix(self, elementList): """ Construction of pair-distance matrices """ # Initiate nSpecies = len(elementList) # Get the molecular structure pos = np.array(self.molecule.positions, dtype = float) # Atomic positions elInd = np.array(self.molecule.elInd, dtype = np.intc) # Element indices matching to elementList natoms = len(self.molecule.names) # Total number of atoms in the molecule # Initiate the matrix dim1 = natoms * (natoms -1)/2 # First dimension (pairwise distances) dim2 = nSpecies * (nSpecies + 1)/2 # Number of possible pairs featMat = np.zeros((dim1,dim2)) # To be passed to fun_pairFeatures (compiled C code) # Call the C function to store the pairFeatures pairFeatures.fun_pairFeatures(nSpecies, natoms, elInd, pos, featMat) # Return featMat return featMat
def execute(self, actions): """ Pass action to universe environment, return reward, next step, terminal state and additional info. :param action: action to execute as numpy array, should have dtype np.intc and should adhere to the specification given in DeepMindLabEnvironment.action_spec(level_id) :return: dict containing the next state, the reward, and a boolean indicating if the next state is a terminal state """ adjusted_actions = list() for action_spec in self.level.action_spec(): if action_spec['min'] == -1 and action_spec['max'] == 1: adjusted_actions.append(actions[action_spec['name']] - 1) else: adjusted_actions.append(actions[action_spec['name']]) # clip? actions = np.array(adjusted_actions, dtype=np.intc) reward = self.level.step(action=actions, num_steps=self.repeat_action) state = self.level.observations()['RGB_INTERLACED'] terminal = not self.level.is_running() return state, terminal, reward
def default(self, obj): # convert dates and numpy objects in a json serializable format if isinstance(obj, datetime): return obj.strftime('%Y-%m-%dT%H:%M:%SZ') elif isinstance(obj, date): return obj.strftime('%Y-%m-%d') elif type(obj) in [np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64]: return int(obj) elif type(obj) in [np.bool_]: return bool(obj) elif type(obj) in [np.float_, np.float16, np.float32, np.float64, np.complex_, np.complex64, np.complex128]: return float(obj) # Let the base class default method raise the TypeError return json.JSONEncoder.default(self, obj)
def predict(self, queries, n_jobs=1): ''' Predict the ranking score for each individual document of the given queries. n_jobs: int, optional (default is 1) The number of working threads that will be spawned to compute the ranking scores. If -1, the current number of CPUs will be used. ''' if self.trained is False: raise ValueError('the model has not been trained yet') predictions = np.zeros(queries.document_count(), dtype=np.float64) n_jobs = max(1, min(n_jobs if n_jobs >= 0 else n_jobs + cpu_count() + 1, queries.document_count())) indices = np.linspace(0, queries.document_count(), n_jobs + 1).astype(np.intc) Parallel(n_jobs=n_jobs, backend="threading")(delayed(parallel_helper, check_pickle=False) (LambdaRandomForest, '_LambdaRandomForest__predict', self.estimators, queries.feature_vectors[indices[i]:indices[i + 1]], predictions[indices[i]:indices[i + 1]]) for i in range(indices.size - 1)) predictions /= len(self.estimators) return predictions
def perform(self, node, inputs, out): # TODO support broadcast! # TODO assert all input have the same shape z, = out if (z[0] is None or z[0].shape != inputs[0].shape or not z[0].is_c_contiguous()): z[0] = theano.sandbox.cuda.CudaNdarray.zeros(inputs[0].shape) if inputs[0].shape != inputs[1].shape: raise TypeError("PycudaElemwiseSourceModuleOp:" " inputs don't have the same shape!") if inputs[0].size > 512: grid = (int(numpy.ceil(inputs[0].size / 512.)), 1) block = (512, 1, 1) else: grid = (1, 1) block = (inputs[0].shape[0], inputs[0].shape[1], 1) self.pycuda_fct(inputs[0], inputs[1], z[0], numpy.intc(inputs[1].size), block=block, grid=grid)
def make_thunk(self, node, storage_map, _, _2): mod = SourceModule(""" __global__ void my_fct(float * i0, float * o0, int size) { int i = blockIdx.x*blockDim.x + threadIdx.x; if(i<size){ o0[i] = i0[i]*2; } }""") pycuda_fct = mod.get_function("my_fct") inputs = [ storage_map[v] for v in node.inputs] outputs = [ storage_map[v] for v in node.outputs] def thunk(): z = outputs[0] if z[0] is None or z[0].shape!=inputs[0][0].shape: z[0] = cuda.CudaNdarray.zeros(inputs[0][0].shape) grid = (int(numpy.ceil(inputs[0][0].size / 512.)),1) pycuda_fct(inputs[0][0], z[0], numpy.intc(inputs[0][0].size), block=(512,1,1), grid=grid) return thunk
def npy2py_type(npy_type): int_types = [ np.int_, np.intc, np.intp, np.int8, np.int16, np.int32, np.int64, np.uint8, np.uint16, np.uint32, np.uint64 ] float_types = [np.float_, np.float16, np.float32, np.float64] bytes_types = [np.str_, np.string_] if npy_type in int_types: return int if npy_type in float_types: return float if npy_type in bytes_types: return bytes if hasattr(npy_type, 'char'): if npy_type.char in ['S', 'a']: return bytes raise TypeError return npy_type
def _open_and_load(f, dtype, multilabel, zero_based, query_id): if hasattr(f, "read"): actual_dtype, data, ind, indptr, labels, query = \ _load_svmlight_file(f, dtype, multilabel, zero_based, query_id) # XXX remove closing when Python 2.7+/3.1+ required else: with closing(_gen_open(f)) as f: actual_dtype, data, ind, indptr, labels, query = \ _load_svmlight_file(f, dtype, multilabel, zero_based, query_id) # convert from array.array, give data the right dtype if not multilabel: labels = frombuffer_empty(labels, np.float64) data = frombuffer_empty(data, actual_dtype) indices = frombuffer_empty(ind, np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) # never empty query = frombuffer_empty(query, np.intc) data = np.asarray(data, dtype=dtype) # no-op for float{32,64} return data, indices, indptr, labels, query
def to_dense(A): """ Convert a sparse matrix A to dense. For debugging only. """ if hasattr(A, "getrow"): n = A.size(0) m = A.size(1) B = np.zeros( (n,m), dtype=np.float64) for i in range(0,n): [j, val] = A.getrow(i) B[i,j] = val return B else: x = Vector() Ax = Vector() A.init_vector(x,1) A.init_vector(Ax,0) n = get_local_size(Ax) m = get_local_size(x) B = np.zeros( (n,m), dtype=np.float64) for i in range(0,m): i_ind = np.array([i], dtype=np.intc) x.set_local(np.ones(i_ind.shape), i_ind) A.mult(x,Ax) B[:,i] = Ax.get_local() x.set_local(np.zeros(i_ind.shape), i_ind) return B
def _create_lookups(self, X): """ Create document and term lookups for all tokens. """ docs, terms = np.nonzero(X) if issparse(X): x = np.array(X[docs, terms])[0] else: x = X[docs, terms] doc_lookup = np.ascontiguousarray(np.repeat(docs, x), dtype=np.intc) term_lookup = np.ascontiguousarray(np.repeat(terms, x), dtype=np.intc) return doc_lookup, term_lookup
def _create_edges(self, y, order='tail'): y.sort(order=order) _docs, _counts = np.unique(y[order], return_counts=True) counts = np.zeros(self.n_docs) counts[_docs] = _counts docs = np.ascontiguousarray( np.concatenate(([0], np.cumsum(counts))), dtype=np.intc) edges = np.ascontiguousarray(y['index'].flatten(), dtype=np.intc) return docs, edges
def fit(self, X, y): """ Estimate the topic distributions per document (theta), term distributions per topic (phi), and regression coefficients (eta). Parameters ---------- X : array-like, shape = (n_docs, n_terms) The document-term matrix. y : array-like, shape = (n_edges, 3) Each entry of y is an ordered triple (d_1, d_2, y_(d_1, d_2)), where d_1 and d_2 are documents and y_(d_1, d_2) is an indicator of a directed edge from d_1 to d_2. """ self.doc_term_matrix = X self.n_docs, self.n_terms = X.shape self.n_tokens = X.sum() self.n_edges = y.shape[0] doc_lookup, term_lookup = self._create_lookups(X) # edge info y = np.ascontiguousarray(np.column_stack((range(self.n_edges), y))) # we use a view here so that we can sort in-place using named columns y_rec = y.view(dtype=list(zip(('index', 'tail', 'head', 'data'), 4 * [y.dtype]))) edge_tail = np.ascontiguousarray(y_rec['tail'].flatten(), dtype=np.intc) edge_head = np.ascontiguousarray(y_rec['head'].flatten(), dtype=np.intc) edge_data = np.ascontiguousarray(y_rec['data'].flatten(), dtype=np.float64) out_docs, out_edges = self._create_edges(y_rec, order='tail') in_docs, in_edges = self._create_edges(y_rec, order='head') # iterate self.theta, self.phi, self.H, self.loglikelihoods = gibbs_sampler_grtm( self.n_iter, self.n_report_iter, self.n_topics, self.n_docs, self.n_terms, self.n_tokens, self.n_edges, self.alpha, self.beta, self.mu, self.nu2, self.b, doc_lookup, term_lookup, out_docs, out_edges, in_docs, in_edges, edge_tail, edge_head, edge_data, self.seed)
def fit(self, X, y, hier): """ Estimate the topic distributions per document (theta), term distributions per topic (phi), and regression coefficients (eta). Parameters ---------- X : array-like, shape = (n_docs, n_terms) The document-term matrix. y : array-like, shape = (n_docs, n_labels) Response values for each document for each labels. hier : 1D array-like, size = n_labels The index of the list corresponds to the current label and the value of the indexed position is the parent of the label. Set -1 as the root. """ self.doc_term_matrix = X self.n_docs, self.n_terms = X.shape self.n_tokens = X.sum() doc_lookup, term_lookup = self._create_lookups(X) # iterate self.theta, self.phi, self.eta, self.loglikelihoods = gibbs_sampler_blhslda( self.n_iter, self.n_report_iter, self.n_topics, self.n_docs, self.n_terms, self.n_tokens, self.alpha, self.beta, self.mu, self.nu2, self.b, doc_lookup, term_lookup, np.ascontiguousarray(y, dtype=np.intc), np.ascontiguousarray(hier, dtype=np.intc), self.seed)
def test_dtype(self): dt = np.intc p = ndpointer(dtype=dt) self.assertTrue(p.from_param(np.array([1], dt))) dt = '<i4' p = ndpointer(dtype=dt) self.assertTrue(p.from_param(np.array([1], dt))) dt = np.dtype('>i4') p = ndpointer(dtype=dt) p.from_param(np.array([1], dt)) self.assertRaises(TypeError, p.from_param, np.array([1], dt.newbyteorder('swap'))) dtnames = ['x', 'y'] dtformats = [np.intc, np.float64] dtdescr = {'names': dtnames, 'formats': dtformats} dt = np.dtype(dtdescr) p = ndpointer(dtype=dt) self.assertTrue(p.from_param(np.zeros((10,), dt))) samedt = np.dtype(dtdescr) p = ndpointer(dtype=samedt) self.assertTrue(p.from_param(np.zeros((10,), dt))) dt2 = np.dtype(dtdescr, align=True) if dt.itemsize != dt2.itemsize: self.assertRaises(TypeError, p.from_param, np.zeros((10,), dt2)) else: self.assertTrue(p.from_param(np.zeros((10,), dt2)))
def predict(self, X, check_input=True): """Predict class or regression value for X. For a classification model, the predicted class for each sample in X is returned. For a regression model, the predicted value based on X is returned. Parameters ---------- X : array-like of shape = [n_samples, n_features] The input samples. Returns ------- y : array of shape = [n_samples] or [n_samples, n_outputs] The predicted classes, or the predict values. """ X = check_array(X, dtype=DTYPE, accept_sparse="csr") if issparse(X) and (X.indices.dtype != np.intc or X.indptr.dtype != np.intc): raise ValueError("No support for np.int64 index based " "sparse matrices") n_samples, n_features = X.shape if self.tree_ is None: raise Exception("Tree not initialized. Perform a fit first") if self.n_features_ != n_features: raise ValueError("Number of features of the model must " " match the input. Model n_features is %s and " " input n_features is %s " % (self.n_features_, n_features)) return (self.tree_.get('coefficient') * (X[:, self.tree_.get('best_dim')] > self.tree_.get('threshold')) + self.tree_.get('constant'))
def _action(*entries): return np.array(entries, dtype=np.intc)
def __init__(self, points, fraction): super(Graph, self).__init__(points, fraction) self.order = _np.ascontiguousarray(_np.argsort(self.density).astype(_np.intc)[::-1]) self.delta, self.neighbour = _core.get_delta_and_neighbour( self.order, self.distances, self.max_distance)
def assign(self, min_density, min_delta, border_only=False): self.min_density = min_density self.min_delta = min_delta self.border_only = border_only if self.autoplot: self.draw_decision_graph(self.min_density, self.min_delta) self._get_cluster_indices() self.membership = _core.get_membership(self.clusters, self.order, self.neighbour) self.border_density, self.border_member = _core.get_border( self.kernel_size, self.distances, self.density, self.membership, self.nclusters) self.halo_idx, self.core_idx = _core.get_halo( self.density, self.membership, self.border_density, self.border_member.astype(_np.intc), border_only=border_only)
def _get_cluster_indices(self): self.clusters = _np.intersect1d( _np.where(self.density > self.min_density)[0], _np.where(self.delta > self.min_delta)[0], assume_unique=True).astype(_np.intc) self.nclusters = self.clusters.shape[0]
def _get_membership(self): self.membership = -1 * _np.ones(shape=self.order.shape, dtype=_np.intc) for i in range(self.ncl): self.membership[self.clusters[i]] = i for i in range(self.npoints): if self.membership[self.order[i]] == -1: self.membership[self.order[i]] = self.membership[self.neighbour[self.order[i]]]
def MapActions(self, action_raw): self.action = np.zeros([self.num_actions]) if (action_raw == 0): self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = -25 elif (action_raw == 1): self.action[self.indices["LOOK_LEFT_RIGHT_PIXELS_PER_FRAME"]] = 25 """if (action_raw==2): self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = -25 elif (action_raw==3): self.action[self.indices["LOOK_DOWN_UP_PIXELS_PER_FRAME"]] = 25 if (action_raw==4): self.action[self.indices["STRAFE_LEFT_RIGHT"]] = -1 elif (action_raw==5): self.action[self.indices["STRAFE_LEFT_RIGHT"]] = 1 if (action_raw==6): self.action[self.indices["MOVE_BACK_FORWARD"]] = -1 el""" if (action_raw == 2): # 7 self.action[self.indices["MOVE_BACK_FORWARD"]] = 1 # all binary actions need reset """if (action_raw==8): self.action[self.indices["FIRE"]] = 0 elif (action_raw==9): self.action[self.indices["FIRE"]] = 1 if (action_raw==10): self.action[self.indices["JUMP"]] = 0 elif (action_raw==11): self.action[self.indices["JUMP"]] = 1 if (action_raw==12): self.action[self.indices["CROUCH"]] = 0 elif (action_raw==13): self.action[self.indices["CROUCH"]] = 1""" return np.clip(self.action, self.mins, self.maxs).astype(np.intc)
def _to_ctypes_array(tup, dtype=numpy.intc): return numpy.array(tup, dtype=dtype).ctypes
def __init__(self, bins, mapq_thresh=30, clip_thresh=1): # set parameters self.bins = bins self.mapQT = mapq_thresh self.clip_thresh = clip_thresh # initialise data structures self.depth_stats = DepthStats(bins, mapq_thresh=mapq_thresh, dtype=np.intc) self.aln_stats = np.zeros((bins.num, len(AlignStats.aln_stats_cols)), dtype=np.intc) self.fwd_inserts = np.empty(bins.num, dtype=list) self.rvs_inserts = np.empty(bins.num, dtype=list) for j in range(0, bins.num): self.fwd_inserts[j] = [] self.rvs_inserts[j] = []
def generate_data(n_samples, n_features, size_groups, rho=0.5, random_state=24): """ Data generation process with Toplitz like correlated features: this correspond to the synthetic dataset used in our paper "GAP Safe Screening Rules for Sparse-Group Lasso". """ rng = check_random_state(random_state) n_groups = len(size_groups) # g_start = np.zeros(n_groups, order='F', dtype=np.intc) # for i in range(1, n_groups): # g_start[i] = size_groups[i - 1] + g_start[i - 1] g_start = np.cumsum(size_groups, dtype=np.intc) - size_groups[0] # 10% of groups are actives gamma1 = int(np.ceil(n_groups * 0.1)) selected_groups = rng.random_integers(0, n_groups - 1, gamma1) true_beta = np.zeros(n_features) for i in selected_groups: begin = g_start[i] end = g_start[i] + size_groups[i] # 10% of features are actives gamma2 = int(np.ceil(size_groups[i] * 0.1)) selected_features = rng.random_integers(begin, end - 1, gamma2) ns = len(selected_features) s = 2 * rng.rand(ns) - 1 u = rng.rand(ns) true_beta[selected_features] = np.sign(s) * (10 * u + (1 - u) * 0.5) vect = rho ** np.arange(n_features) covar = toeplitz(vect, vect) X = rng.multivariate_normal(np.zeros(n_features), covar, n_samples) y = np.dot(X, true_beta) + 0.01 * rng.normal(0, 1, n_samples) return X, y
def expected_support(): numpy_datatypes = [numpy.bool_, numpy.bool, numpy.int_, numpy.intc, numpy.intp, numpy.int8, numpy.int16, numpy.int32, numpy.int64, numpy.uint8, numpy.uint16, numpy.uint32, numpy.uint64, numpy.float_, numpy.float16, numpy.float32, numpy.float64] python_datatypes = [bool, int, float, object] return numpy_datatypes + python_datatypes
def predict_rankings(self, queries, compact=False, n_jobs=1): ''' Predict rankings of the documents for the given queries. If `compact` is set to True then the output will be one long 1d array containing the rankings for all the queries instead of a list of 1d arrays. The compact array can be subsequently index using query index pointer array, see `queries.query_indptr`. query: Query The query whose documents should be ranked. compact: bool Specify to return rankings in compact format. n_jobs: int, optional (default is 1) The number of working threads that will be spawned to compute the ranking scores. If -1, the current number of CPUs will be used. ''' # Predict the ranking scores for the documents. predictions = self.predict(queries, n_jobs) rankings = np.zeros(queries.document_count(), dtype=np.intc) ranksort_queries(queries.query_indptr, predictions, rankings) if compact or len(queries) == 1: return rankings else: return np.array_split(rankings, queries.query_indptr[1:-1])
def predict_rankings(self, queries, compact=False, n_jobs=1): ''' Predict rankings of the documents for the given queries. If `compact` is set to True then the output will be one long 1d array containing the rankings for all the queries instead of a list of 1d arrays. The compact array can be subsequently index using query index pointer array, see `queries.query_indptr`. query: Query The query whose documents should be ranked. compact: bool Specify to return rankings in compact format. n_jobs: int, optional (default is 1) The number of working threads that will be spawned to compute the ranking scores. If -1, the current number of CPUs will be used. ''' if self.trained is False: raise ValueError('the model has not been trained yet') # Predict the ranking scores for the documents. predictions = self.predict(queries, n_jobs) rankings = np.zeros(queries.document_count(), dtype=np.intc) ranksort_queries(queries.query_indptr, predictions, rankings) if compact or queries.query_count() == 1: return rankings else: return np.array_split(rankings, queries.query_indptr[1:-1])
def compute_scale(self, queries, relevance_scores=None): ''' Return the ideal DCG value for each query. Optionally, external relevance assessments can be used instead of the relevances present in the queries. Parameters ---------- queries: Queries The queries for which the ideal DCG should be computed. relevance_scores: array of integers, optional, (default is None) The relevance scores that should be used instead of the relevance scores inside queries. Note, this argument is experimental. ''' ideal_values = np.empty(queries.query_count(), dtype=np.float64) if relevance_scores is not None: if queries.document_count() != relevance_scores.shape[0]: raise ValueError('number of documents and relevance scores do not match') # Need to sort the relevance labels first. indices = np.empty(relevance_scores.shape[0], dtype=np.intc) relevance_argsort_v1(relevance_scores, indices, relevance_scores.shape[0]) # Creates a copy. relevance_scores = relevance_scores[indices] else: # Assuming these are sorted. relevance_scores = queries.relevance_scores self.metric_.evaluate_queries_ideal(queries.query_indptr, relevance_scores, ideal_values) return ideal_values
def evaluate(self, ranking=None, labels=None, ranked_labels=None, scales=None): ''' Evaluate NDCG metric on the specified ranked list of document relevance scores. The function input can be either ranked list of relevance labels (`ranked_labels`), which is most convenient from the computational point of view, or it can be in the form of ranked list of documents (`ranking`) and corresponding relevance scores (`labels`), from which the ranked document relevance labels are computed. Parameters: ----------- ranking: array, shape = (n_documents,) Specify list of ranked documents. labels: array: shape = (n_documents,) Specify relevance score for each document. ranked_labels: array, shape = (n_documents,) Relevance scores of the ranked documents. If not given, then `ranking` and `labels` must not be None, `ranked_labels` will be than inferred from them. scales: float, optional (default is None) The ideal DCG value on the given documents. If None is given it will be computed from the document relevance scores. ''' if ranked_labels is not None: return self.get_score_from_labels_list(ranked_labels) elif ranking is not None and labels is not None: if ranking.shape[0] != labels.shape[0]: raise ValueError('number of ranked documents != number of relevance labels (%d, %d)' \ % (ranking.shape[0], labels.shape[0])) ranked_labels = np.array(sorted(labels, key=dict(zip(labels,ranking)).get, reverse=True), dtype=np.intc) return self.get_score_from_labels_list(ranked_labels)
def _get_partition_indices(start, end, n_jobs): ''' Get boundary indices for ``n_jobs`` number of sub-arrays dividing a (contiguous) array of indices starting with ``start`` (inclusive) and ending with ``end`` (exclusive) into equal parts. ''' if (end - start) >= n_jobs: return np.linspace(start, end, n_jobs + 1).astype(np.intc) else: return np.arange(end - start + 1, dtype=np.intc)
def save_as_text(self, filepath, shuffle=False): ''' Save queries into the specified file in svmlight format. Parameters: ----------- filepath: string The filepath where this object will be saved. shuffle: bool Specify to shuffle the query document lists prior to writing into the file. ''' # Inflate the query_ids array such that each id covers # the corresponding feature vectors. query_ids = np.fromiter( chain(*[[qid] * cnt for qid, cnt in zip(self.query_ids, np.diff(self.query_indptr))]), dtype=int) relevance_scores = self.relevance_scores feature_vectors = self.feature_vectors if shuffle: shuffle_indices = np.random.permutation(self.document_count()) reshuffle_indices = np.argsort(query_ids[shuffle_indices]) document_shuffle_indices = np.arange(self.document_count(), dtype=np.intc)[shuffle_indices[reshuffle_indices]] query_ids = query_ids[document_shuffle_indices] relevance_scores = relevance_scores[document_shuffle_indices] feature_vectors = feature_vectors[document_shuffle_indices] with open(filepath, 'w') as ofile: for score, qid, feature_vector in zip(relevance_scores, query_ids, feature_vectors): ofile.write('%d' % score) ofile.write(' qid:%d' % qid) for feature in zip(self.feature_indices, feature_vector): output = ' %d:%.12f' % feature ofile.write(output.rstrip('0').rstrip('.')) ofile.write('\n')
def get_idxs_thread(comm, npoints): """ Get indices for processor using Scatterv Note: ----- Uppercase mpi4py functions require everything to be in C-compatible types or they will return garbage! """ size = comm.Get_size() rank = comm.Get_rank() npoints_thread = np.zeros(size,dtype=np.intc) offsets_thread = np.zeros(size,dtype=np.intc) for idx in range(size): npoints_thread[idx] = npoints/size offsets_thread[idx] = sum(npoints_thread[:idx]) for idx in range(npoints % size): npoints_thread[idx] += 1 offsets_thread[idx + 1:] += 1 npoints_thread = tuple(npoints_thread) offsets_thread = tuple(offsets_thread) idxs_thread = np.zeros(npoints_thread[rank],dtype=np.intc) idxs = np.arange(npoints,dtype=np.intc) comm.Scatterv((idxs, npoints_thread, offsets_thread, MPI.INT), idxs_thread, root=0) return idxs_thread, npoints_thread, offsets_thread
def get_ravel_offsets(npoints_thread,natoms): """ Get lengths and offsets for gathering trajectory fragments """ size = len(npoints_thread) ravel_lengths = np.zeros(size,dtype=np.intc) ravel_offsets = np.zeros(size,dtype=np.intc) for i in range(size): ravel_lengths[i] = npoints_thread[i]*3*natoms ravel_offsets[i] = sum(ravel_lengths[:i]) ravel_lengths = tuple(ravel_lengths) ravel_offsets = tuple(ravel_offsets) return ravel_lengths, ravel_offsets
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = _make_int_array() indptr = _make_int_array() indptr.append(0) for doc in raw_documents: for feature in analyze(doc): try: j_indices.append(vocabulary[feature]) except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = frombuffer_empty(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = np.ones(len(j_indices)) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sum_duplicates() return vocabulary, X
def _count_vocab(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() values = _make_int_array() indptr.append(0) for doc in raw_documents: feature_counter = {} for feature in analyze(doc): try: feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = 1 else: feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend(feature_counter.values()) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = frombuffer_empty(values, dtype=np.intc) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary)), dtype=self.dtype) X.sort_indices() return vocabulary, X
def _count_vocab_2(self, raw_documents, fixed_vocab): """Create sparse feature matrix, and vocabulary where fixed_vocab=False """ if fixed_vocab: vocabulary = self.vocabulary_ else: # Add a new value when a new vocabulary item is seen vocabulary = defaultdict() vocabulary.default_factory = vocabulary.__len__ analyze = self.build_analyzer() j_indices = [] indptr = _make_int_array() # values = _make_int_array() values = array.array(str("f")) indptr.append(0) for doc in raw_documents: feature_counter = {} for feature in analyze(doc): try: feature_idx = vocabulary[feature] if feature_idx not in feature_counter: feature_counter[feature_idx] = 1 else: feature_counter[feature_idx] += 1 except KeyError: # Ignore out-of-vocabulary items for fixed_vocab=True continue j_indices.extend(feature_counter.keys()) values.extend([i * 1.0 / sum(feature_counter.values()) for i in feature_counter.values()]) indptr.append(len(j_indices)) if not fixed_vocab: # disable defaultdict behaviour vocabulary = dict(vocabulary) if not vocabulary: raise ValueError("empty vocabulary; perhaps the documents only" " contain stop words") j_indices = np.asarray(j_indices, dtype=np.intc) indptr = np.frombuffer(indptr, dtype=np.intc) values = frombuffer_empty(values, dtype=np.float32) X = sp.csr_matrix((values, j_indices, indptr), shape=(len(indptr) - 1, len(vocabulary))) X.sort_indices() return vocabulary, X