我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用numpy.isnan()。
def replace_missing(X): # This is ugly, but try: if X.getformat()=='csr': return X except: X[np.isnan(X)]=-999.0 #djajetic 05.09.2015 return X #djajetic 05.09.2015 p=len(X) nn=len(X[0])*2 XX = np.zeros([p,nn]) for i in range(len(X)): line = X[i] line1 = [0 if np.isnan(x) else x for x in line] line2 = [1 if np.isnan(x) else 0 for x in line] # indicator of missingness XX[i] = line1 + line2 return XX
def rhoA(self): # rhoA rhoA = pd.DataFrame(0, index=np.arange(1), columns=self.latent) for i in range(self.lenlatent): weights = pd.DataFrame(self.outer_weights[self.latent[i]]) weights = weights[(weights.T != 0).any()] result = pd.DataFrame.dot(weights.T, weights) result_ = pd.DataFrame.dot(weights, weights.T) S = self.data_[self.Variables['measurement'][ self.Variables['latent'] == self.latent[i]]] S = pd.DataFrame.dot(S.T, S) / S.shape[0] numerador = ( np.dot(np.dot(weights.T, (S - np.diag(np.diag(S)))), weights)) denominador = ( (np.dot(np.dot(weights.T, (result_ - np.diag(np.diag(result_)))), weights))) rhoA_ = ((result)**2) * (numerador / denominador) if(np.isnan(rhoA_.values)): rhoA[self.latent[i]] = 1 else: rhoA[self.latent[i]] = rhoA_.values return rhoA.T
def get(self, X): X = np.array(X) X_nan = np.isnan(X) imputed = self.meanImput(X.copy()) if len(self.estimators_) > 1: for i, estimator_ in enumerate(self.estimators_): X_s = np.delete(imputed, i, 1) y_nan = X_nan[:, i] X_unk = X_s[y_nan] result_ = [] if len(X_unk) > 0: for unk in X_unk: result_.append(estimator_.predict(unk)) X[y_nan, i] = result_ return X
def treegauss_remove_row( data_row, tree_grid, latent_row, vert_ss, edge_ss, feat_ss, ): # Update sufficient statistics. for v in range(latent_row.shape[0]): z = latent_row[v, :] vert_ss[v, :, :] -= np.outer(z, z) for e in range(tree_grid.shape[1]): z1 = latent_row[tree_grid[1, e], :] z2 = latent_row[tree_grid[2, e], :] edge_ss[e, :, :] -= np.outer(z1, z2) for v, x in enumerate(data_row): if np.isnan(x): continue z = latent_row[v, :] feat_ss[v] -= 1 feat_ss[v, 1] -= x feat_ss[v, 2:] -= x * z # TODO Use central covariance.
def test_train(self): model, fetches_ = self._test_pipeline(tf.contrib.learn.ModeKeys.TRAIN) predictions_, loss_, _ = fetches_ target_len = self.sequence_length + 10 + 2 max_decode_length = model.params["target.max_seq_len"] expected_decode_len = np.minimum(target_len, max_decode_length) np.testing.assert_array_equal(predictions_["logits"].shape, [ self.batch_size, expected_decode_len - 1, model.target_vocab_info.total_size ]) np.testing.assert_array_equal(predictions_["losses"].shape, [self.batch_size, expected_decode_len - 1]) np.testing.assert_array_equal(predictions_["predicted_ids"].shape, [self.batch_size, expected_decode_len - 1]) self.assertFalse(np.isnan(loss_))
def information_ratio(algorithm_returns, benchmark_returns): """ http://en.wikipedia.org/wiki/Information_ratio Args: algorithm_returns (np.array-like): All returns during algorithm lifetime. benchmark_returns (np.array-like): All benchmark returns during algo lifetime. Returns: float. Information ratio. """ relative_returns = algorithm_returns - benchmark_returns relative_deviation = relative_returns.std(ddof=1) if zp_math.tolerant_equals(relative_deviation, 0) or \ np.isnan(relative_deviation): return 0.0 return np.mean(relative_returns) / relative_deviation
def raw_data_gen(self): for dt, series in self.data.iterrows(): for sid, price in series.iteritems(): # Skip SIDs that can not be forward filled if np.isnan(price) and \ sid not in self.started_sids: continue self.started_sids.add(sid) event = { 'dt': dt, 'sid': sid, 'price': price, # Just chose something large # if no volume available. 'volume': 1e9, } yield event
def test_nan_filter_dataframe(self): dates = pd.date_range('1/1/2000', periods=2, freq='B', tz='UTC') df = pd.DataFrame(np.random.randn(2, 2), index=dates, columns=[4, 5]) # should be filtered df.loc[dates[0], 4] = np.nan # should not be filtered, should have been ffilled df.loc[dates[1], 5] = np.nan source = DataFrameSource(df) event = next(source) self.assertEqual(5, event.sid) event = next(source) self.assertEqual(4, event.sid) event = next(source) self.assertEqual(5, event.sid) self.assertFalse(np.isnan(event.price))
def df_type_to_str(i): ''' Convert into simple datatypes from pandas/numpy types ''' if isinstance(i, np.bool_): return bool(i) if isinstance(i, np.int_): return int(i) if isinstance(i, np.float): if np.isnan(i): return 'NaN' elif np.isinf(i): return str(i) return float(i) if isinstance(i, np.uint): return int(i) if type(i) == bytes: return i.decode('UTF-8') if isinstance(i, (tuple, list)): return str(i) if i is pd.NaT: # not identified as a float null return 'NaN' return str(i)
def calc_reward(self, action=0, state=None, **kw ): """Calculate the reward for the specified transition.""" eps1, eps2 = self.eps_values_for_actions[action] if state is None: state = self.observe() if self.logspace: T1, T2, T1s, T2s, V, E = 10**state else: T1, T2, T1s, T2s, V, E = state # the reward function penalizes treatment because of side-effects reward = -0.1*V - 2e4*eps1**2 - 2e3*eps2**2 + 1e3*E # Constrain reward to be within specified range if np.isnan(reward): reward = -self.reward_bound elif reward > self.reward_bound: reward = self.reward_bound elif reward < -self.reward_bound: reward = -self.reward_bound return reward
def to_rgb(img): """ Converts the given array into a RGB image. If the number of channels is not 3 the array is tiled such that it has 3 channels. Finally, the values are rescaled to [0,255) :param img: the array to convert [nx, ny, channels] :returns img: the rgb image [nx, ny, 3] """ img = np.atleast_3d(img) channels = img.shape[2] if channels < 3: img = np.tile(img, 3) img[np.isnan(img)] = 0 img -= np.amin(img) img /= np.amax(img) img *= 255 return img
def SMA(Series, N, M=1): ret = [] i = 1 length = len(Series) # ??X????? nan ? while i < length: if np.isnan(Series[i]): i += 1 else: break preY = Series[i] # Y' ret.append(preY) while i < length: Y = (M * Series[i] + (N - M) * preY) / float(N) ret.append(Y) preY = Y i += 1 return pd.Series(ret)
def map(self, data): data = data[self.fieldName] colors = np.empty((len(data), 4)) default = np.array(fn.colorTuple(self['Default'])) / 255. colors[:] = default for v in self.param('Values'): mask = data == v.maskValue c = np.array(fn.colorTuple(v.value())) / 255. colors[mask] = c #scaled = np.clip((data-self['Min']) / (self['Max']-self['Min']), 0, 1) #cmap = self.value() #colors = cmap.map(scaled, mode='float') #mask = np.isnan(data) | np.isinf(data) #nanColor = self['NaN'] #nanColor = (nanColor.red()/255., nanColor.green()/255., nanColor.blue()/255., nanColor.alpha()/255.) #colors[mask] = nanColor return colors
def round_solution_pool(pool, constraints): pool.distinct().sort() P = pool.P L0_reg_ind = np.isnan(constraints['coef_set'].C_0j) L0_max = constraints['L0_max'] rounded_pool = SolutionPool(P) for solution in pool.solutions: # sort from largest to smallest coefficients feature_order = np.argsort([-abs(x) for x in solution]) rounded_solution = np.zeros(shape=(1, P)) l0_norm_count = 0 for k in range(0, P): j = feature_order[k] if not L0_reg_ind[j]: rounded_solution[0, j] = np.round(solution[j], 0) elif l0_norm_count < L0_max: rounded_solution[0, j] = np.round(solution[j], 0) l0_norm_count += L0_reg_ind[j] rounded_pool.add(objvals=np.nan, solutions=rounded_solution) rounded_pool.distinct().sort() return rounded_pool
def checkFSXvalsAgainstADNIMERGE(tadpoleDF, mriADNI1FileFSX, otherSSvisCodeStr, ssNameTag, ignoreMissingCols = False): nrRows, nrCols = tadpoleDF.shape colListOtherSS = list(ssDF.columns.values) colListTadpoleDF = list(tadpoleDF.columns.values) tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]] = \ tadpoleDF[['Hippocampus', 'ST29SV%s' % ssNameTag, 'ST88SV%s' % ssNameTag]].apply(pd.to_numeric, errors='coerce') tadpoleDF['HIPPOSUM'] = tadpoleDF['ST29SV%s' % ssNameTag] + tadpoleDF['ST88SV%s' % ssNameTag] for r in range(nrRows): valsNan = np.isnan(tadpoleDF['Hippocampus'][r]) or (np.isnan(tadpoleDF['ST29SV%s' % ssNameTag][r]) and \ np.isnan(tadpoleDF['ST88SV%s' % ssNameTag][r])) if valsNan: continue valsNotEq = tadpoleDF['Hippocampus'][r] != (tadpoleDF['ST29SV%s' % ssNameTag][r] + tadpoleDF['ST88SV%s' % ssNameTag][r]) if valsNotEq: print('entries dont match\n ', tadpoleDF[['RID','VISCODE', 'Hippocampus', 'ST29SV%s' % ssNameTag,\ 'ST88SV%s' % ssNameTag, 'HIPPOSUM']].iloc[r]) # Conclusion: the reason why entries above don't match is because UCSFFSX has duplicate entries for the same subject and viscode.
def test_hz(): """Test the hz function.""" df, _ = readSC() for (teff, logg, mass) in df.loc[:, ['teff', 'logg', 'mass']].values: lum = (teff / 5777)**4 * (mass / ((10**logg) / (10**4.44)))**2 assert isinstance(hz(teff, lum, model=2), float) assert isinstance(hz(teff, lum, model=4), float) teff = 5777 lum = 1 invalids = [{teff: lum}, [teff, lum], (teff, lum), "..."] for model in range(1, 6): assert isinstance(hz(teff, lum, model), float) results = [0.75, 0.98, 0.99, 1.71, 1.77] for model, result in enumerate(results, start=1): assert round(hz(teff, lum, model), 2) == result for invalid in invalids: assert np.isnan(hz(invalid, lum, model)) assert np.isnan(hz(teff, invalid, model)) assert hz(teff, lum, 2) < hz(teff, lum, 4) # hz1 < hz2
def generateWekaFile(X,Y,features,path,name): f = open(path + name + '.arff', 'w') f.write("@relation '" + name + "'\n\n") for feat in features: f.write("@attribute " + feat + " numeric\n") f.write("@attribute cluster {True,False}\n\n") f.write("@data\n\n") for i in range(X.shape[0]): for j in range(X.shape[1]): if np.isnan(X[i,j]): f.write("?,") else: f.write(str(X[i,j]) + ",") if Y[i] == 1.0 or Y[i] == True: f.write("True\n") else: f.write("False\n") f.close()
def test_posterior_zeros(self): p = np.asarray([.5, 0., 0.]).reshape((1, 3)) posterior = self.eval(self.posterior, p) print 'posterior', posterior posterior_grad = self.eval(self.posterior_grad, p) print 'posterior grad', posterior_grad kl = self.eval(self.posterior_kl, p) print kl self.assertGreater(kl.sum(), 0) self.assertFalse(np.isnan(kl).any()) self.assertTrue(np.isfinite(kl).all()) grad = self.eval(self.posterior_kl_grad, p) print grad self.assertFalse(np.isnan(grad).any()) self.assertTrue(np.isfinite(grad).all())
def update_summary( var_up, var, start, end, ): diff = np.abs(var_up - var) reldiff = diff / var # filter out nan's try: reldiff = reldiff[~np.isnan(reldiff)] except: pass return (np.mean(diff), np.std(diff), np.mean(reldiff), np.std(reldiff), (end - start).microseconds)
def test_bootstrap_replicate_1d(data, seed): np.random.seed(seed) x = dcst.bootstrap_replicate_1d(data, np.mean) np.random.seed(seed) x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.mean) assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \ or np.isclose(x, x_correct, atol=atol, equal_nan=True) np.random.seed(seed) x = dcst.bootstrap_replicate_1d(data, np.median) np.random.seed(seed) x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.median) assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \ or np.isclose(x, x_correct, atol=atol, equal_nan=True) np.random.seed(seed) x = dcst.bootstrap_replicate_1d(data, np.std) np.random.seed(seed) x_correct = original.bootstrap_replicate_1d(data[~np.isnan(data)], np.std) assert (np.isnan(x) and np.isnan(x_correct, atol=atol, equal_nan=True)) \ or np.isclose(x, x_correct, atol=atol, equal_nan=True)
def nan_helper(y): """ Helper to handle indices and logical indices of NaNs. Input: - y, 1d numpy array with possible NaNs Output: - nans, logical indices of NaNs - index, a function, with signature indices= index(logical_indices), to convert logical indices of NaNs to 'equivalent' indices Example: >>> # linear interpolation of NaNs >>> nans, x= nan_helper(y) >>> y[nans]= NP.interp(x(nans), x(~nans), y[~nans]) """ # Source: http://stackoverflow.com/questions/6518811/interpolate-nan-values-in-a-numpy-array return NP.isnan(y), lambda z: z.nonzero()[0]
def step4(): key_vec = pickle.loads(open("key_vec.pkl", "rb").read()) vecs = [] for ev, vec in enumerate(key_vec.values()): x = np.array(vec) if np.isnan(x).any(): # print(vec) continue vecs.append(x) vecs = np.array(vecs) kmeans = KMeans(n_clusters=128, init='k-means++', n_init=10, max_iter=300, tol=0.0001,precompute_distances='auto', verbose=0, random_state=None, copy_x=True, n_jobs=1) print("now fitting...") kmeans.fit(vecs) open("kmeans.model", "wb").write( pickle.dumps(kmeans) ) for p in kmeans.predict(vecs): print(p)
def _step5(arr): kmeans = pickle.loads(open("kmeans.model", "rb").read()) key, lines, tipe = arr print(key) open("./tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe,key=key), "w").write("\n".join(lines)) res = os.popen("./fasttext print-sentence-vectors ./models/model.bin < tmp/tmp.{tipe}.{key}.txt".format(tipe=tipe, key=key)).read() w = open("tmp/tmp.{tipe}.{key}.json".format(tipe=tipe,key=key), "w") for line in res.split("\n"): try: vec = list(map(float, line.split()[-100:])) except: print(line) print(res) continue x = np.array(vec) if np.isnan(x).any(): continue cluster = kmeans.predict([vec]) txt = line.split()[:-100] obj = {"txt": txt, "cluster": cluster.tolist()} data = json.dumps(obj, ensure_ascii=False) w.write( data + "\n" )
def test_lm(self): hps = get_test_hparams() with tf.variable_scope("model"): model = LM(hps) with self.test_session() as sess: tf.initialize_all_variables().run() tf.initialize_local_variables().run() loss = 1e5 for i in range(50): x, y, w = simple_data_generator(hps.batch_size, hps.num_steps) loss, _ = sess.run([model.loss, model.train_op], {model.x: x, model.y: y, model.w: w}) print("%d: %.3f %.3f" % (i, loss, np.exp(loss))) if np.isnan(loss): print("NaN detected") break self.assertLess(loss, 1.0)
def get_series_median_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=10, exclude_partial_missing=False): """ Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years """ medians = [] r_word_time_series = {} if exclude_partial_missing: for word, time_series in word_time_series.iteritems(): if not np.isnan(np.sum(time_series.values())): r_word_time_series[word] = time_series else: r_word_time_series = word_time_series for year in xrange(start_year, end_year + 1, year_inc): word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not r_word_time_series[word][year] == 0]) if len(word_array) == 0: continue if one_minus: word_array = 1 - word_array medians.append(np.median(word_array)) return np.array(medians)
def get_series_mean_std_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1, exclude_partial_missing=False): """ Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years """ means = [] stderrs = [] r_word_time_series = {} if exclude_partial_missing: for word, time_series in word_time_series.iteritems(): if not np.isnan(np.sum(time_series.values())): r_word_time_series[word] = time_series else: r_word_time_series = word_time_series for year in xrange(start_year, end_year + 1, year_inc): word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] if word in r_word_time_series and not np.isnan(r_word_time_series[word][year]) and not np.isinf(r_word_time_series[word][year])]) if len(word_array) == 0: continue if one_minus: word_array = 1 - word_array means.append(word_array.mean()) stderrs.append(word_array.std()) return np.array(means), np.array(stderrs)
def get_series_mean_stderr_peryear(word_time_series, i_year_words, one_minus=False, start_year=1900, end_year=2000, year_inc=1, exclude_partial_missing=False): """ Return the mean and stderr arrays for the values of the words specified per year in i_year_words for specified years """ means = [] stderrs = [] r_word_time_series = {} if exclude_partial_missing: for word, time_series in word_time_series.iteritems(): time_series = {year:val for year, val in time_series.iteritems() if year >= start_year and year <= end_year} if not np.isnan(np.sum(time_series.values())): r_word_time_series[word] = time_series else: r_word_time_series = word_time_series for year in xrange(start_year, end_year + 1, year_inc): word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])]) if one_minus: word_array = 1 - word_array means.append(word_array.mean()) stderrs.append(word_array.std() / len(word_array)) return np.array(means), np.array(stderrs)
def get_yearly_set_dev(series, i_year_words, one_minus=False, start_year=1900, end_year=2000, method='diff'): """ Gets the mean relative deviation of the words in words vs. the full series. """ base_mat = _make_series_mat(series, series.keys(), one_minus=one_minus, start_year=start_year, end_year=end_year) means = [] stderrs = [] r_word_time_series = series for year in xrange(start_year, end_year + 1): word_array = np.array([r_word_time_series[word][year] for word in i_year_words[year] if word in r_word_time_series and not np.isnan(r_word_time_series[word][year])]) if one_minus: word_array = 1 - word_array if method == 'diff': word_array = word_array - base_mat.mean(0)[year-start_year] elif method == 'ratio': word_array = word_array / base_mat.mean(0)[year-start_year] else: raise RuntimeError("Unknown deviation method. Use diff or ratio.") means.append(word_array.mean()) stderrs.append(word_array.std() / len(word_array)) return np.array(means), np.array(stderrs)
def log_likelihood(self, data): nks = np.bincount(self.labels_, minlength=self.n_clusters) # number of points in each cluster n, d = data.shape log_likelihood = 0 covar_matrices = self.covariances(self.labels_, cluster_centers=self.cluster_centers_, data=data) covar_matrix_det_v = np.linalg.det(covar_matrices) self._inv_covar_matrices = self._matrix_inverses(covar_matrices) for k, nk in enumerate(nks): if self.verbose == 1: print('log_likelihood: covar_matrix_det = {}'.format(covar_matrix_det_v[k])) term_1 = nk * (np.log(float(nk)/n) - 0.5 * d * np.log(2*np.pi) - 0.5 * np.log(abs(covar_matrix_det_v[k]))) cdist_result = cdist(data[self.labels_ == k], np.array([self.cluster_centers_[k]]), metric='mahalanobis', VI=self._inv_covar_matrices[k]) cdist_no_nan = cdist_result[~np.isnan(cdist_result)] # to deal with nans returned by cdist term_2 = -0.5 * (np.sum(cdist_no_nan)) k_sum = term_1 + term_2 log_likelihood += k_sum if np.isnan(log_likelihood) or log_likelihood == float('inf'): raise Exception('ll is nan or inf') return log_likelihood
def test_alpha(self, returns, benchmark, expected): observed = self.empyrical.alpha(returns, benchmark) assert_almost_equal( observed, expected, DECIMAL_PLACES) if len(returns) == len(benchmark): # Compare to scipy linregress returns_arr = returns.values benchmark_arr = benchmark.values mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr) slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask], returns_arr[mask]) assert_almost_equal( observed, intercept * 252, DECIMAL_PLACES ) # Alpha/beta translation tests.
def test_beta(self, returns, benchmark, expected): observed = self.empyrical.beta(returns, benchmark) assert_almost_equal( observed, expected, DECIMAL_PLACES) if len(returns) == len(benchmark): # Compare to scipy linregress returns_arr = returns.values benchmark_arr = benchmark.values mask = ~np.isnan(returns_arr) & ~np.isnan(benchmark_arr) slope, intercept, _, _, _ = stats.linregress(benchmark_arr[mask], returns_arr[mask]) assert_almost_equal( observed, slope )
def strategy(data, params): """ Stack overlapping intervals. Assumes that each set has the same horizontal position """ vjust = params['vjust'] y = data['y'].copy() y[np.isnan(y)] = 0 heights = np.append(0, y.cumsum()) if params['fill']: heights = heights / np.abs(heights[-1]) data['ymin'] = np.min([heights[:-1], heights[1:]], axis=0) data['ymax'] = np.max([heights[:-1], heights[1:]], axis=0) # less intuitive than (ymin + vjust(ymax-ymin)), but # this way avoids subtracting numbers of potentially # similar precision data['y'] = ((1-vjust)*data['ymin'] + vjust*data['ymax']) return data
def _find_index(bg_df, start_date, end_date, make_col_bool): if (make_col_bool): bg_df['date'] = bg_df['created_at'].apply(lambda x: x.date()) #create column with just the date if make_col_bool is True #Find the first date with the start date (first entry) and the last date with the end date (last entry) #Since the older dates have higher indices, we use max() for start and min() for the end dates start_index = bg_df[bg_df['date'] == start_date.date()].index.max() end_index = bg_df[bg_df['date'] == end_date.date()].index.min() #Raises exception if invalid dates (which are labeled as NaN) if np.isnan(start_index): raise Exception("Invalid start date: " + str(start_date.date())) if np.isnan(end_index): raise Exception("Invalid end date: " + str(end_date.date())) return bg_df, start_index, end_index #Function to get the bg data
def plot_heatmaps(data, mis, column_label, cont, topk=30, prefix=''): cmap = sns.cubehelix_palette(as_cmap=True, light=.9) m, nv = mis.shape for j in range(m): inds = np.argsort(- mis[j, :])[:topk] if len(inds) >= 2: plt.clf() order = np.argsort(cont[:,j]) subdata = data[:, inds][order].T subdata -= np.nanmean(subdata, axis=1, keepdims=True) subdata /= np.nanstd(subdata, axis=1, keepdims=True) columns = [column_label[i] for i in inds] sns.heatmap(subdata, vmin=-3, vmax=3, cmap=cmap, yticklabels=columns, xticklabels=False, mask=np.isnan(subdata)) filename = '{}/heatmaps/group_num={}.png'.format(prefix, j) if not os.path.exists(os.path.dirname(filename)): os.makedirs(os.path.dirname(filename)) plt.title("Latent factor {}".format(j)) plt.yticks(rotation=0) plt.savefig(filename, bbox_inches='tight') plt.close('all') #plot_rels(data[:, inds], map(lambda q: column_label[q], inds), colors=cont[:, j], # outfile=prefix + '/relationships/group_num=' + str(j), latent=labels[:, j], alpha=0.1)
def write_data(self, result_dict): for key, result in six.iteritems(result_dict): if ss.isspmatrix(result): if np.isnan(result.data).any(): raise ValueError("data {} have nan".format(key)) elif np.isnan(result).any(): raise ValueError("data {} have nan".format(key)) with SimpleTimer("Writing generated data {} to hdf5 file" .format(key), end_in_new_line=False): if key in self.h5f: # self.h5f[key][...] = result raise NotImplementedError("Overwriting not supported.") else: if (isinstance(result, ss.csc_matrix) or isinstance(result, ss.csr_matrix)): # sparse matrix h5sparse.Group(self.h5f).create_dataset(key, data=result) else: self.h5f.create_dataset(key, data=result) self.h5f.flush()
def repeat_until_convergence(labelled_data, labelled_clusters, unlabelled_centroids): #find best fitting centroids to the labelled_data previous_max_difference = 0 while True: unlabelled_old_centroids = unlabelled_centroids unlabelled_centroids = move_centroids(labelled_clusters) labelled_clusters = form_clusters(labelled_data, unlabelled_centroids) differences = list(map(lambda a, b: np.linalg.norm(a-b),unlabelled_old_centroids,unlabelled_centroids)) max_difference = max(differences) if np.isnan(max_difference-previous_max_difference): difference_change = np.nan else: difference_change = abs((max_difference-previous_max_difference)/np.mean([previous_max_difference,max_difference])) * 100 previous_max_difference = max_difference # difference change is nan once the list of differences is all zeroes. if np.isnan(difference_change): break return labelled_clusters, unlabelled_centroids
def loadData (self, filename, verbose=True, replace_missing=True): ''' Get the data from a text file in one of 3 formats: matrix, sparse, binary_sparse''' if verbose: print("========= Reading " + filename) start = time.time() if self.use_pickle and os.path.exists (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")): with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "r") as pickle_file: vprint (verbose, "Loading pickle file : " + os.path.join(self.tmp_dir, os.path.basename(filename) + ".pickle")) return pickle.load(pickle_file) if 'format' not in self.info.keys(): self.getFormatData(filename) if 'feat_num' not in self.info.keys(): self.getNbrFeatures(filename) data_func = {'dense':data_io.data, 'sparse':data_io.data_sparse, 'sparse_binary':data_io.data_binary_sparse} data = data_func[self.info['format']](filename, self.info['feat_num']) # INPORTANT: when we replace missing values we double the number of variables if self.info['format']=='dense' and replace_missing and np.any(map(np.isnan,data)): vprint (verbose, "Replace missing values by 0 (slow, sorry)") data = data_converter.replace_missing(data) if self.use_pickle: with open (os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle"), "wb") as pickle_file: vprint (verbose, "Saving pickle file : " + os.path.join (self.tmp_dir, os.path.basename(filename) + ".pickle")) p = pickle.Pickler(pickle_file) p.fast = True p.dump(data) end = time.time() if verbose: print( "[+] Success in %5.2f sec" % (end - start)) return data
def sanitize_array(array): ''' Replace NaN and Inf (there should not be any!)''' a=np.ravel(array) maxi = np.nanmax((filter(lambda x: x != float('inf'), a))) # Max except NaN and Inf mini = np.nanmin((filter(lambda x: x != float('-inf'), a))) # Mini except NaN and Inf array[array==float('inf')]=maxi array[array==float('-inf')]=mini mid = (maxi + mini)/2 array[np.isnan(array)]=mid return array
def htmt(self): htmt_ = pd.DataFrame(pd.DataFrame.corr(self.data_), index=self.manifests, columns=self.manifests) mean = [] allBlocks = [] for i in range(self.lenlatent): block_ = self.Variables['measurement'][ self.Variables['latent'] == self.latent[i]] allBlocks.append(list(block_.values)) block = htmt_.ix[block_, block_] mean_ = (block - np.diag(np.diag(block))).values mean_[mean_ == 0] = np.nan mean.append(np.nanmean(mean_)) comb = [[k, j] for k in range(self.lenlatent) for j in range(self.lenlatent)] comb_ = [(np.sqrt(mean[comb[i][1]] * mean[comb[i][0]])) for i in range(self.lenlatent ** 2)] comb__ = [] for i in range(self.lenlatent ** 2): block = (htmt_.ix[allBlocks[comb[i][1]], allBlocks[comb[i][0]]]).values # block[block == 1] = np.nan comb__.append(np.nanmean(block)) htmt__ = np.divide(comb__, comb_) where_are_NaNs = np.isnan(htmt__) htmt__[where_are_NaNs] = 0 htmt = pd.DataFrame(np.tril(htmt__.reshape( (self.lenlatent, self.lenlatent)), k=-1), index=self.latent, columns=self.latent) return htmt
def get_cubic_root(self): # We have the equation x^2 D^2 + (1-x)^4 * C / h_min^2 # where x = sqrt(mu). # We substitute x, which is sqrt(mu), with x = y + 1. # It gives y^3 + py = q # where p = (D^2 h_min^2)/(2*C) and q = -p. # We use the Vieta's substution to compute the root. # There is only one real solution y (which is in [0, 1] ). # http://mathworld.wolfram.com/VietasSubstitution.html # eps in the numerator is to prevent momentum = 1 in case of zero gradient if np.isnan(self._dist_to_opt) or np.isnan(self._h_min) or np.isnan(self._grad_var) \ or np.isinf(self._dist_to_opt) or np.isinf(self._h_min) or np.isinf(self._grad_var): logging.warning("Input to cubic solver has invalid nan/inf value!") raise Exception("Input to cubic solver has invalid nan/inf value!") p = (self._dist_to_opt + eps)**2 * (self._h_min + eps)**2 / 2 / (self._grad_var + eps) w3 = (-math.sqrt(p**2 + 4.0 / 27.0 * p**3) - p) / 2.0 w = math.copysign(1.0, w3) * math.pow(math.fabs(w3), 1.0/3.0) y = w - p / 3.0 / (w + eps) x = y + 1 if self._verbose: logging.debug("p %f, denominator %f", p, self._grad_var + eps) logging.debug("w3 %f ", w3) logging.debug("y %f, denominator %f", y, w + eps) if np.isnan(x) or np.isinf(x): logging.warning("Output from cubic is invalid nan/inf value!") raise Exception("Output from cubic is invalid nan/inf value!") return x
def treegauss_add_row( data_row, tree_grid, program, latent_row, vert_ss, edge_ss, feat_ss, ): # Sample latent state using dynamic programming. TODO('https://github.com/posterior/treecat/issues/26') # Update sufficient statistics. for v in range(latent_row.shape[0]): z = latent_row[v, :] vert_ss[v, :, :] += np.outer(z, z) for e in range(tree_grid.shape[1]): z1 = latent_row[tree_grid[1, e], :] z2 = latent_row[tree_grid[2, e], :] edge_ss[e, :, :] += np.outer(z1, z2) for v, x in enumerate(data_row): if np.isnan(x): continue z = latent_row[v, :] feat_ss[v] += 1 feat_ss[v, 1] += x feat_ss[v, 2:] += x * z # TODO Use central covariance.
def imputeSNPs(X): snpsMean = np.nanmean(X, axis=0) isNan = np.isnan(X) for i,m in enumerate(snpsMean): X[isNan[:,i], i] = m return X
def __call__(self, *args, **kwargs): assert len(args) <= len(self.inputs), "Too many arguments provided" feed_dict = {} # Update the args for inpt, value in zip(self.inputs, args): self._feed_input(feed_dict, inpt, value) # Update the kwargs kwargs_passed_inpt_names = set() for inpt in self.inputs[len(args):]: inpt_name = inpt.name.split(':')[0] inpt_name = inpt_name.split('/')[-1] assert inpt_name not in kwargs_passed_inpt_names, \ "this function has two arguments with the same name \"{}\", so kwargs cannot be used.".format(inpt_name) if inpt_name in kwargs: kwargs_passed_inpt_names.add(inpt_name) self._feed_input(feed_dict, inpt, kwargs.pop(inpt_name)) else: assert inpt in self.givens, "Missing argument " + inpt_name assert len(kwargs) == 0, "Function got extra arguments " + str(list(kwargs.keys())) # Update feed dict with givens. for inpt in self.givens: feed_dict[inpt] = feed_dict.get(inpt, self.givens[inpt]) results = get_session().run(self.outputs_update, feed_dict=feed_dict)[:-1] if self.check_nan: if any(np.isnan(r).any() for r in results): raise RuntimeError("Nan detected") return results
def test_gradients(self): inputs = tf.random_normal( [self.batch_size, self.sequence_length, self.input_depth]) seq_length = tf.ones(self.batch_size, dtype=tf.int32) * self.sequence_length labels = np.random.randint(0, self.vocab_size, [self.batch_size, self.sequence_length]) helper = decode_helper.TrainingHelper( inputs=inputs, sequence_length=seq_length) decoder_fn = self.create_decoder( helper=helper, mode=tf.contrib.learn.ModeKeys.TRAIN) initial_state = decoder_fn.cell.zero_state( self.batch_size, dtype=tf.float32) decoder_output, _ = decoder_fn(initial_state, helper) losses = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=decoder_output.logits, labels=labels) optimizer = tf.train.AdamOptimizer(learning_rate=0.001) grads_and_vars = optimizer.compute_gradients(tf.reduce_mean(losses)) #pylint: disable=E1101 with self.test_session() as sess: sess.run(tf.global_variables_initializer()) grads_and_vars_ = sess.run(grads_and_vars) for grad, _ in grads_and_vars_: self.assertFalse(np.isnan(grad).any()) return grads_and_vars_
def frame_to_series(self, field, frame, columns=None): """ Convert a frame with a DatetimeIndex and sid columns into a series with a sid index, using the aggregator defined by the given field. """ if isinstance(frame, pd.DataFrame): columns = frame.columns frame = frame.values if not len(frame): return pd.Series( data=(0 if field == 'volume' else np.nan), index=columns, ).values if field in ['price', 'close']: # shortcircuit for full last row vals = frame[-1] if np.all(~np.isnan(vals)): return vals return ffill(frame)[-1] elif field == 'open': return bfill(frame)[0] elif field == 'volume': return np.nansum(frame, axis=0) elif field == 'high': return np.nanmax(frame, axis=0) elif field == 'low': return np.nanmin(frame, axis=0) else: raise ValueError("Unknown field {}".format(field))
def update_last_known_values(self): """ Store the non-NaN values from our oldest frame in each frequency. """ ffillable = self.ffillable_fields if not len(ffillable): return for frequency in self.unique_frequencies: digest_panel = self.digest_panels.get(frequency, None) if digest_panel: oldest_known_values = digest_panel.oldest_frame(raw=True) else: oldest_known_values = self.buffer_panel.oldest_frame(raw=True) oldest_vals = oldest_known_values oldest_columns = self.fields for field in ffillable: f_idx = oldest_columns.get_loc(field) field_vals = oldest_vals[f_idx] # isnan would be fast, possible to use? non_nan_sids = np.where(pd.notnull(field_vals)) key = (frequency.freq_str, field) key_loc = self.last_known_prior_values.index.get_loc(key) self.last_known_prior_values.values[ key_loc, non_nan_sids ] = field_vals[non_nan_sids]
def check_entry(key, value): if key != 'period_label': return np.isnan(value) or np.isinf(value) else: return False ############################ # Risk Metric Calculations # ############################
def _compute_asset_lifetimes(self): """ Compute and cache a recarry of asset lifetimes. """ equities_cols = self.equities.c buf = np.array( tuple( sa.select(( equities_cols.sid, equities_cols.start_date, equities_cols.end_date, )).execute(), ), dtype='<f8', # use doubles so we get NaNs ) lifetimes = np.recarray( buf=buf, shape=(len(buf),), dtype=[ ('sid', '<f8'), ('start', '<f8'), ('end', '<f8') ], ) start = lifetimes.start end = lifetimes.end start[np.isnan(start)] = 0 # convert missing starts to 0 end[np.isnan(end)] = np.iinfo(int).max # convert missing end to INTMAX # Cast the results back down to int. return lifetimes.astype([ ('sid', '<i8'), ('start', '<i8'), ('end', '<i8'), ])
def _compute(self, arrays, dates, assets, mask): data = arrays[0] bins = self.params['bins'] to_bin = where(mask, data, nan) result = quantiles(to_bin, bins) # Write self.missing_value into nan locations, whether they were # generated by our input mask or not. result[isnan(result)] = self.missing_value return result.astype(int64_dtype)