Python pandas 模块,read_csv() 实例源码

我们从Python开源项目中,提取了以下50个代码示例,用于说明如何使用pandas.read_csv()

项目:KATE    作者:hugochan    | 项目源码 | 文件源码
def calc_word_sim(model, eval_file):
    df = pd.read_csv(eval_file, sep=',', header=0) # eval dataset
    col1, col2, score = df.columns.values
    model_vocab = model.vocab.keys()
    ground = []
    sys = []
    for idx, row in df.iterrows():
        if row[col1] in model_vocab and row[col2] in model_vocab:
            ground.append(float(row[score]))
            sys.append(model.similarity(row[col1], row[col2]))

    # compute Spearman's rank correlation coefficient (https://en.wikipedia.org/wiki/Spearman%27s_rank_correlation_coefficient)
    print sys
    # import pdb;pdb.set_trace()
    corr, p_val = stats.spearmanr(sys, ground)
    logger.info("# of pairs found: %s / %s" % (len(ground), len(df)))
    logger.info("correlation: %s" % corr)
    return corr, p_val
项目:rca-evaluation    作者:sieve-microservices    | 项目源码 | 文件源码
def draw(path, srv):
     filename = os.path.join(path, srv["preprocessed_filename"])
     df = pd.read_csv(filename, sep="\t", index_col='time', parse_dates=True)
     bins = defaultdict(list)
     for i, col in enumerate(df.columns):
         serie = df[col].dropna()
         if pd.algos.is_monotonic_float64(serie.values, False)[0]:
             serie = serie.diff()[1:]
         p_value = adfuller(serie, autolag='AIC')[1]
         if math.isnan(p_value): continue
         nearest = 0.05 * round(p_value/0.05)
         bins[nearest].append(serie)
     for bin, members in bins.items():
         series = [serie.name for serie in members]
         if len(members) <= 10:
             columns = series
         else:
             columns = random.sample(series, 10)

         subset = df[columns]
         name = "%s_adf_confidence_%.2f.png" % (srv["name"], bin)
         print(name)
         axes = subset.plot(subplots=True)
         plt.savefig(os.path.join(path, name))
         plt.close("all")
项目:pyaddepar    作者:lobnek    | 项目源码 | 文件源码
def test_addepar2frame(self):
        r = {'meta': {'columns': [{'key': 'node_id', 'display_name': 'Entity ID', 'output_type': 'Word'},
                                  {'key': '_custom_13_custodian_name_166730', 'display_name': '15. Custodian Name', 'output_type': 'Word'},
                                  {'key': '_custom_15_reference_currency_165485', 'display_name': '17. Reference Currency', 'output_type': 'Currency'},
                                  {'key': '_custom_16_lwm_risk_profile_114480', 'display_name': '18. LWM Risk Profile', 'output_type': 'Word'},
                                  {'key': '_custom_23_lwm_aum_type_293536', 'display_name': '23. LWM - AUM Type', 'output_type': 'Word'},
                                  {'key': 'inception_event_date', 'display_name': 'Inception Date', 'output_type': 'Date'}],
                      'groupings': [{'key': 'top_level_owner', 'display_name': 'Top Level Owner'}]},
             'data': {'type': 'portfolio_views', 'attributes':
                 {'total': {'name': 'Total', 'columns':
                                {'_custom_15_reference_currency_165485': None, 'inception_event_date': '2013-12-31', '_custom_23_lwm_aum_type_293536': None, '_custom_16_lwm_risk_profile_114480': None, '_custom_13_custodian_name_166730': None, 'node_id': None},
                            'children': [{'entity_id': 1146188, 'name': 'A', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-10-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'X', 'node_id': 1146188}, 'children': []},
                                         {'entity_id': 1231399, 'name': 'B', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2016-09-21', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Balanced', '_custom_13_custodian_name_166730': 'Y', 'node_id': 1231399}, 'children': []},
                                         {'entity_id': 1511499, 'name': 'C', 'grouping': 'top_level_owner', 'columns': {'_custom_15_reference_currency_165485': 'CHF', 'inception_event_date': '2017-03-31', '_custom_23_lwm_aum_type_293536': 'LWM Consolidation Only', '_custom_16_lwm_risk_profile_114480': 'Conservative', '_custom_13_custodian_name_166730': 'Z', 'node_id': 1511499}, 'children': []},
                                        ]}}, 'links': {'self': '/v1/portfolio_views/null'}}}

        pdt.assert_frame_equal(addepar2frame(r), pd.read_csv("/pyaddepar/test/resources/frame.csv", parse_dates=True), check_dtype=False)
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def generate_vocabulary(self, review_summary_file):
        """

        :param review_summary_file:
        :return:
        """
        self.rev_sum_pair = pd.read_csv(review_summary_file, header=0).values

        for review,summary in self.rev_sum_pair:
            rev_lst = wordpunct_tokenize(review)
            sum_lst = wordpunct_tokenize(summary)
            self.__add_list_to_dict(rev_lst)
            self.__add_list_to_dict(sum_lst)

        # Now store the "" empty string as the last word of the voacabulary
        self.map[""] = len(self.map)
        self.revmap[len(self.map)] = ""
项目:kiteHistory    作者:mr-karan    | 项目源码 | 文件源码
def plot_csv(stock_data, symbol):
    """
    params:
        - stock_data(list) : list of dict objects containing stock data
        - name(str) : output file name specified by `-output` param.
    """

    try:
        df = pd.read_csv('{}.csv'.format(symbol))

    except:
        write_to_csv(stock_data, symbol)
        df = pd.read_csv('{}.csv'.format(symbol))

    p1 = figure(x_axis_type="datetime", title="Stock Closing Price")
    p1.grid.grid_line_alpha = 0.3
    p1.xaxis.axis_label = 'Date'
    p1.yaxis.axis_label = 'Price'

    p1.line(datetime(list(df['date'])), list(df['close']),
            color='#A6CEE3', legend=symbol)
    output_file("{}.html".format(symbol), title="Stock Closing Prices")

    show(p1)  # open a browser
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def fix_columns(df):
    """
    Changes DataFrame in-place
    """
    # Convert all string columns to str to avoid a PerformanceWarning
    for col in _STRING_COLUMNS:
        if col not in df:
            continue
        df[col].fillna('', inplace=True)
        df[col] = df[col].astype('str')
        # Empty strings have been set to NaN by read_csv. Replacing
        # by the empty string avoids problems with groupby, which
        # ignores NaN values.
    # Columns that have any NaN values in them cannot be converted to
    # int due to a numpy limitation.
    for col in _INTEGER_COLUMNS:
        if col not in df.columns:
            continue
        if all(df[col].notnull()):
            df[col] = df[col].astype(int)
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def main(args):
    if args.minimum_frequency is None:
        minimum_frequency = max((len(args.tables) + 1) // 2, 2)
    else:
        minimum_frequency = args.minimum_frequency
    logger.info('Minimum frequency set to %s', minimum_frequency)

    # Read in tables
    tables = []
    for path in args.tables:
        table = pd.read_csv(path, sep='\t')
        table = table[table.database_diff >= args.minimum_db_diff]
        table = table.dropna()
        tables.append(table)
        if len(table) == 0:
            logger.warn('Table read from %r is empty after filtering out sequences with database diff >= %s.', path, args.minimum_db_diff)

    # Count V sequence occurrences
    counter = Counter()
    for table in tables:
        counter.update(set(table.consensus))

    # Find most frequent occurrences and print result
    print('count', 'gene', 'database_diff', 'sequence', 'names', sep='\t')
    for sequence, frequency in counter.most_common():
        if frequency < minimum_frequency:
            break
        names = []
        gene = None
        for table in tables:
            matching_rows = table[table.consensus == sequence]
            if matching_rows.empty:
                continue
            names.extend(matching_rows.name)
            if gene is None:
                row = matching_rows.iloc[0]
                gene = row.gene
                database_diff = row.database_diff
                #shm = row['V_SHM']
        print(frequency, gene, database_diff, sequence, *names, sep='\t')
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def count_full_text_occurrences(candidates, table_path, other_gene, other_errors, merge, min_count):
    # Use only records that have a chance of reaching the required min_count
    records = {info.sequence: info for info in candidates if info.max_count >= min_count}

    # Count full-text occurrences in the genomic_sequence, circumventing
    # inaccurate IgBLAST alignment boundaries
    # TODO limit the search to the gene region (especially for D genes)
    # Speed up search by looking for most common sequences first
    search_order = sorted(records, key=lambda s: records[s].max_count, reverse=True)
    cols = [other_gene, 'V_errors', 'J_errors', 'CDR3_nt', 'genomic_sequence']
    for chunk in pd.read_csv(table_path, usecols=cols, chunksize=10000, sep='\t'):
        chunk = chunk[chunk[other_errors] == 0]
        for row in chunk.itertuples():
            for needle in search_order:
                if needle in row.genomic_sequence:
                    record = records[needle]
                    record.count += 1
                    record.other_genes.add(getattr(row, other_gene))
                    record.cdr3s.add(row.CDR3_nt)
                    if merge:
                        break
    return records.values()
项目:IgDiscover    作者:NBISweden    | 项目源码 | 文件源码
def main(args):
    n = 0
    first = True
    written = 0
    stats = FilteringStatistics()
    for chunk in pd.read_csv(args.table, chunksize=10000, sep='\t'):
        fix_columns(chunk)
        n += len(chunk)
        filtered, chunk_stats = filtered_table(chunk, v_gene_coverage=args.v_coverage,
            j_gene_coverage=args.j_coverage, v_gene_evalue=args.v_evalue)
        stats += chunk_stats
        print(filtered.to_csv(sep='\t', index=False, header=first), end='')
        first = False
        written += len(filtered)

    logger.info('%s rows in input table', stats.n)
    logger.info('%s rows have both V and J assignment', stats.vjassigned)
    logger.info('%s of those do not have a stop codon', stats.stop)
    logger.info('%s of those have an E-value of at most %s', stats.v_evalue, args.v_evalue)
    logger.info('%s of those cover the V gene by at least %s%%', stats.v_coverage, args.v_coverage)
    logger.info('%s of those cover the J gene by at least %s%%', stats.j_coverage, args.j_coverage)
    logger.info('%d rows written', written)
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def get_treasury_data(start_date, end_date):
    return pd.read_csv(
        "http://www.federalreserve.gov/datadownload/Output.aspx"
        "?rel=H15"
        "&series=bf17364827e38702b42a58cf8eaa3f78"
        "&lastObs="
        "&from="  # An unbounded query is ~2x faster than specifying dates.
        "&to="
        "&filetype=csv"
        "&label=omit"
        "&layout=seriescolumn"
        "&type=package",
        skiprows=1,  # First row is a useless header.
        parse_dates=['Time Period'],
        na_values=['ND'],  # Presumably this stands for "No Data".
        index_col=0,
    ).loc[
        start_date:end_date
    ].dropna(
        how='all'
    ).rename(
        columns=parse_treasury_csv_column
    ).tz_localize('UTC') * 0.01  # Convert from 2.57% to 0.0257.
项目:zipline-chinese    作者:zhanghan1990    | 项目源码 | 文件源码
def storageindex(self):
        #get the filelist
        onlyfiles = [ f for f in listdir(self.indexdata) if isfile(join(self.indexdata,f)) ]
        #read from using pandas
        for f in onlyfiles:
            df = pd.read_csv(self.indexdata+"/"+f)
            s=f.split('.')
            name = s[0][2:8]
            records = json.loads(df.T.to_json()).values()
            for row in records:
                row['date'] = datetime.datetime.strptime(row['date'], "%Y-%m-%d")
            print name
            self.index[name].insert_many(records)



    #storage stock pool into database
项目:table-compositor    作者:InvestmentSystems    | 项目源码 | 文件源码
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
项目:table-compositor    作者:InvestmentSystems    | 项目源码 | 文件源码
def load_names_data():
    fp = os.path.join(tempfile.gettempdir(), ZIP_NAME)
    if not os.path.exists(fp):
        r = requests.get(URL_NAMES)
        with open(fp, 'wb') as f:
            f.write(r.content)

    post = collections.OrderedDict()
    with zipfile.ZipFile(fp) as zf:
        # get ZipInfo instances
        for zi in sorted(zf.infolist(), key=lambda zi: zi.filename):
            fn = zi.filename
            if fn.startswith('yob'):
                year = int(fn[3:7])
                df = pd.read_csv(
                    zf.open(zi),
                    header=None,
                    names=('name', 'gender', 'count'))
                df['year'] = year
                post[year] = df

        df = pd.concat(post.values())
        df.set_index('name', inplace=True, drop=True)
        return df
项目:soccerstan    作者:Torvaney    | 项目源码 | 文件源码
def read_data(fname):
    """ Read football-data.co.uk csv """
    data = (
        pd.read_csv(fname)
        .rename(columns={
                'HomeTeam': 'home_team',
                'AwayTeam': 'away_team',
                'FTHG': 'home_goals',
                'FTAG': 'away_goals'
            })
        .loc[lambda df: ~pd.isnull(df['home_goals'])]  # Remove future games
    )

    team_map = stan_map(pd.concat([data['home_team'], data['away_team']]))
    data['home_team_id'] = data['home_team'].replace(team_map)
    data['away_team_id'] = data['away_team'].replace(team_map)


    for col in ('home_goals', 'away_goals'):
        data[col] = [int(c) for c in data[col]]

    return data, team_map
项目:didi_competition    作者:Heipiao    | 项目源码 | 文件源码
def cluster_map_sheet_pre():
    print("------ load cluster_map data ----------")
    cluster_map_sheet_path = os.path.join(LOAD_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
    print("load data from: ", cluster_map_sheet_path)
    save_path = os.path.join(SAVE_DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR)
    print("save data to: ", save_path)
    file = "cluster_map"

    cluster_sheet = os.path.join(cluster_map_sheet_path, file)
    data = pd.read_csv(cluster_sheet,header=-1)
    data.columns = ["raw"]
    data["district_hash"] = data["raw"].map(lambda x: x.split("\t")[0])
    data["district_map"] = data['raw'].map(lambda x: x.split("\t")[1])

    del data["raw"]

    save_df_to_file(data, save_path, file)


# handle the order_info sheet
项目:didi_competition    作者:Heipiao    | 项目源码 | 文件源码
def create_hash_district_map_dict():
    file = "cluster_map.csv"
    district_hash_map_path = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, file)

    hash_data = pd.read_csv(district_hash_map_path)
    ## convert the dataframe into dict
    hash_map_rule = dict(zip(hash_data.district_hash, hash_data.district_map))

    # print(type(hash_map_rule))

    saved_file = "cluster_map.pickle"
    map_save_file = os.path.join(DATA_DIR, CONCRETE_DIR, CLUSTER_MAP_SHEET_DIR, saved_file)
    ## save into same dir as file
    with open(map_save_file, "wb") as f:
        pickle.dump(hash_map_rule, f)

    #print(hash_map_rule)

# map the district features in the input data_frame into value
项目:rosie    作者:datasciencebr    | 项目源码 | 文件源码
def test_prepare_dataset(self, fetch, chamber_of_deputies):
        """
        * Rename columns.
        * Make `document_type` a category column.
        * Rename values for `category`.
        * Create `is_party_expense` column.
        """
        dataset = self.subject.dataset
        self.assertTrue(set(ADAPTER_COLUMNS.keys()).issubset(set(dataset.columns)))
        document_types = ['bill_of_sale', 'simple_receipt', 'expense_made_abroad']
        self.assertEqual(document_types,
                         dataset['document_type'].cat.categories.tolist())
        fixture = pd.read_csv(os.path.join(self.fixtures_path, 'reimbursements.xz'))
        meal_rows = fixture \
            .query('subquota_description == "Congressperson meal"').index
        self.assertEqual(['Meal'],
                         dataset.loc[meal_rows, 'category'].unique().tolist())
        party_expense_rows = fixture[fixture['congressperson_id'].isnull()].index
        self.assertEqual([True],
                         dataset.loc[party_expense_rows, 'is_party_expense'].unique().tolist())
项目:lung-cancer-detector    作者:YichenGong    | 项目源码 | 文件源码
def _load_sets(self):
        print("Loading datasets")

        train_patients = pd.read_csv("data/stage1/" + "stage1_labels.csv")

        for idx, row in train_patients.iterrows():
            if self._check_sample_exists(row['id']):
                self._test_set.append(row['id'])

        for idx, row in train_patients.iterrows():
            if self._check_sample_exists(row['id']):
                self._train_set.append([row['id'], row['cancer']])

        #Create permutation for random loading
        self.shuffle()

        print("Loading datasets: Done!")
项目:lung-cancer-detector    作者:YichenGong    | 项目源码 | 文件源码
def _load_sets(self):
        print("Loading datasets")

        train_patients = pd.read_csv(os.path.join(self._directory, "stage1_labels.csv"))
        test_patients = pd.read_csv(os.path.join(self._directory, "stage1_sample_submission.csv"))

        for idx, row in test_patients.iterrows():
            self._test_set.append(row['id'])

        for idx, row in train_patients.iterrows():
            self._train_set.append([row['id'], row['cancer']])

        #Create permutation for random loading
        self.shuffle()

        print("Loading datasets: Done!")
项目:saapy    作者:ashapochka    | 项目源码 | 文件源码
def doctable(ctx):
    df = pd.read_csv('./docs/flight-options.csv')

    # open an existing document
    doc = docx.Document('./docs/style-reference.docx')

    as_int = partial(format_decimal, format='#')
    as_usd = partial(format_currency, currency='USD')

    s = doc.sections[0]
    width = s.page_width - s.left_margin - s.right_margin

    doc.add_picture('./docs/diagrams_002.png', width=width)

    formatters = {
        'ticket_price': as_usd,
        'total_hours': as_int,
        'trip': as_int,
        'airline': partial(shorten_long_name, width=20),
        'selected': compose({0: 'No', 1: 'Yes'}.get, int)
    }
    add_table(df, doc, table_style='Plain Table 3', formatters=formatters)

    # save the doc
    doc.save('./docs/test.docx')
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def eval(flags):
    name = flags.pred_path
    yp = pd.read_csv(name)
    classes = len([i for i in yp.columns.values if 'class' in i])
    yp = yp[['class%d'%i for i in range(1,classes+1)]].values
    myDB = personalDB(flags,name="full")
    if "stage1" in name:
        y=myDB.data['test_variants_filter']['Class']-1
    else:
        myDB.get_split()
        va = myDB.split[flags.fold][1]
        y = np.argmax(myDB.y[va],axis=1)
    if np.max(y)>classes:
        y = np.argmax(to4c(onehot_encode(y)),axis=1)
    score = cross_entropy(y,yp)
    print(name,score,'\n')
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def eval(name,clip=False,bar=0.9):
    base = pd.read_csv('../input/stage1_solution_filtered.csv')
    base['Class'] = np.argmax(base[['class%d'%i for i in range(1,10)]].values,axis=1)
    sub = pd.read_csv(name)
    #sub = pd.merge(sub,base[['ID','Class']],on="ID",how='right')
    #print(sub.head())
    y = base['Class'].values
    yp = sub[['class%d'%i for i in range(1,10)]].values
    if clip:
        yp = np.clip(yp,(1.0-bar)/8,bar)
        yp = yp/np.sum(yp,axis=1).reshape([yp.shape[0],1])
    print(name,cross_entropy(y,yp),multiclass_log_loss(y,yp))
    for i in range(9):
        y1 = y[y==i]
        yp1 = yp[y==i]
        print(i,y1.shape,cross_entropy(y1,yp1),multiclass_log_loss(y1,yp1))
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def post(self):
        if self.flags.task == "test_cnn_stage1":
            docs = self.DB.clean_doc['test_text_filter']
        elif self.flags.task == "test_cnn_stage2":
            docs = self.DB.clean_doc['stage2_test_text']
        else:
            self.mDB.get_split()
            docs = self.mDB.split[self.flags.fold][1]
        nrows = len(docs)
        p = np.zeros([nrows,9])
        for i in range(self.flags.epochs):
            if i==0:
                skiprows=None
            else:
                skiprows = nrows*i
            p = p + (pd.read_csv(self.flags.pred_path,header=None,nrows=nrows,skiprows=skiprows).values)
        p = p/self.flags.epochs
        if '_cv' in self.flags.task:
            from utils.np_utils.utils import cross_entropy
            y = np.argmax(self.mDB.y,axis=1)
            print("cross entropy", cross_entropy(y[self.mDB.split[self.flags.fold][1]],p))
        s = pd.DataFrame(p,columns=['class%d'%i for i in range(1,10)])
        s['ID'] = np.arange(nrows)+1
        s.to_csv(self.flags.pred_path.replace(".csv","_sub.csv"),index=False,float_format="%.5f")
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def post_cv(flags):
    import re
    import os
    path = flags.data_path
    files = [i for i in os.listdir(path) if len(re.findall('cv_[0-9].csv',i))]
    s = []
    for name in files:
        s.append(pd.read_csv("%s/%s"%(path,name)))

    s = pd.concat(s,axis=0)
    print(s.head())
    classes = len([i for i in s.columns.values if 'class' in i])
    from utils.np_utils.utils import cross_entropy
    yp = s[['class%d'%i for i in range(1,classes+1)]].values
    y=s['real'].values
    print(cross_entropy(y,yp))
    s.to_csv("%s/cv.csv"%path,index=False)
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def replace(s,n):
    seen = pd.read_csv(s)
    unseen = pd.read_csv(n)
    te = pd.read_csv('../input/stage2_test_variants.csv')
    tr = pd.read_csv('../input/training_variants')
    unseen = pd.merge(unseen,te,on='ID',how='right')
    seen = pd.merge(seen,te,on='ID',how='right')
    mask = seen.Gene.isin(tr.Gene)
    cols = ['class%d'%i for i in range(1,10)]
    seen.loc[~mask,cols] = 0

    mask = unseen.Gene.isin(tr.Gene)
    unseen.loc[mask,cols] = 0

    assert (unseen['ID']==seen['ID']).all()
    seen[cols] = seen[cols] + unseen[cols]

    seen[cols+['ID']].to_csv('mix.csv',index=False)
项目:kaggle-review    作者:daxiongshu    | 项目源码 | 文件源码
def test2():
    s1 = pd.read_csv('../input/test_variants')
    s3 = pd.read_csv('../input/test_variants_filter')
    s1 = pd.merge(s1,s3[['ID','Class']],on='ID',how='left').fillna(1)

    s2 = pd.read_csv('../input/stage2_test_variants.csv')
    s1 = pd.merge(s1,s2,on= ["Gene", "Variation"],how='inner')
    s1['ID'] = s1['ID_y']
    s2 = pd.merge(s1[['ID','Class']],s2,on='ID',how='right').fillna(1)
    yp = onehot_encode(s2['Class'].values-1)

    for i in range(1,10):
        s2['class%d'%i] = yp[:,i-1]
    cols = ['class%d'%i for i in range(1,10)]
    mask = s2['ID'].isin(s1['ID_y'])
    s2.loc[~mask,cols] = 0.1

    s2['ID'] = s2['ID'].astype(int)
    cols = ['ID']+['class%d'%i for i in range(1,10)]
    s2[cols].to_csv('sub.csv',index=False)
项目:powerAI    作者:dreameng28    | 项目源码 | 文件源码
def x_label(feature_path, pred=False):
    X_list = []
    for each in feature_path:
        X = pd.read_csv(feature_paths.format(str(each)))
        X_list.append(X)
    X = pd.DataFrame(pd.concat(X_list, axis=0)).reset_index().drop('index', axis=1)
    if not pred:
        y = X[power_consumption].tolist()
        X = X.drop([record_date, user_id, power_consumption], axis=1)
        columns = X.columns
        X = X.values
        return X, y, columns
    else:
        X = X.drop([record_date, user_id], axis=1)
        columns = X.columns
        X = X.values
        return X, columns
项目:GOS    作者:crcresearch    | 项目源码 | 文件源码
def neighbors():
    """
    Read the neighbors for each country.
    """
    neighbors_csv = pd.read_csv(csv_path("mledoze-countries.csv"), sep=';',
                                usecols=[4, 17])
    neighbors_csv.columns = ["Code", "neighbors"]
    neighbors_csv["neighbors"] = neighbors_csv["neighbors"].str.split(',')
    for row in neighbors_csv.loc[neighbors_csv.neighbors.isnull(), 'neighbors'].index:
        neighbors_csv.at[row, 'neighbors'] = []
    # Island nations are a weird exception
    neighbors_csv.loc[neighbors_csv.Code == "MDG", "neighbors"] = [["MOZ", "ZAF", "TZA"]]
    neighbors_csv.loc[neighbors_csv.Code == "TWN", "neighbors"] = [["CHN", "PHL"]]
    neighbors_csv.loc[neighbors_csv.Code == "AUS", "neighbors"] = [["NZL"]]
    neighbors_csv.loc[neighbors_csv.Code == "NZL", "neighbors"] = [["AUS"]]
    neighbors_csv.loc[neighbors_csv.Code == "JPN", "neighbors"] = [["TWN", "KOR", "PHL"]]
    neighbors_csv.loc[neighbors_csv.Code == "PHL", "neighbors"] = [["TWN", "KOR", "JPN"]]
    neighbors_csv.loc[neighbors_csv.Code == "PRI", "neighbors"] = [["DOM"]]
    neighbors_csv.loc[neighbors_csv.Code == "SGP", "neighbors"] = [["MYS", "IDN"]]
    neighbors_csv.loc[neighbors_csv.Code == "JAM", "neighbors"] = [["CUB", "DOM"]]
    return neighbors_csv
项目:webcrawling    作者:etilelab    | 项目源码 | 文件源码
def loadFile(fileName):
    # checkFileName??? ??, ??? ???? ???? ??? ??
    outputFileName = checkFileName(fileName)

    if outputFileName is not -1:
        df = pandas.read_csv(outputFileName)
        content = df["Content"]
        title = df["Title"]
        company = df["Company"]
        print(company)

        print("csv FIle Load Success")
    else:
        print("Error csv File")

# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??
项目:webcrawling    作者:etilelab    | 项目源码 | 文件源码
def loadFile(fileName,analyzeValue):
    # checkFileName??? ??, ??? ???? ???? ??? ??
    outputFileName = checkFileName(fileName)

    if outputFileName is not -1:
        df = pandas.read_csv(outputFileName)
        content = df["Content"]
        title = df["Title"]
        company = df["Company"]

        print("csv FIle Load Success")

        if analyzeValue==1:
            # analyze(title)
            analyze(content)

    else:
        print("Error csv File")

# checkFileName ??
# ???? ??? ???? ???? ??? -1 ??, ??? ??? ??
# ??? ???? all?? ?? ??? ?? csv??? ??? ???, csv??? ?? ??
# ??? ???? csv ??? ??
项目:scrapy_projects    作者:morefreeze    | 项目源码 | 文件源码
def filter_data(csv_file, start_day=28, end_day=90, interest=780, state=None, **kwargs):
    f = pd.read_csv(csv_file)
    f['sub_title'] = f['sub_title'].fillna('')
    candidate = []
    filter = Filter()
    filter.install_rule(lambda v: v['period'] <= datetime.timedelta(days=20) and v['benefit'] > 6, ok_stop=True, weight=5)
    filter.install_rule(lambda v: v['benefit'] >= 8 and v['period'] < datetime.timedelta(days=230))
    filter.install_rule(lambda v: not v['sub_title'].startswith('????'))
    for row in f.iterrows():
        idx, v = row
        money = money2float(v['money'])
        period = period2timedelta(v['period'])
        # remove percent sign(%)
        benefit = float(v['expected_benefit'][:-1])
        item = {
            'title': v['title'],
            'sub_title': v['sub_title'],
            'money': money,
            'period': period,
            'benefit': benefit,
        }
        if filter.check(item):
            candidate.append(item)
    return candidate
项目:scrapy_projects    作者:morefreeze    | 项目源码 | 文件源码
def filter_data(csv_file, **kwargs):
    f = pd.read_csv(csv_file)
    candidate = []
    filter = Filter()
    filter.install_rule(lambda v: not v['title'].startswith('test'))
    for row in f.iterrows():
        idx, v = row
        item = {
            'title': v['title'],
        }
        if filter.check(item):
            candidate.append(item)
    return candidate


# If len(candicate) > 0 will send to slack, the text will store as slack_txt_file
项目:ssbio    作者:SBRG    | 项目源码 | 文件源码
def parse_psqs(psqs_results_file):
    """Parse a PSQS result file and returns a Pandas DataFrame of the results

    Args:
        psqs_results_file: Path to psqs results file

    Returns:
        Pandas DataFrame: Summary of PSQS results

    """

    # TODO: generalize column names for all results, save as dict instead

    psqs_results = pd.read_csv(psqs_results_file, sep='\t', header=None)
    psqs_results['pdb_file'] = psqs_results[0].apply(lambda x: str(x).strip('./').strip('.pdb'))
    psqs_results = psqs_results.rename(columns = {1:'psqs_local', 2:'psqs_burial', 3:'psqs_contact', 4:'psqs_total'}).drop(0, axis=1)
    psqs_results['u_pdb'] = psqs_results['pdb_file'].apply(lambda x: x.upper() if len(x)==4 else np.nan)
    psqs_results['i_entry_name'] = psqs_results['pdb_file'].apply(lambda x: x.split('_model1')[0] if len(x)>4 else np.nan)
    psqs_results = psqs_results[pd.notnull(psqs_results.psqs_total)]

    return psqs_results
项目:kaggle    作者:RankingAI    | 项目源码 | 文件源码
def LoadFromTextFile(InputDir):

        ## raw data
        TrainData = pd.read_csv('%s/train_2016_v2.csv' % InputDir, parse_dates=['transactiondate'], header=0)
        TestData = pd.read_csv('%s/sample_submission.csv' % InputDir, header=0)
        TestData['parcelid'] = TestData['ParcelId']
        TestData.drop('ParcelId', axis=1, inplace=True)
        PropertyData = pd.read_csv('%s/properties_2016.csv' % InputDir,header=0)
        for c, dtype in zip(PropertyData.columns, PropertyData.dtypes):
            if dtype == np.float64:
                PropertyData[c] = PropertyData[c].astype(np.float32)

        ## join dynamic data with static data
        TrainData = pd.merge(TrainData, PropertyData, how='left', on='parcelid')
        TestData = pd.merge(TestData, PropertyData, how='left', on='parcelid')

        return TrainData,TestData

    ## class method, save data with pkl format
项目:scheduled-bots    作者:SuLab    | 项目源码 | 文件源码
def get_microbe_taxids(force_download=False):
    """
    Download the latest bacterial genome assembly summary from the NCBI genome ftp site
    and generate a pd.DataFrame of relevant data for strain items based on taxids of the bacterial reference genomes.
    :return: pandas dataframe of bacteria reference genome data
    """
    if force_download or not os.path.exists("reference_genomes.csv"):
        assembly = urllib.request.urlretrieve("ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt")
        df = pd.read_csv(assembly[0], sep="\t", dtype=object, skiprows=1, header=0)
        df = df[df['refseq_category'].isin(['reference genome', 'representative genome'])]

        all_tax_wdid = id_mapper('P685')

        df['wdid'] = df['taxid'].apply(lambda x: all_tax_wdid.get(x, None))
        df = df.rename(columns={'# assembly_accession': 'assembly_accession'})
        df.to_csv('reference_genomes.csv', sep="\t")
        df.taxid = df.taxid.astype(int)
        return df
    else:  # use predownloaded and parsed flatfile
        df = pd.read_csv("reference_genomes.csv", sep="\t", dtype=object, index_col=0)
        df.taxid = df.taxid.astype(int)
        return df
项目:scheduled-bots    作者:SuLab    | 项目源码 | 文件源码
def get_assembly_report(self, taxid):
        if self.ass_sum is None:
            self.get_assembly_summaries()
        df = self.ass_sum.query("taxid == {} & refseq_category == 'reference genome'".format(taxid))
        if len(df) == 0:
            # try "representative genome" (needed for mouse and rat)
            df = self.ass_sum.query("taxid == {} & refseq_category == 'representative genome'".format(taxid))
        if len(df) != 1:
            raise ValueError("unknown reference: {}".format(df))
        print(df)
        ftp_path = list(df.ftp_path)[0]
        assembly = os.path.split(ftp_path)[1]
        url = os.path.join(ftp_path, assembly + "_assembly_report.txt")
        print(url)
        # read the column names from the file
        table = request.urlopen(request.Request(url)).read().decode()
        names = [x for x in table.split("\n") if x.startswith("#")][-1].strip().replace("# ", "").split("\t")
        self.chr_df[taxid] = pd.read_csv(StringIO(table), sep="\t", names=names, comment='#')
        self.chr_df[taxid] = self.chr_df[taxid].rename(columns={'Sequence-Name': 'SequenceName', 'Sequence-Role': 'SequenceRole',
                                                                'Assigned-Molecule': 'AssignedMolecule',
                                                                'Assigned-Molecule-Location/Type': 'AssignedMoleculeLocationType',
                                                                'GenBank-Accn': 'GenBankAccn', 'RefSeq-Accn': 'RefSeqAccn',
                                                                'UCSC-style-name': 'UCSCstylename'})
        #print(self.chr_df[taxid].query("SequenceRole == 'assembled-molecule'"))
项目:how_to_convert_text_to_images    作者:llSourcell    | 项目源码 | 文件源码
def load_bbox(data_dir):
    bbox_path = os.path.join(data_dir, 'CUB_200_2011/bounding_boxes.txt')
    df_bounding_boxes = pd.read_csv(bbox_path,
                                    delim_whitespace=True,
                                    header=None).astype(int)
    #
    filepath = os.path.join(data_dir, 'CUB_200_2011/images.txt')
    df_filenames = pd.read_csv(filepath, delim_whitespace=True, header=None)
    filenames = df_filenames[1].tolist()
    print('Total filenames: ', len(filenames), filenames[0])
    #
    filename_bbox = {img_file[:-4]: [] for img_file in filenames}
    numImgs = len(filenames)
    for i in xrange(0, numImgs):
        # bbox = [x-left, y-top, width, height]
        bbox = df_bounding_boxes.iloc[i][1:].tolist()

        key = filenames[i][:-4]
        filename_bbox[key] = bbox
    #
    return filename_bbox
项目:evaluation_tools    作者:JSALT-Rosetta    | 项目源码 | 文件源码
def get_sample_item_file(wav_file_names_sample, item_file, output):
    """
    From a sampled dataset, get an item file for running an ABX task
    Parameters
    ----------
    item file : text file containing at least as columns : #filename, onset, offset, 
    #phoneme and context and side information such as image ID
    item_file : string,
         path to the item file of the whole dataset
    output: string, 
        path where the sample item file will be stored
    """
    wav_names=[]
    temp=np.load(wav_file_names_sample)
    for s in temp:
        wav_names.append(s.split(".")[0])

    df=pd.read_csv(item_file, sep="\t", index_col="#filename")
    df_sample=df.loc[wav_names]

    df_sample.to_csv(output, sep="\t", header=True, index=False)

    return(df_sample)
项目:BadParser    作者:stanojevic    | 项目源码 | 文件源码
def meansOfMeans(datafile):

    df = pd.read_csv(datafile, delimiter=",")
    df = df.loc[df["swapsEager"]>0]
    grouped = df.groupby("words", as_index=True)
    idx = grouped.groups.keys()

    all_means=grouped.mean()
    mean_of_means = all_means.mean()
    std_of_means = all_means.std()

    #Print in latex format:
    print "& Average number of swaps & Average jump size \\\\"
    print "\hline"
    for laziness in ("Eager", "Lazy", "Lazier"):
        print "{} & {}({}) & {}({})\\\\".format(laziness, \
                                                mean_of_means["swaps%s"%laziness], \
                                                std_of_means["swaps%s"%laziness], \
                                                mean_of_means["avgAltBlockSize%s"%laziness], \
                                                std_of_means["avgAltBlockSize%s"%laziness])
项目:pyrsss    作者:butala    | 项目源码 | 文件源码
def read_sm_csv(csv_fname):
    """
    Parse the SuperMAG CSV format data record *csv_fname*. For each
    station, store the information in pandas
    :class:`DataFrame`. Return a mapping between the station
    identifier and data frame.
    """
    df = PD.read_csv(csv_fname,
                     header=0,
                     parse_dates=[0],
                     date_parser=lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'),
                     index_col=0)
    df_map = {name: group for name, group in df.groupby('IAGA')}
    for df in df_map.itervalues():
        del df['IAGA']
        df.rename(columns={'N': 'B_N',
                           'E': 'B_E',
                           'Z': 'B_Z'},
                  inplace=True)
    return df_map
项目:bambi    作者:bambinos    | 项目源码 | 文件源码
def crossed_data():
    '''
    Random effects:
    10 subjects, 12 items, 5 sites
    Subjects crossed with items, nested in sites
    Items crossed with sites

    Fixed effects:
    A continuous predictor, a numeric dummy, and a three-level category
    (levels a,b,c)

    Structure:
    Subjects nested in dummy (e.g., gender), crossed with threecats
    Items crossed with dummy, nested in threecats
    Sites partially crossed with dummy (4/5 see a single dummy, 1/5 sees both
    dummies)
    Sites crossed with threecats
    '''
    from os.path import dirname, join
    data_dir = join(dirname(__file__), 'data')
    data = pd.read_csv(join(data_dir, 'crossed_random.csv'))
    return data
项目:chinese-stock-Financial-Index    作者:lfh2016    | 项目源码 | 文件源码
def calcu_all_stocks_3year_average_profit(year):  # ??3???????
    path = os.path.join(current_folder, '????%s.csv' % today)
    if not os.path.exists(path):
        data = ts.get_stock_basics()
        lie = ['??', '??', '??', '???', '????', '???',
               '???(?)', '????', '????', '???', '?????', '????', '????',
               '???', '????', '????', '?????', '????(%)', '????(%)',
               '???(%)', '????(%)', '????']
        data.columns = lie
        data.index.names = ['??']
        data.to_csv(path, encoding='utf-8')

    data = pd.read_csv(path, encoding='utf-8', index_col=0)
    # print(data)
    data['????'] = 0
    for index, row in data.iterrows():
        try:
            data.loc[index, '????'] = calcu_3year_average_profit('%06d' % index, year)
        except Exception as e:
            print(e)
            data.loc[index, '????'] = 0

        print('??%s' % index)
    data.to_csv(os.path.join(current_folder, '3????????????%s.csv' % today), encoding='utf-8')
项目:ml-rest    作者:apinf    | 项目源码 | 文件源码
def save_csv_as_dataframe(request):
    print("Save CSV as DataFrame")

    if (request.POST):
        # Get CSV URL from post; default to None if not provided
        csv_url = request.POST.get('csv_url', None)

        if (csv_url):
            csv_data = pd.read_csv(csv_url)

            print(csv_data)

            # Create Data Frame instance
            data = Data()

            # Add CSV Data to data_frame field
            data.data_frame = csv_data
            data.source_url = csv_url

            # Save Data Frame
            data.save()
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def store_test_predictions(self, prediction_id='_final'):
        """
        Stores the test predictions in a CSV file

        :param prediction_id: A simple id appended to the name of the summary for uniqueness
        :return: None
        """
        # prediction id is usually the step count
        print 'Storing predictions on Test Data...'
        review = []
        true_summary = []
        generated_summary = []
        for i in range(self.test_size):
            if not self.checkpointer.is_output_file_present():
                review.append(self._index2sentence(self.test_review[i]))
                true_summary.append(self._index2sentence(self.true_summary[i]))
            if i < (self.test_batch_size * (self.test_size // self.test_batch_size)):
                generated_summary.append(self._index2sentence(self.predicted_test_summary[i]))
            else:
                generated_summary.append('')

        prediction_nm = 'generated_summary' + prediction_id
        if self.checkpointer.is_output_file_present():
            df = pd.read_csv(self.checkpointer.get_result_location(), header=0)
            df[prediction_nm] = np.array(generated_summary)
        else:
            df = pd.DataFrame()
            df['review'] = np.array(review)
            df['true_summary'] = np.array(true_summary)
            df[prediction_nm] = np.array(generated_summary)
        df.to_csv(self.checkpointer.get_result_location(), index=False)
        print 'Stored the predictions. Moving Forward'
        if prediction_id == '_final':
            print 'All done. Exiting..'
            print 'Exited'
项目:deep-summarization    作者:harpribot    | 项目源码 | 文件源码
def load_result(self,result_file):
        """

        :param result_file:
        :return:
        """
        self.result = pd.read_csv(result_file, header=0)
        self.__scrape_reference()
        self.__scrape_all_hypotheses()
项目:numerai    作者:gansanay    | 项目源码 | 文件源码
def training_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.train_file_name))
项目:numerai    作者:gansanay    | 项目源码 | 文件源码
def test_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.test_file_name))
项目:numerai    作者:gansanay    | 项目源码 | 文件源码
def sorted_training_set(self):
        return pd.read_csv(resource_filename('numerai.data', self.sorted_file_name))
项目:toll_road    作者:idosekely    | 项目源码 | 文件源码
def _reader(self):
        if not self.does_exist():
            return
        dateparse = lambda dates: pd.datetime.strptime(dates, '%Y-%m-%d %H:%M:%S.%f')
        df = pd.read_csv(self.data_file, parse_dates='timestamp', index_col='timestamp', date_parser=dateparse)
        return df
项目:DREAM    作者:LaceyChen17    | 项目源码 | 文件源码
def get_orders(self):
        '''
            get order context information
        '''
        orders = pd.read_csv(self.raw_data_dir + 'orders.csv')
        orders = orders.fillna(0.0)
        orders['days'] = orders.groupby(['user_id'])['days_since_prior_order'].cumsum()
        orders['days_last'] = orders.groupby(['user_id'])['days'].transform(max)
        orders['days_up_to_last'] = orders['days_last'] - orders['days']
        del orders['days_last']
        del orders['days']
        return orders