博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
人工智能学习笔记——数据分析处理库Pandas
阅读量:4218 次
发布时间:2019-05-26

本文共 10498 字,大约阅读时间需要 34 分钟。

import pandas

读取csv数据:

food_info = pandas.read_csv("food_info.csv")
print(type(food_info))

print(food_info.dtypes)#字符串为object结构

NDB_No int64Shrt_Desc objectWater_(g) float64Energ_Kcal int64Protein_(g) float64Lipid_Tot_(g) float64Ash_(g) float64Carbohydrt_(g) float64Fiber_TD_(g) float64Sugar_Tot_(g) float64Calcium_(mg) float64Iron_(mg) float64Magnesium_(mg) float64Phosphorus_(mg) float64Potassium_(mg) float64Sodium_(mg) float64Zinc_(mg) float64Copper_(mg) float64Manganese_(mg) float64Selenium_(mcg) float64Vit_C_(mg) float64Thiamin_(mg) float64Riboflavin_(mg) float64Niacin_(mg) float64Vit_B6_(mg) float64Vit_B12_(mcg) float64Vit_A_IU float64Vit_A_RAE float64Vit_E_(mg) float64Vit_D_mcg float64Vit_D_IU float64Vit_K_(mcg) float64FA_Sat_(g) float64FA_Mono_(g) float64FA_Poly_(g) float64Cholestrl_(mg) float64dtype: object

显示数据内容:

food_info.head()

显示前n行数据:

food_info.head(n)
显示后n行:
food_info.tail(n)
显示列名:
food_info.columns

Index(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)',       'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)',       'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)',       'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)',       'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)',       'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)',       'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg',       'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)',       'Cholestrl_(mg)'],      dtype='object')

显示行列数:

food_info.shape

(8618, 36)
取数据:
按索引取数据:
food_info.loc[0]
NDB_No                         1001Shrt_Desc          BUTTER WITH SALTWater_(g)                     15.87Energ_Kcal                      717Protein_(g)                    0.85Lipid_Tot_(g)                 81.11Ash_(g)                        2.11Carbohydrt_(g)                 0.06Fiber_TD_(g)                      0Sugar_Tot_(g)                  0.06Calcium_(mg)                     24Iron_(mg)                      0.02Magnesium_(mg)                    2Phosphorus_(mg)                  24Potassium_(mg)                   24Sodium_(mg)                     643Zinc_(mg)                      0.09Copper_(mg)                       0Manganese_(mg)                    0Selenium_(mcg)                    1Vit_C_(mg)                        0Thiamin_(mg)                  0.005Riboflavin_(mg)               0.034Niacin_(mg)                   0.042Vit_B6_(mg)                   0.003Vit_B12_(mcg)                  0.17Vit_A_IU                       2499Vit_A_RAE                       684Vit_E_(mg)                     2.32Vit_D_mcg                       1.5Vit_D_IU                         60Vit_K_(mcg)                       7FA_Sat_(g)                   51.368FA_Mono_(g)                  21.021FA_Poly_(g)                   3.043Cholestrl_(mg)                  215Name: 0, dtype: object
切片:

food_info.loc[3:6]

定位具体值:

food_info.loc[3,"NDB_No"]

按照列名:
food_info["NDB_No"]
按照列名取多个数据:
columns = ["Zinc_(mg)","Copper_(mg)"]
food_info[columns]

取出以g结尾的数据:

col_names = food_info.columns.tolist()
print(col_names)
gram_columns = []
for c in col_names:
    if c.endswith("(g)"):
        gram_columns.append(c)
gram_df = food_info[gram_columns]
print(gram_df.head(3))

['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)']   Water_(g)  Protein_(g)  Lipid_Tot_(g)  Ash_(g)  Carbohydrt_(g)  \0      15.87         0.85          81.11     2.11            0.06   1      15.87         0.85          81.11     2.11            0.06   2       0.24         0.28          99.48     0.00            0.00      Fiber_TD_(g)  Sugar_Tot_(g)  FA_Sat_(g)  FA_Mono_(g)  FA_Poly_(g)  0           0.0           0.06      51.368       21.021        3.043  1           0.0           0.06      50.489       23.426        3.012  2           0.0           0.00      61.924       28.732        3.694
组合运算:
result = food_info["Water_(g)"]*food_info["Energ_Kcal"]/1000
print(result.head(3))
0    11.378791    11.378792     0.21024dtype: float64
添加数据:
food_info["Iron_(g)"] = result
print(food_info.shape)
(8618, 37)
排序:
food_info.sort_values("Sodium_(mg)",inplace=True,ascending=True)#ascending=True从小到大ascending=False从大到小;inplacenge=True不生成新数据
print(food_info["Sodium_(mg)"].head(100))
760     0.06470    0.0654     0.0655     0.02269    0.0610     0.0634     0.0637     0.0434     0.0673     0.0638     0.0632     0.0606     0.0633     0.06463    0.0630     0.0
判断数据是否非空:
import pandas as pd
import numpy as np
titanic_survival = pd.read_csv("titanic_train.csv")
age = titanic_survival["Age"]
#print(age.loc[0:10])
age_is_null = pd.isnull(age)#NaN返回True
# print (age_is_null)
age_null_true = age[age_is_null]
# print (age_null_true)
age_null_count = len(age_null_true)
print(age_null_count)

求均值:

correct_mean_age = titanic_survival["Age"].mean()
print (correct_mean_age)

29.6h9911764705882

条件操作:

不同等级客舱获救平均人数:
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)
#index="Pclass"按照客舱等级,values="Survived"对获救人数进行操作, aggfunc=np.mean进行求平均
print (passenger_survival)

SurvivedPclass          1       0.6296302       0.4728263       0.242363
不同上船地点的船票总费用和获救总人数:
port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum)
print(port_stats)
Fare  SurvivedEmbarked                      C         10072.2962        93Q          1022.2543        30S         17439.3988       217
丢弃缺失值:
new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])

重新排序:

new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)
print (new_titanic_survival[0:5])
titanic_reindexed = new_titanic_survival.reset_index(drop=True)
print('-------------')
print(titanic_reindexed.loc[0:5])

PassengerId  Survived  Pclass                                  Name  \630          631         1       1  Barkworth, Mr. Algernon Henry Wilson   851          852         0       3                   Svensson, Mr. Johan   493          494         0       1               Artagaveytia, Mr. Ramon   96            97         0       1             Goldschmidt, Mr. George B   116          117         0       3                  Connors, Mr. Patrick         Sex   Age  SibSp  Parch    Ticket     Fare Cabin Embarked  630  male  80.0      0      0     27042  30.0000   A23        S  851  male  74.0      0      0    347060   7.7750   NaN        S  493  male  71.0      0      0  PC 17609  49.5042   NaN        C  96   male  71.0      0      0  PC 17754  34.6542    A5        C  116  male  70.5      0      0    370369   7.7500   NaN        Q  -------------   PassengerId  Survived  Pclass                                  Name   Sex  \0          631         1       1  Barkworth, Mr. Algernon Henry Wilson  male   1          852         0       3                   Svensson, Mr. Johan  male   2          494         0       1               Artagaveytia, Mr. Ramon  male   3           97         0       1             Goldschmidt, Mr. George B  male   4          117         0       3                  Connors, Mr. Patrick  male   5          673         0       2           Mitchell, Mr. Henry Michael  male       Age  SibSp  Parch      Ticket     Fare Cabin Embarked  0  80.0      0      0       27042  30.0000   A23        S  1  74.0      0      0      347060   7.7750   NaN        S  2  71.0      0      0    PC 17609  49.5042   NaN        C  3  71.0      0      0    PC 17754  34.6542    A5        C  4  70.5      0      0      370369   7.7500   NaN        Q  5  70.0      0      0  C.A. 24580  10.5000   NaN        S
使用自定义函数:

返回第一百行数据:

def hundredth_row(column):

    # Extract the hundredth item
    hundredth_item = column.loc[99]
    return hundredth_item
hundredth_row = titanic_survival.apply(hundredth_row)
print (hundredth_row)
PassengerId                  100Survived                       0Pclass                         2Name           Kantor, Mr. SinaiSex                         maleAge                           34SibSp                          1Parch                          0Ticket                    244367Fare                          26Cabin                        NaNEmbarked                       Sdtype: object
返回每一列的空数据个数:
def not_null_count(column):
    column_null = pd.isnull(column)
    null = column[column_null]
    return len(null)
column_null_count = titanic_survival.apply(not_null_count)
print (column_null_count)
PassengerId                  100Survived                       0Pclass                         2Name           Kantor, Mr. SinaiSex                         maleAge                           34SibSp                          1Parch                          0Ticket                    244367Fare                          26Cabin                        NaNEmbarked                       Sdtype: object

判断是否成年(离散化操作):

def is_minor(row):
    if row["Age"] < 18:
        return True
    else:

        return False

minors = titanic_survival.apply(is_minor, axis=1)

#print minors

def generate_age_label(row):
    age = row["Age"]
    if pd.isnull(age):
        return "unknown"
    elif age < 18:
        return "minor"
    else:
        return "adult"
age_labels = titanic_survival.apply(generate_age_label, axis=1)
print (age_labels)
0        adult1        adult2        adult3        adult4        adult5      unknown6        adult7        minor8        adult9        minor10       minor11       adult12       adult13       adult14       minor15       adult16       minor17     unknown18       adult19     unknown20       adult21       adult22       minor23       adult24       minor25       adult
复合操作:
titanic_survival['age_labels'] = age_labels
age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived")
print (age_group_survival)
age_labelsadult      0.381032minor      0.539823unknown    0.293785Name: Survived, dtype: float64

你可能感兴趣的文章
WebGL自学教程——WebGL示例:12. 要有光
查看>>
WebGL自学教程——WebGL示例:13.0 代码整理
查看>>
WebGL自学教程——WebGL示例:14.0 代码整理
查看>>
恶心的社会
查看>>
中国式危机公关9加1策略(第五章 慎用信息控制策略)
查看>>
展现自己的人生智慧
查看>>
深入理解java多态性
查看>>
Java新手进阶:细说引用类型
查看>>
osg中使用MatrixTransform来实现模型的平移/旋转/缩放
查看>>
(一) Qt Model/View 的简单说明
查看>>
(二)使用预定义模型 QStringListModel例子
查看>>
UVM:5.3.2 多domain 的例子
查看>>
UVM:7.4.5 加入存储器
查看>>
UVM:7.5.1 期望值与镜像值
查看>>
UVM:7.5.2 常用操作及其对期望值和镜像值的影响
查看>>
UVM:7.6.1 检查后门访问中hdl 路径的sequence
查看>>
UVM:7.6.2 检查默认值的sequence
查看>>
UVM:7.7.1 使用reg_predictor
查看>>
UVM:7.7.2 使用UVM_PREDICT_DIRECT功能与mirror 操作
查看>>
UVM:7.7.3 寄存器模型的随机化与update
查看>>