本文共 10498 字,大约阅读时间需要 34 分钟。
import pandas
读取csv数据:
food_info = pandas.read_csv("food_info.csv")print(type(food_info))print(food_info.dtypes)#字符串为object结构
NDB_No int64Shrt_Desc objectWater_(g) float64Energ_Kcal int64Protein_(g) float64Lipid_Tot_(g) float64Ash_(g) float64Carbohydrt_(g) float64Fiber_TD_(g) float64Sugar_Tot_(g) float64Calcium_(mg) float64Iron_(mg) float64Magnesium_(mg) float64Phosphorus_(mg) float64Potassium_(mg) float64Sodium_(mg) float64Zinc_(mg) float64Copper_(mg) float64Manganese_(mg) float64Selenium_(mcg) float64Vit_C_(mg) float64Thiamin_(mg) float64Riboflavin_(mg) float64Niacin_(mg) float64Vit_B6_(mg) float64Vit_B12_(mcg) float64Vit_A_IU float64Vit_A_RAE float64Vit_E_(mg) float64Vit_D_mcg float64Vit_D_IU float64Vit_K_(mcg) float64FA_Sat_(g) float64FA_Mono_(g) float64FA_Poly_(g) float64Cholestrl_(mg) float64dtype: object
显示数据内容:
food_info.head()显示前n行数据:
food_info.head(n)显示后n行:food_info.tail(n)显示列名:food_info.columnsIndex(['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'], dtype='object')
显示行列数:
food_info.shape(8618, 36)取数据: 按索引取数据: food_info.loc[0]
NDB_No 1001Shrt_Desc BUTTER WITH SALTWater_(g) 15.87Energ_Kcal 717Protein_(g) 0.85Lipid_Tot_(g) 81.11Ash_(g) 2.11Carbohydrt_(g) 0.06Fiber_TD_(g) 0Sugar_Tot_(g) 0.06Calcium_(mg) 24Iron_(mg) 0.02Magnesium_(mg) 2Phosphorus_(mg) 24Potassium_(mg) 24Sodium_(mg) 643Zinc_(mg) 0.09Copper_(mg) 0Manganese_(mg) 0Selenium_(mcg) 1Vit_C_(mg) 0Thiamin_(mg) 0.005Riboflavin_(mg) 0.034Niacin_(mg) 0.042Vit_B6_(mg) 0.003Vit_B12_(mcg) 0.17Vit_A_IU 2499Vit_A_RAE 684Vit_E_(mg) 2.32Vit_D_mcg 1.5Vit_D_IU 60Vit_K_(mcg) 7FA_Sat_(g) 51.368FA_Mono_(g) 21.021FA_Poly_(g) 3.043Cholestrl_(mg) 215Name: 0, dtype: object切片:
food_info.loc[3:6]
定位具体值:
food_info.loc[3,"NDB_No"] 按照列名: food_info["NDB_No"] 按照列名取多个数据: columns = ["Zinc_(mg)","Copper_(mg)"] food_info[columns]取出以g结尾的数据:
col_names = food_info.columns.tolist()print(col_names)gram_columns = []for c in col_names: if c.endswith("(g)"): gram_columns.append(c)gram_df = food_info[gram_columns]print(gram_df.head(3))['NDB_No', 'Shrt_Desc', 'Water_(g)', 'Energ_Kcal', 'Protein_(g)', 'Lipid_Tot_(g)', 'Ash_(g)', 'Carbohydrt_(g)', 'Fiber_TD_(g)', 'Sugar_Tot_(g)', 'Calcium_(mg)', 'Iron_(mg)', 'Magnesium_(mg)', 'Phosphorus_(mg)', 'Potassium_(mg)', 'Sodium_(mg)', 'Zinc_(mg)', 'Copper_(mg)', 'Manganese_(mg)', 'Selenium_(mcg)', 'Vit_C_(mg)', 'Thiamin_(mg)', 'Riboflavin_(mg)', 'Niacin_(mg)', 'Vit_B6_(mg)', 'Vit_B12_(mcg)', 'Vit_A_IU', 'Vit_A_RAE', 'Vit_E_(mg)', 'Vit_D_mcg', 'Vit_D_IU', 'Vit_K_(mcg)', 'FA_Sat_(g)', 'FA_Mono_(g)', 'FA_Poly_(g)', 'Cholestrl_(mg)'] Water_(g) Protein_(g) Lipid_Tot_(g) Ash_(g) Carbohydrt_(g) \0 15.87 0.85 81.11 2.11 0.06 1 15.87 0.85 81.11 2.11 0.06 2 0.24 0.28 99.48 0.00 0.00 Fiber_TD_(g) Sugar_Tot_(g) FA_Sat_(g) FA_Mono_(g) FA_Poly_(g) 0 0.0 0.06 51.368 21.021 3.043 1 0.0 0.06 50.489 23.426 3.012 2 0.0 0.00 61.924 28.732 3.694组合运算: result = food_info["Water_(g)"]*food_info["Energ_Kcal"]/1000 print(result.head(3))
0 11.378791 11.378792 0.21024dtype: float64添加数据: food_info["Iron_(g)"] = result print(food_info.shape)
(8618, 37)排序: food_info.sort_values("Sodium_(mg)",inplace=True,ascending=True)#ascending=True从小到大ascending=False从大到小;inplacenge=True不生成新数据 print(food_info["Sodium_(mg)"].head(100))
760 0.06470 0.0654 0.0655 0.02269 0.0610 0.0634 0.0637 0.0434 0.0673 0.0638 0.0632 0.0606 0.0633 0.06463 0.0630 0.0判断数据是否非空: import pandas as pd import numpy as np titanic_survival = pd.read_csv("titanic_train.csv") age = titanic_survival["Age"] #print(age.loc[0:10]) age_is_null = pd.isnull(age)#NaN返回True # print (age_is_null) age_null_true = age[age_is_null] # print (age_null_true) age_null_count = len(age_null_true) print(age_null_count)
求均值:
correct_mean_age = titanic_survival["Age"].mean()print (correct_mean_age)29.6h9911764705882
条件操作:不同等级客舱获救平均人数:
passenger_survival = titanic_survival.pivot_table(index="Pclass", values="Survived", aggfunc=np.mean)#index="Pclass"按照客舱等级,values="Survived"对获救人数进行操作, aggfunc=np.mean进行求平均print (passenger_survival)SurvivedPclass 1 0.6296302 0.4728263 0.242363不同上船地点的船票总费用和获救总人数: port_stats = titanic_survival.pivot_table(index="Embarked", values=["Fare","Survived"], aggfunc=np.sum) print(port_stats)
Fare SurvivedEmbarked C 10072.2962 93Q 1022.2543 30S 17439.3988 217丢弃缺失值: new_titanic_survival = titanic_survival.dropna(axis=0,subset=["Age", "Sex"])
重新排序:
new_titanic_survival = titanic_survival.sort_values("Age",ascending=False)print (new_titanic_survival[0:5])titanic_reindexed = new_titanic_survival.reset_index(drop=True)print('-------------')print(titanic_reindexed.loc[0:5])PassengerId Survived Pclass Name \630 631 1 1 Barkworth, Mr. Algernon Henry Wilson 851 852 0 3 Svensson, Mr. Johan 493 494 0 1 Artagaveytia, Mr. Ramon 96 97 0 1 Goldschmidt, Mr. George B 116 117 0 3 Connors, Mr. Patrick Sex Age SibSp Parch Ticket Fare Cabin Embarked 630 male 80.0 0 0 27042 30.0000 A23 S 851 male 74.0 0 0 347060 7.7750 NaN S 493 male 71.0 0 0 PC 17609 49.5042 NaN C 96 male 71.0 0 0 PC 17754 34.6542 A5 C 116 male 70.5 0 0 370369 7.7500 NaN Q ------------- PassengerId Survived Pclass Name Sex \0 631 1 1 Barkworth, Mr. Algernon Henry Wilson male 1 852 0 3 Svensson, Mr. Johan male 2 494 0 1 Artagaveytia, Mr. Ramon male 3 97 0 1 Goldschmidt, Mr. George B male 4 117 0 3 Connors, Mr. Patrick male 5 673 0 2 Mitchell, Mr. Henry Michael male Age SibSp Parch Ticket Fare Cabin Embarked 0 80.0 0 0 27042 30.0000 A23 S 1 74.0 0 0 347060 7.7750 NaN S 2 71.0 0 0 PC 17609 49.5042 NaN C 3 71.0 0 0 PC 17754 34.6542 A5 C 4 70.5 0 0 370369 7.7500 NaN Q 5 70.0 0 0 C.A. 24580 10.5000 NaN S使用自定义函数:
返回第一百行数据:
def hundredth_row(column): # Extract the hundredth item hundredth_item = column.loc[99] return hundredth_item hundredth_row = titanic_survival.apply(hundredth_row) print (hundredth_row)PassengerId 100Survived 0Pclass 2Name Kantor, Mr. SinaiSex maleAge 34SibSp 1Parch 0Ticket 244367Fare 26Cabin NaNEmbarked Sdtype: object返回每一列的空数据个数: def not_null_count(column): column_null = pd.isnull(column) null = column[column_null] return len(null) column_null_count = titanic_survival.apply(not_null_count) print (column_null_count)
PassengerId 100Survived 0Pclass 2Name Kantor, Mr. SinaiSex maleAge 34SibSp 1Parch 0Ticket 244367Fare 26Cabin NaNEmbarked Sdtype: object
判断是否成年(离散化操作):
def is_minor(row): if row["Age"] < 18: return True else:return False
minors = titanic_survival.apply(is_minor, axis=1)#print minors
def generate_age_label(row): age = row["Age"] if pd.isnull(age): return "unknown" elif age < 18: return "minor" else: return "adult" age_labels = titanic_survival.apply(generate_age_label, axis=1) print (age_labels)0 adult1 adult2 adult3 adult4 adult5 unknown6 adult7 minor8 adult9 minor10 minor11 adult12 adult13 adult14 minor15 adult16 minor17 unknown18 adult19 unknown20 adult21 adult22 minor23 adult24 minor25 adult复合操作: titanic_survival['age_labels'] = age_labels age_group_survival = titanic_survival.pivot_table(index="age_labels", values="Survived") print (age_group_survival)
age_labelsadult 0.381032minor 0.539823unknown 0.293785Name: Survived, dtype: float64