当前位置: 首页 > 后端技术 > Python

Pandas入门教程(一)

时间:2023-03-25 20:55:55 Python

数据分析处理库importpandasaspddf=pd.read_csv("./pandas/data/titanic.csv")df.head(N)读取前N行数据df.head(6)df.info()得到DataFrame的简要总结df.info()RangeIndex:891entries,0to890Datacolumns(total12columns):#ColumnNon-NullCountDtype----------------------------0PassengerId891非空int641幸存891非空int642Pclass891非空int643名称891非空对象4Sex891非空对象5Age714非空float646SibSp891非空int647Parch891非空int648Ticket891非空对象9Fare891非空float6410Cabin204非空对象11E889非空对象数据类型:float64(2),int64(5),object(5)内存使用:83.7+KBdf.index查看索引df.indexRangeIndex(start=0,stop=891,step=1)df.columns查看所有列名df.columnsIndex(['PassengerId','Survived','Pclass','Name','Sex','Age','SibSp','Parch','Ticket','Fare','Cabin','Embarked'],dtype='object')df.dtypes查看每一列的字段类型df.dtypesPassengerIdint64Survivedint64Pclassint64NameobjectSexobjectSexobjectAgefloat64SibSpint64Parchint64TicketobjectFarefloat64CabinobjectEmbarkedobjectdtype:objectdfray0,value1[查看所有数据,...,7.25,nan,'S'],[2,1,1,...,71.2833,'C85','C'],[3,1,3,...,7.925,nan,'S'],...,[889,0,3,...,23.45,nan,'S'],[890,1,1,...,30.0,'C148','C'],[891,0,3,...,7.75,nan,'Q']],dtype=object)df['Name']0Braund,Mr.OwenHarris1Cumings,Mrs.JohnBradley(FlorenceBriggsTh...2Heikkinen,Miss.Laina3Futrelle,Mrs.JacquesHeath(LilyMayPeel)4Allen,Mr.WilliamHenry...886Montvila,Rev.Juozas887Graham,Miss.MargaretEdith888Johnston,Miss.CatherineHelen"Carrie"889Behr,Mr.KarlHowell890Dooley,Mr.PatrickName:Name,Length:891,dtype:objectdf=df.set_index('Name')df查询年龄列的前8列数据df['Age'][:8]NameBraund,Mr.OwenHarris22.0Cumings,Mrs.John布拉德利(FlorenceBriggsThayer)38.0Heikkinen,Laina小姐26.0Futrelle,JacquesHeath夫人(LilyMayPeel)35.0Allen,WilliamHenry先生35.0Moran,JamesNaNMcCarthy先生,TimothyJ先生54.0Palsson,大师。戈斯塔伦纳德2.0Name:Age,dtype:float64对单列数据的操作age=df['Age']ageNameBraund,Mr.OwenHarris22.0Cumings,Mrs.JohnBradley(FlorenceBriggsThayer)38.0Heikkinen,Miss.Laina26.0Futrelle,Mrs.JacquesHeath(LilyMayPeel)35.0Allen,Mr.WilliamHenry35.0...Montvila,Rev.Juozas27.0Graham,Miss.MargaretEdith19.0Johnston,Miss.CatherineHelen"Carrie"NaNBehr,Mr.KarlHowell26.0Dooley,Mr..Patrick32.0Name:Age,Length:891,dtype:float64#每个Age加上10age=age+10age。姓名Braund,Mr.OwenHarris32.0Cumings,Mrs.JohnBradley(FlorenceBriggsThayer)48.0Heikkinen,Miss.Laina36.0Futrelle,JacquesHeath夫人(LilyMayPeel)45.0Allen,Mr.WilliamHenry45.0...Montvila,Rev.Juozas37.0Graham,Miss.MargaretEdith29.0Johnston,Miss.CatherineHelen"Carrie"NaNBehr,Mr.KarlHowell36.0Dooley,Mr.Patrick42.0Name:Age,Length:891,dtype:float64#Age的最大值age.max()90.0#Age的最小值age.min()10.42#Age的平均值age.mean()39.69911764705882describe获取数据的基本统计特征df.describe()只查询某个集合的某几列df[['Age','Fare']][:5]通过索引或查询数据label#通过索引查看一行数据df.iloc[0]#查询前4行数据df.iloc[0:5]#查询前4行前3列数据df.iloc[0:5,1:3]#通过索引列值读取某行数据df.loc['Futrelle,Mrs.JacquesHeath(LilyMayPeel)']#查询某行某值并columndf.loc['Futrelle,Mrs.JacquesHeath(LilyMayPeel)','年龄']#曲ery某些行和列的数据df.loc['Braund,Mr.OwenHarris':'Graham,Miss.MargaretEdith','Sex':'Age']#修改某个值df.loc['Heikkinen,Miss.Laina','Age']=2000bool运算#查询前5行Age大于50的数据df[df['Age']>50][:5]#查询性别为女性的数据df[df['Sex']=='female']#计算性别为男性,平均年龄df.loc[df['Sex']=='male','Age'].mean()#计算Age大于50的年龄总和(df['Age']>50).sum()65DataFramegroupby数据分组dff=pd.DataFrame({'键':['A','B','C','A','B','C','A','B','C'],'值':[0,5,10,5,10,15,10,15,20]})dff根据key分组求和dff.groupby('key').sum()importnumpyasnpdff.groupby('key').aggregate(np.mean)#按性别分组,计算年龄的平均值df.groupby('Sex')['Age'].mean()Sexfemale35.478927male30.726645Name:Age,dtype:float64数值运算df1=pd.DataFrame([[1,2,3,4],[3,4,5,6]],index=['a','b'],columns=['A','B','C','D'])df1#计算每一列df1.sum()df1.sum(axis=0)A4B6C8D10dtype:int64#对每一行求和df1.sum(axis=1)a10b18dtype:int64#average每列值df1.mean(axis=0)A2.0B3.0C4.0D5.0dtype:float64#每行的平均值df1.mean(axis=1)a2.5b4.5dtype:float64df#协方差df.cov()#相关性df.corr()#统计每个值出现的次数df['Age'].value_counts()24.003022.002718.002628.002519.0025..53.00155.50170.50123.5010.421Name:Age,Length:89,dtype:int64#统计每个值出现的次数,出现次数【从少到多排列更多df'Age'].value_counts(ascending=True)0.42123.50170.50155.50153.001..19.002528.002518.002622.002724.0030Name:Age,Length:89,dtype:int64objectoperation(2,Series)=onerow,one3,4]index=['a','b','c','d']s=pd.Series(index=index,data=data)#查询第一行s[0]#查询1到3行s[1:3]#mask操作只显示c行mask=[True,False,True,False]s[mask]#修改某个值s['a']=200#将值替换为3for300s.replace(to_replace=3,value=300,inplace=True)#修改列名s.rename(index={'a':'A'},inplace=True)#添加数据s1=pd.Series(index=['e','f'],data=[5,6])s3=s.append(s1)#删除A行数据dels3['A']#一次删除多行数据s3.drop(['c','d'],inplace=True)s3b2e5f6dtype:int64DataFrame增删改查操作#构造一个DataFramedata=[[1,2,3,4],[5,6,7,8]]index=['a','b']columns=['A','B','C','D']dff=pd.DataFrame(data=data,index=index,columns=columns)ABCDa1234b5678#Querydff1=dff.iloc[1]dff1=dff.loc['a']dff1A1B2C3D4Name:a,dtype通过loc('indexvalue')andiloc(indexvalue):int64#修改值dff.loc['a']['A']=1000dffABCDa1000234b5678#修改索引dff.index=['m','n']dffABCDm1000234n5678#添加一行数据dff.loc['o']=[10,11,12,13]dffABCDm1000234n5678o10111213#添加一列数据dff['E']=[5,9,14]dffABCDEm10002345n56789o1011121314#批量添加多列数据df4=pd.DataFrame([[6,10,15],[7,11,16],[8,12,17]],index=['m','n','o'],columns=['F','M','N'])df5=pd.concat([dff,df4],axis=1)df5ABCDEFMNm1000234561015n5678971116o101112131481217#删除一条数据df5.drop(['o'],axis=0,inplace=True)df5ABCDEFMNm1000234561015n5678971116#删除列表df5。drop(['E','F'],axis=1,inplace=True)df5ABCDMNm10002341015n56781116