一、Pandas简介Pandas是Python的一个数据分析包,它是为解决数据分析任务而创建的。Pandas整合了大量库和标准数据模型,提供了高效操作数据集所需的工具。Pandas提供了大量的函数和方法,可以让我们快速方便地处理数据。Pandas采用字典的形式,构建在NumPy之上,使以NumPy为中心的应用程序更加容易。2.Pandas安装pip3installpandas3.Pandas引入importpandasaspd#为了方便实用,pandas使用pd缩写4.Pandas数据结构Pandas的主要数据结构是Series(一维数据)和DataFrame(二维数据)数据)。数据结构足以处理金融、统计、社会科学、工程等领域的大多数典型用例。4.1SeriesSeries是一个类似于一维数组的对象,它由一组数据(各种Numpy数据类型)和一个与之关联的一组数据标签(即索引)。importnumpyasnpimportpandasaspds=pd.Series([1,2,3,np.nan,5,6])#nan(NAN,Nan):notanumber表示不是数字,np.nan是float数据类型print(s)#Index在左边,value在右边)。DataFrame既有行索引也有列索引,可以看作是Series的字典(常用索引)。dates=pd.date_range('20180310',periods=6)df=pd.DataFrame(np.random.randn(6,4),index=dates,columns=['A','B','C','D'])#生成6行4列的位置print(df)#输出一个6行4列的表格print(df['B'])#创建具体数据的DataFramef_1=pd.DataFrame({'A':1.,'B':pd.Timestamp('20180310'),'C':pd.Series(1,index=list(range(4)),dtype='float32'),'D':np.array([3]*4,dtype='int32'),'E':pd.Categorical(["test","train","test","train"]),'F':'foo'})print(df_1)print(df_1.dtypes)#输出数据类型print(df_1.index)#行序号print(df_1.columns)#列序号名称print(df_1.values)#打印出每个值print(df_1.describe())#数字汇总print(df_1.T)#翻转数据,交换行和列print(df_1.sort_index(axis=1,ascending=False))#axisisequal为1并按ABCDEFG等列排序然后升序闪回显示print(df_1.sort_values(by='E'))#按值排序5.Pandasselectdatadates=pd.date_range('20180310',周期=6)df=pd.DataFrame(np.random.randn(6,4),索引=日期,列s=['A','B','C','D'])#生成6行4列位置print(df)5.1选择一行或一列print(df['A'])#ordf.A选择一列5.2切片选择print(df[0:3],df['20180310':'20180314'])#两次选择第一次切片选择第二次根据过滤条件选择5.3根据标签loc-Rowlabeltoselectdataprint(df.loc['20180312',['A','B']])#根据行标签选择精确选择5.4根据序列选择数据iloc-行号print(df.iloc[3,1])#输出第三行第一列的数据print(df.iloc[3:5,0:2])#切片选择print(df.iloc[[1,2,4],[0,2]])#不连续筛选5.5根据判断筛选print(df[df.A>0])#筛选出df.A大于0的元素布尔条件筛选6.pandas设置数据6.1根据设置loc和ilocdates=pd.date_range('20180310',periods=6)df=pd.DataFrame(np.arange(24).reshape((6,4)),index=dates,columns=['A','B','C','D'])print(df)df.iloc[2,2]=999#单点设置df.loc['2018-03-13','D']=999print(df)6.2根据条件设置df[df.A>0]=999#改变df.A的值大于0print(df)6.3设置df['F']=np.nanprint(df)6.4根据行或列添加数据df['E']=pd.Series([1,2,3,4,5,6],index=pd.date_range('20180313',periods=6))#添加一列print(df)7.Pandas处理Lost数据处理数据中的NaN数据,我们使用这个测试数据property-data.csv7.1输出csv文件数据importpandasaspddf=pd.read_csv('property-data.csv')print(df)7.2usedropna()函数去除NaN行或列new_df=df.dropna()print(new_df)7.3使用fillna()函数替换NaN值print(df.fillna(value=0))#用07.4替换NaN值使用isnull()函数判断数据是否丢失print(pd.isnull(df))#矩阵用布尔表示,如果nan为true,如果不为nan,则为falseprint(np.any(df.isnull()))#判断数据中是否会有NaN值8.Pandas导入导出Pandas可以读取和访问csv、excel、json、html、pickle等格式的数据。详见,请参考官方数据data=pd.read_csv('test1.csv')#读取csv文件data.to_pickle('test2.pickle')#将数据接入pickle文件#其他文件导入导出同理9、pandas合并数据9.1轴合并方向df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'])df2=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])df3=pd.DataFrame(np.ones((3,4))*2,columns=['a','b','c','d'])res=pd.concat([df1,df2,df3],axis=0,ignore_index=True)#0表示合并垂直项1表示合并水平项innore_indexresetsequenceindexindex变为012345678print(res)9.2join方法df1=pd.DataFrame(np.ones((3,4))*0,columns=['a','b','c','d'],index=[1,2,3])df2=pd.DataFrame(np.ones((3,4))*1,columns=['b','c','d','e'],index=[2,3,4])print(df1)print(df2)res=pd.concat([df1,df2],axis=1,join='outer')#打印(res)合并print(res)res=pd.concat([df1,df2],axis=1,join='inner')#把同一行合并print(res)指定连接合并时使用有四种连接方式,默认是left代码连接方法inner内连接取行索引intersectionouter外连接取行索引unionleftleftconnection使用左df的行索引rightrightconnection使用右df的行索引9.3append追加数据df1=pd.DataFrame(np.ones((3,4))*0,列=['a','b','c','d'])df2=pd.DataFrame(np.ones((3,4))*1,列=['a','b','c','d'])df3=pd.DataFrame(np.ones((3,4))*1,columns=['a','b','c','d'])s1=pd.Series([1,2,3,4],index=['a','b','c','d'])res=df1.append(df2,ignore_index=True)#将df2合并到df1下并重置indexprint(res)res=df1.append(s1,ignore_index=True)#将s1合并到df1下并重置indexprint(res)10。Pandasmergemerge10.1根据一组键合并left=pd.DataFrame({'key':['K0','K1','K2','K3'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})print(left)right=pd.DataFrame({'key':['K0','K1','K2','K3'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(right)res=pd.merge(left,right,on='key')print(res)10.2根据两组key合并left=pd.DataFrame({'key1':['K0','K0','K1','K2'],'key2':['K0','K1','K0','K1'],'A':['A0','A1','A2','A3'],'B':['B0','B1','B2','B3']})print(left)right=pd.DataFrame({'key1':['K0','K1','K1','K2'],'key2':['K0','K0','K0','K0'],'C':['C0','C1','C2','C3'],'D':['D0','D1','D2','D3']})print(right)res=pd.merge(left,right,on=['key1','key2'],how='inner')#内部合并print(res)res=pd.merge(left,right,on=['key1','key2'],how='outer')#外部合并print(res)res=pd.merge(left,right,on=['key1','key2'],how='left')#左联合合并print(res)res=pd.merge(left,right,on=['key1','key2'],how='right')#rightjoinprint(res)10.3指标合并df1=pd.DataFrame({'col1':[0,1],'col_left':['a','b']})print(df1)df2=pd.DataFrame({'col1':[1,2,2],'col_right':[2,2,2]})print(df2)res=pd.merge(df1,df2,on='col1',how='outer',indicator=True)#根据col1合并并使indicator=True输出各个合并方法print(res)res=pd.merge(df1,df2,on='col1',how='outer',indicator='indicator_column')#自定义指标列名print(res)10.4根据索引合并left=pd.DataFrame({'A':['A0','A1','A2'],'B':['B0','B1','B2']},index=['K0','K1','K2'])print(left)right=pd.DataFrame({'C':['C0','C2','C3'],'D':['D0','D2','D3']},index=['K0','K2','K3'])res=pd.merge(left,right,left_index=True,right_index=True,how='outer')#根据索引index合并,选择外合并打印(res)res=pd.merge(左,右,left_index=True,right_index=True,how='inner')打印(res)
