当前位置: 首页 > 后端技术 > Python

Pandas入门教程(六)

时间:2023-03-26 12:13:43 Python

importpandasaspdgl=pd.read_csv('./pandas/data/game_logs.csv')#数据的内部使用情况gl.info(memory_usage='deep')RangeIndex:171907个条目,0到171906列:161个条目,获取日期_infodtypes:float64(77),int64(6),object(78)内存使用:859.4MBfordtypein['float64','object','int64']:selected_dtype=gl.select_dtypes(include=[dtype])memory_usage_b=selected_dtype.memory_usage(deep=True).mean()memory_usage_mb=memory_usage_b/1024/1024print('[%s]内存使用率%0.2fMB'%(dtype,memory_usage_mb))[float64]memoryusage1.29MB[object]memoryusage9.50MB[int64]memoryusage1.12MB#uint8int8int16int32int64的取值范围importnumpyasnpfordtypein['uint8','int8','int16','int32','int64']:print(np.iinfo(dtype))uint8的机器参数--------------------------------------------------------------min=0max=255------------------------------------------------------------机电int8的参数----------------------------------------------------------min=-128max=127----------------------------------------------------------int16的机器参数---------------------------------------------------------------min=-32768max=32767-------------------------------------------------------------int32的机器参数-------------------------------------------------------------min=-2147483648max=2147483647---------------------------------------------------------------int64的机器参数--------------------------------------------------------------最小=-9223372036854775808最大=9223372036854775807------------------------------------------------------------#类型转换后的数据占用内存defmem_usage(data):ifisinstance(data,pd.DataFrame):mem_b=data.memory_usage(deep=True).sum()else:mem_b=data.memory_usage(deep=True)返回“{:03.2f}MB”.format(mem_b/1024**2)gl_int64=gl。select_dtypes(include=['int64'])#downcastgl_int32=gl_int.apply(pd.to_numeric,downcast='unsigned')print(mem_usage(gl_int64))print(mem_usage(gl_int32))#float64到floatgl_float64=gl.select_dtypes(include=['float64'])gl_float=gl_float64.apply(pd.to_numeric,downcast='float')print("转换前:"+mem_usage(gl_float64))print("转换后"+mem_usage(gl_float))7.87MB1.48MB转换前:100.99MB转换后50.49MBopt_gl=gl.copy()opt_gl[gl_int32.columns]=gl_int32opt_gl[gl_float.columns]=gl_floatprint("原始数据的大小:"+mem_usage(gl))print("转换后的数据大小:"+mem_usage(opt_gl))原始数据大小:859.43MB转换后的数据大小:802.54MBgl_obj=gl.select_dtypes(include=['object']).copy()print(gl_obj.describe())day_of_weekv_namev_leagueh_nameh_leagueday_night\count171907171907171907171907171907140150unique7148714872topSatCHNNLCHNNLD频率2889188708886690248886782724完成没收抗议park_id...h_player_6_id\count116145180171907...140838unique11635245...4774top19590602,PIT06,2,1,39HVSTL07...grimc101freq169907022...427h_player_6_nameh_player_7_idh_player_7_nameh_player_8_id\count140838140838140838140838unique4720525351974760topCharlieGrimmgrimc101CharlieGrimmlopea102freq427491491676h_player_8_nameh_player_9_idh_player_9_nameadditional_info\count1408381408381408381456unique471051935142332topAlLopezspahw101WarrenSpahnHTBF频率6763393391112acquisition_infocount140841unique1topYfreq140841[4rowsx78columns]dow=gl_obj.day_of_weekprint(dow.head())dow_cat=printdow.astype('category'.head())print("转换前"+mem_usage(dow))print("转换后"+mem_usage(dow_cat))#将重复的数据转换成类,减少数据内存=len(gl_obj[col].unique())num_total=len(gl_obj[col])如果num_unique/num_total<0.5:convert_obj.loc[:,col]=gl_obj[col]。astype('category')else:convert_obj.loc[:,col]=gl_obj[col]print('数据转换前:'+mem_usage(gl_obj))print('数据转换后:'+mem_usage(convert_obj))opt_gl[convert_obj.columns]=convert_objprint(mem_usage(opt_gl))#应用操作titanic=pd.read_csv('./pandas/data/titanic_train.csv')titanic.iloc[99]#获取99行数据defget_row(data):返回data.iloc[99]row=titanic.apply(get_row)row#统计每列NaN的个数defget_null_count(data):col_null=pd.isnull(data)null=data[col_null]returnlen(null)null_count=titanic.apply(get_null_count)print(null_count)#数据转换defwhich_class(row):pclass=row['Pclass']ifpd.isnull(pclass):return"UnKown"elifpclass==1:return"One"elifpclass==2:return"Tow"elifpclass==3:return"Three"classes=titanic.apply(which_class,axis=1)print(classes)#findminordatadefis_minor(row):age=row['年龄']如果年龄<18:返回True否则:返回Falseminor=titanic.apply(is_minor,axis=1)print(titanic[minor])