一、实战场景如何使用Pandas清洗二手房数据并存储文件二、知识点Python基本语法python文件读写pandas数据清洗三、菜鸟实战清洗前读取源文件defdo_clean_data(self):#执行清理clean_data_raw_file_path=self.fileManger.get_data_file_path(self.clean_data_raw_file)ifnotos.path.isfile(clean_data_raw_file_path):#确认文件存在self.logger.error("需要清理的文件不存在notexist")print("需要清理的文件不存在")returnFalse#存放清理后的数据数组,先存放文件头new_row_arr=[]#从清理后的文件中读取待清理的数据raw_df=pd.read_csv(clean_data_raw_file_path,encoding=self.encoding)对于tqdm中的idx。trange(len(raw_df.values)):row=raw_df.values[idx]ifself.check_row_is_valid(row):#检查行的有效性new_row=self.clean_row(row)new_row_arr.append(new_row)#保存文件self.save_to_clean_file(new_row_arr)从base_cleaner中清理二手房数据importBaseDataCleanerimporttimeimportplatformclassTao365Cleaner(BaseDataCleaner):#逐行读取文件进行数据清理clean_data_raw_file="tao365_detail.csv"#进行数据清洗的文件clean_data_result_file='tao365_clean.csv'#数据清洗的结果文件clean_data_result_file_head=['title','price','pricepersquare','district','address','户型','建筑面积','楼层数','房屋朝向','建造年份','建造年份','原户型']#数据清洗结果文件头信息defcheck_row_is_valid(self,raw_row):#检查当前行是否有效area='area'inraw_row[4]ifarea==False:returnFalseyear='year'inraw_row[9]ifyear==False:returnFalsereturnTruedefclean_row(self,raw_row):#执行单行清洗#self.logger.info("当前清洗数据:",raw_row)#print(raw_row)#titletitle=raw_row[0]#priceprice=self.get_price(raw_row[1])#每平方价格avg_price_per_square_meter=self.get_avg_price_per_square_meter(raw_row[2])#Communityhousing=self.get_community(raw_row[3])#Addressarea=self.get_area(raw_row[4])#房屋类型house_type=self.得到_room_count(raw_row[5])house_type_2=raw_row[5]#建筑面积acreage=self.get_area_size(raw_row[6])#floorlevel=self.get_floor(raw_row[7])#housefacingdirection=raw_row[8]#Ageyear=self.get_year(raw_row[9])#建设年份time=self.get_house_age(year)#数据转换new_row=[title,price,avg_price_per_square_meter,housing,area,house_type,acreage,level,direction,year,time,house_type_2]returnnew_rowdefget_price(self,row):#售价price=row.replace('万','')returnint(float(price))defget_avg_price_per_square_meter(self,row):#每平方米均价avg_price_per_square_meter=row.replace('yuan/m2','')returnavg_price_per_square_meterdefget_area(self,row):#获取属于哪个区域area=row[:row.index('area')]returnarea+'District'defget_community(self,row):#获取它属于哪个社区returnrowdefget_room_count(self,row):#获取房间类型room_count=row[:row.index('room')]returnroom_countdefget_area_size(self,row):#获取面积area_size=row.replace('㎡','')returnarea_sizedefget_floor(self,row):#获取楼层floor=row[row.index('/'):]returnfloor.replace('/','').replace('layer','')defget_year(self,row):#获取建筑年龄returnrow.replace('year','')defget_house_age(self,row):#获取建筑年龄localtime=time.localtime(time.time())year=int(localtime[0])-int(row)returnyeardeftest(self):#testraw_row=['峨眉新村4楼110平方米','460.4万','70769元/m2','冀北西村','No.玄武区玄武门北集西村21号','4房1厅2卫','65㎡','高层/3层','南北','2020']print(raw_row)new_row=self.clean_row(raw_row)print(new_row)if__name__=='__main__':print("数据清洗开始")raw_row=['峨眉新村4楼110平方米','460.4万','70769元/m2','北极西村','玄武区玄武门北鸡西村21号','4房1厅2卫','65㎡','高层/3层','南北','2020']cleaner=Tao365Cleaner()#cleaner.test()更清洁。do_clean_data()print("数据清理完成")print("python版",platform.python_version())清理后保存到文件defsave_to_clean_file(self,data_arr):#保存到清理后的文件file_path=self.fileManger.get_data_file_path(self.clean_data_result_file)#初始化dataframe=pd.DataFrame(data_arr)frame.columns=self.clean_data_result_file_headframe.to_csv(file_path,encoding=self.encoding,index=None)self.logger.debug("清理文件保存"Complete")runresultsrunningscreenshotdatacleaningstarted100%|██████████|9/9[00:00,?it/s]数据清理完成python版本3.9.10进程结束,退出代码0结果文件资源链接https://download.csdn.net/dow...菜鸟实战,不断学习!
