当前位置: 首页 > 后端技术 > Python

Python3.x实现特征选择ReliefF算法

时间:2023-03-26 14:25:16 Python

code!/usr/bin/envpython#-*-coding:utf-8-*-@Time:2019/10/2900299:12@Author:tb_youth@文件名:RTest.py@SoftWare:PyCharm@Blog:https://blog.csdn.net/tb_youthimportpandasaspdimportnumpyasnpimportnumpy.linalgaslaimportrandomimportcsv'''formulti-categoryproblems'''classRelief:def__init__(self,data_df,sample_rate,t,k):""":paramdata_df:数据框(字段为特征,行为样本):paramsample_rate:采样率:paramt:统计分量阈值:paramk:knumberofneighbors"""self.__data=data_dfself.__feature=data_df.columnsself.__sample_num=int(round(len(data_df)*sample_rate))self.__t=tself.__k=k#数据处理(离散数据处理成连续数据数据,例如字符到值)defget_data(self):new_data=pd.DataFrame()foroneinself.__feature[:-1]:col=self.__data[one]if(str(list(col)[0]).split(".")[0]).isdigit()或str(list(col)[0]).isdigit()或(str(list(col)[0]).split('-')[-1]).split(".")[-1].isdigit():new_data[one]=self.__data[one]#print('%s是数字类型'%one)else:#print('%s是离散的'%one)keys=list(set(list(col)))values=list(range(len(keys)))new=dict(zip(keys,values))new_data[one]=self.__data[one].map(new)new_data[self.__feature[-1]]=self.__data[self.__feature[-1]]returnnew_data#返回一个类似本的k个猜中近邻和其他类的k个猜错近邻外汇出入金流程https://www.fx61.com/supportdefget_neighbors(self,row):df=self.get_data()row_type=row[df.columns[-1]]right_df=df[df[df.columns[-1]]==row_type].drop(columns=[df.columns[-1]])aim=row.drop(df.columns[-1])f=lambdax:eulidSim(np.mat(x),np.mat(aim))right_sim=right_df.apply(f,axis=1)right_sim_two=right_sim.drop(right_sim.idxmin())right=dict()right[row_type]=list(right_sim_two.sort_values().index[0:self.__k])#printlist(right_sim_two.sort_values().index[0:self.__k])lst=[row_type]types=list(set(df[df.columns[-1]])-set(lst))wrong=dict()foroneintypes:wrong_df=df[df[df.columns[-1]]==one].drop(columns=[df.columns[-1]])wrong_sim=wrong_df.apply(f,axis=1)wrong[one]=列表(wrong_sim.sort_values()。索引[0:self.__k])print(right,wrong)returnright,wrong#计算特征权重defget_weight(self,feature,index,NearHit,NearMiss):#data=self.__data.drop(self.__feature[-1],axis=1)data=self.__datarow=data.iloc[index]right=0print('####:',NearHit.values())foroneinlist(NearHit.values())[0]:nearhit=数据.iloc[one]if(str(row[feature]).split(".")[0]).isdigit()或str(row[feature]).isdigit()或(str(row[feature]).split('-')[-1]).split(".")[-1].isdigit():max_feature=data[feature].max()min_feature=data[feature].min()right_one=pow(round(abs(row[feature]-nearhit[feature])/(max_feature-min_feature),2),2)else:print('@@:',row[feature])print('$$:',nearhit[feature])print('-'*100)right_one=0ifrow[feature]==nearhit[feature]else1right+=right_oneright_w=round(right/self.__k,2)wrong_w=0#samplerow该类型在样本集中所占的比例p_row=round(float(list(data[data.columns[-1]]).count(row[data.columns[-1]]))/len(data),2)对于NearMiss.key中的一个s():#第一种在样本集中的比例p_one=round(float(list(data[data.columns[-1]]).count(one))/len(data),2)wrong_one=0foriinNearMiss[one]:nearmiss=data.iloc[i]if(str(row[feature]).split(".")[0]).isdigit()或str(row[feature]).isdigit()或(str(row[feature]).split('-')[-1]).split(".")[-1].isdigit():max_feature=data[feature].max()min_feature=data[feature].min()wrong_one_one=pow(round(abs(row[feature]-nearmiss[feature])/(max_feature-min_feature),2),2)else:wrong_one_one=0如果row[feature]==nearmiss[feature]else1wrong_one+=wrong_one_onewrong=round(p_one/(1-p_row)*wrong_one/self.__k,2)wrong_w+=wrongw=wrong_w-right_wreturnw#过滤式特选defreliefF(self):sample=self.get_data()#printsamplem,n=np.shape(self.__data)#m为行数,n为列数score=[]sample_index=random.sample(range(0,m),self.__sample_num)print('采集样本本查询为%s'%sample_index)num=1foriinsample_index:#采集次数one_score=dict()row=sample.iloc[i]NearHit,NearMiss=self.get_neighbors(row)print('%s样本,样本索引为%s,其NearHitk最近邻行索引为%s,NearMissk最近邻行索引为%s'%(num,i,NearHit,NearMiss))forfinself.__feature[0:-1]:print('***:',f,i,NearHit,NearMiss)w=self.get_weight(f,i,NearHit,NearMiss)one_score[f]=wprint('Feature%s权重为%s。'%(f,w))score.append(one_score)num+=1f_w=pd.DataFrame(score)print('每个样本特征的权重如下:')print(f_w)print('平均特征权重如下:')print(f_w.mean())returnf_w.mean()#返回最终选择的特征defget_final(self):f_w=pd.DataFrame(self.reliefF(),columns=['weight'])final_feature_t=f_w[f_w['weight']>self.__t]print('*'*100)print(final_feature_t)#final_feature_k=f_w.sort_values('weight').head(self.__k)#printfinal_feature_kreturnfinal_feature_t#求解欧氏距离(EuclideanDistance)的几个距离defeulidSim(vecA,vecB):returnla.norm(vecA-vecB)余弦相似度defcosSim(vecA,vecB):""":paramvecA:rowVector:paramvecB:rowvector:return:返回余弦相似度(范围在0-1之间)"""num=float(vecA*vecB.T)denom=la.norm(vecA)*la.norm(向量B)cosSim=0.5+0.5*(num/denom)返回cosSim剥离皮尔逊(Pearson)相关系数'''皮尔逊相关系数,又称皮尔逊积矩相关系数(Pearsonproduct-momentcorrelationcoefficient),是一种线性相关系数,最常用的相关系数记为r,它是用于反映两个变量X和Y之间的线性相关程度。r值在-1到1之间,绝对值越大,相关性越强。'''defpearsSim(vecA,vecB):iflen(vecA)<3:return1.0else:return0.5+0.5*np.corrcoef(vecA,vecB,rowvar=0)[0][1]if__name__=='__main__':withopen('./WatermelonDataset30.csv','r',encoding='gbk')asf:data=pd.read_csv(f)[['color','root','Knocking声音','质地','肚脐','触摸','密度','含糖量','好瓜']]print(type(data))#print(data)#f_csv=csv.reader(f)#forrowinf_csv:#print(row)f=Relief(data,1,0.2,2)#df=f.get_data()#print(type(df.iloc[0]))#f.get_neighbors(df.iloc[0])f.reliefF()f.get_final()