Python实现:爬取[应用]需求数据导入requestimportjsonimportpandasaspdurl="https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxCNlocals=2_zhCN"defgetUrlText(url):headers={"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36"}'''抓取通用代码框架webpages'''try:r=requests.get(url,headers=headers,timeout=30)r.raise_for_status()#status_code不等于200会抛出异常r.encoding=r.apparent_encodingreturnr.textexcept:return"Anexceptionoccurred"first_url_text=getUrlText(url)first_data=json.loads(first_url_text)first_levels=first_data['layoutData'][1]['dataList']col_num=0#用来记录访问数据的行Numberresult=pd.DataFrame(columns=['一级标签','二级标签','app名称'])#用来存放fo的结果riinrange(len(first_levels)):first_level=first_levels[i]['name']#Level1Label#游戏单独处理iffirst_level!="Game":first_detailId=first_levels[i]['detailId']second_url=r"https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri="+first_detailId+"&maxResults=25&reqPageNum=1&locale=zh_CN"second_url_text=getUrlText(second_url)second_data=json.loads(second_url_text)second_levels=second_data['layoutData']forjinrange(len(second_levels)):second_level=second_levels[j]['name']#二级标签second_detailId=second_levels[j]['detailId']third_url="https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri="+second_detailId+"&maxResults=25&reqPageNum=1&locale=zh_CN"third_url_text=getUrlText(third_url)third_data=json.loads(third_url_text)third_levels=third_data['layoutData'][0]['dataList']forkinrange(len(third_levels)):app_name=third_levels[k]['name']#对应的appresult=result。append(pd.DataFrame({'一级标签':[first_level],'二级标签':[second_level],'app名称':[app_name]}))col_num+=1ifcol_num%100==1:print(col_num)result.to_excel('result2.xlsx',sheet_name='application',encoding='utf-8',index=False)python实现:爬取[游戏]需求数据[[游戏]部分的app信息获取比[应用]部分高一级,过程类似。直接上传代码感受一下:importpandaaspdimportrequestsimportjson#frombs4importBeautifulSoupurl="https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxResults=25&locale=zh_CN"defgetUrlText(url):headers={"User-Agent":"Mozilla/5.0;KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36"}'''抓取网页的通用代码框架'''try:r=requests.get(url,headers=headers,timeout=30)r.raise_for_status()#如果status_code不等于200,会抛出异常r.encoding=r.apparent_encodingreturnr.textexcept:return"Exception"first_url_text=getUrlText(url)first_data=json.loads(first_url_text)first_levels=first_data['layoutData'][1]['dataList']col_num=0#用于记录访问数据的行数result=pd.DataFrame(columns=['一级sign','secondarylabel','third-levellabel','appname'])#用于存储i的结果inrange(len(first_levels)):first_level=first_levels[i]['name']#一关卡标签#游戏单独处理亨达代理申请http://www.kaifx.cn/broker/ha...iffirst_level=="game":second_detailId=first_levels[i]['detailId']second_url=r"https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri="+second_detailId+"&maxResults=25&reqPageNum=1&locale=zh_CN"second_url_text=getUrlText(second_url)second_data=json.loads(second_url_text)=second_data['layoutData'][0]['dataList']forjinrange(len(second_levels)):third_detailId=second_levels[j]['detailId']second_level=second_levels[j]['name']#二层级标签third_url=r"https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri="+third_detailId+"&maxResults=25&reqPageNum=1&locale=zh_CN"third_url_text=getUrlText(third_url)third_data=json.loads(third_url_text)third_levels=third_data['layoutData']forkinrange(len(third_levels)):third_level=third_levels[k]['dataList'][0]['name']#三级标签four_detailId=third_levels[k]['dataList'][0]['detailId']four_url=r"https://appgallery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&uri="+four_detailId+"&maxResults=25&reqPageNum=1&locale=zh_CN"four_url_text=getUrlText(four_url)four_data=json.loads(four_url_text)four_levels=four_data['layoutData'][0]['dataList']forhinrange(len(four_levels)):app_name=four_levels[h]['name']#对应app#print([first_level,second_level,third_level,app_name])result=result.append(pd.DataFrame({'一级标签':[first_level],'二级标签':[second_level],'三级标签':[third_level],'app名称':[app_name]}))col_num+=1ifcol_num%100==1:print(col_num)result.to_excel('result.xlsx',sheet_name='game',encoding='utf-8',index=False)【应用】信息翻页问题处理及par的实现第三部分实现的tial优化。具体过程如下:#-*-coding:utf-8-*-"""CreatedonSunJun2109:36:172020@author:Administrator"""importtimefromxlrdimportopen_workbookfromxlutils.copyimportcopyimportrequestsimportjsonimportpandasaspd写入excel,xlutils可以写入现有excel,xlwt只能改写defwrite_xls(filename,row,first_level,second_level,app_name):rb=open_workbook(filename)wb=copy(rb)ws=wb.get_sheet(0)ws.write(row,0,first_level)ws.write(row,1,second_level)ws.write(row,2,app_name)wb.save(filename)defgetUrlText(url):headers={"User-Agent":"Mozilla/5.0(WindowsNT10.0;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36"}'''抓取网页的通用代码框架'''try:r=requests.get(url,headers=headers,timeout=30)r.raise_for_status()#如果status_code不等于的话200,会抛异常r.encoding=r.apparent_encodingreturnr.textexcept:return"Exception"defgetAppLabels():time_start=time.time()#500结果开始计时flag_second=Trueflag_third=Trueurl="https://应用程式llery.cloud.huawei.com/uowap/index?method=internal.getTabDetail&serviceType=13&reqPageNum=1&uri=34789c86f4654624ba9e63cf1353c860&maxResults=25&locale=zh_CN"com_left_url=r"https://appgallery.cloud.huawei.com/uowap/index?method=internal.zh_CN"first_url_text=getUrlText(url)first_data=json.loads(first_url_text)first_levels=first_data['layoutData'][1]['dataList']#row=0#用于记录访问数据的行数globalrow#result=pd.DataFrame(columns=['levelonelabel','secondlevellabel','appname'])#用于存储结果result=[]foriinrange(len(first_levels)):first_level=first_levels[i]['name']#一级标签#游戏单独处理iffirst_level!="Game":first_detailId=first_levels[i]['detailId']#获取二级标签对应的多页数据second_page_num=1#页面开头设置值为1whileflag_second:second_url=com_left_url+first_detailId+com_center_url+str(second_page_num)+com_right_urlsecond_url_text=getUrlText(second_url)second_data=json.loads(second_url_text)second_levels=second_data['layout_data']==[]:#Nodataflag_second=False#结束循环else:second_page_num+=1#第二页加1forjinrange(len(second_levels)):second_level=second_levels[j]['name']#二级标签second_detailId=second_levels[j]['detailId']#获取三级对应的多页app信息third_page_num=1#页面初始值设置为1whileflag_third:third_url=com_left_url+second_detailId+com_center_url+str(third_page_num)+com_right_urlthird_urlgetUrlText(third_url)third_data=json.loads(third_url_text)third_levels=third_data['layoutData']ifthird_levels==[]:#Nodataflag_third=Falseelse:third_page_num+=1#Third-level页面自增1third_levels=third_levels[0]['dataList']forkinrange(len(third_levels)):app_name=third_levels[k]['name']#对应app#write_xls('test.xlsx',行、第一级、第二级、应用程序名称)结果。append([first_level,second_level,app_name])#result=result.append(pd.DataFrame({'levellabel':[first_level],'secondlevellabel':[second_level],'appname':[app_name]}))row+=1ifrow%500==1:time_end=time.time()#500个结果结束计时print('timecost:%.3f'%(time_end-time_start),'s')time_start=time.time()print(row)print(first_level,":",second_level)flag_third=True#恢复初始值flag_second=True#恢复初始值返回结果#主程序if__name__=="__main__":row=0#用于记录访问数据的行数result=getAppLabels()result=pd.DataFrame(result,columns=['一级标签','二级标签','应用名称'])result.to_excel('result2.xlsx',sheet_name='application',encoding='utf-8',index=False)
