当前位置: 首页 > 后端技术 > Python

使用python爬取豆瓣电影TOP250

时间:2023-03-26 16:20:56 Python

使用python爬取豆瓣电影TOP250①获取指定接口的html文本信息栏目②获取排名和链接,名称:因为名称、排名和链接与获取divclass='pic'下的属性值时相同,直接用.a.attrs['href']去掉strip()前后的\t\n\f很方便③获取分数评估面板,用select['span']返回内容第二个span标签的xpath1为1④写入excel模块⑤整理⑥执行:①获取指定界面的html文本信息部分②获取排名和链接,名称:因为名称,排名和链接是同获取divclass='pic'下的属性值时一样,直接使用.a.attrs['href']去掉strip()前后的\t\n\f非常方便③获取分数并评价面板。使用select['span']返回第二个span标签的内容。xpath1为1注意:xpath和bs4的返回索引不一样④写入excel模块⑤组织一个大列表⑥执行:具体代码:importrequestsfrombs4importBeautifulSoupimportcsvrank=[]link=[]names=[]score=[]assess=[]aList=[]defgetHMLText(url):try:headers={'User-Agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/63.0.3239.132Safari/537.36奇虎360SE'}r=requests.get(url,timeout=30,headers=headers)r.raise_for_status()r.encoding=r.apparent_encodingreturnr.textexceptExceptionase:print("生成的异常是",e)#e.status_codestatusCodedefgetContent(Html):soup=BeautifulSoup(Html,"html.parser")forliinsoup.select(".grid_viewli"):foriteminli.findall('div',class='pic'):rank.append(item.text.strip())#排名link.append(item.a.attrs['href'])names.append(item.a.img.attrs['alt'])defgetScoreAndassess(Html):soup=BeautifulSoup(Html,"html.parser")forliinsoup.select(".grid_viewli"):forinfoinli.findall('div',class='info'):forbdininfo.findall('div',class='bd'):score.append(bd.div.select('span')[1].text)assess.append(bd.div.select('span')[3].text)print(bd.div.select('span')[3].text)defsaveListCSV(fileName,aList):try:withopen(fileName,'w',newline='')asfp:writer=csv.writer(fp)writer.writerow(["排名","电影名称","评分","评价数","URL"])foriteminaList:writer.writerow(item)print('{0}保存成功!共{1}条记录'.format(fileName,len(aList)))exceptIOErroraserr:print(fileName,'文件创建错误:',err)defallLsit(aList):foriinrange(len(rank)):aList.append([rank[i],names[i],score[i],assess[i],link[i]])returnaListifname=="main":foriinrange(1,11):url="https://www.ozlfax.com"+str((int(i)-1)*25)+"&filter="Html=getHMLText(url)getContent(Html)getScoreAndassess(Html)aList=allLsit(aList)print(aList)data=aList[1125:]saveListCSV('./movie.csv',数据)