当前位置: 首页 > 后端技术 > Python

爬取小米有品的信息

时间:2023-03-26 13:03:02 Python

说明爬取小米有品:把两个链接放在一起,运行一下就能全部搞定(700左右)。使用的是selenium+chrome+lxml的组合(也很快,因为只有一页)输出:程序会生成三个文件,两个csv,一个xlscsv,体积小,通用性强data_mi.csv使用utf-8编码data_mi-gbk.csv使用gbk编码xls是excel的格式(gbk是中文编码,只能用excel打开,utf-8是python默认编码,可以用专业工具打开)贴上来自selenium的代码importwebdriverfromselenium.webdriver.support.waitimportWebDriverWaitfromselenium.webdriver.common.byimportByfromselenium.webdriver.supportimportexpected_conditionsasECfromselenium.webdriver。chrome.optionsimportOptionsfromlxmlimportetreeimportcsvimportxlwtimporttimeclassSpider:def__init__(self):self.runtime=Noneself.url=['https://www.xiaomiyoupin.com/goodsbycategory?firstId=115&secondId=115&title=%E5%AE%B6%E7%94%A8%E7%94%B5%E5%99%A8&spmref=YouPinPC.$Home$.list.0.90827029','https://www.xiaomiyoupin.com/goodsbycategory?firstId=116&secondId=116&title=%E6%99%BA%E8%83%BD%E5%AE%B6%E5%BA%AD&spmref=YouPinPC.$Home$.list.0.93586205']self.csvfilename='data_mi.csv'self.csvfilenamegbk='data_mi-gbk.csv'chrome_options=Options()chrome_options.add_argument('--headless')#设置chrome无头模式self.browser=webdriver.Chrome(chrome_options=chrome_options)self.wait=WebDriverWait(self.browser,20)defrun(self):'''运行入口'''start=time.time()#8.2号的链接foriteminself.parse_page(self.get_page(self.url[0])):self.save_data(item)#8.1号的链接foriteminself.parse_page(self.get_page(self.url[1])):self.save_data(item)self.u8togbk(self.csvfilename,self.csvfilenamegbk)end=time.time()self.runtime=end-startdefget_page(self,url):'''请请求网页'''self.browser.get(url)self.wait.until(EC.presence_of_element_located((By.XPATH,'//*[@id="root"]/div/div[3]/div/div[2]/div/div[1]/div[1]/img')))#模拟下拉foriinrange(50):js_to_buttom="window.scrollBy(0,1000)"self.browser.execute_script(js_to_buttom)time.sleep(0.05)#等候网页加载time.sleep(5)returnself.browser.page_sourcedefparse_page(self,text):'''解析网页'''html=etree.HTML(text)forindexinrange(2,17):classes=html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/h2/text()'.format(index))[0]names=html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[1]/text()'.format(index))介绍=html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[2]/text()'.format(index))prices=html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/p[3]/span[2]/text()'。format(index))imgs=html.xpath('//*[@id="root"]/div/div[3]/div/div[{}]/div/div/div[1]/img/@src'.format(index))iflen(names)!=len(介绍)!=len(价格)!=len(imgs):raiseExceptionprint(len(names),len(introduces),len(prices),len(imgs))foriinrange(len(names)):yield[classes,names[i],introduces[i],prices[i],imgs[i]]defsave_data(self,item):'''保存文件'''withopen(self.csvfilename,'a',encoding='utf-8',newline='')作为csvfile:print('item>>>',item)writer=csv.writer(csvfile)writer.writerow(item)defu8togbk(self,infn,outfn):withopen(infn,'r',encoding='utf-8')asf:reader=csv.reader(f)results=list(reader)withopen(outfn,'w',encoding='gbk',newline='')asf:writer=csv.writer(f)forresultinresults:try:writer.writerow(result)除了异常:passdefmkxls(self,out_filename):'''将csv转换为xls文件'''defcsv_to_xlsx(csvfile,outfile):''':paramcsvfile:str:paramoutfile:str:return:None'''withopen(csvfile)asfc:r_csv=csv.reader(fc)workbook=xlwt.Workbook()sheet=workbook.add_sheet('sheet1')#创建工作表i=0j=0forlineinr_csv:j=0forvinline:sheet.write(i,j,v)j=j+1i=i+1workbook.save(outfile)#保存Excelcsv_to_xlsx(self.csvfilenamegbk,out_filename)@propertydeftime(self):return'Totaltimespent:{}seconds'.format(self.runtime)if__name__=='__main__':spider=Spider()spider.run()#运行爬虫spider.mkxls('data_mi.xls')#这一行是关于将csv文件转换为xls文件,可由Excel打开使用print(spider.time)转换#运行总时间的输出文件列表输出文件格式注意:网页是utf-8编码的,如果保存为gbk,有些编码不支持只能舍弃,所以utf-8编码的内容会比gbk多2-3%左右