当前位置: 首页 > 后端技术 > Python

Scrapy爬取豆瓣图书数据并写入MySQL

时间:2023-03-26 15:10:17 Python

项目地址BookSpider简介本文涉及的内容主要是获取分类下的所有书籍数据写入MySQL,准备Python3.6、Scrapy、Twisted、MySQLdb等demo代码。一、创建项目scrapystartprojectBookSpider#createprojectscrapygenspiderdoubanbook.douban.com#创建豆瓣爬虫2.从scrapy.cmdline创建测试类(main.py)importexecuteexecute(['scrapy','crawl','douban'])3.修改配置(spiders/settings.py)USER_AGENT='Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36'#BrowserROBOTSTXT_OBEY=False#不遵循爬虫协议豆瓣网站4.设置爬行Classificationof(spiders/douban.py)start_urls=['https://book.douban.com/tag/neuralnetwork']#只测试爬行神经网络5.获取分类列表的图书数据来自scrapy.http的页面importRequestfromurllib.parseimporturljoindefparse(self,response):get_nodes=response.xpath('//div[@id="subject_list"]/ul/li/div[@class="pic"]/a')fornodeinget_nodes:url=node.xpath("@href").get()img_url=node.xpath('img/@src').get()yieldRequest(url=url,meta={"img_url":img_url},callback=self.parse_book)#在meta中传递img_url值,解析电子书回调函数,分析获取到的详情next_url=response.xpath('//div[@class="paginator"]/span[@class="next"]/a/@href').get()#获取下一页页面地址if(next_url):yieldRequest(url=urljoin(response.url,next_url),callback=self.parse)#获取下一页内容6.定义数据模型(spiders/items.py)classBookspiderItem(scrapy.Item):#在这里为你的项目定义字段:#name=scrapy.Field()name=scrapy.Field()author=scrapy.Field()publish=scrapy.Field()page_num=scrapy.Field()isbm=scrapy.Field()binding=scrapy.Field()publish_date=scrapy.Field()price=scrapy.Field()rate=scrapy.Field()img_url=scrapy.Field()image_path=scrapy.Field()7.获取书籍详细信息/text()').get("").strip()BookItem['author']=response.xpath('//span[包含(text(),"作者")]/following-sibling::a[1]/text()').get("").split()[-1]BookItem['publish']=response.xpath('//span[contains(text(),"出版社")]/following-sibling::text()').get("").strip()page_num=response.xpath('//span[contains(text(),"页数")]/following-sibling::text()').get("").strip()BookItem['page_num']=0if(page_num=='')elsepage_numBookItem['isbm']=response.xpath('//span[contains(text(),"ISBN")]/following-sibling::text()').get("").strip()BookItem['binding']=response.xpath('//span[contains(text(),"装带")]/following-sibling::text()').get("").strip()BookItem['publish_date']=response.xpath('//span[contains(text(),"出版年")]/following-sibling::text()').get("").strip()price=response.xpath('//span[contains(text(),"定价")]/following-sibling::text()').get("").strip()BookItem['price']=''if(len(price)==0)elsere.findall(r'\d+\.?\d*',price)[0]BookItem['rate']=response.xpath('//div[contains(@class,"rating_self")]/strong/text()').get("").strip()BookItem['img_url']=[response.meta.get('img_url')]#图片isListyieldBookItem8.下载图片1.创建图片文件加上2.配置spiders/settings.pyITEM_PIPELINES={'BookSpider.pipelines.ImageStorePipeline':1,#后面的数据优先}IMAGES_URLS_FIELD="image_url"IMAGES_STORE=os.path.join(os.path.abspath(os.path.dirname(__file__)),'images')3.创建ImageStorePipeline类(spiders/pipelines.py)fromscrapy.pipelines.imagesimportImagesPipelinefromscrapy.exceptionsimportDropItemfromscrapy.httpimportRequestclassImageStorePipeline(ImagesPipeline):default_headers={'accept':'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3','accept-encoding':'gzip,deflate,br','accept-language':'zh-CN,zh;q=0.9','user-agent':'Mozilla/5.0(WindowsNT6.1;WOW64)AppleWebKit/537.36(KHTML,likeGecko)Chrome/76.0.3809.100Safari/537.36',#这必须是}defget_media_requests(self,item,info):forimage_urlinitem['img_url']:self.default_headers['referer']=image_urlyieldRequest(image_url,headers=self.default_headers)defitem_completed(self,results,item,info):image_path=[x['path']forok,xinresultsifok]如果不是image_path:raiseDropItem("Itemcontainsnoimages")item['image_path']=image_pathreturnitem8.写入数据库1.配置spiders/settings.py#SetdatabaseMYSQL_HOST=""MYSQL_DBNAME=""MYSQL_USER=""MYSQL_PASSWORD=""ITEM_PIPELINES={'BookSpider.pipelines.ImageStorePipeline':1,'BookSpider.pipelines.MysqlTwistedPipeline':30,}2.创建MysqlTwistedPipeline类(spiders/pipelines.py)dbpool@classmethod#静态方法,from_settings会先执行,所以self.dbpool会值得deffrom_settings(cls,settings):dbpool=adbapi.ConnectionPool("MySQLdb",host=settings['MYSQL_HOST'],db=settings['MYSQL_DBNAME'],user=settings['MYSQL_USER'],passwd=设置['MYSQL_PASSWORD'],charset='utf8',cursorclass=MySQLdb.cursors.DictCursor,use_unicode=True)returncls(dbpool)defprocess_item(self,item,spider):query=self.dbpool.runInteraction(self.do_insert,item)query.addErrback(self.handle_error,item,spider)defdo_insert(self,cursor,item):insert_sql="""插入豆瓣(name,author,publish,page_num,isbm,binding,publish_date,price,rate,img_url,image_path)值(%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"""cursor.execute(insert_sql,(item['name'],item['author'],item['publish'],item['page_num'],item['isbm'],item['binding'],item['publish_date'],item['price'],item['rate'],item['img_url'],item['图片_path']))defhandle_error(self,failure,item,spider):print(failure)9.测试1.执行main.py文件