#-*-coding:utf-8-*-importrequestsfromlxmlimportetreedefget_sitemapinfo(robots_url):"""功能:获取robots.txt中的SitemapURL并返回sitemap例如https://www.qidian。com/newsitemap2/pcfixedsitemap.xml:paramrobots_url:return:https://www.qidian.com/newsitemap2/pcfixedsitemap.xml"""response=requests.get(robots_url).texttry:link=response.split("Sitemap:")[-1].strip()returnlinkexcept:print("当前网站的robots协议不包含Sitemap")defget_links(sitemap_url,rule):"""功能:获取所有入口地址在站点地图下返回链接例如['https://www.qidian.com/all_pub/chanId13700/','https://www.qidian.com/all_pub/chanId14100/','https://www.qidian.com/all_pub/chanId14400/']:paramsitemap_url站点地图地址:paramrulexpath匹配规则:return:https://www.qidian.com/newsitemap2/pcfixedsitemap.xml"""response=requests.get(sitemap_url)r=etree.HTML(response.text.encode("UTF-8"))links=r.xpath(rule)returnlinksif__name__=="__main__":##开始执行程序#robots.txt地址url="https://www.qidian.com/robots.txt"sitemap_url=get_sitemapinfo(robots_url=url)links=get_links(sitemap_url=sitemap_url,rule="//url/loc/text()")print(f"links:{links}")
