事先申明一点,这个人品没有什么问题,只是朋友发一段python源码,再这里分享大家。
1 import requests 2 from lxml import html 3 import os 4 from multiprocessing.dummy import Pool as ThreadPool 5 ?6 def header(referer): 7 ????headers = { 8 ????????‘Host‘: ‘i.meizitu.net‘, 9 ????????‘Pragma‘: ‘no-cache‘,10 ????????‘Accept-Encoding‘: ‘gzip, deflate‘,11 ????????‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6‘,12 ????????‘Cache-Control‘: ‘no-cache‘,13 ????????‘Connection‘: ‘keep-alive‘,14 ????????‘User-Agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_5) AppleWebKit/537.36 (KHTML, like Gecko) ‘15 ??????????????????????‘Chrome/59.0.3071.115 Safari/537.36‘,16 ????????‘Accept‘: ‘image/webp,image/apng,image/*,*/*;q=0.8‘,17 ????????‘Referer‘: ‘{}‘.format(referer),18 ????}19 ????return headers20 21 # 获取主页列表22 def getPage(pageNum):23 ????baseUrl = ‘http://www.mzitu.com/page/{}‘.format(pageNum)24 ????selector = html.fromstring(requests.get(baseUrl).content)25 ????urls = []26 ????for i in selector.xpath(‘//ul[@id="pins"]/li/a/@href‘):27 ????????urls.append(i)28 ????????print(i)29 ????return urls30 31 32 # 图片链接列表, 标题33 # url是详情页链接34 def getPiclink(url):35 ????sel = html.fromstring(requests.get(url).content)36 ????# 图片总数37 ????total = sel.xpath(‘//div[@class="pagenavi"]/a[last()-1]/span/text()‘)[0]38 ????# 标题39 ????title = sel.xpath(‘//h2[@class="main-title"]/text()‘)[0]40 ????# 文件夹格式41 ????dirName = u"【{}P】{}".format(total, title)42 ????# 新建文件夹43 ????os.mkdir(dirName)44 45 ????n = 146 ????for i in range(int(total)):47 ????????# 每一页48 ????????try:49 ????????????link = ‘{}/{}‘.format(url, i+1)50 ????????????s = html.fromstring(requests.get(link).content)51 ????????????# 图片地址在src标签中52 ????????????jpgLink = s.xpath(‘//div[@class="main-image"]/p/a/img/@src‘)[0]53 ????????????# print(jpgLink)54 ????????????# 文件写入的名称:当前路径/文件夹/文件名55 ????????????filename = ‘%s/%s/%s.jpg‘ % (os.path.abspath(‘.‘), dirName, n)56 ????????????print(u‘开始下载图片:%s 第%s张‘ % (dirName, n))57 ????????????with open(filename, "wb+") as jpg:58 ????????????????jpg.write(requests.get(jpgLink, headers=header(jpgLink)).content)59 ????????????n += 160 ????????except:61 ????????????pass62 63 64 if __name__ == ‘__main__‘:65 ????pageNum = input(u‘请输入页码:‘)66 ????p = getPage(pageNum)67 ????with ThreadPool(4) as pool:68 ????????pool.map(getPiclink, p) ???
至于爬出出来的效果图,我就不发布了
网络爬虫(爬取网站图片,自动保存本地)
原文地址:https://www.cnblogs.com/chenyanlong/p/9124217.html