1 import urllib2 2 import re 3 import os 4 ??5 def process_item(self, item, spider): 6 ????????headers = { 7 ????????????????????"Host": ???‘img31.mtime.cn‘, 8 ????????????????????"User-Agent": ???‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:53.0) Gecko/20100101 Firefox/53.0‘, 9 ????????????????????"Accept": ???‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘,10 ????????????????????"Accept-Language": ??‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘,11 ????????????????????"Accept-Encoding": ???‘gzip, deflate‘,12 ????????????????????"Connection": ???‘keep-alive‘,13 ????????????????????"Upgrade-Insecure-Requests": "1", ?????????14 ????????????????}15 ????????16 ????????req = urllib2.Request(url=item[‘addr‘], headers=headers) 17 ????????res = urllib2.urlopen(req)18 19 ????????saveFilePath = os.path.join(os.path.curdir, "down_pic", item[‘name‘].split("_")[0]+"_"+str(item["picCount"]))20 ????????if os.path.exists(saveFilePath):21 ????????????pass22 ????????else:23 ????????????#os.mkdir(saveFilePath) #只能建单层文件夹24 ????????????os.makedirs(saveFilePath)25 ????????file_name = os.path.join(saveFilePath, item[‘name‘] + ‘.jpg‘)26 ????????with open(file_name, ‘wb‘) as fp:27 ????????????fp.write(res.read())28 ????????with open("./savePath.txt", "a") as fh:29 ????????????fh.write(file_name+"\n")
用urllib2抓取图片的时候,部分request报HTTP Error 400: Bad Request
参照:https://stackoverflow.com/questions/8840303/urllib2-http-error-400-bad-request?answertab=active#tab-top
because you aren‘t escaping the string for a URL.
改用requests成功解决问题。
1 import os 2 import requests ?3 ?4 ?def process_item(self, item, spider): 5 ????????headers = { 6 ????????????????????"Host": ???‘img31.mtime.cn‘, 7 ????????????????????"User-Agent": ???‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10.10; rv:53.0) Gecko/20100101 Firefox/53.0‘, 8 ????????????????????"Accept": ???‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, 9 ????????????????????"Accept-Language": ??‘zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3‘,10 ????????????????????"Accept-Encoding": ???‘gzip, deflate‘,11 ????????????????????"Connection": ???‘keep-alive‘,12 ????????????????????"Upgrade-Insecure-Requests": "1", ?????????13 ????????????????}14 ????????15 ????????res = requests.get(item[‘addr‘], headers=headers)16 ????????#print res17 ????????#print res.url18 ????????#print res.headers19 ????????#print res.content 二进制数据20 ????????21 ????????saveFilePath = os.path.join(os.path.curdir, "down_pic", item[‘name‘].split("_")[0]+"_"+str(item["picCount"]))22 ????????if os.path.exists(saveFilePath):23 ????????????pass24 ????????else:25 ????????????#os.mkdir(saveFilePath) #只能建单层文件夹26 ????????????os.makedirs(saveFilePath)27 ????????file_name = os.path.join(saveFilePath, item[‘name‘] + ‘.jpg‘)28 ????????with open(file_name, ‘wb‘) as fp:29 ????????????fp.write(res.content)30 ????????with open("./savePath.txt", "a") as fh:31 ????????????fh.write(file_name+"\n")
urllib2.HTTPError: HTTP Error 400: Bad Request
原文地址:http://www.cnblogs.com/v-BigdoG-v/p/7436402.html