Urllib2基础操作
1、打开网页(urlopen)
打开一个网页
import urllib2response = urllib2.urlopen(‘http://www.baidu.com‘)html= response.read()print html
urlopen一般常用的有三个参数,它的参数如下:
urllib.requeset.urlopen(url,data,timeout)
data参数的使用(GET)
import urllib ?import urllib2 ?data = {‘email‘:‘myemail‘, ‘password‘:‘password‘} ?params = urllib.urlencode(params)
response= urllib.urlopen("%s?%s"%(uri, params))
code = response.getcode()
data参数的使用(POST)
import urllib ?import urllib2 ?data = {‘email‘:‘myemail‘, ‘password‘:‘password‘} ?params = urllib.urlencode(data) response= urllib.urlopen(uri, params)code = response.getcode()
所以如果我们添加data参数的时候就是以post请求方式请求,如果没有data参数就是get请求方式
timeout参数的使用
在某些网络情况不好或者服务器端异常的情况会出现请求慢的情况,请求设置一个超时时间
import urllib2response = urllib2.urlopen(‘http://www.baidu.com‘, timeout=1)print(response.read())
2、打开网页(request)
打开一个网页
import urllib.requestrequest = urllib.request.Request(‘https://www.baidu.com‘)response = urllib.request.urlopen(request)print(response.read().decode(‘utf-8‘))
指定请求头
import urllib2# 制定请求头headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64)"}# 封装请求request = urllib2.Request(url=url, headers=headers)response = urllib2.urlopen(request)content = response.read().decode(‘utf-8‘)print content
3、进阶
增加代理
# 自定义headersheaders = { ???‘Host‘:‘www.dianping.com‘, ???‘Cookie‘: ‘JSESSIONID=F1C38C2F1A7F7BF3BCB0C4E3CCDBE245 aburl=1; cy=2;‘ ???‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", ???}proxy_handler = urllib2.ProxyHandler({‘http‘: ‘http://host:port‘})opener = urllib2.build_opener(proxy_handler)urllib2.install_opener(opener)request = urllib2.Request(url, headers=headers)response = urllib2.urlopen(request)content = response.read().decode(‘utf-8‘)
操作cookie
import urllib2import cookielibimport jsoncookie = cookielib.CookieJar()cookie_s = urllib2.HTTPCookieProcessor(cookie) ?# 创建cookie处理器opener = urllib2.build_opener(cookie_s)# 构建openerurllib2.install_opener(opener)response= urllib2.urlopen(‘http://www.dianping.com‘).read() ?# 读取指定网站的内容 ?cj = urllib2.HTTPCookieProcessor(cookie)print response ???# 网页HTML# 查看cookieprint cookie, type(cookie)for item in cookie: ???print ‘name:‘ + item.name + ‘-value:‘ + item.value
保存cookie
def saveCookie(): ???# 设置保存cookie的文件 ???filename = ‘cookie.txt‘ ???# 声明一个MozillaCookieJar对象来保存cookie,之后写入文件 ???cookie = cookielib.MozillaCookieJar(filename) ???# 创建cookie处理器 ???handler = urllib2.HTTPCookieProcessor(cookie) ???# 构建opener ???opener = urllib2.build_opener(handler) ???# 创建请求 ???res = opener.open(‘http://www.baidu.com‘) ???# 保存cookie到文件 ???# ignore_discard的意思是即使cookies将被丢弃也将它保存下来 ???# ignore_expires的意思是如果在该文件中cookies已经存在,则覆盖原文件写入 ???cookie.save(ignore_discard=True, ignore_expires=True)
在文件中取出cookie
def getCookie(): ???# 创建一个MozillaCookieJar对象 ???cookie = cookielib.MozillaCookieJar() ???# 从文件中的读取cookie内容到变量 ???cookie.load(‘cookie.txt‘, ignore_discard=True, ignore_expires=True) ???# 打印cookie内容,证明获取cookie成功 ???for item in cookie: ???????print ‘name:‘ + item.name + ‘-value:‘ + item.value ???# 利用获取到的cookie创建一个opener ???handler = urllib2.HTTPCookieProcessor(cookie) ???opener = urllib2.build_opener(handler) ???res = opener.open(‘http://www.baidu.com‘) ???print res.read()
来个实例
def my_cookie_test(): ???headers = { ???????‘User-Agent‘: "Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US) AppleWebKit/532.5 (KHTML, like Gecko) Chrome/4.0.249.0 Safari/532.5", ???????‘Accept‘: ‘text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8‘, ???????‘Accept-Language‘: ‘zh-CN,zh;q=0.8,en;q=0.6,zh-TW;q=0.4‘, ???????‘Connection‘: ‘keep-alive‘, ???????‘Cookie‘: ‘cy=2; _lxsdk_cuid=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk=16000a1a16cc8-0629d2ca3b9f7-40544230-100200-16000a1a16dc8; _lxsdk_s=16000a1a16f-c56-870-2aa%7C%7C23; _hc.v=44792549-7147-7394-ac0a-eefed1fa19a2.1511839081; s_ViewType=10‘, ???????‘Host‘: ‘www.dianping.com‘, ???????‘Referer‘: ‘http://www.dianping.com/shop‘, ???????‘Upgrade-Insecure-Requests‘: 1 ???} ???# 请求cookie ???cj_a = cookielib.CookieJar() ???cj_s = urllib2.HTTPCookieProcessor(cj_a) ???proxy_s = urllib2.ProxyHandler({‘http‘: ‘0.0.0.0:8080‘}) ???opener = urllib2.build_opener(proxy_s, cj_s) ???urllib2.install_opener(opener) ???try: ???????request = urllib2.Request("http://www.dianping.com/shop/000000/", headers=headers) ???????response = urllib2.urlopen(request) ???????content = response.read().decode(‘utf-8‘) ???????# HTML ???????print content ???????cookie_data = {} ???????for item in cj_a: ???????????# print ‘请求之后:name:‘ + item.name + ‘-value:‘ + item.value ???????????cookie_data[item.name] = item.value ???????cookie_str = json.dumps(cookie_data) ???????with open(‘cookie.txt‘, ‘w‘) as f: ???????????f.write(cookie_str) ???????print("cookies信息已保存到本地") ???except Exception as e: ???????print e
网页信息抽取。。。待下期。。。
Urllib模块使用
原文地址:http://www.cnblogs.com/shangpolu/p/7929272.html