首先要下载相应的库
gevent协程库:pip install gevent
selenium模拟浏览器访问库:pip install selenium
selenium库相应驱动配置 https://www.cnblogs.com/Niuxingyu/p/10490882.html
#导包import gevent#猴子补丁from gevent import monkeymonkey.patch_all()import requestsimport osimport refrom lxml import etree#模拟浏览器行为from selenium import webdriver#定义全局变量global_url_list = []#定义类class GeventSpider(object): ???#定义类属性 ???encloseing_url_list = [] ???#定义抓取方法 ???def run(self,url): ???????# 抓取写文件 ???????if url == ‘http://military.cctv.com/‘ : ???????????file_name = ‘test_cctv.html‘ ???????else: ???????????file_name = ‘inner_cctv.html‘ ???????html_content = ‘‘ ???????if not os.path.exists(file_name) : ???????????#定义浏览器对象 ???????????browser = webdriver.Chrome() ???????????browser.get(url) ???????????#解码赋值 ???????????html = browser.page_source.encode(‘utf-8‘).decode() ???????????time.sleep(1) ???????????#解码 ???????????# html = r.content.decode(‘utf-8‘) ???????????#写文件 ?指定文件编码 ???????????with open(‘./‘+file_name,‘w‘,encoding=‘utf-8‘) as f: ???????????????f.write(html) ???????????#关闭浏览器 ???????????browser.quit() ???????????html_content = ?html ???????else: ???????????????#读取文件返回 ???????????with open(‘./‘+file_name,encoding=‘utf-8‘) as f: ???????????????content = f.read() ???????????html_content = content ???????self.get_xpath(html_content) ???#定义数据匹配方法 ???def get_xpath(self,html): ???????#转换格式 ???????html = etree.HTML(html) ???????#匹配url ???????html_data_url = html.xpath(‘//span[@class="l"]/a/@href‘) ???????#声明修改全局变量 ???????global global_url_list ???????global_url_list = html_data_url ???????#修改类属性 ???????self.encloseing_url_list = html_data_url ???#定义爬取内页逻辑 ???def get_inner(self,url): ???????#发送请求 ???????r = requests.get(url) ???????html = r.content.decode(‘utf-8‘) ???????#正则匹配标题 ???????regex = re.compile(‘<h1>(.+?)</h1>‘,re.I) ???????print(regex.findall(html))if __name__ == "__main__": ???#实例化一个对象 ???geventspider = GeventSpider() ???#定义一个urllist ???url_list = [‘http://military.cctv.com/‘] ??????#请求首页没必要开协程 ???geventspider.run(url_list[0]) ???????#重新赋值 使用协程同时爬取十四个内页 ?其实这里我们做了两种方法一个使用类属性赋值,还可以使用我们定义好的全局变量global_url_list来进行赋值 ???url_list = geventspider.encloseing_url_list ???#url_list = global_url_list ???#列表推倒式将所有创建好的协程写入列表 ???job_list = [gevent.spawn(geventspider.get_inner,item) for item in url_list] ???#阻塞协程 等待所有协程完成后在进行关闭 ???gevent.joinall(job_list) ?
利用selenium并使用gevent爬取动态网页数据
原文地址:https://www.cnblogs.com/Niuxingyu/p/10509697.html