网页爬虫常用来在互联网上爬取感兴趣的页面或文件,结合数据处理与分析技术可以得到更深层次的信息。下面的代码实现了网页爬虫,可以抓取指定网页中的所有链接,并且可以指定关键字和抓取深度。
1 import sys 2 import multiprocessing 3 import re 4 import os 5 import urllib.request as lib 6 ?7 def craw_links(url,depth,keywords,processed): 8 ????‘‘‘ 9 ????:param url: ??????要爬取的网址10 ????:param depth: ????爬取深度11 ????:param keywords: ?要爬取的关键字组成的元组12 ????:param procdssed: 进程池13 ????:return:14 ????‘‘‘15 16 ????contents = []17 18 ????if url.startswith((‘http://‘,‘https://‘)):19 ????????if url not in processed:20 ????????????#make this url as processed21 ????????????processed.append(url)22 ????????else:23 ????????????#avoid processing the same url again24 ????????????return25 26 ????????print(‘Crawing ‘ + url + ‘...‘)27 ????????fp = lib.urlopen(url) ??????????????????????????#向url 发出请求28 29 ????????#Python3 returns bytes,so need to decode30 ????????contents_decoded = fp.read().decode(‘utf-8‘)31 ????????fp.close() ?????????????????????????????????????#至此已经读取爬取的网页文本内容32 33 ????????pattern = ‘|‘.join(keywords)34 35 ????????#if this page contains certain keywords,save it to a file36 ????????flag = False37 ????????if pattern:38 ????????????searched = re.search(pattern,contents_decoded) ?????????????#用正则表达式去返回的网页文本中匹配关键字39 ????????else:40 ????????????#if the keywords to filter is not given,save current page41 ????????????flag = True42 43 ????????if flag or searched:44 ????????????with open(‘craw\\‘ + url.replace(‘:‘,‘_‘).replace(‘/‘,‘_‘),‘w‘) as fp:45 ????????????????fp.writelines(contents)46 47 ????????#find all the links in the current page48 ????????links = re.findall(‘href="(.*?)"‘,contents_decoded)49 50 ????????#craw all links in the current page51 ????????for link in links:52 ????????????#consider the relative path53 ????????????if not link.startswith((‘http://‘,‘https://‘)):54 ????????????????try:55 ????????????????????index = url.rindex(‘/‘)56 ????????????????????link = url[0:index+1] + link57 ????????????????except:58 ????????????????????pass59 ????????????if depth > 0 and link.endswith((‘.htm‘,‘.html‘)):60 ????????????????craw_links(link,depth-1,keywords,processed)61 62 if __name__ == ‘__main__‘:63 ????processed = []64 ????keywords=(‘datetime‘,‘KeyWord2‘)65 ????if not os.path.exists(‘craw‘) or not os.path.isdir(‘craw‘):66 ????????os.mkdir(‘craw‘)67 ????craw_links(r‘https://docs.python.org/3/library/index.html‘,1,keywords,processed)
9.3.2 网页爬虫
原文地址:https://www.cnblogs.com/avention/p/8986368.html