# !/usr/bin/env python# -*- coding: utf-8 -*-import urllib.requestimport reimport MySQLdbimport socketdomain = ‘http://www.quanshuwang.com‘headers = { ???‘User-Agent‘:‘Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36‘}#获取分类列表def getTypeList(type): ???req = urllib.request.Request(‘http://www.quanshuwang.com/map/%s.html‘%type) ???req.headers = headers #替换头信息 ???#req.add_header() #添加单个头信息 ???res = urllib.request.urlopen(req) ?# 获取源码 ???html = res.read().decode(‘gbk‘) ?# 解码 ???reg = r‘<a href="(/book/.+?)" target="_blank">(.+?)</a>‘ ???reg = re.compile(reg) ?# 编译 ???return re.findall(reg,html)def getNovelList(href): ???req = urllib.request.Request(domain + href) ???req.headers = headers ???res = urllib.request.urlopen(req) ???html = res.read().decode(‘gbk‘) ???reg = r‘<li><a href="(.+?)" title="(.+?)">(.+?)</a></li>‘ ???reg = re.compile(reg) ???return re.findall(reg,html)def getNovelContent(url): ???req = urllib.request.Request(domain + url) ???req.headers = headers ???res = urllib.request.urlopen(req) ???html = res.read().decode(‘gbk‘,‘ignore‘) ???reg = r‘style5\(\);</script>(.*?)<script type="text/javascript">style6\(\)‘ ???reg = re.compile(reg,re.S) ???print(domain + url) ???return re.findall(reg,html)[0]class Sql(object): ???conn = MySQLdb.connect(host=‘localhost‘,port=x,user=‘x‘,password=‘x‘,db=‘novel‘,charset=‘utf8‘) ???def addnovels(self,sort,novelname): ???????cur = self.conn.cursor() #游标 ???????cur.execute("insert into novel(sort,novelname) values(‘%s‘,‘%s‘)"%(sort,novelname)) ???????lastrowid = cur.lastrowid ???????cur.close() ???????self.conn.commit() ???????return lastrowid ???def addchapters(self,novelid,chaptername,content): ???????cur = self.conn.cursor() ???????cur.execute("insert into chapter(novelid,chaptername,content) values(%s,‘%s‘,‘%s‘)"%(novelid,chaptername,content)) ???????cur.close() ???????self.conn.commit()mysql = Sql()if __name__ == ‘__main__‘: ???for type in range(1,10): ???????if type == 1: ???????????sort = "玄幻魔法" ???????elif type == 2: ???????????sort = "武侠修真" ???????elif type == 3: ???????????sort = "历史军事" ???????elif type == 4: ???????????sort = "女频言情" ???????elif type == 5: ???????????sort = "侦探推理" ???????elif type == 6: ???????????sort = "网络动漫" ???????elif type == 7: ???????????sort = "科幻小说" ???????elif type == 8: ???????????sort = "恐怖灵异" ???????elif type == 9: ???????????sort = "美文同人" ???????else: ???????????print("请求的小说类型有误!!!") ???????for href,novelname in getTypeList(type): ???????????lastrowid = mysql.addnovels(sort,novelname) ???????????for url,title,title in getNovelList(href): ???????????????try: ???????????????????print("正在爬取------------%s 《%s》 %s"%(sort,novelname,title)) ???????????????????content = getNovelContent(href.replace(‘index.html‘,url)) ???????????????????mysql.addchapters(novelid=lastrowid,chaptername=title,content=content) ???????????????????socket.setdefaulttimeout(30) ???????????????except Exception as e: ???????????????????print("连接中断,发生错误:%s !!!!"%e)
爬取小说网站整站小说内容 -《狗嗨默示录》-
原文地址:http://www.cnblogs.com/LiGoHi/p/7451815.html