在1.01版本中,我发现各回调函数找到数据后再插入数据库有个竞争问题不好解决,如果等所有回调都完成也没有好的处理方法,因为启动不止一处启动了新的TopicSpider实例。
于是我决定把读数据和写DB分开,爬虫负责前一部分,insertDB.js负责后一部分。
这样做避免了爬虫写DB竞争和判断所有回调都结束的难点问题。
而insertDB.js中,等待所有读文件的回调都结束是有办法的,那就是并行化控制,毕竟读各个文件比较简单,易于用task控制。
下面是爬虫代码:
//======================================================// 理想论坛帖子下载爬虫1.02// 目标网址:http://www.55188.com/forum-8-1.html// 已经将获得数据和插入数据分开// 此文件代码负责找到数据并存入文件// 插入数据由insertDB.js负责// 2018年4月16日//======================================================// 内置http模块var http=require("http");// 用于解析gzip网页(ungzip,https得到的网页是用gzip进行压缩的)var zlib = require(‘zlib‘); // 内置文件处理模块var fs=require(‘fs‘);// 用于转码。非Utf8的网页如gb2132会有乱码问题,需要iconv将其转码var iconv = require(‘iconv-lite‘);// cheerio模块,提供了类似jQuery的功能,用于从HTML code中查找图片地址和下一页var cheerio = require("cheerio");// 请求参数,JSON格式,http和https都有使用var options;// request请求var req;// 爬虫总数var spiderCount=0;// 完成任务的爬虫数var spiderFinished=0;// 爬虫获得的全部信息var allInfos=[];// 存放文件的目录var folder="";//-------------------------------// 爬虫类// index:爬虫序号// title:帖子标题// url:帖子地址// pageNo:帖子页码//-------------------------------function TopicSpider(title,url){ ???var obj=new Object; ???spiderCount++; ???obj.index=spiderCount; ???// 序号 ???obj.title=title; ???????// 标题 ???obj.url=url; ???????????// 地址 ???obj.pageNo=1; ???????????// 页码 ???obj.crawl=function(){ ???????// 得到hostname和path ???????var currUrl=this.url.replace("http://",""); ???????var pos=currUrl.indexOf("/"); ???????var hostname=currUrl.slice(0,pos); ???????????var path=currUrl.slice(pos); ???????????pos=currUrl.lastIndexOf("/"); ???????var dir="http://"+currUrl.slice(0,pos); ???????????????????// 初始化options ?????????options={ ???????????hostname:hostname, ???????????????port:80, ???????????????path:path,// 子路径 ?????????????method:‘GET‘, ???????????????}; ???????????// 请求并处理response ???????req=http.request(options,function(resp){ ???????????var html = []; ???????????????????????????resp.on("data", function(data) { ???????????????html.push(data); ???????????}) ???????????resp.on("end", function() { ???????????????????????????????var buffer = Buffer.concat(html); ???????????????var body = iconv.decode(buffer,‘gb2312‘); ???????????????var $ = cheerio.load(body); ???????????????????var infos=[];// 获得的发帖人信息 ???????????????// 得到下一页信息 ???????????????var nexturl=null; ???????????????$(".pages_btns .pages strong").each(function(index,element){ ???????????????????????????????????????if(nexturl==null){ ???????????????????????obj.pageNo=$(element).text(); ???????????????????????????????????????????????var nextNode=$(element).next(); ???????????????????????if(nextNode && nextNode.attr("href")){ ???????????????????????????// 如有下一页再起新爬虫 ???????????????????????????nexturl=‘http://www.55188.com/‘+nextNode.attr("href"); ???????????????????????????var nextspider=new TopicSpider(obj.title,nexturl); ???????????????????????????nextspider.crawl(); ???????????????????????} ???????????????????} ???????????????}) ???????????????// 得到发帖人信息 ???????????????$(".postinfo").each(function(index,element){ ???????????????????????????????????????var content=$(element).text(); ???????????????????content=content.replace(/\s+/g,‘ ‘);// 空白字符替换为一个空格 ???????????????????var arr=content.split(" ");// 以空格劈分 ???????????????????????????????????????if(arr.length==7){ ???????????????????????info={‘url‘:obj.url, ?????????????????????????????‘title‘:obj.title, ?????????????????????????????‘楼层‘:arr[1], ?????????????????????????????‘作者‘:arr[2].replace(‘只看:‘,‘‘), ?????????????????????????????‘日期‘:arr[4], ?????????????????????????????‘时间‘:arr[5]}; ???????????????????????infos.push(info); ???????????????????????//console.log(‘info=‘+info); ???????????????????}else if(arr.length==8){ ???????????????????????info={‘url‘:obj.url, ?????????????????????????????‘title‘:obj.title, ?????????????????????????????‘楼层‘:arr[1], ?????????????????????????????‘作者‘:arr[2].replace(‘只看:‘,‘‘), ?????????????????????????????‘日期‘:arr[5], ?????????????????????????????‘时间‘:arr[6]}; ???????????????????????infos.push(info); ???????????????????????//console.log(‘info=‘+info); ???????????????????} ???????????????}) ???????????????// 信息获取结束,打印出爬虫信息 ???????????????var msg=‘‘; ???????????????msg+=‘小爬序号:‘+obj.index+‘\n‘; ???????????????msg+=‘小爬爬取的帖子标题:‘+obj.title+‘\n‘; ???????????????msg+=‘小爬爬取的页码:‘+obj.pageNo+‘\n‘; ???????????????msg+=‘小爬爬取的帖子地址:‘+obj.url+‘\n‘; ???????????????msg+=‘小爬获取的信息数量:‘+infos.length+‘\n‘; ???????????????????????????????obj.savefile(obj.index,infos); // 存文件 ???????????????msg+=‘小爬#:‘+obj.index+‘任务完成\n‘; ???????????????console.log(msg); ???????????????????????????????// 计算比例 ???????????????spiderFinished++; ???????????????var percent=(spiderFinished*100/spiderCount).toFixed(2); ???????????????console.log(‘\n启动的爬虫数:‘+spiderCount+‘,完成任务的爬虫数:‘+spiderFinished+‘,完成比例:‘+percent+"%"); ???????????????????????????}).on("error", function(err) { ???????????????console.log(coloredText(‘请求失败,异常为‘+err,‘red‘)); ?????????????}) ???????}); ???????// 超时处理 ???????req.setTimeout(7500,function(){ ???????????req.abort(); ???????}); ???????// 出错处理 ???????req.on(‘error‘,function(err){ ???????????console.log(coloredText(‘请求发生错误‘+err,‘red‘)); ?????????}); ???????// 请求结束 ???????req.end(); ???????????????}; ???// 保存文件 ???obj.savefile=function(index,infos){ ???????var text=JSON.stringify(infos); ???????filename=‘./‘+folder+‘/‘+index+‘.dat‘; ???????fs.writeFile(filename,text,function(err){ ???????????if(err){ ???????????????console.log(‘写入文件‘+filename+‘失败,因为‘+err); ???????????} ???????}); ???} ???return obj;}//-------------------------------// 找到每个论坛页的帖子// pageUrl:论坛页的地址//-------------------------------function findTopicsInPage(pageUrl){ ???console.log("Current page="+pageUrl); ???// 得到hostname和path ???var currUrl=pageUrl.replace("http://",""); ???var pos=currUrl.indexOf("/"); ???var hostname=currUrl.slice(0,pos); ???????var path=currUrl.slice(pos); ???????pos=currUrl.lastIndexOf("/"); ???var dir="http://"+currUrl.slice(0,pos); ???????????????????// 初始化options ?????options={ ???????hostname:hostname, ???????????port:80, ???????????path:path,// 子路径 ?????????method:‘GET‘, ???????????}; ???????// 请求并处理response ???req=http.request(options,function(resp){ ???????var html = []; ???????resp.on("data", function(data) { ???????????html.push(data); ???????}) ???????resp.on("end", function() { ???????????var buffer = Buffer.concat(html); ???????????var body = iconv.decode(buffer,‘gb2312‘); ???????????var $ = cheerio.load(body); ???????????$(".forumdisplay a").each(function(index,element){ ???????????????????????????????var topicUrl=‘http://www.55188.com/‘+$(element).attr("href"); ???????????????var topicTitle=$(element).text(); ???????????????// 启动子贴爬虫 ???????????????????????????????var spider=new TopicSpider(topicTitle,topicUrl); ???????????????spider.crawl(); ???????????}) ?????????}).on("error", function(err) { ???????????console.log("findTopicsInPage函数失败"+err); ???????}) ???}); ???// 超时处理 ???req.setTimeout(7500,function(){ ???????req.abort(); ???}); ???// 出错处理 ???req.on(‘error‘,function(err){ ???????console.log(‘请求发生错误‘+err); ?????}); ???// 请求结束 ???req.end(); ???}//-------------------------------// 得到带颜色(前景色)的文字,用于在控制台输出// text:文字// color:前景色//-------------------------------function coloredText(text,color){ ???var dic = new Array(); ???dic["white"] = [‘\x1B[37m‘, ‘\x1B[39m‘]; ???dic["grey"] = [‘\x1B[90m‘, ‘\x1B[39m‘]; ???dic["black"] = [‘\x1B[30m‘, ‘\x1B[39m‘]; ???dic["blue"] = [‘\x1B[34m‘, ‘\x1B[39m‘]; ???dic["cyan"] = [‘\x1B[36m‘, ‘\x1B[39m‘]; ???dic["green"] = [‘\x1B[32m‘, ‘\x1B[39m‘]; ???dic["magenta"] = [‘\x1B[35m‘, ‘\x1B[39m‘]; ???dic["red"] = [‘\x1B[31m‘, ‘\x1B[39m‘]; ???dic["yellow"] = [‘\x1B[33m‘, ‘\x1B[39m‘]; ???return dic[color][0]+text+dic[color][1]}//-------------------------------// 入口函数// start:起始页,从1开始// end:终止页,>start//-------------------------------function main(start,end){ ???console.log(coloredText(‘爬取任务开始‘,‘yellow‘)); ???folder=‘infos(‘+getNowFormatDate()+")"; ???// 创建目录 ???fs.mkdir(‘./‘+folder,function(err){ ???????if(err){ ???????????console.log("目录"+folder+"已经存在"); ???????} ???}); ???for(var i=start;i<=end;i++){ ???????????????pageUrl=‘http://www.55188.com/forum-8-‘+i+‘.html‘ ???????findTopicsInPage(pageUrl); ???}}//-------------------------------// 测试函数,用于测试TopicSpider类//-------------------------------function testTopicSpider(){ ???var spider=new TopicSpider(‘测试‘,‘http://www.55188.com/thread-8325667-1-1.html‘); ???spider.crawl();}//--------------------------------------// 取得当前时间//--------------------------------------function getNowFormatDate() { ???var date = new Date(); ???var seperator1 = "-"; ???var seperator2 = "_"; ???var month = date.getMonth() + 1; ???var strDate = date.getDate(); ???if (month >= 1 && month <= 9) { ???????month = "0" + month; ???} ???if (strDate >= 0 && strDate <= 9) { ???????strDate = "0" + strDate; ???} ???var currentdate =date.getFullYear() + seperator1 + month + seperator1 + strDate ???????????+ " " + date.getHours() + seperator2 + date.getMinutes() ???????????+ seperator2 + date.getSeconds(); ???return currentdate;}// 开始main(1,5);//testTopicSpider();
写出的文件示例截图如下:
这些文件是json格式的,示例:
[{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"楼主","作者":"先见天机","日期":"2018-4-15","时间":"15:13"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"2楼","作者":"短线冲","日期":"2018-4-15","时间":"15:27"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"3楼","作者":"勇儿马甲","日期":"2018-4-15","时间":"15:52"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"4楼","作者":"崇心开始","日期":"2018-4-15","时间":"15:52"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"5楼","作者":"老法儿","日期":"2018-4-15","时间":"16:28"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"6楼","作者":"金山水财","日期":"2018-4-15","时间":"16:33"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"7楼","作者":"生活如愿","日期":"2018-4-15","时间":"17:19"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"8楼","作者":"李汶安","日期":"2018-4-15","时间":"17:41"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"9楼","作者":"极品之星","日期":"2018-4-16","时间":"07:57"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"10楼","作者":"叼叼狼","日期":"2018-4-16","时间":"08:54"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"11楼","作者":"先见天机","日期":"2018-4-16","时间":"09:47"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"12楼","作者":"先见天机","日期":"2018-4-16","时间":"09:56"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"13楼","作者":"先见天机","日期":"2018-4-16","时间":"10:10"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"14楼","作者":"先见天机","日期":"2018-4-16","时间":"10:23"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"15楼","作者":"先见天机","日期":"2018-4-16","时间":"10:35"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"16楼","作者":"先见天机","日期":"2018-4-16","时间":"10:51"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"17楼","作者":"先见天机","日期":"2018-4-16","时间":"10:58"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"18楼","作者":"先见天机","日期":"2018-4-16","时间":"11:00"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"19楼","作者":"先见天机","日期":"2018-4-16","时间":"11:10"},{"url":"http://www.55188.com/thread-8337207-1-5.html","title":" 大势分析 ","楼层":"20楼","作者":"先见天机","日期":"2018-4-16","时间":"11:17"}]
而insertDB.js的代码如下:
//======================================================// 在理想论坛帖子下载爬虫1.02生成数据文件后读取数据插入数据库// 2018年4月17日//======================================================var fs=require(‘fs‘); ???// 内置文件处理模块var allInfos=[]; ???????// 数据文件里获得的全部信息var folder; ???????????????// 数据文件所在的目录var tasks=[]; ???????????// 任务数组var finished=0; ???????????// 已完成任务的数量//--------------------------------------// 将数据插入数据库//--------------------------------------function insertDB(){ ???console.log("开始插入DB"); ???console.log(‘总计将有‘+allInfos.length+"条数据将被插入数据库"); ???????var mysql=require(‘mysql‘); // 连接mysql数据库的模块 ???var conn=mysql.createConnection({ ???????host:‘127.0.0.1‘, ???????port:‘3306‘, ???????database:‘test‘, ???????user:‘root‘, ???????password:‘12345678‘, ???}); ???conn.connect(function(err){ ???????if(err){ ???????????console.log(‘与MySQL数据库建立连接失败‘); ???????}else{ ???????????console.log(‘与MySQL数据库建立连接成功‘); ???????????for(var i=0;i<allInfos.length;i++){ ???????????????var info=allInfos[i]; ???????????????sql="insert into test.topic7(floor,author,tdate,ttime,addtime,url,title) values (‘" ???????????????????+info[‘楼层‘]+"‘,‘" ???????????????????+info[‘作者‘]+"‘,‘" ???????????????????+info[‘日期‘]+"‘,‘" ???????????????????+info[‘时间‘]+"‘," ???????????????????+"now(),‘" ???????????????????+info[‘url‘]+"‘,‘" ???????????????????+info[‘title‘]+"‘ " ???????????????????+" )"; ???????????????conn.query(sql,function(err,result){ ???????????????????if(err){ ???????????????????????console.log(err); ???????????????????}else{ ???????????????????????//console.log("Insert succeed"); ???????????????????????} ???????????????????????????????????????????}) ???????????} ???????????conn.end(); ???????????//console.log(‘全部数据插入完成‘); ???????} ???}); }//--------------------------------------// 检查是否完成//--------------------------------------function checkIfFinished(){ ???finished++; ???// 读取任务全完成再插DB ???if(finished==tasks.length){ ???????console.log("数据总量:"+allInfos.length); ???????insertDB(); ???}}//--------------------------------------// 入口函数//--------------------------------------function main() { ???folder=‘./‘+‘infos(2018-04-17 5_38_18)‘;// 数据文件所在目录 ???fs.readdir(folder,function(err,files){ ???????if(err) throw err; ???????for(var index in files){ ???????????var task=(function(file){ ???????????????return function(){ ???????????????????fs.readFile(file,‘utf8‘,function(err,data){ ???????????????????????if(err){ ???????????????????????????console.log(‘读取文件失败,因为‘+err); ???????????????????????}else{ ???????????????????????????var infos=JSON.parse(data); ???????????????????????????allInfos=allInfos.concat(infos); ??????????????????????????????????????????????????????????checkIfFinished(); ???????????????????????} ???????????????????}); ???????????????} ???????????})(folder+‘/‘+files[index]); ???????????tasks.push(task); ???????} ???????for(var task in tasks){ ???????????tasks[task](); ???????} ???}); ???}// 开始main();
下面是数据库截图:
这样分离后,原有的问题就算解决了,Python爬虫也可以照此解决。
2018年4月17日08点45分
【Nodejs】理想论坛帖子爬虫1.02
原文地址:https://www.cnblogs.com/xiandedanteng/p/8861515.html