首先cnpm init创建一个package.json
引入cheerio模块 cnpm install --save cheerio
然后开始编写代码
let cheerio = require(‘cheerio‘), ???http = require(‘http‘), ???fs = require(‘fs‘), ???url = ‘http://so.8264.com/cse/search?q=2&s=9963133823733045431&p=‘, ???page = 1http.get(url + page, function (res) { ???let html = ‘‘; //用来存储请求网页的整个html内容 ???res.setEncoding(‘utf-8‘); //防止中文乱码 ???//监听data事件,每次取一块数据 ???res.on(‘data‘, function (chunk) { ???????html += chunk; ???}); ???//监听end事件,如果整个网页内容的html都获取完毕,就执行回调函数 ???res.on(‘end‘, function () { ???????// console.log(html) ???????var $ = cheerio.load(html, { ???????????????decodeEntities: false ???????????}), ???????????Arr = [] ???????//采用cheerio模块解析 ???html ???????$(‘.result‘).each(function (index, element) { ???????????const _t = $(this) ???????????Arr.push({ ???????????????‘title‘: _t.find(‘.c-title‘).text().trim(), ???????????????‘src‘: _t.find(‘a‘).attr(‘href‘).trim(), ???????????????‘img‘: _t.find(‘img‘).length > 0 ? _t.find(‘img‘).attr(‘src‘).trim() : ‘‘, ???????????????‘describe‘: _t.find(‘.c-abstract‘).text().trim() ???????????}) ???????}) ???????let writerStream = fs.createWriteStream(‘output.txt‘); ???????writerStream.write(JSON.stringify(Arr), ‘UTF8‘); ???????writerStream.end(); ???});}).on(‘error‘, function (err) { ???console.log(err);});
node 利用http和cheerio编写简易爬虫
原文地址:https://www.cnblogs.com/lmyt/p/9928492.html