采用maven工程,免着到处找依赖jar包
<project xmlns="http://maven.apache.org/POM/4.0.0" ???xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ???xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> ???<modelVersion>4.0.0</modelVersion> ???<groupId>com.zhaowu</groupId> ???<artifactId>pachong01</artifactId> ???<version>0.0.1-SNAPSHOT</version> ???<dependencies> ???????<!-- https://mvnrepository.com/artifact/org.apache.httpcomponents/httpclient --> ???????<dependency> ???????????<groupId>org.apache.httpcomponents</groupId> ???????????<artifactId>httpclient</artifactId> ???????????<version>4.5.3</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> ???????<dependency> ???????????<groupId>org.jsoup</groupId> ???????????<artifactId>jsoup</artifactId> ???????????<version>1.11.2</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/commons-io/commons-io --> ???????<dependency> ???????????<groupId>commons-io</groupId> ???????????<artifactId>commons-io</artifactId> ???????????<version>2.6</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/org.quartz-scheduler/quartz --> ???????<dependency> ???????????<groupId>org.quartz-scheduler</groupId> ???????????<artifactId>quartz</artifactId> ???????????<version>2.3.0</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector --> ???????<dependency> ???????????<groupId>cn.edu.hfut.dmic.webcollector</groupId> ???????????<artifactId>WebCollector</artifactId> ???????????<version>2.71</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/org.apache.poi/poi --> ???????<dependency> ???????????<groupId>org.apache.poi</groupId> ???????????<artifactId>poi</artifactId> ???????????<version>3.17</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/net.sourceforge.htmlunit/htmlunit --> ???????<dependency> ???????????<groupId>net.sourceforge.htmlunit</groupId> ???????????<artifactId>htmlunit</artifactId> ???????????<version>2.29</version> ???????</dependency> ???</dependencies></project>
直接上代码RenWu.class:
package com.zhaowu.renwu2;import java.io.BufferedReader;import java.io.IOException;import java.io.InputStream;import java.io.InputStreamReader;import java.net.MalformedURLException;import java.net.URL;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;import com.gargoylesoftware.htmlunit.BrowserVersion;import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;import com.gargoylesoftware.htmlunit.WebClient;import com.gargoylesoftware.htmlunit.html.HtmlInput;import com.gargoylesoftware.htmlunit.html.HtmlPage;public class RenWu { ???// 搜索页数 ???private static int N = 6; ???// 搜索关键词 ???private static String keyWord = "爬虫"; ???// 第一页搜索结果 ???private static HtmlPage firstBaiduPage; ???// Baidu对应每个搜索结果的第一页第二页第三页等等其中包含“&pn=1”,“&pn=2”,“&pn=3”等等, ???// 提取该链接并处理可以获取到一个模板,用于定位某页搜索结果 ???private static String template = ""; ???public static void main(String[] args) { ???????goSearch(N, keyWord); ???} ???private static void goSearch(final int n, final String keyWord) { ???????Thread thread = new Thread(new Runnable() { ???????????public void run() { ???????????????// 页数 ???????????????int x = n; ???????????????System.out.println("爬取百度关于关键字“" + keyWord + "”搜索结果的前" + x + "页"); ???????????????FileUtil.toFile("爬取百度关于关键字“" + keyWord + "”搜索结果的前" + x + "页\n"); ???????????????????????????????//1.获取并输出第一页百度查询内容 ???????????????Elements firstElementsLink = null; ???????????????try { ???????????????????firstElementsLink = getFirstPage(keyWord); ???????????????} catch (Exception e) { ???????????????????e.printStackTrace(); ???????????????} ????????????????for (Element link : firstElementsLink) { ???????????????????// 链接url ???????????????????String linkHref = link.attr("href"); ???????????????????// 链接标题 ???????????????????String linkText = link.text(); ???????????????????if(linkHref.length() > 13 & linkText.length() > 4) { ???????????????????????String content = "链接url: " + linkHref + "\n\t链接标题: " + linkText + "\n"; ???????????????????????System.out.println(content); ???????????????????????FileUtil.toFile(content); ???????????????????} ???????????????} ???????????????????????????????//2.读取第二页及之后页面预处理 ???????????????// 以firstBaiduPage作为参数,定义template,即网页格式。 ???????????????nextHref(firstBaiduPage); ???????????????????????????????//3.获取百度第一页之后的搜索结果 ???????????????for(int i = 1; i< x; i++) { ???????????????????System.out.println("\n---------百度搜索关键字“" + keyWord + "”第" + (i + 1) + "页结果------"); ???????????????????FileUtil.toFile("\n---------百度搜索关键字“" + keyWord + "”第" + (i + 1) + "页结果------" + "\n"); ???????????????????// 根据已知格式修改生成新的一页的链接 ???????????????????String tempURL = template.replaceAll("&pn=1", "&pn=" + i + ""); ???????????????????// 显示该搜索模板 ???????????????????System.out.println("\t该页地址为:" + tempURL); ???????????????????RenWu renWu = new RenWu(); ???????????????????// 实现摘取网页源码 ???????????????????String htmls = renWu.getPageSource(tempURL, "utf-8"); ???????????????????// 网页信息转换为jsoup可识别的doc模式 ???????????????????Document doc = Jsoup.parse(htmls); ???????????????????// 摘取该页搜索链接 ???????????????????Elements links = doc.select("a[data-click]"); ???????????????????// 该处同上getFirstPage的相关实现 ???????????????????for (Element link : links) { ???????????????????????// 链接url ???????????????????????String linkHref = link.attr("href"); ???????????????????????// 链接标题 ???????????????????????String linkText = link.text(); ???????????????????????if(linkHref.length() > 13 & linkText.length() > 4) { ???????????????????????????String content = "链接url: " + linkHref + "\n\t链接标题: " + linkText + "\n"; ???????????????????????????System.out.println(content); ???????????????????????????????FileUtil.toFile(content); ???????????????????????} ???????????????????} ???????????????} ???????????} ???????}); ???????thread.start(); ???} ???????public String getPageSource(String pageURL, String encoding) { ???????// 输入:url链接&编码格式 ???????// 输出:该网页内容 ???????StringBuffer sb = new StringBuffer(); ???????try { ???????????// 构建一URL对象 ???????????URL url = new URL(pageURL); ???????????// 使用openStream得到一输入流并由此构造一个BufferedReader对象 ???????????InputStream in = url.openStream(); ???????????InputStreamReader ir = new InputStreamReader(in); ???????????BufferedReader br = new BufferedReader(ir); ???????????String line; ???????????while((line = br.readLine()) != null) { ???????????????sb.append(line); ???????????????sb.append("\n"); ???????????} ???????????br.close(); ???????} catch (Exception e) { ???????????e.printStackTrace(); ???????} ???????return sb.toString(); ???} ???/* ????* 获取百度搜索第一页内容 ????*/ ???public static Elements getFirstPage(String keyWord) throws FailingHttpStatusCodeException, MalformedURLException, IOException { ???????//设置浏览器的User-Agent ???????WebClient webClient = new WebClient(BrowserVersion.FIREFOX_52); ???????// HtmlUnit对JavaScript的支持不好,关闭之 ???????webClient.getOptions().setJavaScriptEnabled(false); ???????// HtmlUnit对CSS的支持不好,关闭之 ???????webClient.getOptions().setCssEnabled(false); ???????// 百度搜索首页页面 ???????HtmlPage htmlPage = webClient.getPage("http://www.baidu.com/"); ???????// 获取搜索输入框并提交搜索内容(查看源码获取元素名称) ???????HtmlInput input = htmlPage.getHtmlElementById("kw"); ???????// 将搜索词模拟填进百度输入框(元素ID如上) ???????input.setValueAttribute(keyWord); ???????// 获取搜索按钮并点击 ???????HtmlInput btn = htmlPage.getHtmlElementById("su"); ???????// 模拟搜索按钮事件,获取第一页的html内容 ???????firstBaiduPage = btn.click(); ???????// 将获取到的百度搜索的第一页信息输出 ???????// 通过page.asXml()来获取百度首页的源代码, ???????// 通过page.asTest()来获取页面的文字 ???????String content = firstBaiduPage.asXml().toString(); ???????// 转换为Jsoup识别的doc格式 ???????Document doc = Jsoup.parse(content); ???????System.out.println("---------百度搜索关键字“" + keyWord + "”第1页结果--------"); ???????FileUtil.toFile("---------百度搜索关键字“" + keyWord + "”第1页结果--------" + "\n"); ???????// 返回包含类似<a......data-click=" "......>等的元素 ???????Elements firstElementsLink = doc.select("a[data-click]"); ???????// 返回此类链接,即第一页的百度搜素链接 ???????return firstElementsLink; ???} ???????/* ????* 获取下一页地址 ????*/ ???public static void nextHref(HtmlPage firstBaiduPage) { ???????????????WebClient webClient = new WebClient(BrowserVersion.FIREFOX_52); ???????webClient.getOptions().setJavaScriptEnabled(false); ???????webClient.getOptions().setCssEnabled(false); ???????// 获取到百度第一页搜索的底端的页码的html代码 ???????String morelinks = firstBaiduPage.getElementById("page").asXml(); ???????// 转换为Jsoup识别的doc格式 ???????Document doc = Jsoup.parse(morelinks); ???????// 提取这个html中的包含<a href=""....>的部分 ???????Elements links = doc.select("a[href]"); ???????// 设置只取一次每页链接的模板格式 ???????boolean getTemplate = true; ???????for (Element e : links) { ???????????// 将提取出来的<a>标签中的链接取出 ???????????String linkHref = e.attr("href"); ???????????if(getTemplate) { ???????????????// 补全模板格式 ???????????????template = "http://www.baidu.com" + linkHref; ???????????????getTemplate = false; ???????????} ???????} ???}}
导出到本地文件(末尾追加)的封装方发类FileUtil.class:
package com.zhaowu.renwu2;import java.io.File;import java.io.FileWriter;import java.io.IOException;public class FileUtil { ???public static void toFile (String content) { ???????File file = null; ???????FileWriter fw = null; ???????file = new File("/home/acer/桌面/aaa"); ???????try { ???????????if (!file.exists()) { ???????????????file.createNewFile(); ???????????} ???????????fw = new FileWriter(file,true); ???????????fw.write(content);//向文件中复制内容 ???????????fw.flush(); ???????} catch (IOException e) { ???????????e.printStackTrace(); ???????}finally{ ???????????if(fw != null){ ???????????????try { ???????????????????fw.close(); ???????????????} catch (IOException e) { ???????????????????e.printStackTrace(); ???????????????} ???????????} ???????} ???}}
爬虫任务二:爬取(用到htmlunit和jsoup)通过百度搜索引擎关键字搜取到的新闻标题和url,并保存在本地文件中(主体借鉴了网上的资料)
原文地址:https://www.cnblogs.com/sutao/p/9012361.html