分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 运营维护

HtmlUnit---网页抓取工具解析及使用

发布时间:2023-09-06 01:33责任编辑:熊小新关键词:暂无标签

网页的抓取网络爬虫的核心功能之一,本文介绍一下htmlunit的使用。

1.jar包:

从链接:http://sourceforge.net/projects/htmlunit/files/htmlunit/

下载最新的bin文件 htmlunit-2.29.jar;htmlunit-core-js-2.28.jar

2.获取页面的TITLE、XML代码、文本

package htmlunit;

import java.io.IOException;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUnitTest {
public static void main(String[] args){
String str="";
//创建一个webclient
WebClient webclient=new WebClient();
//htmlclient对css和js的支持不好,所以关闭
webclient.getOptions().setJavaScriptEnabled(false);
webclient.getOptions().setCssEnabled(false);
//获取页面
String url="http://www.weather.com.cn/alarm/newalarmlist.shtml?areaId=10126";
try {
HtmlPage page=webclient.getPage(url);
//获取标题TITLE
str=page.getTitleText();
System.out.println(str);
//获取页面xml代码
str=page.asXml();
System.out.println(str);
//获取页面文本
str=page.asText();
System.out.println(str);

} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
//关闭webclient
webclient.close();
}

}

3.使用不同版本的浏览器打开

package htmlunit;

import java.io.IOException;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUnitTest2 {
public static void main(String[] args){
String str="";
//使用FireFox浏览器
WebClient webclient=new WebClient(BrowserVersion.FIREFOX_45);
//htmlunit对css和js支持不好,就关闭
webclient.getOptions().setCssEnabled(false);
webclient.getOptions().setJavaScriptEnabled(false);
try {
//获取页面
HtmlPage page=webclient.getPage("http://www.baidu.com/");
str = page.getTitleText();
???????System.out.println(str);
???????//关闭webclient
???????webclient.close();
???
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}

}

4.找到页面中特定元素

package htmlunit;

import java.io.IOException;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUnitTest3 {
public static void main(String[] args){
//创建webclient
WebClient webclient =new WebClient();
//htmlunit 对css和javascript的支持不好,所以请关闭之
webclient.getOptions().setCssEnabled(false);
webclient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page=webclient.getPage("http://www.baidu.com/");
//通过id获得"百度一下"按钮
HtmlInput btn=(HtmlInput)page.getHtmlElementById("su");
System.out.println(btn.getDefaultValue());
//关闭webclient
webclient.close();
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}

5.元素检索

package htmlunit;

import java.io.IOException;
import java.util.List;

import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlDivision;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;


public class HtmlClientTest4 {

public static void main(String[] args){
WebClient webclient =new WebClient(BrowserVersion.CHROME);
webclient.getOptions().setCssEnabled(false);
webclient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page=webclient.getPage("http://www.baidu.com/");
//查找所有div
List<?> hbList = (List) page.getByXPath("//div");
HtmlDivision hb=(HtmlDivision) hbList.get(0);
System.out.println(hb.toString());
//查找并获取特定input
List<?> inputList=page.getByXPath("//input[@id=‘su‘]");
HtmlInput input=(HtmlInput) inputList.get(0);
System.out.println(input.toString());
//关闭webclient
webclient.close();
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
}

6.提交搜索

package htmlunit;

import java.io.IOException;

import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.HtmlInput;
import com.gargoylesoftware.htmlunit.html.HtmlPage;

public class HtmlUntilTest5 {
public static void main(String args[]){
WebClient webclient=new WebClient();
webclient.getOptions().setCssEnabled(false);
webclient.getOptions().setJavaScriptEnabled(false);
try {
HtmlPage page=webclient.getPage("http://www.baidu.com/");
//获取搜索输入框并提交搜索内容
HtmlInput input=page.getHtmlElementById("kw");
System.out.println(input.toString());
input.setValueAttribute("HtmlUnit 下载");
System.out.println(input.toString());
//获取搜索按钮并点击
HtmlInput btn=(HtmlInput) page.getElementById("su");
HtmlPage page2=btn.click();
//输出新页面的文本
System.out.println(page2.asText());
} catch (FailingHttpStatusCodeException | IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}

}

}

HtmlUnit---网页抓取工具解析及使用

原文地址:https://www.cnblogs.com/yuan-tao/p/8143879.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved