详细的介绍已经有很多前辈总结,引用一下该篇文章:https://blog.csdn.net/zhuwukai/article/details/78644484
下面是一个代码的示例:
package com.http.client;import java.io.IOException;import org.apache.http.HttpHost;import org.apache.http.HttpResponse;import org.apache.http.client.ClientProtocolException;import org.apache.http.client.HttpClient;import org.apache.http.client.methods.HttpGet;import org.apache.http.conn.params.ConnRouteParams;import org.apache.http.impl.client.DefaultHttpClient;import org.apache.http.params.CoreConnectionPNames;import org.apache.http.util.EntityUtils;import org.apache.log4j.Logger;/** * ?* @author oo * @date 2018-04-04 */public class MyHttpClient { ???private static Logger logger = Logger.getLogger(MyHttpClient.class); ???/** ????* 需求:使用httpclient 爬取 网站数据 ????* ?????* @param args ????*/ ???public static void main(String[] args) { ???????// 创建HttpClient 对象 ???????HttpClient hclient = new DefaultHttpClient(); ???????// 设置响应时间 传输源码时间 代理服务器(设置代理服务器的目的是:防止爬数据被封ip) ???????hclient.getParams().setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT, 20000) ???????????????.setParameter(CoreConnectionPNames.SO_TIMEOUT, 20000) ???????????????.setParameter(ConnRouteParams.DEFAULT_PROXY, new HttpHost("111.155.116.237", 8123)); ???????HttpGet hGet = new HttpGet("http://www.itcast.cn/"); ???????String content = ""; ???????try { ???????????// 向网站发送请求,获取网页源码 ???????????HttpResponse execute = hclient.execute(hGet); ???????????// EntityUtils工具类把网页实体转换成字符串 ???????????content = EntityUtils.toString(execute.getEntity(), "utf-8"); ???????} catch (ClientProtocolException e) { ???????????e.printStackTrace(); ???????????logger.error("********ClientProtocolException" + e); ???????} catch (IOException e) { ???????????e.printStackTrace(); ???????????logger.error("********IOException" + e); ???????} ???????System.out.println(content); ???}}
使用Jsoup进行请求:
package com.http.client;import java.io.IOException;import org.apache.log4j.Logger;import org.jsoup.Jsoup;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import org.jsoup.select.Elements;public class MyJsoup { ???private static Logger logger = Logger.getLogger(MyJsoup.class); ???public static void main(String[] args) { ???????try { ???????????// 使用jsoup 发送请求 ???????????Document document = Jsoup.connect("http://www.itcast.cn").get();// ???????????System.out.println(document); ???????????Elements elements = document.getElementsByTag("a"); ???????????String val = elements.text(); ???????????System.out.println(val); ???????????????????????for (Element element : elements) { ???????????????System.out.println(element.text()+":"+element.attr("href")); ???????????} ???????} catch (IOException e) { ???????????e.printStackTrace(); ???????????logger.error("***********IOException: 连接失败" + e); ???????} ???}}
HttpClient 结合Jsoup:
1 package com.http.client; 2 ?3 import java.io.IOException; 4 ?5 import org.apache.http.HttpResponse; 6 import org.apache.http.client.ClientProtocolException; 7 import org.apache.http.client.HttpClient; 8 import org.apache.http.client.methods.HttpGet; 9 import org.apache.http.impl.client.DefaultHttpClient;10 import org.apache.http.util.EntityUtils;11 import org.jsoup.Jsoup;12 import org.jsoup.nodes.Document;13 import org.jsoup.nodes.Element;14 import org.jsoup.select.Elements;15 16 public class HttpCLientAndJsoup {17 18 ????public static void main(String[] args) throws ClientProtocolException, IOException {19 ????????// 创建HttpClient对象20 ????????HttpClient hClient = new DefaultHttpClient();21 ????????// 爬虫URL大部分都是get请求,创建get请求对象22 ????????HttpGet hget = new HttpGet("http://www.itcast.cn/");23 ????????// 向网站发送请求,获取网页源码24 ????????HttpResponse response = hClient.execute(hget);25 ????????// EntityUtils工具类把网页实体转换成字符串26 ????????String content = EntityUtils.toString(response.getEntity(), "utf-8");27 ????????// Jsoup负责解析网页28 ????????Document doc = Jsoup.parse(content);29 ????????// 使用元素选择器选择网页内容30 ????????Elements elements = doc.select("div.salary_con li");31 ????????// System.out.println(elements.text());32 ????????for (Element element : elements) {33 ????????????String text = element.text();34 ????????????System.out.println(text);35 ????????}36 37 ????}38 39 }
HttpClient&Jsoup爬虫的简单应用
原文地址:https://www.cnblogs.com/lyc-smile/p/8744237.html