最近经常听说或者接触关于网络爬虫的问题,只是一直看到被人写的代码。而没有真正的做过实践,
昨天做了一下尝试,其中采用网络流行的扩展类库 http://html-agility-pack.net/?z=codeplex
遇到的问题是:部分网站禁止爬虫,或者有规则验证,无法通过模拟http 请求获取 html
本测试案例 通过模拟http 请求获取html ,通过Html Agility Pack 分析节点,获取对应节点的值,其中本案例采用的是:赶集网的数据
代码如下:
private static void ClearnHtml(string html) ???????{ ???????????var htmlDoc = new HtmlAgilityPack.HtmlDocument(); ???????????htmlDoc.LoadHtml(html); ???????????var list = new List<Room>(); ???????????var sb = new StringBuilder();//f-list-item ershoufang-list ???????????HtmlAgilityPack.HtmlNodeCollection ?htmlBody = htmlDoc.DocumentNode.SelectNodes("*//div[@class=‘f-list-item ershoufang-list‘]"); ?????????????????????foreach(HtmlAgilityPack.HtmlNode roomitem in htmlBody) ???????????{ ???????????????var room = new Room(); ???????????????if (roomitem != null) { ???????????????????try ???????????????????{ ???????????????????????var title = roomitem.SelectNodes("*//a[@class=‘js-title value title-font‘]").FirstOrDefault() != null ? roomitem.SelectNodes("*//a[@class=‘js-title value title-font‘]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.Type = roomitem.SelectNodes("*//span[@class=‘first js-huxing‘]").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class=‘first js-huxing‘]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "1"; ???????????????????????room.buju = roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[3]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[3]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.mianji = roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[5]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[5]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.Direction = roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[7]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[7]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.Floor = roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[9]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class=‘dd-item size‘]/span[9]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.zhuangxiu = roomitem.SelectNodes("*//span[@class=‘last‘]").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class=‘last‘]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.area = roomitem.SelectNodes("*//span[@class=‘area‘]").FirstOrDefault() != null ? roomitem.SelectNodes("*//span[@class=‘area‘]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.feature = roomitem.SelectNodes("*//dd[@class=‘dd-item feature‘]").FirstOrDefault() != null ? roomitem.SelectNodes("*//dd[@class=‘dd-item feature‘]").FirstOrDefault().InnerText.Trim().Replace("\n", "").Replace(" ", "") : "0"; ???????????????????????room.Price = roomitem.SelectNodes("*//div[@class=‘price‘]/span[1]").FirstOrDefault() != null ? roomitem.SelectNodes("*//div[@class=‘price‘]/span[1]").FirstOrDefault().InnerText.Replace("\n", "").Replace(" ", "").Replace(" ", "") : "0"; ???????????????????} ???????????????????catch (Exception ex) { ???????????????????????continue; ???????????????????} ??????????????????????????????????} ???????????????sb.Append($"insert into room(title,Type,buju,mianji,Direction,Floor,zhuangxiu,area,feature,Price)values"); ???????????????sb.Append($"(‘{room.title}‘,‘{room.Type}‘,‘{room.buju}‘,‘{ room.mianji}‘,‘{room.Direction}‘,‘{room.Floor}‘,‘{room.zhuangxiu}‘,‘{room.area}‘,‘{room.feature}‘,‘{room.Price}‘);"); ???????????????// ??????????????// list.Add(room); ???????????} ???????????var connection = new MySqlConnection("Server=127.0.0.1;Database=personal;Uid=ken;Pwd=123456;"); ???????????connection.Execute(sb.ToString()); ?????????????????}
NET ?爬虫
原文地址:https://www.cnblogs.com/yanwuming/p/9606628.html