分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 代码编程

怎么写爬虫,怎么找网站练手抓取链家、中原、安居客、我爱我家,今年5月份开始写论文啦!!!

发布时间:2023-09-06 01:34责任编辑:蔡小小关键词:爬虫

//设置请求时间

string html = string.Empty;
???????????try
???????????{
???????????????HttpWebRequest request = HttpWebRequest.Create(url) as HttpWebRequest;//模拟请求
???????????????request.Timeout = 30 * 1000;//设置30s的超时
???????????????request.UserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36";
???????????????request.ContentType = "text/html; charset=utf-8";
???????????????using (HttpWebResponse response = request.GetResponse() as HttpWebResponse) ?//发起请求
???????????????{
???????????????????if (response.StatusCode != HttpStatusCode.OK)
???????????????????{
???????????????????????log.Error("抓取{0}地址返回失败,响应状态为{1}", url, response.StatusCode);
???????????????????}
???????????????????else
???????????????????{
???????????????????????try
???????????????????????{
???????????????????????????StreamReader sr = new StreamReader(response.GetResponseStream(), encode);
???????????????????????????html = sr.ReadToEnd();//读取数据
???????????????????????????sr.Close();
???????????????????????}
???????????????????????catch (Exception ex)
???????????????????????{
???????????????????????????log.Error("抓取{0}失败", url, ex);
???????????????????????????html = null;
???????????????????????}
???????????????????}
???????????????}
???????????}
???????????catch (Exception ex)
???????????{
???????????????log.Error("抓取{0}出现异常", url, ex);
???????????????html = null;
???????????}
???????????return html;

//抓取链家、中原、安居客、我爱我家

string html = HttpHelper.DownloadUrl(pageurl);
???????????????if (html == null)
???????????????{
???????????????????return houseList;
???????????????}
???????????????HtmlDocument doc = new HtmlDocument();
???????????????doc.LoadHtml(html);
???????????????//链家
???????????????string psht = @"//*[@class=‘content‘]/div[@class=‘leftContent‘]/ul[@class=‘listContent‘]/li[@class=‘clear xiaoquListItem‘]";
???????????????HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
???????????????if (noneNodeList == null)
???????????????{
???????????????????log.ErrorAsync("数据为空!");
???????????????????return houseList;
???????????????}
???????????????foreach (var item in noneNodeList)
???????????????{
???????????????????TrojanHorse house = new TrojanHorse();
???????????????????HtmlDocument docChild = new HtmlDocument();
???????????????????docChild.LoadHtml(item.OuterHtml);
???????????????????//链家
???????????????????string urlPath = @"//*[@class=‘info‘]/div[@class=‘title‘]/a";
???????????????????HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
???????????????????string tsct = urlNode.InnerText;//小区名称
???????????????????//链家 [@class=‘xiaoquListItemRight‘]/div
???????????????????string strs = @"//*[@class=‘xiaoquListItemPrice‘]/div[@class=‘totalPrice‘]/span";
???????????????????HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
???????????????????string s = "";
???????????????????if (urlNodes == null)
???????????????????{
???????????????????????s = "null";
???????????????????}
???????????????????else
???????????????????{
???????????????????????s = urlNodes.InnerText;
???????????????????}
???????????????????//链家
???????????????????string strst = @"//*[@class=‘xiaoquListItemRight‘]/div[@class=‘xiaoquListItemSellCount‘]/a[@class=‘totalSellCount‘]/span";
???????????????????HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
???????????????????string st = "";
???????????????????if (urlNodest == null)
???????????????????{
???????????????????????st = "null";
???????????????????}
???????????????????else
???????????????????{
???????????????????????st = urlNodest.InnerText;
???????????????????}
???????????????????#region
???????????????????//string tscts = s.Replace(" ", "");
???????????????????//string tsctst = tscts.Substring(0, 8);
???????????????????//string tsctsb = tscts.Substring(tscts.Length - 7, 7);
???????????????????// string rsf = s;
???????????????????//string zf = tsctsb.Substring(0, 5);//租房
???????????????????#endregion
???????????????????house.title = tsct;
???????????????????house.price = s;
???????????????????house.remark = st;
???????????????????houseList.Add(house);
???????????????????#region
???????????????????//house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
???????????????????//string companyPath = "//*[@class=‘jjr-info‘]/p[@class=‘jjr-desc mg-top‘]/a[position()<2]";
???????????????????//HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
???????????????????//if (companyNode == null)
???????????????????//{
???????????????????// ???continue;
???????????????????//}
???????????????????//house.Company = companyNode.InnerText; //中介公司
???????????????????//string telPath = "//*[@class=‘jjr-side‘]";
???????????????????//HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
???????????????????//if (telNode == null)
???????????????????//{
???????????????????// ???continue;
???????????????????//}
???????????????????//string telstr = telNode.InnerText.Trim();
???????????????????//house.Mobile = telstr; //经纪人电话
???????????????????//house.CityCode = citycode; //城市代号
???????????????????//house.CreateTime = DateTime.Now;
???????????????????//var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
???????????????????//if (flag == null)
???????????????????//{
???????????????????// ???houseList.Add(house);
???????????????????//}
???????????????????#endregion
???????????????}
???????????}

/// <summary>
???????/// 抓取每一页的数据
???????/// </summary>
???????/// <param name="pageurl"></param>
???????/// <returns></returns>
???????private static List<TrojanHorse> GetTrojanHorseList(string pageurl)
???????{
???????????List<TrojanHorse> houseList = new List<TrojanHorse>();
???????????try
???????????{
???????????????string html = HttpHelper.DownloadUrl(pageurl);
???????????????if (html == null)
???????????????{
???????????????????return houseList;
???????????????}
???????????????HtmlDocument doc = new HtmlDocument();
???????????????doc.LoadHtml(html);
???????????????//链家
???????????????string psht = @"//*[@class=‘content‘]/div[@class=‘leftContent‘]/ul[@class=‘listContent‘]/li[@class=‘clear xiaoquListItem‘]";
???????????????HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
???????????????if (noneNodeList == null)
???????????????{
???????????????????log.ErrorAsync("数据为空!");
???????????????????return houseList;
???????????????}
???????????????foreach (var item in noneNodeList)
???????????????{
???????????????????TrojanHorse house = new TrojanHorse();
???????????????????HtmlDocument docChild = new HtmlDocument();
???????????????????docChild.LoadHtml(item.OuterHtml);
???????????????????//链家
???????????????????string urlPath = @"//*[@class=‘info‘]/div[@class=‘title‘]/a";
???????????????????HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
???????????????????string tsct = urlNode.InnerText;//小区名称
???????????????????//链家 [@class=‘xiaoquListItemRight‘]/div
???????????????????string strs = @"//*[@class=‘xiaoquListItemPrice‘]/div[@class=‘totalPrice‘]/span";
???????????????????HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
???????????????????string s = "";
???????????????????if (urlNodes == null)
???????????????????{
???????????????????????s = "null";
???????????????????}
???????????????????else
???????????????????{
???????????????????????s = urlNodes.InnerText;
???????????????????}
???????????????????//链家
???????????????????string strst = @"//*[@class=‘xiaoquListItemRight‘]/div[@class=‘xiaoquListItemSellCount‘]/a[@class=‘totalSellCount‘]/span";
???????????????????HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
???????????????????string st = "";
???????????????????if (urlNodest == null)
???????????????????{
???????????????????????st = "null";
???????????????????}
???????????????????else
???????????????????{
???????????????????????st = urlNodest.InnerText;
???????????????????}
???????????????????#region
???????????????????//string tscts = s.Replace(" ", "");
???????????????????//string tsctst = tscts.Substring(0, 8);
???????????????????//string tsctsb = tscts.Substring(tscts.Length - 7, 7);
???????????????????// string rsf = s;
???????????????????//string zf = tsctsb.Substring(0, 5);//租房
???????????????????#endregion
???????????????????house.title = tsct;
???????????????????house.price = s;
???????????????????house.remark = st;
???????????????????houseList.Add(house);
???????????????????#region
???????????????????//house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
???????????????????//string companyPath = "//*[@class=‘jjr-info‘]/p[@class=‘jjr-desc mg-top‘]/a[position()<2]";
???????????????????//HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
???????????????????//if (companyNode == null)
???????????????????//{
???????????????????// ???continue;
???????????????????//}
???????????????????//house.Company = companyNode.InnerText; //中介公司
???????????????????//string telPath = "//*[@class=‘jjr-side‘]";
???????????????????//HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
???????????????????//if (telNode == null)
???????????????????//{
???????????????????// ???continue;
???????????????????//}
???????????????????//string telstr = telNode.InnerText.Trim();
???????????????????//house.Mobile = telstr; //经纪人电话
???????????????????//house.CityCode = citycode; //城市代号
???????????????????//house.CreateTime = DateTime.Now;
???????????????????//var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
???????????????????//if (flag == null)
???????????????????//{
???????????????????// ???houseList.Add(house);
???????????????????//}
???????????????????#endregion
???????????????}
???????????}


???????????#region
???????????// ????{
???????????// ???string html = HttpHelper.DownloadUrl(pageurl);
???????????// ???if (html == null)
???????????// ???{
???????????// ???????return houseList;
???????????// ???}
???????????// ???HtmlDocument doc = new HtmlDocument();
???????????// ???doc.LoadHtml(html);
???????????// ???//*[@id="pebpwbwege"] ?????????????
???????????// ???//链家
???????????// ???//string psht = @"//*[@class=‘leftContent‘]/ul[@class=‘listContent‘]/li[@class=‘clear xiaoquListItem‘]";

???????????// ???//中原
???????????// ???string psht = @"//*[@class=‘section-wrap section-houselists‘]/div[@class=‘section‘]/div[@class=‘house-item clearfix‘]";
???????????// ???HtmlNodeCollection noneNodeList = doc.DocumentNode.SelectNodes(psht);
???????????// ???if (noneNodeList == null)
???????????// ???{
???????????// ???????log.ErrorAsync("数据为空!");
???????????// ???????return houseList;
???????????// ???}
???????????// ???foreach (var item in noneNodeList)
???????????// ???{
???????????// ???????TrojanHorse house = new TrojanHorse();
???????????// ???????HtmlDocument docChild = new HtmlDocument();
???????????// ???????docChild.LoadHtml(item.OuterHtml);
???????????// ???????//docChild.LoadHtml();
???????????// ???????//链家
???????????// ???????//string urlPath = @"//*[@class=‘info‘]/div[@class=‘title‘]/a";
???????????// ???????//中原
???????????// ???????string urlPath = @"//*[@class=‘item-info fl‘]/h4/a";
???????????// ???????HtmlNode urlNode = docChild.DocumentNode.SelectSingleNode(urlPath);
???????????// ???????//docChild.DocumentNode.SelectSingleNode(str);
???????????// ???????string tsct = urlNode.InnerText;//小区名称
???????????// ???????//链家
???????????// ???????//string strs = @"//*[@class=‘xiaoquListItemRight‘]/div[@class=‘xiaoquListItemPrice‘]/div[@class=‘totalPrice‘]/span";

???????????// ???????//中原
???????????// ???????string strs = @"//*[@class=‘item-pricearea fr‘]/p[@class=‘tc f666 f12 mt_10‘]/a";
???????????// ???????HtmlNode urlNodes = docChild.DocumentNode.SelectSingleNode(strs);
???????????// ???????string s = "";
???????????// ???????if (urlNodes == null)
???????????// ???????{
???????????// ???????????s = "null";
???????????// ???????}
???????????// ???????else
???????????// ???????{
???????????// ???????????s = urlNodes.InnerText;

???????????// ???????}
???????????// ???????//链家
???????????// ???????//string strst = @"//*[@class=‘xiaoquListItemRight‘]/div[@class=‘xiaoquListItemSellCount‘]/a[@class=‘totalSellCount‘]/span";
???????????// ???????//中原
???????????// ???????string strst = @"//*[@class=‘item-pricearea fr‘]/p[@class=‘price-nub cRed tc‘]/span";
???????????// ???????HtmlNode urlNodest = docChild.DocumentNode.SelectSingleNode(strst);
???????????// ???????string st = "";
???????????// ???????if (urlNodest == null)
???????????// ???????{
???????????// ???????????st = "null";
???????????// ???????}
???????????// ???????else
???????????// ???????{
???????????// ???????????st = urlNodest.InnerText;

???????????// ???????}

???????????// ???????//string tscts = s.Replace(" ", "");
???????????// ???????//string tsctst = tscts.Substring(0, 8);
???????????// ???????//string tsctsb = tscts.Substring(tscts.Length - 7, 7);
???????????// ???????// string rsf = s;
???????????// ???????//string zf = tsctsb.Substring(0, 5);//租房
???????????// ???????house.title = tsct;
???????????// ???????house.price = s;
???????????// ???????house.remark = st;
???????????// ???????houseList.Add(house);

???????????// ???????//house.StaffName = urlNode.Attributes["title"].Value; //经纪人姓名
???????????// ???????//string companyPath = "//*[@class=‘jjr-info‘]/p[@class=‘jjr-desc mg-top‘]/a[position()<2]";
???????????// ???????//HtmlNode companyNode = docChild.DocumentNode.SelectSingleNode(companyPath);
???????????// ???????//if (companyNode == null)
???????????// ???????//{
???????????// ???????// ???continue;
???????????// ???????//}
???????????// ???????//house.Company = companyNode.InnerText; //中介公司
???????????// ???????//string telPath = "//*[@class=‘jjr-side‘]";
???????????// ???????//HtmlNode telNode = docChild.DocumentNode.SelectSingleNode(telPath);
???????????// ???????//if (telNode == null)
???????????// ???????//{
???????????// ???????// ???continue;
???????????// ???????//}
???????????// ???????//string telstr = telNode.InnerText.Trim();
???????????// ???????//house.Mobile = telstr; //经纪人电话
???????????// ???????//house.CityCode = citycode; //城市代号
???????????// ???????//house.CreateTime = DateTime.Now;
???????????// ???????//var flag = houseList.Where(x => x.Mobile == telstr).FirstOrDefault(); //有重复的手机号不添加
???????????// ???????//if (flag == null)
???????????// ???????//{
???????????// ???????// ???houseList.Add(house);
???????????// ???????//}
???????????// ???}
???????????//}
???????????#endregion
???????????catch (Exception ex)
???????????{
???????????????log.ErrorAsync("服务器异常,异常信息:" + ex.Message);
???????????}

怎么写爬虫,怎么找网站练手抓取链家、中原、安居客、我爱我家,今年5月份开始写论文啦!!!

原文地址:https://www.cnblogs.com/hsha/p/8183442.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved