分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 运营维护

一步步教你如何打造一个网站克隆工具仿站

发布时间:2023-09-06 02:11责任编辑:白小东关键词:暂无标签

前两天朋友叫我模仿一个网站,刚刚开始,我一个页面一个页面查看源码并复制和保存,花了我很多时间,一个字“累”,为了减轻工作量,我写了个网站“克隆工具”,一键克隆,比起人工操作,
效率提高了200%以上,精确度也大大提高,下面我将我写的“网站克隆工具”实现方法分享给大家。

一睹为快,先看看界面:

开发工具:vs2012(winform)

1.新建UrlModel模型

public class UrlModel ???{ ???????public string RelatedPath { get; set; } ???????public string AbsoluteUri { get; set; } ???????public string CurrPath { get; set; } ???????public string RootPath { get; set; } ???????public string Host { get; set; } ???????public int Port { get; set; } ???????public string Scheme { get; set; } ???}

2.新建UrlParser解析器

public class UrlParser ???{ ???????public static UrlModel Parse(string url) ???????{ ???????????UrlModel model = new UrlModel(); ???????????//默认 ???????????if (url.Length < 8) ???????????????throw new Exception("url参数不正确"); ???????????else if (!url.ToLower().StartsWith("http:") && !url.ToLower().StartsWith("https:")) ???????????????throw new Exception("url格式有误"); ???????????if (url.LastIndexOf(‘/‘) < 8) ???????????????url = url + "/"; ???????????Regex reg = new Regex("(?<scheme>(http|https))://(?<host>.+?)/", RegexOptions.Singleline); ???????????if (reg.IsMatch(url)) ???????????{ ???????????????string scheme = reg.Match(url).Groups["scheme"].Value; ???????????????string host = reg.Match(url).Groups["host"].Value; ???????????????if (host.Contains(":")) ???????????????{ ???????????????????var aa = host.Split(‘:‘); ???????????????????if (aa.Length == 2) ???????????????????{ ???????????????????????model.Host = aa[0]; ???????????????????????model.Port = int.Parse(aa[1]); ???????????????????} ???????????????} ???????????????else ???????????????{ ???????????????????model.Host = host; ???????????????????model.Port = 80; ???????????????} ???????????????int index = url.IndexOf(‘/‘, 8); ???????????????model.RelatedPath = url.Substring(index); ???????????????model.AbsoluteUri = url; ???????????????model.Scheme = scheme; ???????????????model.CurrPath = url.Substring(0, url.LastIndexOf("/")); ???????????????if (80 == model.Port) ???????????????{ ???????????????????model.RootPath = string.Format("{0}://{1}", model.Scheme, model.Host); ???????????????} ???????????????else ???????????????{ ???????????????????model.RootPath = string.Format("{0}://{1}:{2", model.Scheme, model.Host, model.Port); ???????????????} ???????????} ???????????else ???????????{ ???????????????throw new Exception("url解析失败!"); ???????????} ???????????return model; ???????} ???}

3.网页处理服务工具

/// <summary> ???/// 网页处理服务工具 ???/// </summary> ???public class WebPageService ???{ ???????private static string[] excludekeys = { "http:", "https:", "//", "#", "javascript:", "?", "tel:", "mailto:" }; ???????/// <summary> ???????/// 获取所有html元素的href属性值,只获取站点本地的链接,站外的不获取 ???????/// </summary> ???????/// <param name="html">页面的html源码</param> ???????/// <returns></returns> ???????public static List<UrlModel> GetLocalHrefs(string url,string html) ???????{ ???????????if (string.IsNullOrEmpty(html)) ???????????????return new List<UrlModel>(); ???????????Dictionary<string, UrlModel> urls = GetHrefs(url,html); ???????????List<UrlModel> newUrls = new List<UrlModel>(); ???????????if (null != urls) ???????????{ ???????????????foreach (string key in urls.Keys) ???????????????{ ???????????????????string newkey = key.ToLower(); ???????????????????bool iscontained = false; ???????????????????foreach (var exkey in excludekeys) ???????????????????{ ???????????????????????if (newkey.IndexOf(exkey) == 0) ???????????????????????{ ???????????????????????????iscontained = true; ???????????????????????????break; ???????????????????????} ???????????????????} ???????????????????if (!iscontained) { ???????????????????????//只获取本地路径 ???????????????????????newUrls.Add(urls[key]); ???????????????????} ???????????????} ???????????} ???????????return newUrls; ???????} ???????/// <summary> ???????/// 获取所有html元素的src属性值,只获取站点本地的链接,站外的不获取 ???????/// </summary> ???????/// <param name="html">页面的html源码</param> ???????/// <returns></returns> ???????public static List<UrlModel> GetLocalSrcs(string url,string html) ???????{ ???????????if (string.IsNullOrEmpty(html)) ???????????????return new List<UrlModel>(); ???????????Dictionary<string, UrlModel> urls = GetSrc(url, html); ???????????List<UrlModel> newUrls = new List<UrlModel>(); ???????????if (null != urls) ???????????{ ???????????????foreach (string key in urls.Keys) ???????????????{ ???????????????????string newkey = key.ToLower(); ???????????????????bool iscontained = false; ???????????????????foreach (var exkey in excludekeys) ???????????????????{ ???????????????????????if (newkey.IndexOf(exkey) == 0) ???????????????????????{ ???????????????????????????iscontained = true; ???????????????????????????break; ???????????????????????} ???????????????????} ???????????????????if (!iscontained) ???????????????????{ ???????????????????????//只获取本地路径 ???????????????????????newUrls.Add(urls[key]); ???????????????????} ???????????????} ???????????} ???????????return newUrls; ???????} ???????private static Dictionary<string, UrlModel> GetHrefs(string url,string html) ???????{ ???????????if (string.IsNullOrEmpty(html)) ???????????????return null; ???????????UrlModel currUrl = UrlParser.Parse(url); ???????????Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>(); ???????????Regex reg = new Regex("href=\"(?<Url>.+?)\"", RegexOptions.IgnoreCase); ???????????????????????if (currUrl != null) ???????????{ ???????????????AddUrlModel(html, currUrl, urls, reg); ???????????} ???????????return urls; ???????} ???????private static Dictionary<string, UrlModel> GetSrc(string url,string html) ???????{ ???????????if (string.IsNullOrEmpty(html)) ???????????????return null; ???????????UrlModel currUrl = UrlParser.Parse(url); ???????????Dictionary<string, UrlModel> urls = new Dictionary<string, UrlModel>(); ???????????Regex reg = new Regex("(src=\"(?<Url>.+?)\"|url\\((?<Url>.+?)\\))", RegexOptions.IgnoreCase); ???????????if (currUrl != null) ???????????{ ???????????????AddUrlModel(html, currUrl, urls, reg); ???????????} ???????????return urls; ???????} ???????private static void AddUrlModel(string html, UrlModel currUrl, Dictionary<string, UrlModel> urls, Regex reg) ???????{ ???????????if (reg.IsMatch(html)) ???????????{ ???????????????MatchCollection matchs = reg.Matches(html); ???????????????foreach (Match item in matchs) ???????????????{ ???????????????????try ???????????????????{ ???????????????????????string strUrl = item.Groups["Url"].Value; ???????????????????????UrlModel model = new UrlModel(); ???????????????????????model.RelatedPath = strUrl; ???????????????????????model.CurrPath = currUrl.CurrPath; ???????????????????????model.RootPath = currUrl.RootPath; ???????????????????????model.Scheme = currUrl.Scheme; ???????????????????????model.Port = currUrl.Port; ???????????????????????model.Host = currUrl.Host; ???????????????????????if (strUrl.StartsWith("/")) ???????????????????????{ ???????????????????????????//绝对目录情况下 ???????????????????????????model.AbsoluteUri = string.Format("{0}{1}", model.RootPath, model.RelatedPath); ???????????????????????} ???????????????????????else ???????????????????????{ ???????????????????????????//相对目录情况下 ???????????????????????????string currPath = model.CurrPath; ???????????????????????????int depth = 0; ???????????????????????????string path = model.RelatedPath; ???????????????????????????if (path.StartsWith("..")) ???????????????????????????{ ???????????????????????????????try ???????????????????????????????{ ???????????????????????????????????while (path.StartsWith("..")) ???????????????????????????????????{ ???????????????????????????????????????depth++; ???????????????????????????????????????path = path.Substring(3); ???????????????????????????????????????currPath = currPath.Substring(0, currPath.LastIndexOf("/")); ???????????????????????????????????} ???????????????????????????????????model.AbsoluteUri = string.Format("{0}/{1}", currPath, path); ???????????????????????????????} ???????????????????????????????catch ???????????????????????????????{ ???????????????????????????????} ???????????????????????????} ???????????????????????????else ???????????????????????????{ ???????????????????????????????model.AbsoluteUri = string.Format("{0}/{1}", currPath, path); ???????????????????????????} ???????????????????????} ???????????????????????strUrl = strUrl.Trim().ToLower(); ???????????????????????urls.Add(strUrl, model); ???????????????????} ???????????????????catch ???????????????????{ ???????????????????} ???????????????} ???????????} ???????} ???}

4.新建网站克隆接口

interface IWebCloneWorker ???{ ???????void Start(); ???????void Cancel(); ???}

5.新建实现

public class WebCloneWorker : IWebCloneWorker ???{ ???????//网站页面克隆深度(如:0-首页,1-分类页,2-详细页面) ???????public static int depth = 0; ???????????????//要克隆的网站网址 ???????public string Url { get; set; } ???????//克隆后,保存的路径 ???????public string SavePath { get; set; } ???????private BackgroundWorker backgroundWorker1 = null; ???????public event UrlChangedEventHandler UrlChanged; ???????public event FileSavedSuccessEventHandler FileSavedSuccess; ???????public event FileSavedFailEventHandler FileSavedFail; ???????public event DownloadCompletedEventHandler DownloadCompleted; ???????public event CollectingUrlEventHandler CollectingUrl; ???????public event CollectedUrlEventHandler CollectedUrl; ???????public event ProgressChangedEventHandler ProgressChanged; ???????//所有页面、文件资源地址集合 ???????private Dictionary<string, UrlModel> _Hrefs = new Dictionary<string, UrlModel>(); ???????/// <summary> ???????/// 所有页面、文件资源地址集合 ???????/// </summary> ???????public Dictionary<string,UrlModel> Hrefs ???????{ ???????????get { return _Hrefs; } ???????????set { _Hrefs = value; } ???????} ???????//网站页面请求编码,默认为UTF-8 ???????private string _Encoding = "utf-8"; ???????//网站页面请求编码,默认为UTF-8 ???????public string Encoding ???????{ ???????????get { return _Encoding; } ???????????set { _Encoding = value; } ???????} ???????public WebCloneWorker() { } ???????public WebCloneWorker(string url,string path) ????????{ ???????????//设置网站、保存路径 ???????????this.Url = url; ???????????this.SavePath = path; ???????????if (string.IsNullOrEmpty(this.Url)) ???????????????throw new Exception("请输入网址"); ???????????if (string.IsNullOrEmpty(this.SavePath)) ???????????????throw new Exception("请选择要保存的目录"); ???????????backgroundWorker1 = new BackgroundWorker(); ???????????//设置报告进度更新 ???????????backgroundWorker1.WorkerReportsProgress = true; ???????????backgroundWorker1.WorkerSupportsCancellation = true; ???????????//注册线程主体方法 ???????????backgroundWorker1.DoWork += backgroundWorker1_DoWork; ???????????//注册更新UI方法 ???????????backgroundWorker1.ProgressChanged += backgroundWorker1_ProgressChanged; ???????????//处理完毕 ???????????backgroundWorker1.RunWorkerCompleted += backgroundWorker1_RunWorkerCompleted; ???????} ???????void backgroundWorker1_RunWorkerCompleted(object sender, RunWorkerCompletedEventArgs e) ???????{ ???????????if (e.Cancelled) { ???????????????return; ???????????} ???????????if (this.DownloadCompleted != null) ???????????{ ???????????????DownloadCompletedEventArgs eventArgs = new DownloadCompletedEventArgs(e.Result, e.Error, e.Cancelled); ???????????????this.DownloadCompleted(this, eventArgs); ???????????} ???????} ???????void backgroundWorker1_ProgressChanged(object sender, ProgressChangedEventArgs e) ???????{ ???????????//进度回调 ???????????if (this.ProgressChanged != null) ????????????????this.ProgressChanged(this, e); ???????????UrlModel model = (UrlModel)e.UserState; ???????????if (this.UrlChanged != null) ???????????{ ???????????????//Url改变后,回调 ???????????????UrlChangedEventArgs eventArgs = new UrlChangedEventArgs(model); ???????????????this.UrlChanged(this, eventArgs); ???????????} ???????????try ???????????{ ???????????????string dir = this.SavePath; ???????????????string url = model.AbsoluteUri; ???????????????string AbsolutePath = url.Substring(url.IndexOf(‘/‘, 8)); ???????????????string fileName = ""; ???????????????if (url.IndexOf(‘?‘) > 0) ???????????????{ ???????????????????string path = AbsolutePath.Substring(0, model.RelatedPath.IndexOf(‘?‘)); ???????????????????fileName = System.IO.Path.GetFileName(path); ???????????????} ???????????????else ???????????????{ ???????????????????fileName = System.IO.Path.GetFileName(AbsolutePath); ???????????????} ???????????????//默认首页 ???????????????if (string.IsNullOrEmpty(fileName) || fileName.IndexOf(".") < 0) ???????????????{ ???????????????????fileName = "index.html"; ???????????????????if (!AbsolutePath.EndsWith("/")) ???????????????????????AbsolutePath = AbsolutePath + "/"; ???????????????} ???????????????fileName = System.Web.HttpUtility.UrlDecode(fileName); ???????????????string localPath = string.Format("{0}{1}", dir, System.IO.Path.GetDirectoryName(AbsolutePath)); ???????????????if (!System.IO.Directory.Exists(localPath)) ???????????????{ ???????????????????System.IO.Directory.CreateDirectory(localPath); ???????????????} ???????????????//判断文件是否存在,存在不再下载 ???????????????string path2 = Path.Combine(localPath, fileName); ???????????????if (File.Exists(path2)) ???????????????{ ???????????????????return; ???????????????} ???????????????//下载网页、图片、资源文件 ???????????????HttpTool.DownFile(url, localPath, fileName); ???????????????//保存成功后,回调 ???????????????if (this.FileSavedSuccess != null) ???????????????{ ???????????????????FileSavedSuccessEventArgs eventArgs = new FileSavedSuccessEventArgs(model); ???????????????????this.FileSavedSuccess(this, eventArgs); ???????????????} ???????????} ???????????catch (Exception ex) ???????????{ ???????????????//保存失败后,回调 ???????????????if (this.FileSavedFail != null) ???????????????{ ???????????????????FileSavedFailEventArgs eventArgs = new FileSavedFailEventArgs(ex); ???????????????????this.FileSavedFail(this, eventArgs); ???????????????} ???????????} ???????} ???????void backgroundWorker1_DoWork(object sender, DoWorkEventArgs e) ???????{ ???????????//获取资源 ???????????GetResource(); ???????????int index = 1; ???????????if (this.Hrefs.Keys.Count > 0) ???????????{ ???????????????foreach (var k in this.Hrefs.Keys) ???????????????{ ???????????????????//取消操作 ???????????????????if (backgroundWorker1.CancellationPending) ???????????????????{ ???????????????????????e.Cancel = true; ???????????????????????return; ???????????????????} ???????????????????backgroundWorker1.ReportProgress(index, this.Hrefs[k]); ???????????????????index++; ???????????????????//挂起当前线程200毫秒 ???????????????????Thread.Sleep(200); ???????????????} ???????????} ???????} ???????public void Start() ???????{ ???????????if (this.backgroundWorker1.IsBusy) ???????????????return; ???????????this.backgroundWorker1.RunWorkerAsync(); ???????} ???????public void Cancel() ???????{ ???????????if (this.backgroundWorker1.CancellationPending) ???????????????return; ???????????this.backgroundWorker1.CancelAsync(); ???????} ???????????????private void GetResource() ???????{ ???????????string url = this.Url; ???????????string referer = this.Url; ???????????string msg = ""; ???????????string html = HttpTool.HttpGet(url, referer, this.Encoding, out msg); ???????????//收集页面链接 ???????????GetHrefs(0, url, html); ???????????//收集完毕 ???????????if (null != CollectedUrl) ???????????{ ???????????????UrlModel urlModel = new UrlModel(); ???????????????CollectedUrlEventArgs eventArgs = new CollectedUrlEventArgs(urlModel); ???????????????this.CollectedUrl(this, eventArgs); ???????????} ???????} ???????private void GetHrefs(int level,string url,string html) ???????{ ???????????#region 添加当前页 ???????????UrlModel currUrl = UrlParser.Parse(url); ???????????try ???????????{ ???????????????//取消 ???????????????if (backgroundWorker1.CancellationPending) ???????????????????return; ???????????????this.Hrefs.Add(currUrl.RelatedPath, currUrl); ???????????????//收集回调 ???????????????if (null != CollectingUrl) ???????????????{ ???????????????????CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(currUrl); ???????????????????this.CollectingUrl(this, eventArgs); ???????????????} ???????????} ???????????catch ???????????{ ???????????} ???????????#endregion ???????????//获取相关链接(含有href属性的) ???????????List<UrlModel> list1 = WebPageService.GetLocalHrefs(url,html); ???????????//获取图片,文件等资源文件(含有src属性的) ???????????List<UrlModel> listSrcs = WebPageService.GetLocalSrcs(url,html); ???????????#region 获取当级资源文件 ???????????if (listSrcs != null) ???????????{ ???????????????for (int i = 0; i < listSrcs.Count; i++) ???????????????{ ???????????????????UrlModel urlModel = listSrcs[i]; ???????????????????try ???????????????????{ ???????????????????????//取消 ???????????????????????if (backgroundWorker1.CancellationPending) ????????????????????????????return; ???????????????????????this.Hrefs.Add(urlModel.RelatedPath, urlModel); ???????????????????????//收集回调 ???????????????????????if (null != CollectingUrl) ???????????????????????{ ???????????????????????????CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel); ???????????????????????????this.CollectingUrl(this, eventArgs); ???????????????????????} ???????????????????} ???????????????????catch ???????????????????{ } ???????????????} ???????????} ???????????#endregion ???????????#region 获取子级页面资源 ???????????//获取第二级 ???????????if (list1 != null) ???????????{ ???????????????for (int i = 0; i < list1.Count; i++) ???????????????{ ???????????????????UrlModel urlModel = list1[i]; ???????????????????try ???????????????????{ ???????????????????????//取消 ???????????????????????if (backgroundWorker1.CancellationPending) ???????????????????????????return; ???????????????????????this.Hrefs.Add(urlModel.RelatedPath, urlModel); ???????????????????????//收集回调 ???????????????????????if (null != CollectingUrl) ???????????????????????{ ???????????????????????????CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel); ???????????????????????????this.CollectingUrl(this, eventArgs); ???????????????????????} ???????????????????} ???????????????????catch ???????????????????{ } ???????????????????string msg = ""; ???????????????????html = HttpTool.HttpGet(urlModel.AbsoluteUri, urlModel.AbsoluteUri, this.Encoding, out msg); ???????????????????#region 获取子级资源文件 ???????????????????/* ????????????????????* 获取二级资源文件 ????????????????????* */ ???????????????????listSrcs = WebPageService.GetLocalSrcs(urlModel.AbsoluteUri, html);//资源文件 ???????????????????if (listSrcs != null) ???????????????????{ ???????????????????????for (int j = 0; j < listSrcs.Count; j++) ???????????????????????{ ???????????????????????????UrlModel urlModel2 = listSrcs[j]; ???????????????????????????try ???????????????????????????{ ???????????????????????????????//取消 ???????????????????????????????if (backgroundWorker1.CancellationPending) ???????????????????????????????????return; ???????????????????????????????this.Hrefs.Add(urlModel2.RelatedPath, urlModel2); ???????????????????????????????//收集回调 ???????????????????????????????if (null != CollectingUrl) ???????????????????????????????{ ???????????????????????????????????CollectingUrlEventArgs eventArgs = new CollectingUrlEventArgs(urlModel2); ???????????????????????????????????this.CollectingUrl(this, eventArgs); ???????????????????????????????} ???????????????????????????} ???????????????????????????catch ???????????????????????????{ } ???????????????????????????//挂起线程20毫秒 ???????????????????????????Thread.Sleep(20); ???????????????????????} ???????????????????} ???????????????????#endregion ???????????????????//挂起线程20毫秒 ???????????????????Thread.Sleep(20); ???????????????????//到达指定深度后,退出 ???????????????????if (level >= depth) ???????????????????????return; ???????????????????//递归 ???????????????????GetHrefs(level + 1, urlModel.AbsoluteUri, html); ???????????????} ???????????} ???????????#endregion ???????}

6.代码有点多,各位有需要的还是下载源码查看并运行吧。

百度网盘:链接:https://pan.baidu.com/s/1hja1rl9UEcl0dzTqVFt0dg 密码:7s6r

一步步教你如何打造一个网站克隆工具仿站

原文地址:https://www.cnblogs.com/jonlan/p/9533116.html

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved