全站克隆是一件很有意思的事情,需要满足许多条件。
需要保证文件能够静态访问,这就要求html文件中的路径都是相对路径。
涉及到html文件的链接改写过程,因为不改写链接,本地访问时还是会加载网站上的资源。
一个大坑:如果没有禁用重定向,会产生死循环。
你访问a页面,a包含b,b被重定向到a,那么就会产生a/a/a/a/a...
这种情况,最简单的解决方法是禁用掉重定向,麻烦点的方法时检测出循环来,如果出现路径循环则停止。
关键在于,重定向之后url变成了错误的url。
<?xml version="1.0" encoding="UTF-8"?><project xmlns="http://maven.apache.org/POM/4.0.0" ????????xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" ????????xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> ???<modelVersion>4.0.0</modelVersion> ???<groupId>wyf</groupId> ???<artifactId>CloneSite</artifactId> ???<version>1.0-SNAPSHOT</version> ???<dependencies> ???????<!-- https://mvnrepository.com/artifact/cn.edu.hfut.dmic.webcollector/WebCollector --> ???????<dependency> ???????????<groupId>cn.edu.hfut.dmic.webcollector</groupId> ???????????<artifactId>WebCollector</artifactId> ???????????<version>2.71</version> ???????</dependency> ???????<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup --> ???????<dependency> ???????????<groupId>org.jsoup</groupId> ???????????<artifactId>jsoup</artifactId> ???????????<version>1.11.2</version> ???????</dependency> ???</dependencies> ???<build> ???????<plugins> ???????????<plugin> ???????????????<groupId>org.apache.maven.plugins</groupId> ???????????????<artifactId>maven-compiler-plugin</artifactId> ???????????????<configuration> ???????????????????<source>1.8</source> ???????????????????<target>1.8</target> ???????????????????<encoding>utf-8</encoding> ???????????????</configuration> ???????????</plugin> ???????</plugins> ???</build></project>
import cn.edu.hfut.dmic.webcollector.conf.Configuration;import cn.edu.hfut.dmic.webcollector.model.CrawlDatum;import cn.edu.hfut.dmic.webcollector.model.CrawlDatums;import cn.edu.hfut.dmic.webcollector.model.Page;import cn.edu.hfut.dmic.webcollector.plugin.berkeley.BreadthCrawler;import org.jsoup.nodes.Document;import org.jsoup.nodes.Element;import java.io.BufferedWriter;import java.io.IOException;import java.io.OutputStream;import java.nio.charset.Charset;import java.nio.file.Files;import java.nio.file.Path;import java.nio.file.Paths;public class Main extends BreadthCrawler {static String seed = "http://www.xqbase.com/computer.htm";//html网页前缀static String prefix = "http://www.xqbase.com/computer";static Path targetFolder = Paths.get("haha").toAbsolutePath();int maxRedirect = 0;public Main(String crawlPath, boolean autoParse) { ???super(crawlPath, autoParse); ???this.getConf().setMaxRedirect(maxRedirect); ???this.addSeed(seed);}boolean isInvalidPathChar(char var0) { ???return var0 < ' ' || "<>:\"|?*".indexOf(var0) != -1;}boolean isInvalidPath(String path) { ???for (int i = 0; i < path.length(); i++) if (isInvalidPathChar(path.charAt(i))) return true; ???return false;}/** * 将URL转化为本地的path,用于将网页内容保存到本地 * * @param url:绝对路径url * @param type:文件类型,用于决定保存成的后缀名称 */Path url2path(String url, String type) { ???int beg = url.indexOf(":") + 3; ???String path = url.substring(beg); ???//如果文件名包含不合法字符,那么使用hashcode ???if (isInvalidPath(path)) { ???????path = path.hashCode() + ""; ???} ???if (type != null && !path.endsWith("." + type)) { ???????path += '.' + type; ???} ???return targetFolder.resolve(path);}/** * now表示当前html网页url,resource表示资源文件url,返回二者的相对位置 * resourceType表示是否强制resourceURL发生变化 */String path2relative(String htmlUrl, String resourceUrl, String resourceType) { ???return url2path(htmlUrl, "html").getParent().relativize(url2path(resourceUrl, resourceType)).toString().replace('\\', '/');}/** * 递归创建目录,用于创建文件 */void mkdir(Path p) { ???p = p.toAbsolutePath(); ???if (Files.exists(p)) return; ???if (Files.notExists(p.getParent())) mkdir(p.getParent()); ???try { ???????Files.createDirectory(p); ???} catch (IOException e) { ???????e.printStackTrace(); ???}}/** * 保存文本文件 */void writeFile(Path path, String content, Charset encoding) { ???mkdir(path.getParent()); ???try (BufferedWriter cout = Files.newBufferedWriter(path, encoding)) { ???????cout.write(content); ???} catch (Exception e) { ???????e.printStackTrace(); ???}}/** * 保存二进制文件 */void writeFile(Path path, byte[] data) { ???mkdir(path.getParent()); ???try (OutputStream cout = Files.newOutputStream(path)) { ???????cout.write(data); ???} catch (Exception e) { ???????e.printStackTrace(); ???}}void src(Page page, CrawlDatums crawlDatums, Document doc) { ???String src[] = new String[]{"script", "svg", "img"}; ???for (int ind = 0; ind < src.length; ind++) { ???????String j = src[ind]; ???????for (Element i : doc.select(j)) { ???????????if (i.hasAttr("src") == false) continue; ???????????String s = i.absUrl("src"); ???????????if (s.trim().length() == 0) continue; ???????????i.attr("src", path2relative(page.url(), s, null)); ???????????CrawlDatum next = new CrawlDatum(s, "binary"); ???????????crawlDatums.add(next); ???????} ???}}void hrefOfResource(Page page, CrawlDatums crawlDatums, Document doc) { ???String href[] = new String[]{"link"}; ???for (int ind = 0; ind < href.length; ind++) { ???????String j = href[ind]; ???????for (Element i : doc.select(j)) { ???????????if (i.hasAttr("href") == false) continue; ???????????String s = i.absUrl("href"); ???????????if (s.trim().length() == 0) continue; ???????????i.attr("href", path2relative(page.url(), s, null)); ???????????CrawlDatum next = new CrawlDatum(s, "binary"); ???????????crawlDatums.add(next); ???????} ???}}void hrefOfHtml(Page page, CrawlDatums crawlDatums, Document doc) { ???for (Element i : doc.select("a")) { ???????if (i.hasAttr("href")) { ???????????String s = i.absUrl("href"); ???????????if (s.trim().length() == 0) continue; ???????????i.attr("href", path2relative(page.url(), s, "html")); ???????????if (s.startsWith(prefix)) { ???????????????crawlDatums.add(s); ???????????} ???????} ???}}@Overridepublic void visit(Page page, CrawlDatums crawlDatums) { ???if (page.matchType("binary")) { ???????writeFile(url2path(page.url(), null), page.content()); ???} else { ???????Document doc = page.doc(); ???????src(page, crawlDatums, doc); ???????hrefOfResource(page, crawlDatums, doc); ???????hrefOfHtml(page, crawlDatums, doc); ???????writeFile(url2path(page.url(), "html"), doc.html(), doc.charset()); ???}}public static void main(String[] args) throws Exception { ???//autoparse表示是否让引擎来控制url的解析 ???Main blog = new Main("webcollector", false); ???Configuration conf = blog.getConf(); ???conf.setConnectTimeout(3000); ???blog.start(Integer.MAX_VALUE);}}