/* @desc:爬虫原型 @author [Lee] <[<complet@163.com>]> @param url 初始url @param callback 处理业务的回调函数 @param 挖掘url的深度 默认3 */function crawl($url,$callback,$depth = 3){ ???if($depth > 0){ ???????$depth--; ???????$http = new http($url); ???????$content = $http->get()->exec(); ???????// 业务处理开始 ???????call_user_func($callback,$content); ???????// 业务处理结束 ???????$preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘; ???????$bool = preg_match_all($preg,$content,$res); ???????$urls = array(); ???????if($bool){ ???????????$urls = $res[1]; ???????} ???????$info = parse_url($url); ???????$scheme = $info["scheme"]?:‘http‘; ???????$user = $info["user"]; ???????$pass = $info["pass"]; ???????$host = $info["host"]; ???????$port = $info["port"]; ???????$path = $info["path"]; ???????$url = $scheme . ‘://‘; ???????if ($user && $pass) { ???????????$url .= $user . ":" . $pass . "@"; ???????} ???????$url .= $host; ???????if ($port) { ???????????$url .= ":" . $port; ???????} ????????$url .= $path; ???????if (is_array($urls)) { ???????????foreach ($urls as $u) { ???????????????if (preg_match(‘/^http/‘, $u)) { ???????????????????$returl = $u; ???????????????} else { ???????????????????$real = $url . ‘/‘ . $u; ???????????????????$returl = $real; ???????????????} ???????????????crawl($returl,$callback,$depth); ???????????} ???????} ???}}
php爬虫原型
原文地址:http://blog.51cto.com/12173069/2125359