分享web开发知识

注册/登录|最近发布|今日推荐

主页 IT知识网页技术软件开发前端开发代码编程运营维护技术分享教程案例
当前位置:首页 > 运营维护

php爬虫原型

发布时间:2023-09-06 01:58责任编辑:顾先生关键词:爬虫
/* @desc:爬虫原型 @author [Lee] <[<complet@163.com>]> @param url 初始url @param callback 处理业务的回调函数 @param 挖掘url的深度 默认3 */function crawl($url,$callback,$depth = 3){ ???if($depth > 0){ ???????$depth--; ???????$http = new http($url); ???????$content = $http->get()->exec(); ???????// 业务处理开始 ???????call_user_func($callback,$content); ???????// 业务处理结束 ???????$preg = ‘/<[a|A].*?href=[\‘\"]{0,1}([^>\‘\"\ ]*).*?>/‘; ???????$bool = preg_match_all($preg,$content,$res); ???????$urls = array(); ???????if($bool){ ???????????$urls = $res[1]; ???????} ???????$info = parse_url($url); ???????$scheme = $info["scheme"]?:‘http‘; ???????$user = $info["user"]; ???????$pass = $info["pass"]; ???????$host = $info["host"]; ???????$port = $info["port"]; ???????$path = $info["path"]; ???????$url = $scheme . ‘://‘; ???????if ($user && $pass) { ???????????$url .= $user . ":" . $pass . "@"; ???????} ???????$url .= $host; ???????if ($port) { ???????????$url .= ":" . $port; ???????} ????????$url .= $path; ???????if (is_array($urls)) { ???????????foreach ($urls as $u) { ???????????????if (preg_match(‘/^http/‘, $u)) { ???????????????????$returl = $u; ???????????????} else { ???????????????????$real = $url . ‘/‘ . $u; ???????????????????$returl = $real; ???????????????} ???????????????crawl($returl,$callback,$depth); ???????????} ???????} ???}}

php爬虫原型

原文地址:http://blog.51cto.com/12173069/2125359

知识推荐

我的编程学习网——分享web前端后端开发技术知识。 垃圾信息处理邮箱 tousu563@163.com 网站地图
icp备案号 闽ICP备2023006418号-8 不良信息举报平台 互联网安全管理备案 Copyright 2023 www.wodecom.cn All Rights Reserved