获取前端网页 php爬虫 get_html.php
<!DOCTYPE html> <html> <head> <title>spider</title> </head> <body> <form method="get" action="get_html.php"> crawl web html address:<input type="text" name="url" > <input type="submit" value="crawl"> </form> <?php date_default_timezone_set("PRC"); function dump($var){ echo "<pre>"; var_dump($var); echo "<pre>"; exit(date("Y-m-d H:i:s",time())); } //catetory html resource into local project file. class spider{ public $url; public $http; public $host; public $html; public $path; public $title; function __construct($url,$imagesPath=""){ set_time_limit(60); //dump($url); //$url="http://www.hose.com"; preg_match("#(https?)s?:s?//([w.-]+)/?#", $url,$matches); $this->http=$matches[1]; $this->host=$matches[2]; //dump($this->http.$this->host); if ($url) { $ch=curl_init($url); //curl_setopt($ch,CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); curl_close($ch); $this->html=$res; $this->url=$url; /*if(preg_match("#<title>(.*?)</title>#", $res,$matches)){ $this->title=substr($matches[1],0,9); }*/ $this->title=$this->host; if($res){ $this->path=dirname(__FILE__)."/".$this->title; if(!file_exists($this->path)){ mkdir($this->path); chmod($this->path,0777); } if(!file_exists($this->path."/style")){ mkdir($this->path."/style"); chmod($this->path."/style/",0777); } }else{ exit("could not load html webpage."); } }else{ exit("Please input url!"); } } function get_resource($url_array){ foreach ($url_array as $key => $url) { $ch=curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); file_put_contents($this->path."/".basename($url), $res); chmod($this->path."/".basename($url), 0777); curl_close($ch); } } function get_image(){ $matches=array(); preg_match_all("/<img.*?src=[""](.*?/[w-]+.(gif|png|jpg)).*?[""]/i",$this->html, $matches); foreach ($matches[1] as $key => $url) { if(strpos($url, "/")===0){ $url=$this->http."://".$this->host.$url; }elseif (strpos($url,"//")===false) { $url=$this->http."://".$this->host."/".$url; } $ch=curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); if(!file_exists($this->path."/style/img")){ mkdir($this->path."/style/img"); chmod($this->path."/style/img",0777); } file_put_contents($this->path."/style/img/".basename($url), $res); chmod($this->path."/style/img/".basename($url), 0777); curl_close($ch); } echo "<br />get image over."; } function get_css(){ $matches=array(); //var_dump($this->html); preg_match_all("/<link.*?href=[""](.*?.css).*?>/i",$this->html, $matches); //var_dump($matches); foreach ($matches[1] as $key => $url) { if(strpos($url, "/")===0){ $url=$this->http."://".$this->host.$url; }elseif (strpos($url,"//")===false) { $url=$this->http."://".$this->host."/".$url; } //dump($url); $ch=curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); if(!file_exists($this->path."/style/css")){ mkdir($this->path."/style/css"); chmod($this->path."/style/css", 0777); } file_put_contents($this->path."/style/css/".basename($url), $res); chmod($this->path."/style/css/".basename($url),0777); curl_close($ch); } echo "<br />get css over."; } function get_js(){ $matches=array(); //.js文件后面带参数一般是为了不要让浏览器读缓存,过旧的js版本 preg_match_all("/<script.*?src=[""](.*?.js).*?>/i",$this->html, $matches); foreach ($matches[1] as $key => $url) { if(strpos($url, "/")===0){ $url=$this->http."://".$this->host.$url; }elseif (strpos($url,"//")===false) { $url=$this->http."://".$this->host."/".$url; } $ch=curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_HEADER, 0); curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); curl_setopt($ch, CURLOPT_TIMEOUT, 5); $res=curl_exec($ch); if(!file_exists($this->path."/style/js")){ mkdir($this->path."/style/js"); chmod($this->path."/style/js",0777); } file_put_contents($this->path."/style/js/".basename($url), $res); chmod($this->path."/style/js/".basename($url), 0777); curl_close($ch); } echo "<br />get js over."; } function formate_html(){ $res=$this->html; $url=$this->url; //process the source link $res=preg_replace("/<link.*?href=[""].*?/([w-]+.css).*?>/i", "<link href="./style/css/$1" rel="stylesheet" type="text/css" />", $res); $res=preg_replace("/<script.*?src=[""].*?/([w-.]+.js).*?>/i", "<script type="text/javascript" src="./style/js/$1">", $res); $res=preg_replace_callback("/<img.*?src=[""].*?/([w-]+.(gif|png|jpg)).*?>/i", function ($res){ return preg_replace("/src=[""].*?/([w-]+.(gif|png|jpg)).*?[""]/i", "src="./style/img/".$res[1].""", $res[0]); //dump($res); } , $res); $file_name=preg_replace("/.w+$/", "", basename($url)); file_put_contents($this->path."/".$file_name.".html", $res); chmod($this->path."/".$file_name.".html", 0777); } } function crawl($url){ $spider=new spider($url); $spider->get_css(); $spider->get_js(); $spider->get_image(); $spider->formate_html(); } if (!empty($_GET["url"])) { crawl($_GET["url"]); } ?> </body> </html>
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: php 采集函数,很好的php采集函数
- 下一篇: 使用phpQuery 抓取HTML 页面内容