牛骨文教育服务平台(让学习变的简单)
博文笔记

获取前端网页 php爬虫 get_html.php

创建时间:2016-10-04 投稿人: 浏览次数:1068
<!DOCTYPE html>
<html>
<head>
<title>spider</title>
</head>
<body>
<form method="get" action="get_html.php">
crawl web html address:<input type="text" name="url" >
<input type="submit" value="crawl">
</form>
<?php
date_default_timezone_set("PRC");
function dump($var){
	echo "<pre>";
	var_dump($var);
	echo "<pre>";
	exit(date("Y-m-d H:i:s",time()));
}
//catetory html resource into local project file.
class spider{
		public $url;
		public $http;
		public $host;
		public $html;
		public $path;
		public $title;

	function __construct($url,$imagesPath=""){
		set_time_limit(60);
		//dump($url);
		//$url="http://www.hose.com";
		preg_match("#(https?)s?:s?//([w.-]+)/?#", $url,$matches);
		$this->http=$matches[1];
		$this->host=$matches[2];
		//dump($this->http.$this->host);
		if ($url) {
			$ch=curl_init($url);
		    //curl_setopt($ch,CURLOPT_URL, $url);
		    curl_setopt($ch, CURLOPT_HEADER, 0);
		    curl_setopt($ch, CURLOPT_RETURNTRANSFER, TRUE);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
		    $res=curl_exec($ch);
		    curl_close($ch);
		    $this->html=$res;
		    $this->url=$url;

		    /*if(preg_match("#<title>(.*?)</title>#", $res,$matches)){
		    	$this->title=substr($matches[1],0,9);
		    }*/
		    $this->title=$this->host;
		    if($res){
		    	$this->path=dirname(__FILE__)."/".$this->title;
		    	if(!file_exists($this->path)){
		    		mkdir($this->path);
		    		chmod($this->path,0777);
		    	}


		    	if(!file_exists($this->path."/style")){
		    		mkdir($this->path."/style");
		    		chmod($this->path."/style/",0777);
		    	}

		    	
		    
			}else{
				exit("could not load html webpage.");
			}
		}else{
			exit("Please input url!");
		}
	}

	function get_resource($url_array){
		foreach ($url_array as $key => $url) {
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);

			$res=curl_exec($ch);
			
			file_put_contents($this->path."/".basename($url), $res);
			chmod($this->path."/".basename($url), 0777);
			curl_close($ch);
		}
	}

	function get_image(){
		$matches=array();
		preg_match_all("/<img.*?src=[""](.*?/[w-]+.(gif|png|jpg)).*?[""]/i",$this->html, $matches);	
		
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, "/")===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,"//")===false) {
				$url=$this->http."://".$this->host."/".$url;
			}
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
			curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			$res=curl_exec($ch);
			if(!file_exists($this->path."/style/img")){
				mkdir($this->path."/style/img");
				chmod($this->path."/style/img",0777);
			}
			file_put_contents($this->path."/style/img/".basename($url), $res);
			chmod($this->path."/style/img/".basename($url), 0777);
			curl_close($ch);
		}
		echo "<br />get image over.";
	}

	function get_css(){
		$matches=array();
		//var_dump($this->html);
		preg_match_all("/<link.*?href=[""](.*?.css).*?>/i",$this->html, $matches);	
		//var_dump($matches);
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, "/")===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,"//")===false) {
				$url=$this->http."://".$this->host."/".$url;
			}
			//dump($url);
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
		    curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			
			$res=curl_exec($ch);
			if(!file_exists($this->path."/style/css")){
				mkdir($this->path."/style/css");
				chmod($this->path."/style/css", 0777);
			}
			file_put_contents($this->path."/style/css/".basename($url), $res);
			chmod($this->path."/style/css/".basename($url),0777);
			curl_close($ch);
		}
		echo "<br />get css over.";
	}

	function get_js(){
		$matches=array();
		//.js文件后面带参数一般是为了不要让浏览器读缓存,过旧的js版本
		preg_match_all("/<script.*?src=[""](.*?.js).*?>/i",$this->html, $matches);	
		
		foreach ($matches[1] as $key => $url) {
			if(strpos($url, "/")===0){
				$url=$this->http."://".$this->host.$url;
			}elseif (strpos($url,"//")===false) {
				$url=$this->http."://".$this->host."/".$url;
			}
			$ch=curl_init();
			curl_setopt($ch, CURLOPT_URL, $url);
			curl_setopt($ch, CURLOPT_HEADER, 0);
			curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
			curl_setopt($ch, CURLOPT_TIMEOUT, 5);
			$res=curl_exec($ch);
			if(!file_exists($this->path."/style/js")){
				mkdir($this->path."/style/js");
				chmod($this->path."/style/js",0777);
			}
			file_put_contents($this->path."/style/js/".basename($url), $res);
			chmod($this->path."/style/js/".basename($url), 0777);
			curl_close($ch);
		}
		echo "<br />get js over.";
	}

	function formate_html(){
		$res=$this->html;
		$url=$this->url;
		//process the source link
				$res=preg_replace("/<link.*?href=[""].*?/([w-]+.css).*?>/i", "<link href="./style/css/$1" rel="stylesheet" type="text/css" />", $res);
				$res=preg_replace("/<script.*?src=[""].*?/([w-.]+.js).*?>/i", "<script type="text/javascript" src="./style/js/$1">", $res);
				$res=preg_replace_callback("/<img.*?src=[""].*?/([w-]+.(gif|png|jpg)).*?>/i",
						function ($res){
							return preg_replace("/src=[""].*?/([w-]+.(gif|png|jpg)).*?[""]/i", "src="./style/img/".$res[1].""", $res[0]);
						 	//dump($res);
						}
					 , $res);
		    	$file_name=preg_replace("/.w+$/", "", basename($url));
	   	    	file_put_contents($this->path."/".$file_name.".html", $res);
	   	    	chmod($this->path."/".$file_name.".html", 0777);
	}
}

function crawl($url){
	$spider=new spider($url);
	$spider->get_css();
	$spider->get_js();
	$spider->get_image();
	$spider->formate_html();
}

if (!empty($_GET["url"])) {
	crawl($_GET["url"]);
}

?>
</body>
</html>


声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。