牛骨文教育服务平台(让学习变的简单)
博文笔记

PHP采集指定HTML标签代码内容分享

创建时间:2012-09-04 投稿人: 浏览次数:4246

引用www.dachunblog.com

说明:采用curl方式读取URL内容可采集一些防盗链地址内容,并对gzip压缩文件进行解压,采集部分HTML标签内容。HTML解析采用strpos效率响应时间高于正则采集

下一版本预告:1.防盗链图片采集方法,2.通用新闻页正文采集(同一段代码可以采集大部分新闻页正文)请关注blog.dachun.net大春博客

如果有问题或有更好的方式交流请http://blog.dachun.net/?p=182留言交流

<?php
header("Expires: Thu, 01 Jan 1970 00:00:01 GMT");
header("Content-type: text/html; charset=utf-8");
header("Cache-Control: no-cache, must-revalidate, max-age=0");
header("Pragma: no-cache");
/**
 * PHP采集器工具类 v1.0
 * 开发者:大春 blog.dachun.net
 * 说明:采用curl方式读取URL内容可采集一些防盗链地址内容,并对gzip压缩文件进行解压,采集部分HTML标签内容。HTML解析采用strpos效率响应时间高于正则采集
 * 下一版本预告:1.防盗链图片采集方法,2.通用新闻页正文采集(同一段代码可以采集大部分新闻页正文)请关注blog.dachun.net大春博客
 */
class caiji{

    public $unicode = "utf-8";
    /**
     * curl获取URL内容
     *
     * @param unknown_type $url //URL地址
     * @return unknown
     */
    function openformit($url){
        if(!preg_match("/http://.*/i",$url)) return false;
        $ch = curl_init();
        $timeout = 10;
        if(!curl_setopt ($ch, CURLOPT_URL, $url)){ return false; }
        if(!curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1)){ return false; }
        if(!curl_setopt ($ch, CURLOPT_HEADER, true)){ return false; }
        if(!curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout)){ return false; }
        $html = curl_exec($ch);
        $info = curl_getinfo($ch);
        $header = substr($html,0,$info["header_size"]);
        $html    = substr($html,$info["header_size"]);
        if(strpos($header,"gzip")){
            $html = my_gzdecode($html);
        }
        if(!strpos($header,$this->unicode)){
            $html = mb_convert_encoding($html,$this->unicode,"utf-8,gb2312,gbk,big5");
        }
        return $html;
    }

    /**
     * 解压GZIP
     *
     * @param unknown_type $data
     * @return unknown
     */
    function my_gzdecode($data){
        if (!function_exists ( "gzdecode" )) {
            $flags = ord ( substr ( $data, 3, 1 ) );
            $headerlen = 10;
            $extralen = 0;
            $filenamelen = 0;
            if ($flags & 4) {
                $extralen = unpack ( "v", substr ( $data, 10, 2 ) );
                $extralen = $extralen [1];
                $headerlen += 2 + $extralen;
            }
            if ($flags & 8) // Filename
            $headerlen = strpos ( $data, chr ( 0 ), $headerlen ) + 1;
            if ($flags & 16) // Comment
            $headerlen = strpos ( $data, chr ( 0 ), $headerlen ) + 1;
            if ($flags & 2) // CRC at end of file
            $headerlen += 2;
            $unpacked = gzinflate ( substr ( $data, $headerlen ) );
            if ($unpacked === FALSE)
            $unpacked = $data;
            return $unpacked;
        }else{
            return gzdecode($data);
        }
    }

    /**
     * 判断是否为UTF8编码
     *
     * @param unknown_type $string
     * @return unknown
     */
    function is_utf8($string){
        return preg_match ( "%^(?:
          [x09x0Ax0Dx20-x7E]             # ASCII
        | [xC2-xDF][x80-xBF]              # non-overlong 2-byte
        |   xE0[xA0-xBF][x80-xBF]         # excluding overlongs
        | [xE1-xECxEExEF][x80-xBF]{2}   # straight 3-byte
        |   xED[x80-x9F][x80-xBF]         # excluding surrogates
        |   xF0[x90-xBF][x80-xBF]{2}      # planes 1-3
        | [xF1-xF3][x80-xBF]{3}           # planes 4-15
        |   xF4[x80-x8F][x80-xBF]{2}      # plane 16
    )*$%xs" , $string );
    }

    /**
     * 采集HTML标签内容
     * 开发者:大春 blog.dachun.net
     * @param unknown_type $html //HTMl数据源
     * @param unknown_type $tag  //HTML标签内容
     * @return unknown 返回标签范围内位子
     */
    function getTags($html, $tag){
        $level        = 0;
        $offset        = 0;
        $return     = "";
        $len        = strlen($tag);
        $tag        = strtolower($tag);
        $html2        = strtolower($html);
        if(strpos($tag," ")){
            $temp         = explode(" ",$tag);
        }
        $tag_end    = (isset($temp[0]))?$temp[0]:$tag;
        $i = 0;
        while(1){
            $seat1    = strpos($html2,"<{$tag}",$offset);
            if(false === $seat1) return $return;
            $seat2    = strpos($html2,"</{$tag_end}>",$seat1+strlen($tag)+1);
            $seat3    = strpos($html2,"<{$tag}",$seat1+strlen($tag)+1);
            while($seat3!=false && $seat3<$seat2){
                $seat2    = strpos($html2,"</{$tag_end}>",$seat2+strlen($tag_end)+3);
                $seat3    = strpos($html2,"<{$tag}",$seat3+strlen($tag)+1);
            }
            $offset = $seat1+$len+1;
            $return[$i]["s"] = $seat1;
            $return[$i]["e"] = $seat2+$len+3-$seat1;
            $i++;
        }
    }

    /**
     * 统计标签数量
     *
     * @param unknown_type $html //HTMl数据源
     * @param unknown_type $tag  //HTML标签内容
     * @return unknown
     */
    function countTag($html,$tag){
        $return     = 0;
        $offset        = 0;
        $tag        = "<".$tag." ";
        $len        = strlen($tag);
        while (1){
            $star         = strpos($html,$tag,$offset);
            if(false===$star) return $return;
            $return++;
            $offset    = $star+$len;
        }
    }
}

//采集新浪军事频道头条新闻列表
$url = "http://mil.news.sina.com.cn/";
$cj = new caiji; //实例化采集类
if(!$html=$cj->openformit($url)){
    echo "URL地址无法打开";
    exit;
}
$uls = $cj->getTags($html,"ul class="news_list""); //获取标签<ul class="news_list">..........</ul>内容位子
if(!isset($uls[0]["s"])){
    echo "无法获取标签内容";
    exit;
}
$content = substr($html,$uls[0]["s"],$uls[0]["e"]); //获取内容
echo $content;
?>



声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。