PHP采集指定HTML标签代码内容分享
引用www.dachunblog.com
说明:采用curl方式读取URL内容可采集一些防盗链地址内容,并对gzip压缩文件进行解压,采集部分HTML标签内容。HTML解析采用strpos效率响应时间高于正则采集
下一版本预告:1.防盗链图片采集方法,2.通用新闻页正文采集(同一段代码可以采集大部分新闻页正文)请关注blog.dachun.net大春博客
如果有问题或有更好的方式交流请http://blog.dachun.net/?p=182留言交流
<?php
header("Expires: Thu, 01 Jan 1970 00:00:01 GMT");
header("Content-type: text/html; charset=utf-8");
header("Cache-Control: no-cache, must-revalidate, max-age=0");
header("Pragma: no-cache");
/**
* PHP采集器工具类 v1.0
* 开发者:大春 blog.dachun.net
* 说明:采用curl方式读取URL内容可采集一些防盗链地址内容,并对gzip压缩文件进行解压,采集部分HTML标签内容。HTML解析采用strpos效率响应时间高于正则采集
* 下一版本预告:1.防盗链图片采集方法,2.通用新闻页正文采集(同一段代码可以采集大部分新闻页正文)请关注blog.dachun.net大春博客
*/
class caiji{
public $unicode = "utf-8";
/**
* curl获取URL内容
*
* @param unknown_type $url //URL地址
* @return unknown
*/
function openformit($url){
if(!preg_match("/http://.*/i",$url)) return false;
$ch = curl_init();
$timeout = 10;
if(!curl_setopt ($ch, CURLOPT_URL, $url)){ return false; }
if(!curl_setopt ($ch, CURLOPT_RETURNTRANSFER, 1)){ return false; }
if(!curl_setopt ($ch, CURLOPT_HEADER, true)){ return false; }
if(!curl_setopt ($ch, CURLOPT_CONNECTTIMEOUT, $timeout)){ return false; }
$html = curl_exec($ch);
$info = curl_getinfo($ch);
$header = substr($html,0,$info["header_size"]);
$html = substr($html,$info["header_size"]);
if(strpos($header,"gzip")){
$html = my_gzdecode($html);
}
if(!strpos($header,$this->unicode)){
$html = mb_convert_encoding($html,$this->unicode,"utf-8,gb2312,gbk,big5");
}
return $html;
}
/**
* 解压GZIP
*
* @param unknown_type $data
* @return unknown
*/
function my_gzdecode($data){
if (!function_exists ( "gzdecode" )) {
$flags = ord ( substr ( $data, 3, 1 ) );
$headerlen = 10;
$extralen = 0;
$filenamelen = 0;
if ($flags & 4) {
$extralen = unpack ( "v", substr ( $data, 10, 2 ) );
$extralen = $extralen [1];
$headerlen += 2 + $extralen;
}
if ($flags & 8) // Filename
$headerlen = strpos ( $data, chr ( 0 ), $headerlen ) + 1;
if ($flags & 16) // Comment
$headerlen = strpos ( $data, chr ( 0 ), $headerlen ) + 1;
if ($flags & 2) // CRC at end of file
$headerlen += 2;
$unpacked = gzinflate ( substr ( $data, $headerlen ) );
if ($unpacked === FALSE)
$unpacked = $data;
return $unpacked;
}else{
return gzdecode($data);
}
}
/**
* 判断是否为UTF8编码
*
* @param unknown_type $string
* @return unknown
*/
function is_utf8($string){
return preg_match ( "%^(?:
[x09x0Ax0Dx20-x7E] # ASCII
| [xC2-xDF][x80-xBF] # non-overlong 2-byte
| xE0[xA0-xBF][x80-xBF] # excluding overlongs
| [xE1-xECxEExEF][x80-xBF]{2} # straight 3-byte
| xED[x80-x9F][x80-xBF] # excluding surrogates
| xF0[x90-xBF][x80-xBF]{2} # planes 1-3
| [xF1-xF3][x80-xBF]{3} # planes 4-15
| xF4[x80-x8F][x80-xBF]{2} # plane 16
)*$%xs" , $string );
}
/**
* 采集HTML标签内容
* 开发者:大春 blog.dachun.net
* @param unknown_type $html //HTMl数据源
* @param unknown_type $tag //HTML标签内容
* @return unknown 返回标签范围内位子
*/
function getTags($html, $tag){
$level = 0;
$offset = 0;
$return = "";
$len = strlen($tag);
$tag = strtolower($tag);
$html2 = strtolower($html);
if(strpos($tag," ")){
$temp = explode(" ",$tag);
}
$tag_end = (isset($temp[0]))?$temp[0]:$tag;
$i = 0;
while(1){
$seat1 = strpos($html2,"<{$tag}",$offset);
if(false === $seat1) return $return;
$seat2 = strpos($html2,"</{$tag_end}>",$seat1+strlen($tag)+1);
$seat3 = strpos($html2,"<{$tag}",$seat1+strlen($tag)+1);
while($seat3!=false && $seat3<$seat2){
$seat2 = strpos($html2,"</{$tag_end}>",$seat2+strlen($tag_end)+3);
$seat3 = strpos($html2,"<{$tag}",$seat3+strlen($tag)+1);
}
$offset = $seat1+$len+1;
$return[$i]["s"] = $seat1;
$return[$i]["e"] = $seat2+$len+3-$seat1;
$i++;
}
}
/**
* 统计标签数量
*
* @param unknown_type $html //HTMl数据源
* @param unknown_type $tag //HTML标签内容
* @return unknown
*/
function countTag($html,$tag){
$return = 0;
$offset = 0;
$tag = "<".$tag." ";
$len = strlen($tag);
while (1){
$star = strpos($html,$tag,$offset);
if(false===$star) return $return;
$return++;
$offset = $star+$len;
}
}
}
//采集新浪军事频道头条新闻列表
$url = "http://mil.news.sina.com.cn/";
$cj = new caiji; //实例化采集类
if(!$html=$cj->openformit($url)){
echo "URL地址无法打开";
exit;
}
$uls = $cj->getTags($html,"ul class="news_list""); //获取标签<ul class="news_list">..........</ul>内容位子
if(!isset($uls[0]["s"])){
echo "无法获取标签内容";
exit;
}
$content = substr($html,$uls[0]["s"],$uls[0]["e"]); //获取内容
echo $content;
?>声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: 得到最后一次SQL执行语句
- 下一篇: php获取当前html页面某个字段的值
