QueryList抓取尝试
<?php /** * QueryList使用示例 * * 入门教程:http://doc.querylist.cc/site/index/doc/4 * * QueryList::Query(采集的目标页面,采集规则[,区域选择器][,输出编码][,输入编码][,是否移除头部]) * //采集规则 * $rules = array( * "规则名" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]), * "规则名2" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]), * .......... * [,"callback"=>"全局回调函数"] * ); */ set_time_limit(0); require "vendor/autoload.php"; function convToUtf8($str) { if( mb_detect_encoding($str,"UTF-8, ISO-8859-1, GBK")!="UTF-8" ) {//判断是否不是UTF-8编码,如果不是UTF-8编码,则转换为UTF-8编码 return iconv("gbk","utf-8",$str); } else { return $str; } } use QLQueryList; $rules = array( "parent" => array(".PageBody>table>tr>td:eq(0)>p","text"), "href" => array(".PageBody>table>tr>td:eq(0)>p>a","href"), ); $rulesxueke = array( "child" => array("#leftgundong>table>tr>td>p","text"), "hrefurl" => array("#leftgundong>table>tr>td>p>a","href"), ); $rulesthree = array( "threename" => array(".PageBody>table>tr>td:eq(2)>table>tr:eq(3)>td>div>table>tr","text"), ); $html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp"; $data = QueryList::Query($html,$rules)->data; //一级 //根据一级去抓取二级 foreach( $data as $k=>$v) { $data[$k]["parent"] = convToUtf8($v["parent"]); $linkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$v["href"];//获取二级地址 $htmlurl = file_get_contents($linkurl); $twoleve = QueryList::Query($htmlurl,$rulesxueke)->data; //二级 foreach($twoleve as $kk=>$vv) { $twoleve[$kk]["child"] = convToUtf8($vv["child"]); } $data[$k]["twoleveldata"] = $twoleve; foreach($data[$k]["twoleveldata"] as $tk=>$tv) { $tlinkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$tv["hrefurl"];//获取二级地址 $thtmlurl = file_get_contents($tlinkurl); $twoleve = QueryList::Query($thtmlurl,$rulesthree)->data; //三级 foreach($twoleve as $tck=>$tcv) { $twoleve[$tck]["threename"] = convToUtf8($tcv["threename"]); $data[$k]["twoleveldata"][$tk]["chreelist"] = $twoleve; } } } print_r($data);die(); //根据二级去抓取三级 /* $html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp?yjxkdm=0201&xkdm=01,02,03,04,05,06"; $data = QueryList::Query($html,$rules)->data; //一级 foreach($data as $k=>$v) { $data[$k]["parent"] = convToUtf8($v["parent"]); } print_r($data);die(); */
一次抓取大学录取数据的尝试
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。