QueryList抓取尝试
<?php
/**
* QueryList使用示例
*
* 入门教程:http://doc.querylist.cc/site/index/doc/4
*
* QueryList::Query(采集的目标页面,采集规则[,区域选择器][,输出编码][,输入编码][,是否移除头部])
* //采集规则
* $rules = array(
* "规则名" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]),
* "规则名2" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]),
* ..........
* [,"callback"=>"全局回调函数"]
* );
*/
set_time_limit(0);
require "vendor/autoload.php";
function convToUtf8($str) {
if( mb_detect_encoding($str,"UTF-8, ISO-8859-1, GBK")!="UTF-8" ) {//判断是否不是UTF-8编码,如果不是UTF-8编码,则转换为UTF-8编码
return iconv("gbk","utf-8",$str);
} else {
return $str;
}
}
use QLQueryList;
$rules = array(
"parent" => array(".PageBody>table>tr>td:eq(0)>p","text"),
"href" => array(".PageBody>table>tr>td:eq(0)>p>a","href"),
);
$rulesxueke = array(
"child" => array("#leftgundong>table>tr>td>p","text"),
"hrefurl" => array("#leftgundong>table>tr>td>p>a","href"),
);
$rulesthree = array(
"threename" => array(".PageBody>table>tr>td:eq(2)>table>tr:eq(3)>td>div>table>tr","text"),
);
$html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp";
$data = QueryList::Query($html,$rules)->data; //一级
//根据一级去抓取二级
foreach( $data as $k=>$v) {
$data[$k]["parent"] = convToUtf8($v["parent"]);
$linkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$v["href"];//获取二级地址
$htmlurl = file_get_contents($linkurl);
$twoleve = QueryList::Query($htmlurl,$rulesxueke)->data; //二级
foreach($twoleve as $kk=>$vv) {
$twoleve[$kk]["child"] = convToUtf8($vv["child"]);
}
$data[$k]["twoleveldata"] = $twoleve;
foreach($data[$k]["twoleveldata"] as $tk=>$tv) {
$tlinkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$tv["hrefurl"];//获取二级地址
$thtmlurl = file_get_contents($tlinkurl);
$twoleve = QueryList::Query($thtmlurl,$rulesthree)->data; //三级
foreach($twoleve as $tck=>$tcv) {
$twoleve[$tck]["threename"] = convToUtf8($tcv["threename"]);
$data[$k]["twoleveldata"][$tk]["chreelist"] = $twoleve;
}
}
}
print_r($data);die();
//根据二级去抓取三级
/* $html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp?yjxkdm=0201&xkdm=01,02,03,04,05,06";
$data = QueryList::Query($html,$rules)->data; //一级
foreach($data as $k=>$v) {
$data[$k]["parent"] = convToUtf8($v["parent"]);
}
print_r($data);die(); */
一次抓取大学录取数据的尝试
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
