牛骨文教育服务平台(让学习变的简单)
博文笔记

QueryList抓取尝试

创建时间:2017-03-21 投稿人: 浏览次数:1565
<?php
/**
 *  QueryList使用示例
 *  
 * 入门教程:http://doc.querylist.cc/site/index/doc/4
 * 
 * QueryList::Query(采集的目标页面,采集规则[,区域选择器][,输出编码][,输入编码][,是否移除头部])
* //采集规则
* $rules = array(
*   "规则名" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]),
*   "规则名2" => array("jQuery选择器","要采集的属性"[,"标签过滤列表"][,"回调函数"]),
*    ..........
*    [,"callback"=>"全局回调函数"]
* );
 */
set_time_limit(0);
require "vendor/autoload.php";

function convToUtf8($str) {
    if( mb_detect_encoding($str,"UTF-8, ISO-8859-1, GBK")!="UTF-8" ) {//判断是否不是UTF-8编码,如果不是UTF-8编码,则转换为UTF-8编码
        return  iconv("gbk","utf-8",$str);
    } else {
        return $str;
    }
}


use QLQueryList;

$rules = array(
	"parent" => array(".PageBody>table>tr>td:eq(0)>p","text"),
	"href" => array(".PageBody>table>tr>td:eq(0)>p>a","href"),
);

$rulesxueke = array(
	"child" => array("#leftgundong>table>tr>td>p","text"),
	"hrefurl" => array("#leftgundong>table>tr>td>p>a","href"),
);

$rulesthree = array(
	"threename" => array(".PageBody>table>tr>td:eq(2)>table>tr:eq(3)>td>div>table>tr","text"),
);

$html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp";
$data = QueryList::Query($html,$rules)->data; //一级


//根据一级去抓取二级
foreach( $data as $k=>$v) {
	$data[$k]["parent"] = convToUtf8($v["parent"]);
	$linkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$v["href"];//获取二级地址
	$htmlurl = file_get_contents($linkurl);
	$twoleve = QueryList::Query($htmlurl,$rulesxueke)->data; //二级
	foreach($twoleve as $kk=>$vv) {
		$twoleve[$kk]["child"] = convToUtf8($vv["child"]);
	}
	$data[$k]["twoleveldata"] = $twoleve;
	
	foreach($data[$k]["twoleveldata"] as $tk=>$tv) {
		$tlinkurl = "http://www.chinadegrees.cn/webrms/pages/Ranking/".$tv["hrefurl"];//获取二级地址
		$thtmlurl = file_get_contents($tlinkurl);
		$twoleve = QueryList::Query($thtmlurl,$rulesthree)->data; //三级
		
		foreach($twoleve as $tck=>$tcv) {
			$twoleve[$tck]["threename"] = convToUtf8($tcv["threename"]);
			$data[$k]["twoleveldata"][$tk]["chreelist"] = $twoleve;
		}
		
	}
}

print_r($data);die(); 


//根据二级去抓取三级


/* $html = "http://www.chinadegrees.cn/webrms/pages/Ranking/xkpmGXZJ.jsp?yjxkdm=0201&xkdm=01,02,03,04,05,06";
$data = QueryList::Query($html,$rules)->data; //一级

foreach($data as $k=>$v) {
	$data[$k]["parent"] = convToUtf8($v["parent"]);
}
print_r($data);die(); */


一次抓取大学录取数据的尝试

声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。