phpspider 简单使用
phpspider一款优秀的PHP开发蜘蛛爬虫
官方下载地址:https://github.com/owner888/phpspider
官方开发手册:https://doc.phpspider.org/
关于这个爬虫的使用 下载下来之后有写好的实例 我在这里只是说一下我遇到的一个坑
// GitHub下载方式 require_once __DIR__ . "/../autoloader.php"; use phpspidercorephpspider; /* Do NOT delete this comment */ /* 不要删除这段注释 */ $configs = array( "name" => "糗事百科", "log_show" => true, "tasknum" => 1, //"save_running_state" => true, "domains" => array( "qiushibaike.com", "www.qiushibaike.com" ), "scan_urls" => array( "http://www.qiushibaike.com/" ), "list_url_regexes" => array( "http://www.qiushibaike.com/8hr/page/d+?s=d+" ), "content_url_regexes" => array( "http://www.qiushibaike.com/article/d+", ), "max_try" => 5, //"proxies" => array( //"http://H784U84R444YABQD:57A8B0B743F9B4D2@proxy.abuyun.com:9010" //), "export" => array( "type" => "csv", "file" => "../data/qiushibaike.csv", ), //"export" => array( //"type" => "sql", //"file" => "../data/qiushibaike.sql", //"table" => "content", //), // "export" => array( // "type" => "db", // "table" => "content", // ), "db_config" => array( "host" => "127.0.0.1", "port" => 3306, "user" => "root", "pass" => "123456", "name" => "spider", ), //"queue_config" => array( //"host" => "127.0.0.1", //"port" => 6379, //"pass" => "", //"db" => 5, //"prefix" => "phpspider", //"timeout" => 30, //), "fields" => array( array( "name" => "article_title", "selector" => "//*[@id="single-next-link"]//div[contains(@class,"content")]/text()[1]", "required" => true, ), array( "name" => "article_author", "selector" => "//div[contains(@class,"author")]//h2", "required" => true, ), array( "name" => "article_headimg", "selector" => "//div[contains(@class,"author")]//a[1]", "required" => true, ), array( "name" => "article_content", "selector" => "//*[@id="single-next-link"]//div[contains(@class,"content")]", "required" => true, ), array( "name" => "article_publish_time", "selector" => "//div[contains(@class,"author")]//h2", "required" => true, ), array( "name" => "url", "selector" => "//div[contains(@class,"author")]//h2", // 这里随便设置,on_extract_field回调里面会替换 "required" => true, ), ), ); $spider = new phpspider($configs); $spider->start();
这是官网文档其中的一个实例 使用说明中描述只能通过命令行的模式运行爬虫文件
结果我运行了一下demo并没有成功
之后发现糗事百科已经换成https协议 但是代码中的路由还是http的
我抱着试试的想法改成 https
果然成功了 爬虫已经可以正常运行了
具体的一下参数详情还是得看官网的文档
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。