纵横小说分布式采集-Lucene案例开发

转载请注明出处：http://blog.csdn.net/xiaojimanman/article/details/46812645

http://www.llwjy.com/blogdetail/9df464b20cca5405c7ce07e2fb2d768f.html

个人博客站已经上线了，网址 www.llwjy.com ~欢迎各位吐槽~

在前面的几篇博客中，我们已经介绍了如何采集纵横小说网站上的信息以及如何把这些信息持久化到数据库中，现在我们就开始介绍如何做分布式采集，让各个模块之间可以完美的配合。

采集类修改

在开始介绍分布式采集之前，我们需要对之前介绍的采集类添加一些方法，也就是返回上一篇博客中介绍的小说javabean，具体源码还请参照个人网站上的博客源码。

1.简介页

简介页需呀添加一个方法，让它返回简介页的数据信息，具体如下：

	/**
	 * @return
	 * @Author:lulei  
	 * @Description: 分析简介页，获取简介页数据
	 */
	public NovelIntroModel getNovelIntro() {
		NovelIntroModel bean = new NovelIntroModel();
		bean.setMd5Id(ParseMD5.parseStrToMd5L32(this.pageUrl));
		bean.setName(getName());
		bean.setAuthor(getAuthor());
		bean.setDescription(getDesc());
		bean.setType(getType());
		bean.setLastChapter(getLatestChapter());
		bean.setChapterlisturl(getChapterListUrl());
		bean.setWordCount(getWordCount());
		bean.setKeyWords(keyWords());
		return bean;
	}

2.阅读页

阅读页内同样需要添加一个方法，让它返回阅读页内的数据信息，具体如下：

	/**
	 * @return
	 * @Author:lulei  
	 * @Description: 分析阅读页，获取阅读页数据
	 */
	public NovelReadModel getNovelRead(){
		NovelReadModel novel = new NovelReadModel();
		novel.setTitle(getTitle());
		novel.setWordCount(getWordCount());
		novel.setContent(getContent());
		return novel;
		
	}

这些方法都是对之前类中的方法做一个整合，将之前分析到的数据组装成一个javabean返回，方便后面的操作。

各页采集线程类

在实现分布式采集的时候，就需要编写各个页面的采集线程类，让他来控制各页面的采集业务，下面我们就一一介绍：

1.更新列表页线程

这个线程的主要功能就是监控更新列表页的数据，提取页面上的简介页URL，认为它们是有更新的页面，将对应的信息持久化到数据库中，具体实现如下：

 /**  
 *@Description:    更新列表页线程
 */ 
package com.lulei.crawl.novel.zongheng;  

import java.util.List;
import java.util.concurrent.TimeUnit;

import com.lulei.db.novel.zongheng.ZonghengDb;
  
public class UpdateListThread extends Thread{
	private boolean flag = false;
	private String url;//抓取的更新列表页URL
	private int frequency;//采集频率
	
	public UpdateListThread(String name, String url, int frequency){
		super(name);
		this.url = url;
		this.frequency = frequency;
	}

	@Override
	public void run() {
		flag = true;
		ZonghengDb db = new ZonghengDb();
		while (flag){
			try {
				UpdateList updateList = new UpdateList(url);
				List<String> urls = updateList.getPageUrls(true);
				db.saveInfoUrls(urls);
				TimeUnit.SECONDS.sleep(frequency);
			} catch (Exception e) {
				// TODO Auto-generated catch block  
				e.printStackTrace();
			}
		}
		super.run();
	}

	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		UpdateListThread thread = new UpdateListThread("llist", "http://book.zongheng.com/store/c0/c0/b9/u0/p1/v0/s9/t0/ALL.html", 60);
		thread.start();

	}

}

2.简介页&章节列表页线程类

由于一个简介页就对应一个章节列表页，所以我们就把这两个线程合为一个线程，让其实现小说简介信息的采集以及小说章节列表信息的采集，具体实现如下：

 /**  
 *@Description:  小说简介信息线程
 */ 
package com.lulei.crawl.novel.zongheng;  

import java.util.List;
import java.util.concurrent.TimeUnit;

import com.lulei.crawl.novel.zongheng.model.NovelIntroModel;
import com.lulei.db.novel.zongheng.ZonghengDb;
  
public class IntroPageThread extends Thread {
	private boolean flag = false;
	
	public IntroPageThread(String name) {
		super(name);
	}

	@Override
	public void run() {
		flag = true;
		try {
			ZonghengDb db = new ZonghengDb();
			while (flag) {
				//随机获取一个待采集的简介页url
				String url = db.getRandIntroPageUrl(1);
				if (url != null) { 
					IntroPage intro = new IntroPage(url);
					NovelIntroModel bean =	intro.getNovelIntro();
					//采集小说章节列表页信息
					ChapterPage chapterPage = new ChapterPage(bean.getChapterlisturl());
					List<String[]> chapters = chapterPage.getChaptersInfo();
					bean.setChapterCount(chapters == null ? 0 : chapters.size());
					//更新小说简介信息
					db.updateInfo(bean);
					//插入待采集的章节列表
					db.saveChapters(chapters);
					//如果本次有待采集的资源，睡眠一个时间，没有待采集的资源，睡眠另一个时间
					TimeUnit.MILLISECONDS.sleep(500);
				}else {
					TimeUnit.MILLISECONDS.sleep(1000);
				}
			}
		} catch (Exception e) {
			e.printStackTrace();
		}
	}

	public static void main(String[] args) {
		// TODO Auto-generated method stub  
		IntroPageThread thread = new IntroPageThread("novelinfo");
		thread.start();
	}

}

3.阅读页线程

这个线程的主要功能就是将小说阅读页的信息采集并持久化到数据库中，具体如下：

 /**  
 *@Description: 小说阅读页线程
 */ 
package com.lulei.crawl.novel.zongheng;  

import java.util.concurrent.TimeUnit;

import com.lulei.crawl.novel.zongheng.model.NovelChapterModel;
import com.lulei.crawl.novel.zongheng.model.NovelReadModel;
import com.lulei.db.novel.zongheng.ZonghengDb;
import com.lulei.util.ParseMD5;
  
  
public class ReadPageThread extends Thread {
	private boolean flag = false;
	public ReadPageThread(String name) {
		super(name);
	}
	
	@Override
	public void run() {
		flag = true;
		ZonghengDb db = new ZonghengDb();
		while (flag) {
			try {
				//随机获取待采集的阅读页
				NovelChapterModel chapter = db.getRandReadPageUrl(1);
				if (chapter != null) {
					ReadPage read = new ReadPage(chapter.getUrl());
					NovelReadModel novel = read.getNovelRead();
					if (novel == null) {
						continue;
					}
					novel.setChapterId(chapter.getChapterId());
					novel.setTime(chapter.getTime());
					novel.setUrl(chapter.getUrl());
					//保存阅读页信息
					db.saveNovelRead(novel);
					//将状态修改为不需要采集
					db.updateChapterState(ParseMD5.parseStrToMd5L32(novel.getUrl()), 0);
					//如果本次有待采集的资源，睡眠一个时间，没有待采集的资源，睡眠另一个时间
					TimeUnit.MILLISECONDS.sleep(500);
				} else {
					TimeUnit.MILLISECONDS.sleep(1000);
				}
			} catch(Exception e){
				e.printStackTrace();
			}
		}
	}

	public static void main(String[] args) {
		ReadPageThread thread = new ReadPageThread("novel read page");
		thread.start();
	}

}

分布式采集

上面已经介绍完了各个线程完成的工作，下面就需要一个类来控制管理这些线程，让其运行起来，具体代码如下：

 /**  
 *@Description:     
 */ 
package com.lulei.crawl.novel.zongheng;  

import java.util.List;

import com.lulei.crawl.novel.zongheng.model.CrawlListInfo;
import com.lulei.db.novel.zongheng.ZonghengDb;
  
public class CrawStart {
	private static boolean booleanCrawlList = false;
	private static boolean booleanCrawlIntro = false;
	//简介页采集线程数目
	private static int crawlIntroThreadNum = 2;
	private static boolean booleanCrawlRead = false;
	//阅读页采集线程数目
	private static int crawlReadThreadNum = 10;
	
	/**
	 * @Author:lulei  
	 * @Description: 更新列表页采集
	 */
	public void startCrawlList(){
		if (booleanCrawlList) {
			return;
		}
		booleanCrawlList = true;
		ZonghengDb db = new ZonghengDb();
		List<CrawlListInfo> infos = db.getCrawlListInfos();
		if (infos == null) {
			return;
		}
		for (CrawlListInfo info : infos) {
			if (info.getUrl() == null || "".equals(info.getUrl())) {
				continue;
			}
			UpdateListThread thread = new UpdateListThread(info.getInfo(), info.getUrl(), info.getFrequency());
			thread.start();
		}
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 小说简介页和章节列表页
	 */
	public void startCrawlIntro() {
		if (booleanCrawlIntro) {
			return;
		}
		booleanCrawlIntro = true;
		for (int i = 0; i < crawlIntroThreadNum; i++) {
			IntroPageThread thread = new IntroPageThread("novel info thread" + i);
			thread.start();
		}
	}
	
	/**
	 * @Author:lulei  
	 * @Description: 小说阅读页
	 */
	public void startCrawlRead() {
		if (booleanCrawlRead) {
			return;
		}
		booleanCrawlRead = true;
		for (int i = 0; i < crawlReadThreadNum; i++) {
			ReadPageThread thread = new ReadPageThread("novel read page" + i);
			thread.start();
		}
	}

	public static void main(String[] args) {
		CrawStart start = new CrawStart();
		start.startCrawlList();
		start.startCrawlIntro();
		start.startCrawlRead();
	}

}

运行结果

通过上面的这几个步骤，纵横小说的分布式采集程序已经完成，下面就为大家展示一下采集后的数据库截图

写在最后

在上面的线程实现中，有很多的配置信息，比如说线程中的两个请求之间的间隔时间以及各类线程的数量，像这些信息我们都可以将其写到配置文件中，方便之后的修改（这里写到程序中是方便大家的理解，还请见谅）。

ps:最近发现其他网站可能会对博客转载，上面并没有源链接，如想查看更多关于基于lucene的案例开发请点击这里。或访问网址http://blog.csdn.net/xiaojimanman/article/category/2841877 或 http://www.llwjy.com/blogtype/lucene.html

小福利

个人在极客学院上《Lucene案例开发》课程已经上线了（目前上线到第二课），欢迎大家吐槽~
第一课：Lucene概述
 第二课：Lucene 常用功能介绍