java根据 正则表达式解析html网页内容
仅供参考:
import java.io.DataInputStream; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; import com.cms.common.entity.HttpRespons; public class test { public static void main(String[] args) { getLyric("逐浪飞花"); } /** * 获取可下载的歌词信息 * @param songName 歌曲名称 * */ public static List<Map<String, String>> getLyric(String songName) { List<Map<String, String>> reqMap = new ArrayList<Map<String,String>>(); try { HttpRequester request = new HttpRequester(); String urlNameString = "http://www.lrcgc.com/so/?q="+ songName; HttpRespons hr = request.sendGet(urlNameString); String content = hr.getContent(); //返回内容 // System.out.println(content); //获取歌曲信息的结果 如: <a href="/lyric-26228-242158.html" target="_blank"><em>逐浪</em><em>飞花</em></a>, String regex = "<a[^>]*href="/lyric-[^>]*>.*?</a>"; List<String> link = getContentByRegex(content,regex); // System.out.println("路径:"+link); //歌曲名称列表 List<String> songUrlList = match(link.toString(), "a", "href"); // System.out.println("值:"+songUrlList); //歌手列表 List<String> songNameList = getLabelValues(link.toString(),regex); // System.out.println(songNameList); regex = "<a[^>]*href="/songlist-[^>]*>.*?</a>"; link = getContentByRegex(content,regex); // System.out.println("歌手:"+link); //歌手列表 List<String> singerList = getLabelValues(link.toString(),regex); // System.out.println(singerList); for (int i = 0; i < singerList.size(); i++) { Map<String, String> map = new HashMap<String, String>(); map.put("singerName", singerList.get(i).replace("&", "&")); map.put("songName", songNameList.get(i)); //下载链接 TODO map.put("songUrl", geciDownlrc(songUrlList.get(i))); reqMap.add(map); } } catch (Exception e) { e.printStackTrace(); } return reqMap; } /** * * 下载 歌词信息 */ private static String geciDownlrc(String songUrl) { try { HttpRequester request = new HttpRequester(); //歌曲名称 // http://www.lrcgc.com/lyric-26228-242158.html String urlNameString = "http://www.lrcgc.com/"+songUrl; HttpRespons hr = request.sendGet(urlNameString); //请求链接 String content = hr.getContent(); //返回内容 // System.out.println(content); //获取歌曲信息的结果 如: <a href="/lyric-26228-242158.html" target="_blank"><em>逐浪</em><em>飞花</em></a>, String regex = "<a[^>]*id="J_downlrc"[^>]*>.*?</a>"; List<String> link = getContentByRegex(content,regex); // System.out.println(link); List<String> list = match(link.toString(), "a", "href"); // System.out.println("值:"+list); String fileName = ""; String fileUrl = ""; //获取文件名称 if (list != null && list.size() > 0) { fileUrl = list.get(0).replace("&", "&"); fileName = fileUrl.substring(fileUrl.indexOf("/")+1,fileUrl.length()); fileUrl = "http://www.lrcgc.com//"+fileUrl; } // System.out.println("fileUrl:"+fileUrl); // System.out.println("fileName:"+fileName); return fileUrl; } catch (Exception e) { e.printStackTrace(); } return ""; } /** * 传入要下载的文件的url,将url所对应的文件下载到本地 * @param urlString 下载的文件的url * @param fileName 文件名称 */ public static void downloadFile(String urlString,String fileName) { String localFilePath = "C:\Users\Administrator\Desktop\"+fileName; try { URL url = new URL(urlString); DataInputStream dataInputStream = new DataInputStream(url.openStream()); FileOutputStream fileOutputStream = new FileOutputStream(new File(localFilePath)); byte[] buffer = new byte[1024]; int length; while ((length = dataInputStream.read(buffer)) > 0) { fileOutputStream.write(buffer, 0, length); } dataInputStream.close(); fileOutputStream.close(); } catch (MalformedURLException e) { e.printStackTrace(); } catch (IOException e) { e.printStackTrace(); } } /** * * @param html * @return 获得网页标题 */ public static String getTitle( String html) { String regex; String title = ""; final List<String> list = new ArrayList<String>(); regex = "<title>.*?</title>"; final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); final Matcher ma = pa.matcher(html); while (ma.find()) { list.add(ma.group()); } for (int i = 0; i < list.size(); i++) { title = title + list.get(i); } return title.replaceAll("<.*?>", ""); } /** * 获取标签中的值 * @param html 内容 * @param regex 正则表达式 * @return */ public static List<String> getLabelValues(String html,String regex) { // String regex; final List<String> list = new ArrayList<String>(); // regex = "<a[^>]*href="/lyric-[^>]*>(.*?)</a>"; final Pattern pa = Pattern.compile(regex, Pattern.CANON_EQ); final Matcher ma = pa.matcher(html); while (ma.find()) { list.add(ma.group().replaceAll("<.*?>", "")); } return list; } /** * 获取匹配的正则表达式 * @param s 内容 * @param regex 正则表达式 * @return */ public static List<String> getContentByRegex(String s,String regex) { final List<String> list = new ArrayList<String>(); //获得页面所有的链接 final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } return list; } /** * * @param s * @return 获得所有的超链接 */ public List<String> getLink(final String s) { String regex; final List<String> list = new ArrayList<String>(); regex = "<a[^>]*href=("([^"]*)"|"([^"]*)"|([^\s>]*))[^>]*>(.*?)</a>"; final Pattern pa = Pattern.compile(regex, Pattern.DOTALL); final Matcher ma = pa.matcher(s); while (ma.find()) { list.add(ma.group()); } return list; } /** * 获取指定HTML标签的指定属性的值 * @param source 要匹配的源文本 * @param element 标签名称 * @param attr 标签的属性名称 * @return 属性值列表 */ public static List<String> match(String source, String element, String attr) { List<String> result = new ArrayList<String>(); String reg = "<" + element + "[^<>]*?\s" + attr + "=[""]?(.*?)[""]?(\s.*?)?>"; Matcher m = Pattern.compile(reg).matcher(source); while (m.find()) { String r = m.group(1); result.add(r); } return result; } }
HttpRespons hr = request.sendGet(urlNameString);
这个方法请参考:http://blog.csdn.net/qq_27292113/article/details/71534346 这里面有详细的代码。
声明:该文观点仅代表作者本人,牛骨文系教育信息发布平台,牛骨文仅提供信息存储空间服务。
- 上一篇: 正则表达式分析网页数据
- 下一篇: 正则表达式 提取 html 标签的内容