1. 程式人生 > >java提取(獲取)部落格資訊(內容)

java提取(獲取)部落格資訊(內容)

package com.wbg.my.service;
import java.io.*;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.*;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * @author Jack Chen
 * */
public class BlogUtil {

    /**
     * URL_PAGE:cnblogs url
     * URL_PAGE_DETAIL:詳情頁url
     * PAGE_COUNT:頁數
     * urlLists:所有詳情頁url Set集合(防止重複)
     * p:匹配模式
     * 
*/ public final static String URL_PAGE = "https://www.cnblogs.com/weibanggang/default.html?page="; public final static String URL_PAGE_DETAIL = "https://www.cnblogs.com/weibanggang/p/([0-9]+.html)"; public final static int PAGE_COUNT = 20; public static Set<String> urlLists = new TreeSet<String>();
public final static Pattern p = Pattern.compile(URL_PAGE_DETAIL); //檔案路徑 public static String file="d:index.html"; static String [] arr=null; static int sun=0; public static void main(String[] args) throws Exception { for(int i = 1;i<=PAGE_COUNT;i++) { getUrls(i); } System.out.println(
"開始獲取內容!"); arr=new String[urlLists.size()]; for(Iterator<String> i = urlLists.iterator();i.hasNext();) { createFile(i.next()); sun++; } System.out.println("獲取內容完畢!"); System.out.println("開始寫入檔案!"); StringBuffer stringBuffer=new StringBuffer(kais()); for (int i = 0; i < arr.length; i++) { stringBuffer.append(arr[i]); } stringBuffer.append(jiehun()); System.out.println("寫入檔案完畢!"); System.out.println("開始匯出檔案!"); createFile(file,stringBuffer); System.out.println("匯出檔案完畢!"); System.out.println("輸出檔案地址為:"+file); } /* * 將結果寫入檔案 */ private static void createFile(String file, StringBuffer buffer) { try { File newFile = new File(file); if (newFile.exists())// 存在,則刪除 if (!newFile.delete())// 刪除成功則建立 { System.err.println("刪除檔案" + newFile + "失敗"); } if (newFile.createNewFile()) {// 建立成功,則寫入檔案內容 PrintWriter p = new PrintWriter(new FileOutputStream(newFile .getAbsolutePath())); p.write(buffer.toString()); p.close(); } else { System.err.println("建立檔案:" + newFile + "失敗"); } } catch (Exception e) { e.printStackTrace(); } } //開始頭部 public static String kais(){ return "<!DOCTYPE html>\n" + "<html>\n" + "<head>\n" + " <meta charset=\"utf-8\">\n" + " <title>weibanggang.github.io</title>\n" + " <meta name=\"renderer\" content=\"webkit\">\n" + " <meta http-equiv=\"X-UA-Compatible\" content=\"IE=edge,chrome=1\">\n" + " <meta name=\"viewport\" content=\"width=device-width, initial-scale=1, maximum-scale=1\">\n" + " <style>\n" + " html,body{width:100%;height: 100%}\n" + " table{width: 1150px;height:500px;margin: auto}\n" + " table,td,th{border: 1px solid #e6e6e6;border-collapse:collapse; }\n" + " body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url(\"link.jpg\");background-repeat: no-repeat} body{-moz-background-size:100% 100%; background-size:100% 100%;background-image:url(\"link.jpg\");background-repeat: no-repeat}\n" + " * { margin: 0; padding: 0; }\n" + " table { border-collapse: collapse; text-align: center; }\n" + " /*關鍵設定 tbody出現滾動條*/\n" + " table tbody {\n" + " display: block;\n" + " height: 500px;\n" + " overflow-y: scroll;overflow-x:hidden;\n" + " }\n" + " table thead, tbody tr { display: table;width: 100%; table-layout: fixed; }\n" + " table thead th { height: 40px }\n" + " table tbody td {height: 30px }\n" + " </style>\n" + "</head>\n" + "\n" + "<body>\n" + "<marquee><h1 style=\"color:white;\">本網頁僅作為參考部落格、github等地址</h1></marquee>\n" + "<table width=\"80%\" border=\"1\">\n" + " <thead>\n" + " <tr>\n" + " <th style=\"width:230px\">序號</th>\n" + " <th style=\"width:231px\">標題連結</th>\n" + " <th style=\"width:231px\">時間</th>\n" + " <th style=\"width:231px\">來源</th>\n" + " <th style=\"width:249px\">備註</th>\n" + " </tr>\n" + " </thead>\n" + " <tbody>\n" + "\n" + " </tbody>\n" + "</table>\n" + "</body>\n" + "<script src=\"js/jquery.js\"></script>\n" + "<script>\n" + " var sum=["; } //結尾 public static String jiehun(){ return " ];\n" + " \n" + " for(var i=0;i<sum.length;i++){\n" + " var tr=$(\"<tr/>\");\n" + " //序號\n" + " $(\"<td/>\").html(i+1).appendTo(tr);\n" + " //標題連結\n" + " var a=\"<a href='\"+sum[i][0]+\"' target='_blank'>\"+sum[i][1]+\"</a>\"\n" + " $(\"<td/>\").html(a).appendTo(tr);\n" + " //時間\n" + " $(\"<td/>\").html(sum[i][2]).appendTo(tr);\n" + " //來源\n" + " $(\"<td/>\").html(sum[i][3]).appendTo(tr);\n" + " //備註\n" + " $(\"<td/>\").html(sum[i][4]).appendTo(tr);\n" + " $(\"table tbody\").append(tr);\n" + " }\n" + "</script>\n" + "</html>"; } static String fh=""; /** * @param url * 獲取所有內容 * @throws */ private static void createFile(String url) throws Exception { Matcher m = p.matcher(url); m.find(); String fileName = m.group(1); URL u = new URL(url); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); conn.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8")); String str; StringBuffer s=new StringBuffer(); while((str = br.readLine()) != null){ s.append(str); } String href="https://www.cnblogs.com/weibanggang/p/"+fileName; String title=getTitle(s); String data=getDate(s); arr[sun]=fh+"[\""+href+"\",\""+title+"\",\""+data+"\",\"部落格\",\"正常\"]"; fh=","; br.close(); conn.disconnect(); } //獲取時間 public static String getDate(StringBuffer sb){ int first=sb.indexOf("<span id=\"post-date\">")+"<span id=\"post-date\">".length(); String aa=sb.substring(first); int last=aa.indexOf("</span>"); String sa=aa.substring(0,last); return sa; } //獲取標題 public static String getTitle(StringBuffer sb){ int first=sb.indexOf("<title>"); int last=sb.indexOf("</title>"); String sa=sb.substring(first+7,last); int errorindex=sa.lastIndexOf("- 韋邦槓 - 部落格園"); return sa.substring(0,errorindex); } /** * @param idx * 獲取頁數 * @throws */ private static void getUrls(int idx) throws Exception{ URL u = new URL(URL_PAGE+""+idx); HttpURLConnection conn = (HttpURLConnection) u.openConnection(); conn.connect(); BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "utf-8")); String str; while((str = br.readLine()) != null){ if(null != str && str.contains("https://www.cnblogs.com/weibanggang/p/")) { Matcher m = p.matcher(str); if(m.find()) { urlLists.add(m.group()); } } } br.close(); conn.disconnect(); } }