Java爬取某姐的小視訊
阿新 • • 發佈:2018-12-28
最近認真復(學)習了Java的IO,網路,正則表示式等,感覺如果沒什麼練手的話過段時間就忘了,於是就想到了爬蟲。剛好以前用Python爬過百姐的小視訊,於是打算用Java把這個實現。如果想看Python版的,可以參照本人這篇博文Python爬取百思不得姐的視訊。話不多說,直接放碼(Talk is cheap, show you the code)。
import java.io.BufferedReader; import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.net.URL; import java.util.HashMap; import java.util.Map; import java.util.regex.Matcher; import java.util.regex.Pattern; /** * 功能:爬取某姐的小視訊 * @author cxd * */ public class WebSpiderDemo1 { public static void main(String[] args) throws Exception { String source = "http://www.budejie.com/video/"; String destDir = "D:/rob/"; Map<String, String> urlMap = getUrlInSource(source); for (Map.Entry<String, String> entry : urlMap.entrySet()) { String title = entry.getKey();// 視訊名稱 String url = entry.getValue();// 視訊url File destFile = new File(destDir + title + ".mp4"); download(url, destFile); } } /** * 通過視訊的URL下載該視訊並存入本地 * * @param url 視訊的URL * @param destFile 視訊存入的位置 * @throws IOException */ public static void download(String url, File destFile) throws IOException { URL videoUrl = new URL(url); InputStream is = videoUrl.openStream(); FileOutputStream fos = new FileOutputStream(destFile); int len = 0; byte[] buffer = new byte[1024]; while ((-1) != (len = is.read(buffer))) { fos.write(buffer, 0, len); } fos.flush(); if (null != fos) { fos.close(); } if (null != is) { is.close(); } } /** * 獲取視訊的URL地址和視訊名稱存入hashMap * * @param source * @return * @throws IOException */ public static Map<String, String> getUrlInSource(String source) throws IOException { Map<String, String> hashMap = new HashMap<>(); for (int index = 1; index <= 1; index++) { // 頁數最大為50,自己玩嘛,就只爬取了一頁。 String pageUrl = source + index; URL url = new URL(pageUrl); InputStream is = url.openStream(); // 若遇到反爬機制則使用該方法將程式偽裝為瀏覽器進行訪問 // HttpURLConnection conn = (HttpURLConnection) url.openConnection(); // conn.setRequestMethod("GET"); // conn.setRequestProperty("user-agent", // "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.77 Safari/537.36"); // BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(), "UTF-8")); BufferedReader br = new BufferedReader(new InputStreamReader(is, "UTF-8")); String info = null; String title = null; // 此處不要用==null進行判斷,因為網頁中有很多行都是null,否則會報java.lang.NullPointerException。 for (int i = 0; i < 10000; i++) { info = br.readLine(); if (null != info) { String urlRegex = "data-mp4=\"(.*?\\.mp4)"; if (info.contains("data-title")) { title = info; } Pattern pattern = Pattern.compile(urlRegex); Matcher matcher = pattern.matcher(info); if (matcher.find()) { for (int j = 0; j <= matcher.groupCount(); j++) { String tmp = matcher.group(j); if (!tmp.startsWith("data-mp4=")) { String videoTitle = getTitle(title.trim()); hashMap.put(videoTitle, tmp); } } } } } } return hashMap; } /** * 清洗整理titile字串, * * @param info * @return */ private static String getTitle(String info) { int len = info.length(); String title = info.substring(12, len - 1); return title; } }
爬取結果如下:
因為自己學習玩嘛,就只爬取了首頁的。若想爬取全部,可詳見程式碼的相關注釋。
注:此程式碼只能作為學習交流之用,千萬不能做惡,千萬不能做惡,千萬不能做惡,千萬不能做惡,千萬不能做惡,千萬不能做惡。