1. 程式人生 > >java網路程式設計____最簡單的爬蟲(爬取網站美女圖片)

java網路程式設計____最簡單的爬蟲(爬取網站美女圖片)

package com.company.reptile;

import java.io.BufferedReader;
import java.io.DataInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/***
 * @author God
 * @see java抓取網站 美女圖片
 * @info 一個簡單的爬蟲 不涉及廣度優先和深度優先  僅僅做為理解
 */
public class JavaReptileUtil {
	
	// 地址
	private static final String WEB_SITE = "http://www.4493.com";
	// 獲取img標籤正則
	private static final String IMAGE_TAG_REG = "<img.*src=(.*?)[^>]*?>";
	// 獲取src路徑的正則
	private static final String IMAGE_SRC_REG = "http:\"?(.*?)(\"|>|\\s+)";
	/**
	 * 測試小爬蟲
	 * @param args
	 * @throws Exception 
	 */
	public static void main(String[] args) throws Exception {
		// 1.獲取美女圖片官網地址
		String htmlInfo = getHtmlInfo(WEB_SITE);
		//獲取圖片url連結地址
		List<String> imageSrc = getImageSrc(htmlInfo);
		//下載美女圖片
		downloadImage(imageSrc);
		
	}
	/**
	 * 解析html頁面
	 * @param host
	 * @return
	 * @throws Exception
	 */
	public static String getHtmlInfo(String host) throws Exception{
		URL url=new URL(host);
		URLConnection urlConnection = url.openConnection();
		BufferedReader bufferedReader = new BufferedReader(new InputStreamReader(urlConnection.getInputStream()));
		String buffer=null;
		StringBuffer sbf=new StringBuffer();
		while((buffer=bufferedReader.readLine())!=null){
			sbf.append(buffer);
		}
		return sbf.toString();
	}
	
	/**
	 * 解析所有的image標籤文的src屬性
	 * @param args
	 * @throws Exception
	 */
	public static List<String> getImageSrc(String htmlInfo){
		Matcher matcher_image = Pattern.compile(IMAGE_TAG_REG).matcher(htmlInfo);
		List<String> imageSrc = new ArrayList<String>();
		while (matcher_image.find()) {
			Matcher matcher_src = Pattern.compile(IMAGE_SRC_REG).matcher(matcher_image.group());
			while (matcher_src.find()) {
				imageSrc.add(matcher_src.group().substring(0, matcher_src.group().length() - 1));
			}
		}
		return imageSrc;
	}
	
	/**
	 * 下載爬到的url連結
	 * @param imageSrc
	 * @throws Exception 
	 */
	public static void downloadImage(List<String> imageSrc) throws IOException {
		for (String src : imageSrc) {
			URL url =null;
			try {
				url = new URL(src);
			} catch (IOException e) {
				continue;
			}
			// 下在資源
			DataInputStream dataInputStream = new DataInputStream(url.openStream());
			FileOutputStream fileOutputStream = new FileOutputStream(new File("F:\\beauty\\" + NetUtil.getStrName(src)));
			byte[] bytes = new byte[1024];
			int length = 0;
			while ((length = dataInputStream.read(bytes)) != -1) {
				fileOutputStream.write(bytes, 0, length);
				System.out.println("下載中....");
			}
			System.out.println("下載完成...");
			dataInputStream.close();
			fileOutputStream.close();
		}
	}
    
}

//

package com.company.reptile;

public class NetUtil {
	
    /**
     * 獲取url連結的圖片名稱
     * @param url
     * @return
     */
    public static String getStrName(String url) {  
        String[] sarry = url.split("/");  
        return sarry[sarry.length - 1];  
    }  
}


//執行結果

//圖片