1. 程式人生 > >網路爬蟲之獲取圖片到本地

網路爬蟲之獲取圖片到本地

/*
 * Created on Aug 26, 2011 2:41:26 PM
 *
 * HtmlSourceGetter.java
 *
 * NOTICE OF PROPRIETARY RIGHTS
 *
 * This program is a confidential trade secret and the property of author. Use, examination,
 * reproduction, disassembly, decompiling, transfer and/or disclosure to others of
 * all or any part of this software program are strictly prohibited except by express
 * written agreement with author.
 *
 * --------------------------------------------------------------------------------------
 * Modification History
 * Date            Author        Version        Description
 * Aug 26, 2011        Cross        1.0        New
 * --------------------------------------------------------------------------------------
 */

package com.cross.tools;

import java.io.BufferedInputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;

import org.htmlparser.Node;
import org.htmlparser.NodeFilter;
import org.htmlparser.Parser;
import org.htmlparser.filters.NodeClassFilter;
import org.htmlparser.tags.ImageTag;
import org.htmlparser.tags.LinkTag;
import org.htmlparser.util.NodeList;
import org.htmlparser.util.ParserException;
import org.htmlparser.util.SimpleNodeIterator;

public class HtmlSourceGetter {
    
    private static HttpURLConnection con = null;
    private static BufferedInputStream bis = null;
    private static OutputStream out = null;
    
    public static void getSource(String url) {
    
    public static void parseHTML(String url, String keyword) {
    
    private static void processNodeList(NodeList list, String keyword) {
    
    public static void extractLinks(String url) {
    try {
        
        Parser parser = new Parser(url);
        parser.setEncoding("UTF-8");
        
        // frame filter
        NodeFilter frameFilter = new NodeFilter() {
        @Override
        public boolean accept(Node node) {
            if(node.getText().startsWith("frame src=")) {
            return true;
            }
            return false;
        }
        
        };
        
        // image filter;
        NodeFilter imageFilter = new NodeClassFilter(ImageTag.class);
        
        // href filter;
        NodeFilter linkFilter = new NodeClassFilter(LinkTag.class);
        
        
        // link or image filter
//        OrFilter orFilter = new OrFilter(new NodeClassFilter(LinkTag.class),new NodeClassFilter(ImageTag.class));
        
        // link or image or frame filter
//        OrFilter allFilter = new OrFilter(orFilter,frameFilter);
        
        
        NodeList nodeList = parser.extractAllNodesThatMatch(imageFilter);
        
        for (int i = 0; i < nodeList.size(); i++) {
        Node tag = nodeList.elementAt(i);
        
        // <a href> tag
//        if(tag instanceof LinkTag) {
//           LinkTag link = (LinkTag)tag;
//           String linkURL = link.getLink();
//           String linkText = link.getLinkText();
//           System.out.println("linkURL:"+linkURL);
//           System.out.println("linkText:"+linkText);
//        }
        
        // <img src> tag
//        else if(tag instanceof ImageTag) {
            ImageTag image = (ImageTag)tag;
            String imageURL = image.getImageURL();
            String imageText = image.getText();
            System.out.println("imageURL:"+imageURL);
            System.out.println("imageText:"+imageText);
            
            con = (HttpURLConnection)(new URL(imageURL).openConnection());
            con.connect();
            
            bis = new BufferedInputStream(con.getInputStream());  
            out = new FileOutputStream(new File("c:/cross/" + i + "_" +System.currentTimeMillis()  +imageURL.substring(imageURL.lastIndexOf("."))));
            
            byte[] buf = new byte[1024];
            int size = 0;

            while((size = bis.read(buf)) != -1){
            out.write(buf, 0, size);
                }  
//            out.flush();
            
//        } else { // <frame src> tag eg:<frame src="test.html"/>
//            String frame = tag.getText();
//            String frameURL = frame.split("\"")[1];
//            System.out.println("frameURL:"+frameURL);
//            
//        }
        
        }
    } catch (Exception e) {
        System.err.println(e.getStackTrace());
    } finally {
        try {
        out.close();
        bis.close();
        con.disconnect();
        } catch (IOException e) {
        e.printStackTrace();
        }
    }
    }

    
    public static void main(String[] args) {
//    HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "@");
//    HtmlSourceGetter.parseHTML("http://localhost:8080/test/", "img");
    HtmlSourceGetter.extractLinks("http://localhost:8080/");
//    HtmlSourceGetter.extractLinks("http://localhost:8080/test/");
    }

}