1. 程式人生 > >爬蟲記錄(4)——多執行緒爬取圖片並下載

爬蟲記錄(4)——多執行緒爬取圖片並下載

還是繼續前幾篇文章的程式碼。

當我們需要爬取的圖片量級比較大的時候,就需要多執行緒爬取下載了。這裡我們用到forkjoin pool來處理併發。

1、DownloadTask下載任務類

package com.dyw.crawler.util;

import java.io.File;
import java.io.InputStream;
import java.util.List;
import java.util.concurrent.RecursiveAction;

/**
 * forkJoin pool 並行處理下載圖片
 * Created by dyw on 2017/9/7.
 */
public class DownloadTask extends RecursiveAction { //每個任務總數 private static final int THRESHOLD = 8; //傳入的所有的url的列表 private List<String> urls; //開始座標 private int start; //結束座標 private int end; //儲存路徑 private String path; /** * @param urls url集合 * @param
start 開始座標 * @param end 結束座標 * @param path 儲存路徑 */
public DownloadTask(List<String> urls, int start, int end, String path) { this.urls = urls; this.start = start; this.end = end; this.path = path; } @Override protected void compute
() { if (end - start < THRESHOLD) { for (int i = start; i < end; i++) { String url = urls.get(i); String[] split = url.split("/"); String imgName = split[split.length - 1]; try { //檔案儲存 File file = new File(path + "/" + imgName); InputStream inputStream = CrawlerUtils.downLoadFromUrl(url); IOUtils.saveFile(inputStream, file); System.out.println("success:" + url); } catch (Exception e) { System.out.println("fail:" + url); } } } else { // 如果當end與start之間的差大於THRESHOLD時,將大任務分解成兩個小任務。 int middle = (start + end) / 2; DownloadTask left = new DownloadTask(urls, start, middle, path); DownloadTask right = new DownloadTask(urls, middle, end, path); // 並行執行兩個“小任務” left.fork(); right.fork(); } } }

2、main主方法

package com.dyw.crawler.project;

import com.dyw.crawler.util.CrawlerUtils;
import com.dyw.crawler.util.DownloadTask;
import com.dyw.crawler.util.IOUtils;
import com.dyw.crawler.util.RegularUtils;

import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.ForkJoinPool;
import java.util.concurrent.TimeUnit;

/**
 * 多執行緒下載圖片
 * Created by dyw on 2017/9/7.
 */
public class Project3 {

    public static void main(String[] args) {
        ForkJoinPool forkJoinPool = new ForkJoinPool();
        String path = "C:\\Users\\dyw\\Desktop\\crawler\\photo";
        String path1 = "C:\\Users\\dyw\\Desktop\\crawler\\photo1";
        String url = "http://www.tuigirlba.cc/page/show/";
        List<String> list = new ArrayList<>();
        try {
            for (int i = 330; i < 380; i++) {
                String htmlContent = CrawlerUtils.get(url + i);
                List<String> imgUrls = RegularUtils.getIMGUrl(htmlContent);
                list.addAll(imgUrls);
            }
            long l = System.currentTimeMillis();
            forkJoinPool.execute(new DownloadTask(list, 0, list.size(), path));
            forkJoinPool.shutdown();
            //等待 forkJoinPool 20秒
            forkJoinPool.awaitTermination(20, TimeUnit.SECONDS);
            long l1 = System.currentTimeMillis() - l;
            long l2 = System.currentTimeMillis();
            //for迴圈下載
            list.forEach(imgUrl -> {
                String[] split = imgUrl.split("/");
                String imgName = split[split.length - 1];
                try {
                    File file1 = new File(path1 + "/" + imgName);
                    InputStream inputStream = CrawlerUtils.downLoadFromUrl(imgUrl);
                    IOUtils.saveFile(inputStream, file1);
                    System.out.println("success:" + imgUrl);
                } catch (Exception e) {
                    System.out.println("fail:" + imgUrl);
                }
            });
            long l3 = System.currentTimeMillis() - l2;
            System.out.println("forkjoin處理時間:"+l1);
            System.out.println("沒有並行處理時間:"+l3);
        } catch (Exception e) {
            throw new RuntimeException("獲取內容失敗!", e);
        }
    }
}

3、執行結果

從下面2個圖片中可以看到,比同步的快很多!

這裡寫圖片描述

這裡寫圖片描述

如果有什麼程式碼修改的建議,請給我留言唄! ☺☺☺