1. 程式人生 > >訪問url獲取頁面內容工具類

訪問url獲取頁面內容工具類

package com.guanyong.fbimonitor.test;

import java.io.BufferedReader;
import java.io.ByteArrayInputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.ObjectInputStream;
import java.net.SocketTimeoutException;
import java.net.URI;
import java.net.URISyntaxException;
import java.nio.charset.Charset;
import java.security.cert.CertificateException;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.TimeZone;
import java.util.concurrent.TimeUnit;
import java.util.function.Consumer;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.zip.GZIPInputStream;
import java.util.zip.InflaterInputStream;

import javax.net.ssl.SSLContext;

import org.apache.http.Header;
import org.apache.http.HttpEntity;
import org.apache.http.HttpHost;
import org.apache.http.HttpResponse;
import org.apache.http.HttpStatus;
import org.apache.http.NameValuePair;
import org.apache.http.auth.AuthScope;
import org.apache.http.auth.UsernamePasswordCredentials;
import org.apache.http.client.config.CookieSpecs;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.methods.HttpRequestBase;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.client.utils.URIUtils;
import org.apache.http.config.ConnectionConfig;
import org.apache.http.config.MessageConstraints;
import org.apache.http.config.Registry;
import org.apache.http.config.RegistryBuilder;
import org.apache.http.conn.socket.ConnectionSocketFactory;
import org.apache.http.conn.socket.PlainConnectionSocketFactory;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.BasicHttpEntity;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.DefaultHttpResponseFactory;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.impl.client.BasicAuthCache;
import org.apache.http.impl.client.BasicCookieStore;
import org.apache.http.impl.client.BasicCredentialsProvider;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.conn.DefaultHttpClientConnectionOperator;
import org.apache.http.impl.conn.DefaultHttpResponseParser;
import org.apache.http.impl.conn.DefaultHttpResponseParserFactory;
import org.apache.http.impl.conn.ManagedHttpClientConnectionFactory;
import org.apache.http.impl.conn.PoolingHttpClientConnectionManager;
import org.apache.http.io.HttpMessageParser;
import org.apache.http.io.HttpMessageParserFactory;
import org.apache.http.io.SessionInputBuffer;
import org.apache.http.message.BasicLineParser;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.ssl.TrustStrategy;
import org.apache.http.util.ByteArrayBuffer;
import org.apache.http.util.CharArrayBuffer;
import org.apache.http.util.EntityUtils;

public class FetchWebData {
	public static final Charset gbkCharset = Charset.forName("GBK");
	public static final Charset utf8Charset = Charset.forName("UTF-8");
	public static final Charset iso8859Charset = Charset.forName("ISO8859-1");

//	protected static final Logger LOGGER = LoggerFactory.getLogger(FetchWebData.class);
//	protected static final Logger LOGGER = null;
	
	// meant to be used by a single thread only!  In particular httpCtx is not thread-safe
	private final BasicAuthCache authCache = new BasicAuthCache();
	private final HttpClientContext httpCtx; // used across several requests sent with this fd
	private final BasicCredentialsProvider credProvider = new BasicCredentialsProvider();
	private final BasicCookieStore cookieStore = new BasicCookieStore();
	private final CloseableHttpClient client;
    public static final String defaultUserAgent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:27.0) Gecko/20100101 Firefox/27.0";
    private String userAgent = defaultUserAgent;
    private Charset forceCharset = null; // if non-null, will be used in request and response entities; if null, reasonable defaults will be used.
    private String cookieSpec = CookieSpecs.DEFAULT; // FIXME: what about SINGLE_COOKIE_HEADER?
    private String strCookie;
	private int RetryNum;
	private int timeoutMs; // each FetchWebData instance can have its own timeout in milliseconds
	private Map<String,String> extraHeaders; 
	protected boolean isVerboseUrl = true; // whether each url passed here is shown
	public void setVerboseUrl(boolean value) { this.isVerboseUrl = value; }
	private final 	Integer maxReadMillSeconds =4*60*1000;//一次http最多可以允許的毫秒數

	private boolean isFilterApplicationResponse;
    private String redirectUrl; 
	private String refererUrl;
	public String lastModified;
	
	private boolean shouldTryOtherProxy = true;//是否需要嘗試其他的代理,有些不需要
	
//	private int maxPageSize= 3*1024*1024;//頁面最大大小
	
	private String contentFileType ;//二進位制檔案型別
	
	private Integer minTryNumber = 1;
	
	public String getContentFileType() {
		return contentFileType;
	}

	public void setContentFileType(String contentFileType) {
		this.contentFileType = contentFileType;
	}
	
	public void setReferer(String refererUrl){
		this.refererUrl = refererUrl;
	}
	
	public String getRedirectUrl(){
		return redirectUrl;
	}
	
	public String getLastRequestUrl(){
		return lastRedirURL;
	}
	
	public boolean isFilterApplicationResponse() {
		return isFilterApplicationResponse;
	}

	public void setFilterApplicationResponse(boolean isFilterApplicationResponse) {
		this.isFilterApplicationResponse = isFilterApplicationResponse;
	}
	

	public boolean isShouldTryOtherProxy() {
		return shouldTryOtherProxy;
	}

	public void setShouldTryOtherProxy(boolean shouldTryOtherProxy) {
		this.shouldTryOtherProxy = shouldTryOtherProxy;
	}

	
	public void setRetryNum(int retryNum){
	    if(retryNum < minTryNumber){
        	retryNum = minTryNumber;
        }
		this.RetryNum = retryNum;
	}
	
	public void setMinRetryNum(int minTryNumber){
		   if(minTryNumber < 1){
			   minTryNumber = 1;
	        }
	 	   this.minTryNumber = minTryNumber;
	}
	    
	private HashMap<Integer, Boolean> httpRetryModeMap = new HashMap<>();
	public void setRetryMode(int code, boolean shouldRetry) { 
		httpRetryModeMap.put(code, shouldRetry);
	}
	
	{setRetryMode(404, false); }
	
	private HashSet<Integer> acceptedHttpStatuses = new HashSet<>(); // http status codes in this set are considered ok, and the entity content is returned by getGzipContentAPI
	public void addAcceptedHttpStatus(int code) { 
		acceptedHttpStatuses.add(code); 
	}
	
	// NOTE: Although HttpState.getCookies() currently returns a newly created cookie array, the underlying cookies may still be modified.
	// Consequently, the cookie information should be consumed immediately after issuing a request, usually by using getCookiesTxtStr() instead.
    //public Cookie[] getCookies() { return cookies; } // meant to be passed to aria2c, etc.
    
    public String getCookiesTxtStr() {
    	StringBuilder buf = new StringBuilder();
    	for (Cookie cookie: cookieStore.getCookies()) {
    		// See http://blog.omnux.com/index.php/2008/03/25/cookiestxt-file-format/; escaping is handled by the application layer.
    		buf.append(cookie.getDomain());  buf.append('\t');
    		buf.append("TRUE\t"); // whether the cookie is usable in subdomains
    		buf.append(cookie.getPath());  buf.append('\t');
    		buf.append(cookie.isSecure() ? "TRUE" : "FALSE");  buf.append('\t');
    		Date expiryDate = cookie.getExpiryDate();
    		buf.append(expiryDate != null ? expiryDate.getTime() / 1000 : 0L);  buf.append('\t');
    		buf.append(cookie.getName());  buf.append('\t');
    		buf.append(cookie.getValue());  buf.append('\n');
    	}
    	return buf.toString();
    }
    
   public List<Cookie> getCookies(){
	   return  cookieStore.getCookies();
   }
    
     public String getCookiesStr() {
	    	StringBuilder buf = new StringBuilder();
	    	for (Cookie cookie:  cookieStore.getCookies()) {
	    		buf.append(cookie.getName());
	    		buf.append("=");
	    		buf.append(cookie.getValue());
	    		buf.append("; ");
	    	}
	    	return buf.toString();
     }

    // All connections from this manager are not affected by Heritrix's recording facilities, which allows
    // only one open connection (even free ones) per thread.
    // Now that we are no longer using Heritrix, MultiThreadedHttpConnectionManager should fit our needs.
    public static class CrawlerHttpClientConnectionManager extends PoolingHttpClientConnectionManager {
        private static Registry<ConnectionSocketFactory> getSocketFactoryRegistry() { // same as that used in the parent class
        	
        	//trust all host
        	SSLConnectionSocketFactory sslsf =null;
        	try {
				SSLContext sslContext = new SSLContextBuilder()
						.loadTrustMaterial(null, new TrustStrategy() {
							@Override
							public boolean isTrusted(
									java.security.cert.X509Certificate[] chain,String authType)
									throws CertificateException { 
								      return true;
							}
						}).build();
				sslsf = new SSLConnectionSocketFactory(sslContext);
			} catch (Exception e) {
				sslsf = SSLConnectionSocketFactory.getSocketFactory();
			}
			return RegistryBuilder.<ConnectionSocketFactory>create()
                    .register("http", PlainConnectionSocketFactory.getSocketFactory())
                    .register("https", sslsf)
                    .build();
        }
        private static final HttpMessageParserFactory<HttpResponse> myParserFactory = new DefaultHttpResponseParserFactory() {
            @Override public HttpMessageParser<HttpResponse> create(final SessionInputBuffer buffer, final MessageConstraints constraints) {
                return new DefaultHttpResponseParser(buffer, BasicLineParser.INSTANCE, DefaultHttpResponseFactory.INSTANCE, constraints) {
					// when the http server returns garbage (e.g. http://livestream.freshfm.com.au:8004/;stream.mp3), we don't want to wait indefinitely
					private int ngarbage = 0;
					@Override protected boolean reject(CharArrayBuffer line, int count) {
						ngarbage += (line.length() + 1); // line does not include the line delimiter
						if (ngarbage >= 16384 || count >= 256) return true; // don't accept any more garbage
						return false;
					}
				};
            }
        };
        
        private static  Long timeToLiveSeconds  = 60L;//一個連結最多可以保留的有效秒數
        
    	private CrawlerHttpClientConnectionManager() {
    		// The pooled connections are given 1-minute TTL just to be safe; httpclient already uses setValidateAfterInactivity() to validate the connection every 2 seconds,
    		// so we don't need to worry about the NAT router forgetting about the connection. 
    		super(new DefaultHttpClientConnectionOperator(getSocketFactoryRegistry(), null, null),
    			  new ManagedHttpClientConnectionFactory(null, myParserFactory, null, null), timeToLiveSeconds, TimeUnit.SECONDS);
    		this.setDefaultMaxPerRoute(2000); // essentially unlimited; we let upper levels handle the scheduling, and blocking here is bad
    		this.setMaxTotal(2000); // must be larger than the total number of threads, but should not be too large (particularly when multiple crawler instances are being run), or we'd run out of port numbers
    		// Connection/so timeouts are now set in CrawlerHttpClientBuilder
    		new IdleHttpClientConnectionMonitor(this,timeToLiveSeconds);
    	}
	    private static class Helper {
	    	static final CrawlerHttpClientConnectionManager inst = new CrawlerHttpClientConnectionManager();  
	    }
	    public static CrawlerHttpClientConnectionManager getInstance() { return Helper.inst; }
    }    

	private static final int globalDefaultTimeoutMs = 20000; // 20 seconds 
	public static final RequestConfig globalDefaultReqCfg = RequestConfig.custom().setConnectTimeout(globalDefaultTimeoutMs).setSocketTimeout(globalDefaultTimeoutMs).build();
    // The built HttpClient's have shared connections, thus no need to close them
    public static class CrawlerHttpClientBuilder extends HttpClientBuilder {
    	public static CrawlerHttpClientBuilder create() { 
    		return new CrawlerHttpClientBuilder();
    	}
    	protected CrawlerHttpClientBuilder() {
    		super();
    		this.setConnectionManager(CrawlerHttpClientConnectionManager.getInstance()).setConnectionManagerShared(true);
    		this.setDefaultConnectionConfig(ConnectionConfig.custom().setCharset(utf8Charset).build());
    		this.setDefaultRequestConfig(globalDefaultReqCfg); // to customize, copy from defaultReqCfg
    	}
    }
    
    private boolean isNoCache = true;
    public void setNoCache(boolean isNoCache) { 
    	this.isNoCache = isNoCache; 
    }
    
	public void setStrCookie(String strCookie) { 
		 this.strCookie = strCookie; 
	}
	
	public void setEncoding(String encoding) { 
		this.forceCharset = (encoding != null) ? Charset.forName(encoding) : null; 
	}

	public void setUserAgent(String userAgent) { 
		this.userAgent = userAgent;
	}
	
	public void setParams(Map<String, String> extraHeaders) { 
		this.extraHeaders = extraHeaders;
	}

	public FetchWebData() { 
		this(1, 3*3600*1000); 
	}
	public FetchWebData(int retryNum, int timeoutMs) { // timeoutMs can be -1 to use the default
		// NOTE: SSL seems to be working just fine out-of-the-box
//		ProtocolSocketFactory fcty = new MySecureProtocolSocketFactory();
//      Protocol.registerProtocol("https", new Protocol("https", fcty, 443));
		this.RetryNum = retryNum; 
		this.timeoutMs = timeoutMs; 
		this.setRetryNum(retryNum); 
		this.httpCtx = new HttpClientContext(); 
		httpCtx.setAuthCache(this.authCache);
		this.client = CrawlerHttpClientBuilder.create().setDefaultCredentialsProvider(credProvider).setDefaultCookieStore(cookieStore).build(); 
    }
	public void ignoreCookies() { 
		this.cookieSpec = CookieSpecs.IGNORE_COOKIES; 
	}
	
	public void acceptRfcCookies() {  
		this.cookieSpec = CookieSpecs.STANDARD; 
	}
	
 
//	public static final String proxyHost = "10.11.0.5"; // In east asia, sometimes fast but frequently unavailable
	public static final int defaultProxyPort = 3128;
 
	private HttpHost proxyHost = null;
 
	public void enableProxy(String ip, int port) {
		enableProxy(ip, port, null, null);
	}
	public void enableProxy(String ip, int port, String userName, String pwd) {
		proxyHost = new HttpHost(ip, port);
		if (userName != null && !userName.isEmpty() && pwd != null && !pwd.isEmpty()) { // proxy authentication required
			this.credProvider.setCredentials(new AuthScope(proxyHost), new UsernamePasswordCredentials(userName, pwd));
			// enable preemptive authentication for the proxy (FIXME: need to test its usefulness?)
			this.authCache.put(proxyHost, new BasicScheme());
		}
	}


	
	private RequestConfig.Builder newReqCfgB() {
		final RequestConfig.Builder reqCfgB = RequestConfig.copy(globalDefaultReqCfg);
		reqCfgB.setConnectTimeout(Math.max(timeoutMs,15*1000));//設定連線超時時間,單位毫秒
		reqCfgB.setSocketTimeout(Math.max(timeoutMs,15*1000)); 
		//請求獲取資料的超時時間,單位毫秒。 如果訪問一個介面,多少時間內無法返回資料,就直接放棄此次呼叫。
		reqCfgB.setConnectionRequestTimeout(60*1000);
		//設定從connect Manager獲取Connection 超時時間,單位毫秒。這個屬性是新加的屬性,因為目前版本是可以共享連線池的。
		// NOTE: content charset and http element charset default to UTF-8 for now, since this is used at most sites
//		HttpProtocolParams.setContentCharset(params, "UTF-8"); // FIXME
		reqCfgB.setCookieSpec(cookieSpec);
		if (proxyHost != null) { 
			reqCfgB.setProxy(proxyHost);
		}
		return reqCfgB;
	}

	private void initRequestCommon(HttpRequestBase req, String referer, Consumer<RequestConfig.Builder> reqCfgC) {

    	{
    		final RequestConfig.Builder reqCfgB = newReqCfgB();
//          req.getParams().setParameter(HttpMethodParams.RETRY_HANDLER, new DefaultHttpMethodRetryHandler(10,false)); // not enabled for now; didn't see the point
    		if (reqCfgC != null) reqCfgC.accept(reqCfgB);
    		req.setConfig(reqCfgB.build());
    	}
        req.setHeader("User-Agent", this.userAgent); 
        req.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); 
        if(referer!=null&& !referer.equals("")){
        	req.setHeader("Referer", referer);
        }         
        //req.setHeader("Accept-Charset", "GB2312,utf-8;q=0.7,*;q=0.7"); 
        req.setHeader("Accept-Language", "zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3"); 
        if (isNoCache) {
            req.setHeader("Pragma","no-cache");
            req.setHeader("Cache-Control", "max-age=0"); 
        }
        if(this.strCookie!=null&& !this.strCookie.isEmpty()){
        	 req.addHeader("Cookie", this.strCookie);   
        }
        if(extraHeaders != null){
	        for (Map.Entry<String, String> mapEnt: extraHeaders.entrySet()) {
				req.setHeader(mapEnt.getKey(), mapEnt.getValue());
			}
        }
//        req.setHeader("x-flash-version", "17,0,0,134");
	
	}
	
    private HttpGet newGetRequest(String url, String referer, Consumer<RequestConfig.Builder> reqCfgC) { 
    	final HttpGet req = new HttpGet(url);
    	initRequestCommon(req, referer, reqCfgC);
        req.setHeader("Accept-Encoding", "gzip"); // what is sdch?  We are currently unable to handle deflate properly 
        return req;
    } 
    
	private HttpPost newPostRequest(String url,String referer, NameValuePair[] pairs,int flag,InputStream is,String body, Consumer<RequestConfig.Builder> reqCfgC) {
		final HttpPost req = new HttpPost(url);
    	{
    		final RequestConfig.Builder reqCfgB = newReqCfgB();
    		req.setConfig(reqCfgB.build());
    	}
        final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset;
        switch (flag) {
		case 2: {
			BasicHttpEntity entity = new BasicHttpEntity(); 
			entity.setContent(is); 
			entity.setContentType("application/binary");
			req.setEntity(entity);
			break;
		}
		case 3: {
			StringEntity entity = new StringEntity(body, ContentType.create("application/octet-stream", entityCharset)); 
			req.setEntity(entity);
			break;
		}
		case 1: {
			// NOTE: Arrays.asList doesn't make a copy; it is implemented by the internal class Arrays.ArrayList, which is not the ArrayList we are familiar with.
			UrlEncodedFormEntity entity = new UrlEncodedFormEntity(Arrays.asList(pairs), entityCharset);
			req.setEntity(entity);
			break;
		}
		default: throw new RuntimeException("Invalid flag value in newPostRequest(): " + flag);
		}
        initRequestCommon(req, referer, reqCfgC);
		if(flag != 3){
			req.setHeader("Accept-Encoding", "gzip"); 
		}
		return req;
    }
    
     
    
    //Get Gzip Content
    public String getGzipPostContent2(String URL,String referer,NameValuePair[] pairs,int flag,InputStream is,String body) throws Exception {
    
    	String strRs = null;
    
    	  for(int k=1;k<=RetryNum;k++){
    		BufferedReader br=null;
    	    final HttpPost req = newPostRequest(URL, referer, pairs,flag,is,body, null);   	
	    	try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
			          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
								@Override
								public CloseableHttpResponse execute() throws Exception {
								    return  client.execute(req, httpCtx); 
								} 
			          },maxReadMillSeconds)) {
	            final int status = resp.getStatusLine().getStatusCode();
	            final HttpEntity entity = resp.getEntity();
	            if(status == HttpStatus.SC_OK && entity != null){
	            	final Header hr = resp.getFirstHeader("Content-Encoding");
	            	final InputStream ins = (hr!=null && hr.getValue() != null) ? new GZIPInputStream(entity.getContent()) : entity.getContent();
					br = new BufferedReader(new InputStreamReader(ins, utf8Charset));  
			        StringBuilder resBuffer = new StringBuilder();  
			        String resTemp = "";  
			        long startTime = System.currentTimeMillis();
			        while((resTemp = br.readLine()) != null){  
			        	if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){
							throw new SocketTimeoutException("exception occurs after fetching "+resBuffer.length()+"b data");
						}
			            resBuffer.append(resTemp);  
			        } 
			        strRs = resBuffer.toString(); 
	            }
	            break;
		     }catch( IOException ex ) { 
	    	    ex.printStackTrace();
		     }finally{ 
		    	  if(req!=null){
	        		  try {
	        			  req.abort();  
	        		  } catch (Exception e) {
						 
						}finally{
						 
						} 
                   } 
		    	 
		            if(br!=null){
		           	    br.close();  
		           	    br =null; 
		             } 
		       } 
	      }
	    return strRs;	     
    }
    
     
    
    //Get Gzip Content
    public String getGzipPostContent(String URL,String referer,NameValuePair[] pairs) throws Exception{ 
     
    	String strRs = null;
    
    	for(int k=1;k<=RetryNum;k++){
    		BufferedReader br =null;
    		final HttpPost req = newPostRequest(URL, referer, pairs,1,null,null,null);   	
	    	try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
			          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
								@Override
								public CloseableHttpResponse execute() throws Exception {
								    return  client.execute(req, httpCtx); 
								} 
			          },maxReadMillSeconds)) {
		            final int status = resp.getStatusLine().getStatusCode();
		            final HttpEntity entity = resp.getEntity();
		            if((status == HttpStatus.SC_OK || status == 302) && entity != null){
		            	final Header hr = resp.getFirstHeader("Content-Encoding");
		            	final InputStream ins = (hr!=null && hr.getValue() != null) ? new GZIPInputStream(entity.getContent()) : entity.getContent();
		            	final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset;
						br = new BufferedReader(new InputStreamReader(ins, entityCharset));  
				        StringBuilder resBuffer = new StringBuilder();  
				        String resTemp = "";  
				        long startTime = System.currentTimeMillis();
				        while((resTemp = br.readLine()) != null){  
				        	if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){
								throw new SocketTimeoutException("exception occurs after fetching "+resBuffer.length()+"b data");
							}
				            resBuffer.append(resTemp);  
				        } 
				        strRs = resBuffer.toString();  
		             }
		            break;
		    	 }catch( IOException ex ) { 
			    	   ex.printStackTrace();
			     }finally{  
			    	  if(req!=null){
		        		  try {
		        			  req.abort();  
		        		  } catch (Exception e) {
							 
							}finally{
							 
							} 
	                  } 
			    	  
	                 if(br!=null){
	               	    try {
							br.close(); 
						} catch (Exception e) {
						 
						}finally{
							br = null;
						}
	                 } 
	          }
    	   }
	    return strRs;	     
    }
    
    public byte[] getHttpBytes(String url,String referer) throws Exception{
    	final HttpGet req = newGetRequest(url,referer,null);    	
    	byte[] data = null;
    	contentFileType = null;
    	try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
		          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
							@Override
							public CloseableHttpResponse execute() throws Exception {
							    return  client.execute(req, httpCtx); 
							} 
		          },maxReadMillSeconds)) {
    		Header header = resp.getFirstHeader("Content-Type");
    		if(header!=null){
    			String headValue = header.getValue();
    			if(headValue.indexOf("image")!=-1){
    				contentFileType = headValue.replaceAll("image/", "");
    			}
    		
    		}
            final int status = resp.getStatusLine().getStatusCode();
            final HttpEntity entity = resp.getEntity();
            if(status == HttpStatus.SC_OK && entity != null){
            	data = EntityUtils.toByteArray(entity);
            }
	    }finally{
	    	 if(req!=null){
	       		  try {
	       			  req.abort();  
	       		  } catch (Exception e) {
						 
				 }finally{
						 
				  } 
             } 
	     
	    }
	    return data;
    }
    

 
    // if needContent is false, we only check that the content is available, but don't actually fetch it.  Returns an empty string upon success.
    private static final Pattern charsetPat = Pattern.compile("charset=['\"]?(?<name>[-_A-Za-z0-9]+)['\"]?");
    public int lastHttpStatus;
    public String lastRedirPath, lastRedirURL;
    public String lastResponseStr = null;
    
    public static interface ByteArrayContentHandler {
    	public byte[] handleBytes(byte[] in);
    }
    public static enum WebContentCheckMode { NONE, NONEMPTY, NOT_HTML, FULL } 
    public static class GetGzipContentOptions {
    	public WebContentCheckMode defaultChkMode = WebContentCheckMode.FULL; // if not FULL, no content will be returned, but we will check whether the content is nonempty (NONEMPTY) or does not begin with <html (NOT_HTML), with "" upon success and null upon failure.
    	public boolean allowRedirect = true;
    	public ByteArrayContentHandler bytesHandler = null; // if non-null, the fetched bytes are passed through this filter
    	public WebContentCheckMode getChkMode(String redirURL) { return defaultChkMode; }
    }  
  
	public String getGzipContentAPI(String URL,String referer, GetGzipContentOptions opts) throws Exception{
    	lastHttpStatus = 0; 
    	lastRedirPath = null; 
    	lastRedirURL = null;
    	String strRs = null; 
    	
		byte[] data;
		Charset httpCharset = null; 
	    for(int k=1;k<=RetryNum;k++){  
		    	final HttpGet req = newGetRequest(URL, referer, (reqCfgB) -> {
		    		reqCfgB.setRedirectsEnabled(opts.allowRedirect); // if opts.allowRedirect is true, we won't actually get 302's then
		    	}); 
		    	InputStream ins = null;
		    	try(  CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
						          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
											@Override
											public CloseableHttpResponse execute() throws Exception {
											    return  client.execute(req, httpCtx); 
											} 
						          },maxReadMillSeconds) 
		    	   ) { 
		    		lastRedirURL = null; 
		    		int status = resp.getStatusLine().getStatusCode();
		            lastHttpStatus = status;
		            // req.getRequestLine().getUri() or req.getURI() both gives the original, non-redirected URL.
		            // See http://hc.apache.org/httpcomponents-client-4.5.x/tutorial/html/fundamentals.html#d5e334 for the proper approach
		            final HttpHost targetHost = httpCtx.getTargetHost();
		            List<URI> redirLocs = httpCtx.getRedirectLocations();
		            URI redirUri = URIUtils.resolve(req.getURI(), targetHost, redirLocs);
		        	lastRedirPath = redirUri.getPath(); // this should be the path after following redirects
		        	lastRedirURL = redirUri.toString(); // should include the query string... 
		        	if(URL.equalsIgnoreCase(lastRedirURL)){
		        		redirectUrl = null;
		        	}else{
		        		redirectUrl = lastRedirURL;
		        	}
		        	final HttpEntity entity = resp.getEntity();
		            if(! ((status == HttpStatus.SC_OK || (opts.allowRedirect && status == 302) || acceptedHttpStatuses.contains(status)) && entity != null)) {
//		            	if(URL==null){
//		            		LOGGER.warn(status);  return null;
//		            	}
//		            	LOGGER.warn("FetchWebData: " + URL + ": " + status); 
		            	return null;
		            }
		        	final ContentType httpContentType = ContentType.get(entity); // can be null
		        	httpCharset = (httpContentType != null) ? httpContentType.getCharset() : null;
		        	final InputStream rawIns = entity.getContent();
		        	final Header hr = resp.getFirstHeader("Content-Encoding"); 
					if(hr!=null){ // As of httpclient-4.5.2, decompression should already have been done by the httpclient library
						String encName = hr.getValue();
						if (encName != null) {
							if (encName.equals("deflate")) ins = new InflaterInputStream(rawIns); // is this correct?
							else if (encName.equals("gzip")) ins = new GZIPInputStream(rawIns);
							else if (encName.toLowerCase().equals("utf-8")) {} // xiami's file servers return this
//							else LOGGER.warn("Unknown Content-Encoding: " + encName)
							;
						}
					}
					lastModified = (null == resp.getFirstHeader("Last-Modified") ? null :resp.getFirstHeader("Last-Modified").getValue());
					
	            	if(null != lastModified && lastModified.contains("GMT")){
	            		try {
	                		SimpleDateFormat sdf = new SimpleDateFormat("EEE, d MMM yyyy HH:mm:ss 'GMT'", Locale.US);
	                    	sdf.setTimeZone(TimeZone.getTimeZone("GMT"));
	                    	Date ftime = null;
	                		DateFormat df = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
	                		ftime = sdf.parse(lastModified);
	                		lastModified = df.format(ftime);
	
	                	} catch (Exception e) {
	                	 
	                	}
	            	}
					if (ins == null){
						ins = rawIns;
					}
				   ByteArrayBuffer buf = new ByteArrayBuffer(16384);
				   //final InputStream finalIns = ins;
				
			  /*     FutureTaskHelper.getInstance().executeHttpUrgentTask( 
					          new FutureTaskHelper.Task<String>(){ 
										@Override
										public String execute() throws Exception {
										   
										
											return "";
										}
					          },maxReadMillSeconds);
					*/
			       
			       //future task starts
			        int totalInsSize = 0;//一共接收了多少個size
					final WebContentCheckMode chkMode = opts.getChkMode(lastRedirURL);
					long startTime = System.currentTimeMillis();
					if (chkMode != WebContentCheckMode.FULL) {
						if (chkMode == WebContentCheckMode.NONE) return "";
						final int nbyte; // number of bytes to probe; NOTE: if this number of bytes is not available, probe fails
						if (chkMode == WebContentCheckMode.NOT_HTML) nbyte = 4;
						else nbyte = 1;
						final byte[] probeBuf = new byte[nbyte];
						try {
							int nread = 0;
							while (nread < nbyte) {
								if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){
									throw new SocketTimeoutException("exception occurs after fetching "+nread+"b data");
								}
								int result = ins.read(probeBuf, nread, nbyte-nread);
								if (result <= 0) return null;
								nread += result;
							}
							if (chkMode == WebContentCheckMode.NOT_HTML && probeBuf[0] == '<' && probeBuf[1] == 'h' && probeBuf[2] == 't' && probeBuf[3] == 'm')
								return null;
							return "";
						} finally {
							req.abort(); // NOTE: not really necessary; closing resp without closing ins would abort the connection in any case 
						}
					}
					// Fetch the entire response body; must not use EntityUtils.toByteArray() due to possible compression
				
					startTime = System.currentTimeMillis();
					try { // EOFException can occur when fetching badly compressed pages, e.g. http://www.hunantv.com/v/2/53080/f/678703.html
						byte[] inBuf = new byte[1024];
						while (true) {
							if((System.currentTimeMillis() - startTime) >= this.maxReadMillSeconds){
								throw new SocketTimeoutException("exception occurs after fetching "+totalInsSize+"b data");
							}
							int curLen = ins.read(inBuf);
							if (curLen == -1) break; // EOF
							buf.append(inBuf, 0, curLen);
							totalInsSize+=curLen;
						}
//						if(!BusinessUtil.isExcludeUrl(URL)){
//							if(totalInsSize >= maxPageSize){
//								throw new PageBiggerThanMaxSizeException(totalInsSize);
//							}
//						}
					
					} catch (EOFException ex) {
//						LOGGER.error(URL + ": exception after reading " + buf.length() + " bytes: " + ex.toString()); 
					}finally{
					 
					} 
				    //future task ends
					
				    data = buf.toByteArray();
				    
					if (opts.bytesHandler != null) {
						data = opts.bytesHandler.handleBytes(data);  
						if (data == null) return null;
					}
					if (this.forceCharset != null) {
						strRs = new String(data, this.forceCharset);
					}
					String rawStr = new String(data, iso8859Charset);
					Charset charset = null;
					Matcher m;
					if ((m = charsetPat.matcher(rawStr)).find()) {
						String name = m.group("name").toLowerCase();
						if (name.startsWith("gb")) charset = gbkCharset;
						else if (name.equals("utf-8") || name.equals("utf8")){
							charset = utf8Charset;
						}else{
							if(name!=null && !name.trim().equals("")){
								try {
									charset = Charset.forName(name);
								} catch (Exception e) {
									 
								}
							}
							
						}
					}
					if (charset == null && httpCharset != null){
						charset = httpCharset;
					}
					if (charset == null) { 
						charset = utf8Charset; 
					}
					
					boolean shouldCheckCode = false;
					 if(this.forceCharset==null  ){
						 shouldCheckCode = true; 
					}else if(httpCharset!=null && httpCharset.compareTo(forceCharset) != 0){
						 shouldCheckCode = true; 
					} else if(charset!=null && charset.compareTo(forceCharset) != 0){
						 shouldCheckCode = true; 
					} 
					 if(shouldCheckCode){
							if (httpCharset!=null  && (strRs==null  || (MyStringUtils.isMessyCode(strRs) 
									&& httpCharset.compareTo(forceCharset) != 0))){
								 strRs = new String(data,httpCharset);
							} 
							
							if(strRs==null  || MyStringUtils.isMessyCode(strRs)) { // guess from html content
								strRs = new String(data, charset);
							} 
					 }
					 
					
				    break; 
	    	  }catch(IOException ex ) {
	 		      ex.printStackTrace();
	          }finally {	    	
	        	 
	        	  if(req!=null){
	        		  try {
	        			  req.abort();  
	        		  } catch (Exception e) {
						 
						}finally{
						 
						} 
	        	  }
	        	  
		  		  if(ins!=null)   {
		  		    try {
						ins.close();
						
					} catch (Exception e) {
						 
					}finally{
						ins = null;
					}
		  		   } // in case of an abortion, the stream may have been closed already
		  	 
	    	} 
	      }
	    lastResponseStr = strRs;
	    return strRs;
    }
    
    private boolean getShouldRetry(int code) {
		Boolean shouldRetry_ = httpRetryModeMap.get(code);
		return (shouldRetry_ != null) ? shouldRetry_.booleanValue() : true;
    } 
    
    
    public int getStatus(String URL,String referer) throws Exception{
       	final HttpGet req = newGetRequest(URL, null, null);
        try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
		          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
							@Override
							public CloseableHttpResponse execute() throws Exception {
							    return  client.execute(req, httpCtx); 
							} 
		          },maxReadMillSeconds)) {
        	final int status = resp.getStatusLine().getStatusCode();
        	return status;
        }finally{
        	 if(req!=null){
	       		  try {
	       			  req.abort();  
	       		  } catch (Exception e) {
						 
				 }finally{
						 
				  } 
            } 
        	
        }
    }
    
    public String getKXTPostContent(String url,String referer,String postContent)  throws Exception{
    	final HttpPost req = newPostRequest(url, referer, null, 3, null,postContent, null); 	
    	String strRs = null;
    	try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
		          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
							@Override
							public CloseableHttpResponse execute() throws Exception {
							    return  client.execute(req, httpCtx); 
							} 
		          },maxReadMillSeconds)) {
            final int status = resp.getStatusLine().getStatusCode();
            final HttpEntity entity = resp.getEntity();
            if((status == HttpStatus.SC_OK || status == 302) && entity != null) {     	
            	final InputStream rawIns = entity.getContent();
            	final InputStream ins;
            	final int val = rawIns.read();
            	if (val == 0) ins = rawIns; // not compressed; used e.g. when there is no result
            	else if (val == 1) ins = new GZIPInputStream(rawIns); 
            	else throw new RuntimeException("Unexpected KXT type byte: " + val);
            	final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset;
				BufferedReader br = new BufferedReader(new InputStreamReader(ins, entityCharset));  
		        StringBuilder resBuffer = new StringBuilder();  
		        String resTemp = "";  
		        while((resTemp = br.readLine()) != null){  
		            resBuffer.append(resTemp);  
		        } 
		        br.close();
		        strRs = resBuffer.toString(); 
            }
	    }finally{
	    	 if(req!=null){
	       		  try {
	       			  req.abort();  
	       		  } catch (Exception e) {
						 
				 }finally{
						 
				  } 
            } 
	    } 
	    return strRs;	   
    }
    
    public String getPPTVPostContent(String url,String referer,String postContent) throws Exception{
    	final HttpPost req = newPostRequest(url, referer, null,3,null,postContent, null);   	
    	String strRs = null;
    	try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
		          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
							@Override
							public CloseableHttpResponse execute() throws Exception {
							    return  client.execute(req, httpCtx); 
							} 
		          },maxReadMillSeconds)) {
    		final int status = resp.getStatusLine().getStatusCode();
    		final HttpEntity entity = resp.getEntity();
            if((status == HttpStatus.SC_OK || status == 302) && entity != null) {            	
            	final InputStream rawIns = entity.getContent();
            	final Charset entityCharset = (this.forceCharset != null) ? this.forceCharset : utf8Charset;
				BufferedReader br = new BufferedReader(new InputStreamReader(rawIns, entityCharset));  
		        StringBuilder resBuffer = new StringBuilder();  
		        String resTemp = "";  
		        while((resTemp = br.readLine()) != null){  
		            resBuffer.append(resTemp);  
		        } 
		        br.close();
		        strRs = resBuffer.toString();
            }
	    }finally{
	    	 if(req!=null){
	       		  try {
	       			  req.abort();  
	       		  } catch (Exception e) {
						 
				 }finally{
						 
				  } 
            } 
	    	
	    }
	    return strRs;	   
    }
    
    //get gzip content
    public String getGzipContent(String URL,String referer) throws Exception{ 
    	   String strRs = null;
	    	try {
	    		@SuppressWarnings("unused")
				URI uri = new URI(URL);
	    	} catch (URISyntaxException ex) {
	    		URL = URL.replace("|", "%7C");
	         } 
	    	
	    	for(int k=1;k<=RetryNum;k++){	
	    		try {
				    GetGzipContentOptions opts = new GetGzipContentOptions(); 
		    		strRs = this.getGzipContentAPI(URL, referer, opts);
		    		if(strRs!=null){
		    			break; // do not retry upon empty results, which can be valid
		    		}
		    		if (!getShouldRetry(lastHttpStatus)) {
		    			break; // the error is likely permanent  
		    		}
	    	   }catch( IOException ex ) {
	    	         ex.printStackTrace();
	          } 
	        }

		    return strRs;	   	
	    	
    }

    // returns true if the content can be fetched properly
    public boolean checkGetContentMode(String URL, String referer, WebContentCheckMode chkMode, boolean allowRedirect) throws Exception {
    	GetGzipContentOptions opts = new GetGzipContentOptions();  opts.defaultChkMode = chkMode;  opts.allowRedirect = allowRedirect;
    	for (int k = 1; k <= RetryNum; ++k) {
    		if (getGzipContentAPI(URL, referer, opts) != null) {
    			return true;
    		}
    		if (!getShouldRetry(lastHttpStatus)) {
    			break; // the error is likely permanent
    		} 
    	}
    	return false;
    }
    public boolean checkGetContent(String URL, String referer, boolean allowRedirect) throws Exception{
    	return checkGetContentMode(URL, referer, WebContentCheckMode.NONEMPTY, allowRedirect);
    }
    
    //get page encoding
    public String getPageEncoding(String url) throws Exception{
    	String encoding = "gbk";
    	String content = this.getGzipContent(url, "");
    	if(content==null || content.equals("")) {
    		return encoding; 
    	}
    	//<meta http-equiv="Content-Type" content="text/html; charset=gb2312" />
    	String regX="(?is)<meta[^>]*?charset=[\"\']?([\\w-]+)[^>]*?/>";
    	Matcher m = Pattern.compile(regX).matcher(content);
    	if(m.find()){
    		encoding = m.group(1); 
    	}
    	
	    if(encoding==null || encoding.equals("")) encoding="GBK";
	     
	    return encoding;
    }
    
    //Get Location; returns null if this is not a redirect or if an error has occurred
    public String getLocation(String url) throws Exception { 
    	return getLocation(url, null);
    }
    public String getLocation(String url, String referer) throws Exception {
    	this.lastHttpStatus = 0;
    
    	String location = null;
    	
        for(int k=1;k<=RetryNum;k++){
	        	final HttpGet req = newGetRequest(url, referer, (reqCfgB) -> {
	        		reqCfgB.setRedirectsEnabled(false);
	        	});
		    	try(CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
				          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
									@Override
									public CloseableHttpResponse execute() throws Exception {
									    return  client.execute(req, httpCtx); 
									} 
				          },maxReadMillSeconds)) { 
				    final int statusCode = resp.getStatusLine().getStatusCode();
				    this.lastHttpStatus = statusCode; 
				    //Location 301 or 302
					if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){			
							//Get Location from header
						    final Header locationHeader = resp.getFirstHeader("Location");
						    if (locationHeader != null) location = locationHeader.getValue(); 
					} else if (statusCode == HttpStatus.SC_OK ){	
						 
			     	}else if (statusCode >= 400){
						return null;
					} 	// Would return null and abort the connection if we get 200 
					break;
		        }catch( IOException ex ) { 
		    	    ex.printStackTrace();
			    }finally {
			    	 if(req!=null){
			       		  try {
			       			  req.abort();  
			       		  } catch (Exception e) {
								 
						 }finally{
								 
						  } 
		             }  
			    }	
         }
	    return location;	  
    }  
    
    //Get Post Location; returns null if this is not a redirect or if an error has occurred
    public String getPostLocation(String url,NameValuePair[] pairs) throws Exception{
    	this.lastHttpStatus = 0;
    	String location = null;
    
    	  for(int k=1;k<=RetryNum;k++){
    		 final HttpPost req = newPostRequest(url,this.refererUrl,pairs,1,null,null,null);
    		  try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
			          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
								@Override
								public CloseableHttpResponse execute() throws Exception {
								    return  client.execute(req, httpCtx); 
								} 
			          },maxReadMillSeconds)) {
    	    		final int statusCode = resp.getStatusLine().getStatusCode();
    	    		this.lastHttpStatus = statusCode; 
    				if (statusCode == HttpStatus.SC_MOVED_PERMANENTLY || statusCode == HttpStatus.SC_MOVED_TEMPORARILY){			
    					//Get Location from header
    				    final Header locationHeader = resp.getFirstHeader("Location");
    				    if (locationHeader != null) {
    				    	location = locationHeader.getValue();
    				    	return location;
    				    }
    				} else if (statusCode >= 400){
    					return null; 
    				}
    				break;
    	     }catch( IOException ex ) { 
	    	    ex.printStackTrace();
		     }finally {
		    	 if(req!=null){
		       		  try {
		       			  req.abort();  
		       		  } catch (Exception e) {
							 
					 }finally{
							 
					  } 
	             } 
		     }	
    	  } 
    	return location;
    } 
    
    public static Object bytesToObject(byte[] bytes) throws Exception{
        Object result = null;
        ByteArrayInputStream byteInputStream = null;
        ObjectInputStream objectInputStream = null; 
        try{
            byteInputStream = new ByteArrayInputStream(bytes);
            objectInputStream = new ObjectInputStream(byteInputStream);
           
            result = objectInputStream.readObject(); 
        }finally {
            if(null != objectInputStream){
                try{
                    objectInputStream.close();
                    byteInputStream.close();
                }catch(Exception e){
                    
                }
            }
        } 
        return result;
    }


    public boolean downLoad(String remoteFileName, String localFileName) {
    	final HttpGet req = newGetRequest(remoteFileName, null, null);
        try (CloseableHttpResponse resp= FutureTaskHelper.getInstance().executeHttpUrgentTask( 
		          new FutureTaskHelper.Task<CloseableHttpResponse>(){ 
							@Override
							public CloseableHttpResponse execute() throws Exception {
							    return  client.execute(req, httpCtx); 
							} 
		          },maxReadMillSeconds)) {
        	final int status = resp.getStatusLine().getStatusCode();
        	final HttpEntity entity = resp.getEntity();
            if (status == HttpStatus.SC_OK && entity != null) {
            	final Header tokenHeader = resp.getFirstHeader("token");
            	if (tokenHeader != null) {
//            		LOGGER.info("The response value of token:" + tokenHeader.getValue());
            	}
                File storeFile = new File(localFileName);
                try (final FileOutputStream output = new FileOutputStream(storeFile)) {
                	output.write(EntityUtils.toByteArray(entity)); // FIXME: won't work with very large files
                }
                return true;
            } else {
//                LOGGER.info("DownLoad file occurs exception, the error code is :" + status);
                return false;
            }
       } catch (Exception e) {
//            LOGGER.error(e.getMessage(),e);
            return false;
        }finally{
        	 if(req!=null){
	       		  try {
	       			  req.abort();  
	       		  } catch (Exception e) {
						 
				 }finally{
						 
				  } 
            } 
        }
    }
    
 
	 public static void main(String[] args) throws Exception{
//		 FetchWebData fetch  = new FetchWebData(); 
//		fetch.setEncoding("GBK");
//		//fetch.setStrCookie("SUV=1469117584605270; IPLOC=CN3100; ssuid=8162140256; CXID=93F24933B1CF48682C40DA22E37CB8A2; 
[email protected]
@@@@@@@@@; SUID=94F455655FC00D0A000000005790F48F; [email protected]@@@@@@@@@; ABTEST=0|1509435669|v1; weixinIndexVisited=1; JSESSIONID=aaaU39RywHKY27cilGv8v; PHPSESSID=5mje3e3ttdmhcg177csj7pp1h2; SUIR=E193958732376EE1C097CD153376B26F; SNUID=384B4F5DEAEFB7325DA2765EEAB9CF43; sct=30"); // //呼叫代理 // Proxy proxy = ProxyUtil.getProxyFromPool(); // fetch.enableProxy(proxy.proxyHost, proxy.proxyPort,proxy.username,proxy.password); // //// String content = fetch.getGzipContent("http://weixin.sogou.com/weixin?type=1&s_from=input&query=%E6%89%8B%E6%9C%BA%E5%88%9B%E4%B8%9A&ie=utf8&_sug_=n&_sug_type_=",""); // System.exit(0); // //System.out.println(fetch.getRedirectUrl()); } }