1. 程式人生 > >活躍度的爬蟲開發(一)

活躍度的爬蟲開發(一)

      爬蟲最簡單的實現就是一個http連線request,然後解析resposne,最後根據樣式或者什麼規則,進行匹配,然後提取資訊,判斷是否連結其他頁面爬取資訊。

      我在GIT上面在寫了一個關於通過關鍵字查活躍度,暫時在優化中,暫時支援CMD查詢。

       基礎實現

public SearchDto keyWordSearchTest(String url,String keyWord){
		SearchDto seD=new SearchDto();
		BufferedReader in =null;
		OutputStream outputStream = null;  
		String reasponseStr=null;
		StringBuffer resHtml=new StringBuffer();
		String line;
		try{
			URL realUrl =new URL(url);
			HttpURLConnection urlConnection = (HttpURLConnection) realUrl.openConnection();
			urlConnection.setRequestProperty("Host", "s.tool.chinaz.com");
			urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0");
			urlConnection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
			urlConnection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
			//urlConnection.setRequestProperty("Accept-Encoding", "gzip, deflate");
			urlConnection.setRequestProperty("Connection", "keep-alive");
			urlConnection.setRequestProperty("Upgrade-Insecure-Requests", "1");
			urlConnection.setRequestProperty("Accept-Charset", "utf-8");
			urlConnection.setRequestProperty("contentType", "utf-8");
			urlConnection.setDoOutput(true);
			urlConnection.setDoInput(true);
			urlConnection.setRequestMethod("POST");
			
			StringBuffer str =new StringBuffer();
			str.append("Content-Type: application/x-www-form-urlencoded");
			str.append("Content-Length: 23");
			str.append("\"\r\n\r\n");  
			str.append("kw="+keyWord+"&page=1&by=0");
			str.append("\"\r\n\r\n");  
			outputStream=urlConnection.getOutputStream();
			outputStream.write(str.toString().getBytes());
			
			urlConnection.connect();
			in =new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"utf-8"));
			while((line=in.readLine())!=null){
				resHtml.append(line);
			}
			System.out.println(resHtml);
			
		}catch(Exception e){
			e.printStackTrace();
			((org.slf4j.Logger) logger).error(e.getMessage());
		}
		Contanst.RegexString(resHtml.toString(), "");
		
		return null;
	}

       基礎實現(用外掛最方便的是就,不用自己去寫那見了的正則,每次寫正則都要翻文件,頭大)

public SearchDto keyWordSearch(String url,String keyWord,SearchDto searchDto){
		CloseableHttpResponse response=null;
		try{
			CloseableHttpClient httpclient = HttpClients.createDefault();
			HttpPost post = new HttpPost(url);
			List<NameValuePair> nvps = new ArrayList <NameValuePair>();  
	        nvps.add(new BasicNameValuePair("kw",keyWord));  
	        nvps.add(new BasicNameValuePair("by","0"));  
	        post.setEntity(new UrlEncodedFormEntity(nvps,Consts.UTF_8));
	        post.setHeader("Host", "s.tool.chinaz.com");
	        post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0");
	        post.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8");
	        post.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3");
	        post.setHeader("Connection", "keep-alive");
	        post.setHeader("Upgrade-Insecure-Requests", "1");
	        post.setHeader("Accept-Charset", "utf-8");
	        post.setHeader("contentType", "utf-8");
	        response = httpclient.execute(post);
	        String htmlStr = EntityUtils.toString(response.getEntity());
	        //System.out.println(htmlStr);
	        Document  doc =Jsoup.parse(htmlStr);
	        System.out.println("================");
	        Elements result=doc.select(".ResultListWrap ");
	        Elements resultList = result.select(".CiListCent.CiRLlist");
	        String keyName=null;
	        for( org.jsoup.nodes.Element element: resultList){
	        	Elements urlList = element.select("a[href]");
        		keyName=urlList.get(0).text();
        		//0 為NAME  3為整體指數  4為PC指數   5移動指數  6 為收錄量   7為收錄首位
        		if(keyWord.equals(keyName)){
        			searchDto.setAllIndex(urlList.get(3).text());
        			searchDto.setPcIndex(urlList.get(4).text());
        			searchDto.setMoveIndex(urlList.get(5).text());
        			searchDto.setCollNum(urlList.get(6).text());
        			searchDto.setWebFirst(urlList.get(7).text());
        			break;
        		}
	        }
	       // System.out.println(resultList.toString());
		}catch(Exception e){
			logger.error("a");
		}finally {
            try {
				response.close();
			} catch (IOException e) {
				logger.error(e.getMessage());
			}
        }
		
		return null;
    }