活躍度的爬蟲開發(一)
阿新 • • 發佈:2018-12-31
爬蟲最簡單的實現就是一個http連線request,然後解析resposne,最後根據樣式或者什麼規則,進行匹配,然後提取資訊,判斷是否連結其他頁面爬取資訊。
我在GIT上面在寫了一個關於通過關鍵字查活躍度,暫時在優化中,暫時支援CMD查詢。
基礎實現
public SearchDto keyWordSearchTest(String url,String keyWord){ SearchDto seD=new SearchDto(); BufferedReader in =null; OutputStream outputStream = null; String reasponseStr=null; StringBuffer resHtml=new StringBuffer(); String line; try{ URL realUrl =new URL(url); HttpURLConnection urlConnection = (HttpURLConnection) realUrl.openConnection(); urlConnection.setRequestProperty("Host", "s.tool.chinaz.com"); urlConnection.setRequestProperty("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"); urlConnection.setRequestProperty("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); urlConnection.setRequestProperty("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); //urlConnection.setRequestProperty("Accept-Encoding", "gzip, deflate"); urlConnection.setRequestProperty("Connection", "keep-alive"); urlConnection.setRequestProperty("Upgrade-Insecure-Requests", "1"); urlConnection.setRequestProperty("Accept-Charset", "utf-8"); urlConnection.setRequestProperty("contentType", "utf-8"); urlConnection.setDoOutput(true); urlConnection.setDoInput(true); urlConnection.setRequestMethod("POST"); StringBuffer str =new StringBuffer(); str.append("Content-Type: application/x-www-form-urlencoded"); str.append("Content-Length: 23"); str.append("\"\r\n\r\n"); str.append("kw="+keyWord+"&page=1&by=0"); str.append("\"\r\n\r\n"); outputStream=urlConnection.getOutputStream(); outputStream.write(str.toString().getBytes()); urlConnection.connect(); in =new BufferedReader(new InputStreamReader(urlConnection.getInputStream(),"utf-8")); while((line=in.readLine())!=null){ resHtml.append(line); } System.out.println(resHtml); }catch(Exception e){ e.printStackTrace(); ((org.slf4j.Logger) logger).error(e.getMessage()); } Contanst.RegexString(resHtml.toString(), ""); return null; }
基礎實現(用外掛最方便的是就,不用自己去寫那見了的正則,每次寫正則都要翻文件,頭大)
public SearchDto keyWordSearch(String url,String keyWord,SearchDto searchDto){ CloseableHttpResponse response=null; try{ CloseableHttpClient httpclient = HttpClients.createDefault(); HttpPost post = new HttpPost(url); List<NameValuePair> nvps = new ArrayList <NameValuePair>(); nvps.add(new BasicNameValuePair("kw",keyWord)); nvps.add(new BasicNameValuePair("by","0")); post.setEntity(new UrlEncodedFormEntity(nvps,Consts.UTF_8)); post.setHeader("Host", "s.tool.chinaz.com"); post.setHeader("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0"); post.setHeader("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"); post.setHeader("Accept-Language", "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3"); post.setHeader("Connection", "keep-alive"); post.setHeader("Upgrade-Insecure-Requests", "1"); post.setHeader("Accept-Charset", "utf-8"); post.setHeader("contentType", "utf-8"); response = httpclient.execute(post); String htmlStr = EntityUtils.toString(response.getEntity()); //System.out.println(htmlStr); Document doc =Jsoup.parse(htmlStr); System.out.println("================"); Elements result=doc.select(".ResultListWrap "); Elements resultList = result.select(".CiListCent.CiRLlist"); String keyName=null; for( org.jsoup.nodes.Element element: resultList){ Elements urlList = element.select("a[href]"); keyName=urlList.get(0).text(); //0 為NAME 3為整體指數 4為PC指數 5移動指數 6 為收錄量 7為收錄首位 if(keyWord.equals(keyName)){ searchDto.setAllIndex(urlList.get(3).text()); searchDto.setPcIndex(urlList.get(4).text()); searchDto.setMoveIndex(urlList.get(5).text()); searchDto.setCollNum(urlList.get(6).text()); searchDto.setWebFirst(urlList.get(7).text()); break; } } // System.out.println(resultList.toString()); }catch(Exception e){ logger.error("a"); }finally { try { response.close(); } catch (IOException e) { logger.error(e.getMessage()); } } return null; }