1. 程式人生 > >自動抓取並解析一個商品頁

自動抓取並解析一個商品頁

  • 以美國adidas官網為例。
  • 輸入url,抓取商品資訊(標題、描述、圖片等);抓取屬性資訊(顏色、尺碼、價格、庫存、skuId)。
  • 思路很簡單,就是開啟頁面,分析各個需要內容的標籤。

獲取頁面

public static Document getHttpPostResponseWithDocument(String url, String referrer, List<NameValuePair> params,                                                  DecompressingHttpClient httpClient) throws IOException {
        HttpResponse response = getHttpPostResponse(url, referrer, params
, httpClient); Document doc = Jsoup.parse(EntityUtils.toString(response.getEntity(), "UTF-8")); EntityUtils.consume(response.getEntity()); return doc; } public static HttpResponse getHttpGetResponse(String url, String referrer, DecompressingHttpClient httpClient) throws IOException { HttpGet get
= new HttpGet(url); setHeaders(get); if (!StringUtils.isBlank(referrer)) { get.setHeader("Referer", referrer); } return httpClient.execute(get); }

判斷是否有貨

public boolean isInStock() {
        Elements addToCartElements = doc.select(".addtocart");
        if
(null == addToCartElements || addToCartElements.isEmpty()) { return false; } if(!addToCartElements.toString().contains("add-to-cart-button")) { return false; } return true; }

顏色獲取

public ExecInfo parse(String url, Map<String, String> colorMap) {

        ExecResult<Document> execResult = getOneSkuInfoPage(url);
        if (!execResult.isSucc()) {
           LogUtils.info(execResult.getMsg());
        }
        if(!isInStock()) {
            LogUtils.info("out of stock!");
            return ExecInfo.fail("out of stock!");
        }
        Elements curColorElements = doc.select(".product-color");
        if(null == curColorElements || curColorElements.isEmpty()) {
            return ExecInfo.fail("獲取當前商品顏色資訊失敗");
        } else {
            Pattern COLOR_PATTERN = Pattern.compile("<span class=\"product-color-clear\">([^<]*)</span>");
            Pattern SKU_PATTERN = Pattern.compile("\\(([0-9A-Za-z]*)\\)");
            Matcher color_matcher = COLOR_PATTERN.matcher(curColorElements.toString());
            Matcher sku_matcher = SKU_PATTERN.matcher(curColorElements.toString());
            if(color_matcher.find() && sku_matcher.find()) {
                LogUtils.info("CURRENT COLOR: " + sku_matcher.group(1) + ", " + color_matcher.group(1));
            }
        }
        //Elements elements = doc.select("#colorVariationsCarousel");
        Elements elements = doc.select(".color-variation-row");
        if(null != elements && !elements.isEmpty()) {
            for (Element element : elements) {
                Elements colorElements = element.select(".color-variations-thumb-color");
                for (Element colorElement : colorElements) {
                    //LogUtils.info(colorElement.toString());
                    Pattern SKU_PATTERN = Pattern.compile("data-articleno=\"([0-9A-Za-z]*)");
                    Pattern TITLE_PATTERN = Pattern.compile("title=\"([^\"]*)");
                    Matcher sku_matcher = SKU_PATTERN.matcher(colorElement.toString());
                    Matcher title_matcher = TITLE_PATTERN.matcher(colorElement.toString());
                    if (sku_matcher.find() && title_matcher.find()) {
                        colorMap.put(sku_matcher.group(1), title_matcher.group(1));
                    }
                }
            }
        }
        LogUtils.info(colorMap.toString());
        return ExecInfo.succ();

    }