1. 程式人生 > >微醫網爬蟲(一) java實現

微醫網爬蟲(一) java實現

爬取微醫網醫生的基本資料,獲取每個醫生的URL之後,可以使用以下方法解析:

想要採集醫生歷史問診詳細資訊的同學可以移步我們另一篇部落格:傳送門

public Doctor getDoctorInfor(String url) {
        Doctor doctor = new Doctor();
        //提取id
        doctor.setId(url.substring(30, url.length()));
        System.out.print("正在獲取:" + doctor.getId() + "\t");
        Document doc = null;
        try {
            doc = Jsoup.connect(url).get();
            if (doc != null) {
                Element ele1 = doc.selectFirst("div[class=detail word-break]");
                if (ele1 != null) {
                    //提取姓名與職稱
                    Element h1 = ele1.selectFirst("h1");
                    if (h1 != null) {
                        Element s = h1.selectFirst("Strong");
                        Element sp = h1.selectFirst("span");
                        if (s != null) {
                            doctor.setName(s.text());
                            System.out.print(s.text() + "\t");
                        }
                        if (sp != null) {
                            doctor.setJob(sp.text());
                        }
                    }
                    //提取是否為專家
                    Element isA = ele1.selectFirst("a[class=expert-group]");
                    if (isA != null) {
                        doctor.setIsExpert("1");
                    } else {
                        doctor.setIsExpert("0");
                    }
                    //提取醫院和科室
                    Element hosDiv = ele1.selectFirst("div[class=hospital]");
                    if (hosDiv != null) {
                        Element a1 = hosDiv.selectFirst("a");
                        Element a2 = hosDiv.select("a").get(1);
                        if (a1 != null) {
                            doctor.setHospital(a1.text());
                        }
                        if (a2 != null) {
                            doctor.setRoom(a2.text());
                        }
                    }
                    //提取擅長領域
                    Element goodDiv = ele1.selectFirst("div[class=goodat]");
                    if (goodDiv != null) {
                        Element span1 = goodDiv.selectFirst("span");
                        if (span1 != null) {
                            doctor.setGoodAt(span1.text());
                        }
                    }
                    //提取簡介
                    Element aboutDiv = ele1.selectFirst("div[class=about]");
                    if (aboutDiv != null) {
                        Element a = aboutDiv.selectFirst("a");
                        if (a != null) {
                            doctor.setSummary(a.attr("data-description"));
                        } else {
                            Element span = aboutDiv.selectFirst("span");
                            if (span != null) {
                                doctor.setSummary(span.text());
                            }
                        }

                    }

                }
                //提取評分,問診量,預約量
                Element ele2 = doc.selectFirst("div[class=status]");
                if (ele2 != null) {
                    Element dataDiv = ele2.selectFirst("div[class=data]");
                    if (dataDiv != null) {
                        Elements strong = dataDiv.select("strong");
                        if (strong.size() == 3) {
                            doctor.setMarks(strong.get(0).text());
                            doctor.setApoint(strong.get(1).text());
                            doctor.setAsk(strong.get(2).text());
                        }
                    }
                }
                //提取關注量
                Element markDiv = doc.selectFirst("div[class=summary]");
                if (markDiv != null) {
                    Element markspan = markDiv.selectFirst("span[class=mark-count]");
                    if (markspan != null) {
                        doctor.setFocus(markspan.text());
                    }
                }

                //提取價格
                Element price = doc.selectFirst("div[class=consult-type]");
                if (price != null) {
                    Elements pr = price.select("p[class=current-price]");
                    if (pr.size() >= 1) {
                        String pr1 = pr.get(0).text();
                        doctor.setPrice1(pr1.substring(1, pr1.length()));

                    }
                    if (pr.size() == 2) {
                        String pr2 = pr.get(1).text();
                        doctor.setPrice2(pr2.substring(1, pr2.length()));
                    }
                }
                //提取評論數量
                Element commentDiv = doc.selectFirst("section[class=grid-section grid-section-outside expert-comment]");
                if (commentDiv != null) {
                    Element tip = commentDiv.selectFirst("div[class=tip]");
                    if (tip != null) {
                        Element st = tip.selectFirst("strong");
                        if (st != null) {
                            doctor.setComment(st.text());
                        }
                    }
                }
                //提取問診和回答的數量
                Element ele3 = doc.selectFirst("section[class=grid-section grid-section-outside expert-history-ask J_ExpertHistoryAsk]");
                if (ele3 != null) {
                    Element a = ele3.selectFirst("a[class=tip]");
                    if (a != null) {
                        String aurl = a.attr("href");
                        //getSomNum(aurl, doctor);
                       // getSomNumb(aurl,doctor);    //TODO:需要問診數量和回答數量時將此行程式碼恢復即可
                    }

                }
                //提取狀態資訊
                Element  status=doc.selectFirst("div[id=service]");
                if(status!=null){
                    Elements as=status.select("a");
                        doctor.setIsGhuahao("0");

                        doctor.setIsTuwen("0");

                        doctor.setIsShihua("0");

                        doctor.setIsFuwu("0");
                    for(Element elemente:as){
                        String attr=elemente.attr("class");
                        if(attr.contains("guahao")){
                            doctor.setIsGhuahao(isActive(attr));

                        }
                        if(attr.contains("tuwen")){
                            doctor.setIsTuwen(isActive(attr));
                        }
                        if(attr.contains("shipin")){
                            doctor.setIsShihua(isActive(attr));

                        }
                        if(attr.contains("servicePkg")){
                            doctor.setIsTuwen(isActive(attr));
                        }

                    }
                }

            }

        } catch (IOException e) {
            e.printStackTrace();
        }
        System.out.println("---->完成");
        return doctor;
    }

需要原始碼的同學可以聯絡博主QQ(1477517404)。爬取結果: