使用golang+代理IP+goquery開發爬蟲(爬取國外電影網站)
阿新 • • 發佈:2018-11-24
package main import ( "fmt" "github.com/PuerkitoBio/goquery" "net/http" "net/url" "time" "strconv" "strings" "log" ) func main() { for k:= 206044;k<300000;k++{ Bluray(k) } } func Between(str, starting, ending string) string { s := strings.Index(str, starting) if s < 0 { return "" } s += len(starting) e := strings.Index(str[s:], ending) if e < 0 { return "" } return str[s : s+e] } func Bluray(i int){ req_url := "https://www.blu-ray.com/movies/The-Meg-4K-Blu-ray/" + strconv.Itoa(i) LABEL1: fmt.Println("start id:",i) ipAddress := httpGet()//這裡獲取代理IP 返回eg:127.0.0.1:6666 proxy := func(_ *http.Request) (*url.URL, error) { return url.Parse("http://"+ipAddress) } transport := &http.Transport{Proxy: proxy} c := &http.Client{Transport: transport,Timeout:30*time.Second} req, err := http.NewRequest("GET", req_url, nil) if err != nil {//這裡處理異常方式有些不恰當,暫時對golang error沒有詳細瞭解,所以直接goto了 goto LABEL1 }, res, err := c.Do(req) if err != nil { goto LABEL1 } doc, err := goquery.NewDocumentFromReader(res.Body) if err != nil { goto LABEL1 } res.Body.Close() amazonAddress,al := doc.Find("#movie_buylink").Attr("href") if al{ request, err := http.NewRequest("GET", amazonAddress, nil) if err != nil { log.Fatal(err) } resp, err := c.Do(request) if err != nil { log.Fatal(err) } baseURI := resp.Request.URL.Path//這裡被坑了一下,開始使用resp.Request.URL獲取網頁baseuri死活轉不了string,後來看了下原始碼發現path是返回string baseARR := strings.Split(baseURI, "/") asin := baseARR[len(baseARR)-1] fmt.Println(asin) resp.Body.Close() } _,fl := doc.Find("div[itemprop=review][itemscope][itemtype]").Attr("itemtype")//網頁結構不一,這裡使用兩條路徑判斷 var xpath = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>span.subheading" var xpath2 = "td[width='728'][style='padding-top: 3px'][bgcolor='#ffffff']>table" if fl{ xpath = "div[itemprop=review][itemscope][itemtype]>span.subheading" xpath2 = "div[itemprop=review][itemscope][itemtype]>table" } doc.Find(xpath).Each(func(i int, selection *goquery.Selection) { runtime := selection.Find("#runtime").Text() fmt.Println(runtime) selection.Find("a").Each(func(i int, se *goquery.Selection) { a,_ := se.Attr("href") if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?studioid="){ studio := se.Text() fmt.Println(studio) } if strings.Contains(a,"https://www.blu-ray.com/movies/movies.php?year="){ years := se.Text() fmt.Println(years) } if strings.Contains(a,"https://www.blu-ray.com/movies/releasedates.php?year="){ uptime := se.Text() fmt.Println(uptime) } }) }) doc.Find(xpath2).Each(func(i int, selection *goquery.Selection) { title := selection.Find("h1[itemprop=itemReviewed]").Text() fmt.Println(title) state,_ := selection.Find("img[src][width][height][title][alt][style]").Attr("title") fmt.Println(state) style,_ := selection.Attr("style") if strings.Contains(style,"margin-bottom: 10px; "){ subtitles := selection.Find("#longsubs").Text() newstr := string([]rune(subtitles)[:]) arr := strings.Split(newstr, "(less)") fmt.Println(arr[0]) audio := selection.Find("#longaudio").Text() newstr2 := string([]rune(audio)[:]) arr2 := strings.Split(newstr2, "(less)") fmt.Println(arr2[0]) //fmt.Println(strings.TrimSpace(audio)) html,_ := selection.Find("td[width='228px']").Html() fi := strings.Split(html, "<br/>")//這裡是為了獲取<br>之間的內容,所以直接切分了 for j:=0;j<len(fi);j++{ if strings.Contains(fi[j],"Codec"){ codec := string([]rune(fi[j])[7:])//這裡也遇坑了,golang中文字元處理必須的rune轉換一下,不然無效 fmt.Println(strings.TrimSpace(codec)) } if strings.Contains(fi[j],"Resolution"){ resolution := string([]rune(fi[j])[11:]) fmt.Println(strings.TrimSpace(resolution)) } if strings.Contains(fi[j],"Region"){ playback := string([]rune(fi[j])[7:]) num := strings.Index(playback, "<") if num != -1{ playback = string([]rune(fi[j])[7:7+num]) } fmt.Println(strings.TrimSpace(playback)) } } } }) fmt.Println("--------------------") return //這裡可以根據實際情況返回一個結構體 ,暫時沒寫返回值 }
第一個爬蟲遇坑比較多,golang瞭解不足,寫得比較粗糙跟耗資源,歡迎大家指正!