1. 程式人生 > >比較爬蟲用的語言Python與Go

比較爬蟲用的語言Python與Go

腳本語言 開始 os x ont mozilla gecko def response pos

Python是我比較喜歡的語言,莫名的喜歡,對Python的學習可能起初是敲錯了網址開始的,哈哈哈~

工作的任務從一個網站後臺做登錄、爬取數據,寫入服務器Redis中,同事認為我會用PHP來寫,哼!讓你猜到那該多沒意思,於是乎有了如下Python的代碼,你看50多行搞定了。

 1 #!/usr/bin/python3
 2 import requests
 3 import re
 4 import redis
 5 from pyquery import PyQuery as pq
 6 
 7 loginUrl = https://manage.xxx.com.cn/home/login
 8 userName = 
xxx 9 passWord = xxx 10 11 redisServer = 192.168.0.2 12 redisPort = 6379 13 redisPass = ‘‘ 14 15 productList = {椰油:CL_Spot,咖啡:COFFEE,工業銅:COPPER} 16 volumeList = {CL_Spot:[0, 0], COFFEE:[0, 0], COPPER:[0, 0]} 17 18 def main(): 19 jsessionid = getCookie() 20 doLogin(jsessionid)
21 dataUrl = https://manage.xxx.cn/?pageNo=1&pageSize=100 22 cookies = {JSESSIONID: jsessionid} 23 r = requests.get(dataUrl, cookies = cookies) 24 dom = pq(r.text) 25 lines = dom(table).eq(1).find(tr).items() 26 for line in lines: 27 line = re.sub(r<!--.*-->
, ‘‘, str(line)) 28 pattern = re.compile(r<td>(.*?)</td>) 29 group = pattern.findall(line) 30 if not group: 31 continue 32 productCode = productList[group[3]] 33 if group[6] == : 34 volumeList[productCode][0]+= int(group[7]) * int(group[8]) 35 if group[6] == : 36 volumeList[productCode][1]+= int(group[7]) * int(group[8]) 37 38 redisClient = redis.Redis(host=redisServer, port=redisPort, password=redisPass) 39 for x in volumeList: 40 keyUp = redis_order_count_u_%s % x 41 keyDown = redis_order_count_d_%s % x 42 redisClient.set(keyUp, int(volumeList[x][0])) 43 redisClient.set(keyDown, int(volumeList[x][1])) 44 45 def getCookie(): 46 ua = {user-agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36} 47 r = requests.get(loginUrl, headers = ua) 48 return r.cookies[JSESSIONID] 49 50 def doLogin(jsessionid): 51 param = {userName: userName, password: passWord} 52 cookies = {JSESSIONID: jsessionid} 53 requests.post(loginUrl, data = param, cookies = cookies) 54 55 56 if __name__ == __main__: 57 main()

另一個服務也需要這個需求,用了最近看的Golang來實現一次,瞧寫了100多行

  1 package main
  2 
  3 import (
  4     "fmt"
  5     "net/http"
  6     "net/url"
  7     "os"
  8     "strings"
  9     "strconv"
 10     "gopkg.in/redis.v4"
 11     "github.com/PuerkitoBio/goquery"
 12 )
 13 
 14 var loginUrl string = "https://manage.xxx.com.cn/home/login"
 15 var dataUrl string = "https://manage.xxx.com.cn/?pageNo=1&pageSize=100"
 16 var userName string = "xxx"
 17 var passWord string = "xxx"
 18 var redisServer string = "192.168.1.2"
 19 var redisPort string = "6379"
 20 var redisPass string = ""
 21 var redisDB   int = 0
 22 
 23 func main() {
 24     productList := make(map[string] string)
 25     productList["椰油"] = "CL_Spot"
 26     productList["咖啡"] = "COFFEE"
 27     productList["工業銅"] = "COPPER"
 28     volumeList := make(map[string] int)
 29     volumeList["u_CL_Spot"] = 0
 30     volumeList["d_CL_Spot"] = 0
 31     volumeList["u_COFFEE"] = 0
 32     volumeList["d_COFFEE"] = 0
 33     volumeList["u_COPPER"] = 0
 34     volumeList["d_COPPER"] = 0
 35     jsessionid := getCookie()
 36     doLogin(jsessionid)
 37 
 38     request, err := http.NewRequest("GET", dataUrl, nil)
 39     request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid})
 40     client := &http.Client{}
 41     response, err := client.Do(request)
 42     if err != nil {
 43         fmt.Println(err.Error())
 44         os.Exit(0)
 45     }
 46     defer response.Body.Close()
 47     doc, err := goquery.NewDocumentFromReader(response.Body)
 48     doc.Find("table").Eq(1).Find("tr").Each(func(i int, tr *goquery.Selection) {
 49         td := tr.Find("td")
 50         name := td.Eq(3).Text()
 51         dir := td.Eq(6).Text()
 52         if val, ok := productList[name]; ok {
 53             buyNum, _ := strconv.Atoi(td.Eq(7).Text())
 54             buyUnit, _ := strconv.Atoi(td.Eq(8).Text())
 55             num :=  buyNum * buyUnit
 56             cacheKey := ""
 57             if dir == "" {
 58                 cacheKey = fmt.Sprintf("u_%s", val)
 59             } else if dir == "" {
 60                 cacheKey = fmt.Sprintf("d_%s", val)
 61             }
 62             volumeList[cacheKey] += num
 63         }
 64     })
 65     redisClient := redis.NewClient(&redis.Options{
 66         Addr:     fmt.Sprintf("%s:%s", redisServer, redisPort),
 67         Password: redisPass,
 68         DB:       redisDB,
 69     })
 70     for k, v := range volumeList {
 71         strKey := fmt.Sprintf("redis_order_count_%s", k)
 72         redisClient.Set(strKey, int(v), 0)
 73     }
 74     fmt.Println("puti volume get success")
 75 }
 76 
 77 func getCookie() string {
 78     jsessionid := ""
 79     response, err := http.Get(loginUrl)
 80     if err != nil {
 81         fmt.Println(err.Error())
 82         os.Exit(0)
 83     }
 84     defer response.Body.Close()
 85     for _, val := range response.Cookies() {
 86         if val.Name == "JSESSIONID" {
 87             jsessionid = val.Value
 88         }
 89     }
 90     return jsessionid
 91 }
 92 
 93 func doLogin(jsessionid string) bool {
 94     data := url.Values{}
 95     data.Set("userName", userName)
 96     data.Add("password", passWord)
 97     request, _ := http.NewRequest("POST", loginUrl, strings.NewReader(data.Encode()))
 98     request.Header.Add("Content-Type", "application/x-www-form-urlencoded")
 99     request.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
100     request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid})
101     client := &http.Client{}
102     response, err := client.Do(request)
103     if err != nil {
104         fmt.Println(err.Error())
105         os.Exit(0)
106     }
107     defer response.Body.Close()
108     return true
109 }

Python的實現到上線半天的功夫搞定了,Go足足搞了1整天,蹩腳的語法與不熟悉的語法讓我學習了很多知識點,最後Mac編譯到Linux上執行也給我上了一課。

覺得入門學習這兩門語言挺好,一個是腳本語言另一個是編譯語言,用處都很廣泛。軒軒你準備好了嗎?

比較爬蟲用的語言Python與Go