比較爬蟲用的語言Python與Go
阿新 • • 發佈:2018-11-27
腳本語言 開始 os x ont mozilla gecko def response pos
Python是我比較喜歡的語言,莫名的喜歡,對Python的學習可能起初是敲錯了網址開始的,哈哈哈~
工作的任務從一個網站後臺做登錄、爬取數據,寫入服務器Redis中,同事認為我會用PHP來寫,哼!讓你猜到那該多沒意思,於是乎有了如下Python的代碼,你看50多行搞定了。
1 #!/usr/bin/python3 2 import requests 3 import re 4 import redis 5 from pyquery import PyQuery as pq 6 7 loginUrl = ‘https://manage.xxx.com.cn/home/login‘ 8 userName = ‘xxx‘ 9 passWord = ‘xxx‘ 10 11 redisServer = ‘192.168.0.2‘ 12 redisPort = 6379 13 redisPass = ‘‘ 14 15 productList = {‘椰油‘:‘CL_Spot‘,‘咖啡‘:‘COFFEE‘,‘工業銅‘:‘COPPER‘} 16 volumeList = {‘CL_Spot‘:[0, 0], ‘COFFEE‘:[0, 0], ‘COPPER‘:[0, 0]} 17 18 def main(): 19 jsessionid = getCookie() 20 doLogin(jsessionid)21 dataUrl = ‘https://manage.xxx.cn/?pageNo=1&pageSize=100‘ 22 cookies = {‘JSESSIONID‘: jsessionid} 23 r = requests.get(dataUrl, cookies = cookies) 24 dom = pq(r.text) 25 lines = dom(‘table‘).eq(1).find(‘tr‘).items() 26 for line in lines: 27 line = re.sub(r‘<!--.*-->‘, ‘‘, str(line)) 28 pattern = re.compile(r‘<td>(.*?)</td>‘) 29 group = pattern.findall(line) 30 if not group: 31 continue 32 productCode = productList[group[3]] 33 if group[6] == ‘買‘: 34 volumeList[productCode][0]+= int(group[7]) * int(group[8]) 35 if group[6] == ‘賣‘: 36 volumeList[productCode][1]+= int(group[7]) * int(group[8]) 37 38 redisClient = redis.Redis(host=redisServer, port=redisPort, password=redisPass) 39 for x in volumeList: 40 keyUp = ‘redis_order_count_u_%s‘ % x 41 keyDown = ‘redis_order_count_d_%s‘ % x 42 redisClient.set(keyUp, int(volumeList[x][0])) 43 redisClient.set(keyDown, int(volumeList[x][1])) 44 45 def getCookie(): 46 ua = {‘user-agent‘: ‘Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36‘} 47 r = requests.get(loginUrl, headers = ua) 48 return r.cookies[‘JSESSIONID‘] 49 50 def doLogin(jsessionid): 51 param = {‘userName‘: userName, ‘password‘: passWord} 52 cookies = {‘JSESSIONID‘: jsessionid} 53 requests.post(loginUrl, data = param, cookies = cookies) 54 55 56 if __name__ == ‘__main__‘: 57 main()
另一個服務也需要這個需求,用了最近看的Golang來實現一次,瞧寫了100多行
1 package main 2 3 import ( 4 "fmt" 5 "net/http" 6 "net/url" 7 "os" 8 "strings" 9 "strconv" 10 "gopkg.in/redis.v4" 11 "github.com/PuerkitoBio/goquery" 12 ) 13 14 var loginUrl string = "https://manage.xxx.com.cn/home/login" 15 var dataUrl string = "https://manage.xxx.com.cn/?pageNo=1&pageSize=100" 16 var userName string = "xxx" 17 var passWord string = "xxx" 18 var redisServer string = "192.168.1.2" 19 var redisPort string = "6379" 20 var redisPass string = "" 21 var redisDB int = 0 22 23 func main() { 24 productList := make(map[string] string) 25 productList["椰油"] = "CL_Spot" 26 productList["咖啡"] = "COFFEE" 27 productList["工業銅"] = "COPPER" 28 volumeList := make(map[string] int) 29 volumeList["u_CL_Spot"] = 0 30 volumeList["d_CL_Spot"] = 0 31 volumeList["u_COFFEE"] = 0 32 volumeList["d_COFFEE"] = 0 33 volumeList["u_COPPER"] = 0 34 volumeList["d_COPPER"] = 0 35 jsessionid := getCookie() 36 doLogin(jsessionid) 37 38 request, err := http.NewRequest("GET", dataUrl, nil) 39 request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid}) 40 client := &http.Client{} 41 response, err := client.Do(request) 42 if err != nil { 43 fmt.Println(err.Error()) 44 os.Exit(0) 45 } 46 defer response.Body.Close() 47 doc, err := goquery.NewDocumentFromReader(response.Body) 48 doc.Find("table").Eq(1).Find("tr").Each(func(i int, tr *goquery.Selection) { 49 td := tr.Find("td") 50 name := td.Eq(3).Text() 51 dir := td.Eq(6).Text() 52 if val, ok := productList[name]; ok { 53 buyNum, _ := strconv.Atoi(td.Eq(7).Text()) 54 buyUnit, _ := strconv.Atoi(td.Eq(8).Text()) 55 num := buyNum * buyUnit 56 cacheKey := "" 57 if dir == "買" { 58 cacheKey = fmt.Sprintf("u_%s", val) 59 } else if dir == "賣" { 60 cacheKey = fmt.Sprintf("d_%s", val) 61 } 62 volumeList[cacheKey] += num 63 } 64 }) 65 redisClient := redis.NewClient(&redis.Options{ 66 Addr: fmt.Sprintf("%s:%s", redisServer, redisPort), 67 Password: redisPass, 68 DB: redisDB, 69 }) 70 for k, v := range volumeList { 71 strKey := fmt.Sprintf("redis_order_count_%s", k) 72 redisClient.Set(strKey, int(v), 0) 73 } 74 fmt.Println("puti volume get success") 75 } 76 77 func getCookie() string { 78 jsessionid := "" 79 response, err := http.Get(loginUrl) 80 if err != nil { 81 fmt.Println(err.Error()) 82 os.Exit(0) 83 } 84 defer response.Body.Close() 85 for _, val := range response.Cookies() { 86 if val.Name == "JSESSIONID" { 87 jsessionid = val.Value 88 } 89 } 90 return jsessionid 91 } 92 93 func doLogin(jsessionid string) bool { 94 data := url.Values{} 95 data.Set("userName", userName) 96 data.Add("password", passWord) 97 request, _ := http.NewRequest("POST", loginUrl, strings.NewReader(data.Encode())) 98 request.Header.Add("Content-Type", "application/x-www-form-urlencoded") 99 request.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) 100 request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid}) 101 client := &http.Client{} 102 response, err := client.Do(request) 103 if err != nil { 104 fmt.Println(err.Error()) 105 os.Exit(0) 106 } 107 defer response.Body.Close() 108 return true 109 }
Python的實現到上線半天的功夫搞定了,Go足足搞了1整天,蹩腳的語法與不熟悉的語法讓我學習了很多知識點,最後Mac編譯到Linux上執行也給我上了一課。
覺得入門學習這兩門語言挺好,一個是腳本語言另一個是編譯語言,用處都很廣泛。軒軒你準備好了嗎?
比較爬蟲用的語言Python與Go