1. 程式人生 > >censys 資料庫地理資訊自定義介面(python版)

censys 資料庫地理資訊自定義介面(python版)

公司內部的ip資訊庫覆蓋面不是很夠
導致日誌處理的時候ip經常差不到
有人推薦,censys比較權威,
但是沒有文件,而且介面不太好用,所以自己寫了一個查ip的介面
首先
到官網逛了逛,censys特殊之處在於註冊了才能用api
註冊以後有Secret,API_ID,在查詢時需要用到
百度了一下發現也沒什麼有用的教程,又看了看官方的介紹以及原始碼
得到了最初的版本

import censys
from censys import *
Secret=“”
API_ID=“”
self.api = censys.ipv4.CensysIPv4(api_id=self.API_ID, api_secret=self.Secret)
res = self.api.view(ip)
geo = res['location'
]

後來發現,這個庫不是專業的地理資訊庫,這樣查詢很多ip的地址view不到。。
但是在網頁上面卻是可以顯示地理資訊的,想了想,準備直接用url發請求

import requests
url="https://www.censys.io/ipv4/%s"%ip
res = requests.get(url, auth=(API_ID, Secret))
s=res.content

也可以用urllib2

import urllib2
values ={"user":API_ID,"passwd":Secret}
jdata = json.dumps(values)
req = urllib2.Request
(url, jdata) response = urllib2.urlopen(req) s=response.read()

兩者差不太多吧,我用的是第一種
之後就是解析html了
上網找了找,發現神器bs4
搞了搞發現好方便,直接貼程式碼

from bs4 import BeautifulSoup
soup = BeautifulSoup(s, "html5lib")

這樣html就被解析出來了,結合censys返回的html,可以解析出地理資訊

b=soup.find_all("dl","dl-horizontal dl-hostbox")
if len(b) == 0
: print "not found" geo=b[0].find_all('dd')

接下來繼續解析出所需各項

lat_long=geo[3].string.split(',')
country=geo[2].string.split(' ')

json_data = {
    "ip"    :           ip,
     "latitude":        float(lat_long[0]),
     "country":         str(country[0]),
     "country_code":    str(country[1][1:-1]),
     "longitude":       float(lat_long[1]),
     "province":        str(geo[1].string),
     "city":            str(geo[0].string)
    }

這樣也就可以用了
但是。。
有的ip地理資訊竟然是殘缺的,所以根據實際情況修改了一下解析過程
思路很簡單,就是如果geo中五項都有,就直接過,缺項的時候,就看看有什麼,然後加什麼,沒有的用“unknow”代替

try:
    city = str(geo[0].string)
    provice = str(geo[1].string)
    country = geo[2].string.split(' ')
    lat_long= geo[3].string.split(',')
    Timezone= str(geo[4].string)
except:
    print "did not get enough info at ip%s"%ip          
    filed = b[0].find_all('dt')
    lst=[]
    for i in filed:
        lst.append(i.string)    

    city = "unknow"
    provice = "unknow"
    country = ["unknow",("unknow")]
    lat_long=[0.0,0.0]                  

    fileds= ["City","Province","Country","Lat/Long","Timezone"]     
    num=0
    for i in lst:
        index=fileds.index(i)
        #print index
        if index == 0:
            city = str(geo[num].string)
        elif index == 1:
            provice = str(geo[num].string)
        elif index == 2:
            country = geo[num].string.split(' ')
        elif index == 3:
            lat_long= geo[num].string.split(',')        
        else :
            continue
        num=num+1
json_data = {
    "ip"    :           ip,
     "latitude":        float(lat_long[0]),
     "country":         str(country[0]),
     "country_code":    str(country[1][1:-1]),
     "longitude":       float(lat_long[1]),
     "province":        provice,
     "city":            city
    }

這樣完整的地理資訊就解析出來了
這裡放一下完整的api


import json
import urllib2
import censys
from censys import *
import requests
from bs4 import BeautifulSoup

class censys_ip():
    debug = False
    Secret=""
    API_ID=""
    def __init__(self):
        self.api = censys.ipv4.CensysIPv4(api_id=self.API_ID, api_secret=self.Secret)   

    def censys_html_search(self,ip):
        url="https://www.censys.io/ipv4/%s"%ip
        res = requests.get(url, auth=(self.API_ID, self.Secret))
        s=res.content
        soup = BeautifulSoup(s, "html5lib")
        b=soup.find_all("dl","dl-horizontal dl-hostbox")
        if len(b) == 0:
            return {}
        geo=b[0].find_all('dd')

        try:
            city = str(geo[0].string)
            provice = str(geo[1].string)
            country = geo[2].string.split(' ')
            lat_long= geo[3].string.split(',')
            Timezone= str(geo[4].string)
        except:
            print "did not get enough info at ip%s"%ip          
            filed = b[0].find_all('dt')
            lst=[]
            for i in filed:
                lst.append(i.string)    

            city = "unknow"
            provice = "unknow"
            country = ["unknow",("unknow")]
            lat_long=[0.0,0.0]                  

            fileds= ["City","Province","Country","Lat/Long","Timezone"]     
            num=0
            for i in lst:
                index=fileds.index(i)
                #print index
                if index == 0:
                    city = str(geo[num].string)
                elif index == 1:
                    provice = str(geo[num].string)
                elif index == 2:
                    country = geo[num].string.split(' ')
                elif index == 3:
                    lat_long= geo[num].string.split(',')        
                else :
                    continue
                num=num+1
        json_data = {
            "ip"    :           ip,
             "latitude":        float(lat_long[0]),
             "country":         str(country[0]),
             "country_code":    str(country[1][1:-1]),
             "longitude":       float(lat_long[1]),
             "province":        provice,
             "city":            city
            }
        return json_data        

    def search(self,ip):

        try:
            res = self.api.view(ip)
            geo = res['location']
            json_data = {
                "ip"    :           ip,
                 "latitude":        float(geo["longitude"]),
                 "country":         geo["country"],
                 "country_code":    geo["country_code"],
                 "longitude":       float(geo["latitude"]),
                 "province":        geo["province"],
                 "city":            geo["city"]
                }
            return json_data
        except:
            json_data=self.censys_html_search(ip)
            return json_data        


    def get_geo(self,ip):       
        json_data=self.search(ip)
        if len(json_data) == 0:
            print "can not find ip: %s"%ip
            return -1

        print "get geo of ip: %s"%ip
        print json_data
        return 1

    def main(self,ip_lst):
        lst=[]
        for ip in ip_lst:
            print  "========================"
            finish_num = self.get_geo(ip)
            if finish_num == -1 :
                lst.append(ip)
        print lst
if __name__ == '__main__':
    ip_lst=["8.8.8.8"]
    print "=================================start=========================================="
    a=censys_ip()
    a.main(ip_lst)
    print "=================================end=========================================="

反正還可以湊活著用吧

最後,他們告訴我有genip…………………………………………………

無所謂啦,反正寫著玩的