1. 程式人生 > >機器學習樣本標記 示意代碼

機器學習樣本標記 示意代碼

數據的分布 sep 其他 main input ilo %d ict ups

目標:根據各個字段數據的分布(例如srcIP和dstIP的top 10)以及其他特征來進行樣本標註,最終將幾類樣本分別標註在black/white/ddos/mddos/cdn/unknown幾類。

效果示意:

-------------choose one--------------
sub domain: DNSQueryName(N)
ip: srcip(S) or dstip(D)
length: DNSRequestLength(R1) or DNSReplyLength(R2)
length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)
port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)
code: DNSReplyCode(C2) or DNSRequestRRType(C1)
other: DNSRRClass(RR) or DNSReplyIPv4(V)
-------------label or quit------------
black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)
next(Q) or exit(E)?
***************************************
domain: workgroup. flow count: 206
***************************************
------------srcip-----------------
count 206
unique 9
top 162.105.129.122
freq 150
Name: sourceIP, dtype: object
--------------destip---------------
count 206
unique 12
top 199.7.83.42
freq 82
Name: destIP, dtype: object

代碼:

import sys
import json
import os
import pandas as pd
import tldextract
# import numpy as np


medata_field = ‘‘‘
3 = sourceIP
4 = destIP
5 = sourcePort
6 = destPort
7 = protocol
12 = flowStartSeconds
13 = flowEndSecond
54 = DNSReplyCode
55 = DNSQueryName
56 = DNSRequestRRType
57 = DNSRRClass
58 = DNSDelay
59 = DNSReplyTTL
60 = DNSReplyIPv4
61 = DNSReplyIPv6
62 = DNSReplyRRType
77 = DNSReplyName
81 = payload
88 = DNSRequestLength
89 = DNSRequestErrLength
90 = DNSReplyLength
91 = DNSReplyErrLength
‘‘‘ medata_field_num = [] medata_field_info = [] for l in medata_field.split("\n"): if len(l) == 0: continue num, info = l.split(" = ") medata_field_num.append(int(num)-1) medata_field_info.append(info) print medata_field_num print medata_field_info def extract_domain(domain):
try: ext = tldextract.extract(domain) subdomain = ext.subdomain if ext.domain == "": mdomain = ext.suffix else: mdomain = ".".join(ext[1:]) return mdomain except Exception,e: print "extract_domain error:", e return "unknown" def parse_metadata(path): df = pd.read_csv(path, sep="^", header=None) dns_df = df.iloc[:, medata_field_num].copy() dns_df.columns = medata_field_info # print dns_df.tail() dns_df["mdomain"] = dns_df["DNSQueryName"].apply(extract_domain) # print dns_df.groupby(‘mdomain‘).describe() # print dns_df.groupby(‘mdomain‘).groups return dns_df.groupby(mdomain) def get_data_dist(df, col="sourceIP"): # group count by ip dist grouped = df.groupby(by=col) # print grouped.head(10)[col] print type(grouped.size()) size = grouped.size() print size print "-----------top 10-------------" print size.nlargest(10) def move_to(srcpath, domain, dst_path): with open(dst_path, "w") as w: with open(srcpath) as r: for line in r: if extract_domain(line.split("^")[55-1]) == domain: w.write(line) def main(): history_op = {} if os.path.exists("history_op.json"): with open("history_op.json") as h: history_op = json.load(h) print history_op for day in range(15, 17): for hour in range(0, 24): path = "/home/bonelee/latest_metadata_sample/black_all/black-medata_wanted-2017-08-%d-%d.txt" % (day, hour) print path, "running..." try: domains_info = parse_metadata(path) except IOError, e: print e continue for domain, group in domains_info: print "***************************************" print "domain:", domain, "flow count:", len(group) print "***************************************" # print type(group) #<class ‘pandas.core.frame.DataFrame‘> print "------------srcip-----------------" print group["sourceIP"].describe() print "--------------destip---------------" print group["destIP"].describe() has_judged = False need_break = False while True: print "-------------choose one--------------" print "sub domain: DNSQueryName(N)" print "ip: srcip(S) or dstip(D)" print "length: DNSRequestLength(R1) or DNSReplyLength(R2)" print "length too: DNSRequestErrLength(R3) or DNSReplyErrLength(R4)" print "port: sourcePort(P1) or destPort(P2) or DNSReplyTTL(T)" print "code: DNSReplyCode(C2) or DNSRequestRRType(C1)" print "other: DNSRRClass(RR) or DNSReplyIPv4(V)" dist_dict = {"R1": "DNSRequestLength", "R2": "DNSReplyLength", "R3": "DNSRequestErrLength", "R4": "DNSReplyErrLength", "P1": "sourcePort", "P2": "destPort", "T": "DNSReplyTTL", "C2": "DNSReplyCode", "C1": "DNSRequestRRType", "RR": "DNSRRClass", "V": "DNSReplyIPv4", "S": "sourceIP", "D": "destIP", "N": "DNSQueryName" } print "-------------label or quit------------" print "black(B) or white(W) or cdn(CDN) or ddos(DDOS) or mddos(M) or unknown(U) or white-like(L)" print "next(Q) or exit(E)?" domain = domain.lower() if "win" == domain[-len("win"):] or "site" == domain[-len("site"):] or "vip" == domain[-len("vip"):]: check = "U" need_break = True elif "lan" in domain or "local" in domain or "dhcp" in domain or "workgroup" in domain or "home" in domain: check = "DDOS" need_break = True elif "cdn" in domain: check = "CDN" need_break = True else: if domain in history_op and not has_judged: print "found history op:", history_op[domain] if not raw_input("OK(Enter for Y)?"): check = history_op[domain] need_break = True else: check = raw_input("Input:") else: check = raw_input("Input:") has_judged = True if check == "Q": print path, "next OK!" break elif check == "E": print path, "Exit!" with open("history_op.json", "w") as f: json.dump(history_op, f) print "saved history_op.json" sys.exit() elif check == "B": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_black/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "B" print "Saved OK!" if need_break: break elif check == "W": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "W" print "Saved OK!" if need_break: break elif check == "L": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_white_like/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "L" print "Saved OK!" if need_break: break elif check == "CDN": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_cdn/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "CDN" print "Saved OK!" if need_break: break elif check == "DDOS": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_ddos/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "DDOS" print "Saved OK!" if need_break: break elif check == "M": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_mddos/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "M" print "Saved OK!" if need_break: break elif check == "U": move_to(path, domain, "/home/bonelee/latest_metadata_sample/labeled_unknown/2017-8-%d-%d-%s.txt" % (day, hour, domain)) history_op[domain] = "U" print "Saved OK!" if need_break: break else: if check in dist_dict: get_data_dist(group, dist_dict[check]) else: print "unknown input!Choose the following one:" print "*******************************" print path, "check over..." print "*******************************" if __name__ == "__main__": main()

機器學習樣本標記 示意代碼