netfilter之nat程式碼分析
nat主要在PRE_ROUTING、OUTING、LOCAL_IN、POST_ROUTING四個鏈上註冊了hook函式,PRE_ROUTING、OUTING這個兩個鏈上做DNAT,LOCAL_IN和POST_ROUTING鏈上做SNAT。nat表沒有LOCAL_IN鏈,但在LOCAL_IN上註冊了鉤子函式nf_nat_fn,主要作用是修改資料包的源埠。
static struct nf_hook_ops nf_nat_ops[] __read_mostly = { /* Before packet filtering, change destination */ { /*做dnat*/ .hook = nf_nat_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_PRE_ROUTING, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { /*做snat*/ .hook = nf_nat_out, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_POST_ROUTING, .priority = NF_IP_PRI_NAT_SRC, }, /* Before packet filtering, change destination */ { /*做dnat*/ .hook = nf_nat_local_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_OUT, .priority = NF_IP_PRI_NAT_DST, }, /* After packet filtering, change source */ { .hook = nf_nat_fn, .owner = THIS_MODULE, .pf = NFPROTO_IPV4, .hooknum = NF_INET_LOCAL_IN, .priority = NF_IP_PRI_NAT_SRC, }, };
1、nf_nat_in
nf_nat_in鉤子函式註冊在PRE_ROUTING鏈上,最終做DNAT的處理函式是nf_nat_fn,這個函式後面再講,做了DNAT後目的地址改變而且資料包沒有被扔掉就呼叫skb_dst_drop,這個最終呼叫dst_release,將skb->dst設定為NULL,將skb的dst_entry減1
static unsigned int nf_nat_in(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { unsigned int ret; __be32 daddr = ip_hdr(skb)->daddr; /*最終做dnat的處理函式*/ ret = nf_nat_fn(hooknum, skb, in, out, okfn); if (ret != NF_DROP && ret != NF_STOLEN && daddr != ip_hdr(skb)->daddr) /*目的地地址改變要將skb->dst設定為NULL*/ skb_dst_drop(skb); return ret; }
2、nf_nat_out
nf_nat_out註冊在POST_ROUTING鏈上,實現的功能是做SNAT,最終處理的函式也是nf_nat_fn。
static unsigned int nf_nat_out(unsigned int hooknum, struct sk_buff *skb, const struct net_device *in, const struct net_device *out, int (*okfn)(struct sk_buff *)) { #ifdef CONFIG_XFRM const struct nf_conn *ct; enum ip_conntrack_info ctinfo; #endif unsigned int ret; /* root is playing with raw sockets. */ if (skb->len < sizeof(struct iphdr) || ip_hdrlen(skb) < sizeof(struct iphdr)) return NF_ACCEPT; /*做SNAT*/ ret = nf_nat_fn(hooknum, skb, in, out, okfn); #ifdef CONFIG_XFRM if (ret != NF_DROP && ret != NF_STOLEN && (ct = nf_ct_get(skb, &ctinfo)) != NULL) { enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo); if ((ct->tuplehash[dir].tuple.src.u3.ip != ct->tuplehash[!dir].tuple.dst.u3.ip) || (ct->tuplehash[dir].tuple.src.u.all != ct->tuplehash[!dir].tuple.dst.u.all) ) return ip_xfrm_me_harder(skb) == 0 ? ret : NF_DROP; } #endif return ret; }
3、nf_nat_local_fn
nf_nat_local_fn註冊在OUTING鏈上,最終也是呼叫nf_nat_fn做DNAT,在OUTING鏈之前資料包已經做了路由選擇,因為做DNAT目的地地址改變所以要呼叫ip_route_me_hander重新選擇路由。
static unsigned int
nf_nat_local_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
const struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
unsigned int ret;
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
ip_hdrlen(skb) < sizeof(struct iphdr))
return NF_ACCEPT;
/*做DNAT*/
ret = nf_nat_fn(hooknum, skb, in, out, okfn);
if (ret != NF_DROP && ret != NF_STOLEN &&
(ct = nf_ct_get(skb, &ctinfo)) != NULL) {
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
if (ct->tuplehash[dir].tuple.dst.u3.ip !=
ct->tuplehash[!dir].tuple.src.u3.ip) {
/*做DNAT後目的地址改變要重新選路由*/
if (ip_route_me_harder(skb, RTN_UNSPEC))
ret = NF_DROP;
}
#ifdef CONFIG_XFRM
else if (ct->tuplehash[dir].tuple.dst.u.all !=
ct->tuplehash[!dir].tuple.src.u.all)
if (ip_xfrm_me_harder(skb))
ret = NF_DROP;
#endif
}
return ret;
}
4、nf_nat_fn
1nf_nat_fn對資料包的連線跟蹤選項的NAT只做一次,後續的資料包根據連結跟蹤做NAT。nf_nat_fn主要做以下幾件事
(1)判斷資料包的連結跟蹤是否建立,如果沒有建立直接返回,如果連結跟蹤沒有關聯nf_conn_nat也返回
(2)如果資料包狀態是一個期望連結或者有reply方向,而且協議是icmp就呼叫nf_nat_icmp_reply_translation對imcp做nat
(3)如果資料包的狀態是IP_CT_NEW,就呼叫nf_nat_initialized判斷該資料包的連結跟蹤是否已經做 NAT,如果還沒有做NAT而且是LOCAL_IN鏈上的鉤子函式,就呼叫alloc_null_binding修改連結跟蹤reply方向
(4)呼叫函式nf_nat_rule_find查詢nat表最後由nf_nat_packet根據連結跟蹤做nat
static unsigned int
nf_nat_fn(unsigned int hooknum,
struct sk_buff *skb,
const struct net_device *in,
const struct net_device *out,
int (*okfn)(struct sk_buff *))
{
struct nf_conn *ct;
enum ip_conntrack_info ctinfo;
struct nf_conn_nat *nat;
/* maniptype == SRC for postrouting. */
enum nf_nat_manip_type maniptype = HOOK2MANIP(hooknum);
/* We never see fragments: conntrack defrags on pre-routing
and local-out, and nf_nat_out protects post-routing. */
NF_CT_ASSERT(!(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)));
/*獲取連結跟蹤和資料包狀態ctinfo*/
ct = nf_ct_get(skb, &ctinfo);
/* Can't track? It's not due to stress, or conntrack would
have dropped it. Hence it's the user's responsibilty to
packet filter it out, or implement conntrack/NAT for that
protocol. 8) --RR */
if (!ct)
return NF_ACCEPT;
/* Don't try to NAT if this packet is not conntracked */
/*不做連結跟蹤的直接返回*/
if (ct == &nf_conntrack_untracked)
return NF_ACCEPT;
/*連結跟蹤沒有關聯nf_conn_nat直接返回*/
nat = nfct_nat(ct);
if (!nat) {
/* NAT module was loaded late. */
/*連結跟蹤已經確認就返回*/
if (nf_ct_is_confirmed(ct))
return NF_ACCEPT;
nat = nf_ct_ext_add(ct, NF_CT_EXT_NAT, GFP_ATOMIC);
if (nat == NULL) {
pr_debug("failed to add NAT extension\n");
return NF_ACCEPT;
}
}
switch (ctinfo) {
case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY:
if (ip_hdr(skb)->protocol == IPPROTO_ICMP) {
/*對於一個期望連結或者有reply方向而且協議是
icmp就呼叫nf_nat_imcp_reply_translation做nat*/
if (!nf_nat_icmp_reply_translation(ct, ctinfo,
hooknum, skb))
return NF_DROP;
else
return NF_ACCEPT;
}
/* Fall thru... (Only ICMPs can be IP_CT_IS_REPLY) */
case IP_CT_NEW:
/* Seen it before? This can happen for loopback, retrans,
or local packets.. */
/*判斷連線跟蹤是否已經做過NAT*/
if (!nf_nat_initialized(ct, maniptype)) {
unsigned int ret;
/*如果資料包走到了LOCAL_IN鏈而且狀態是NEW就要
做修改連結跟蹤的reply方向*/
if (hooknum == NF_INET_LOCAL_IN)
/* LOCAL_IN hook doesn't have a chain! */
ret = alloc_null_binding(ct, hooknum);
else
/*查詢nat表,判斷是否已經做nat*/
ret = nf_nat_rule_find(skb, hooknum, in, out,
ct);
if (ret != NF_ACCEPT)
return ret;
} else
pr_debug("Already setup manip %s for ct %p\n",
maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
ct);
break;
default:
/* ESTABLISHED */
NF_CT_ASSERT(ctinfo == IP_CT_ESTABLISHED ||
ctinfo == (IP_CT_ESTABLISHED+IP_CT_IS_REPLY));
}
/*根據連結跟蹤的tuple修改資料包做nat*/
return nf_nat_packet(ct, ctinfo, hooknum, skb);
}
4.1 nf_nat_initialized
nf_nat_initlized判斷連結跟蹤選項是否做了NAT,如做了NAT那麼ct->status就會設定IPS_SRC_NAT_DONE_BIT、IPS_SRC_NAT_DONE_BIT。
static inline int nf_nat_initialized(struct nf_conn *ct,
enum nf_nat_manip_type manip)
{
if (manip == IP_NAT_MANIP_SRC)
return test_bit(IPS_SRC_NAT_DONE_BIT, &ct->status);
else
return test_bit(IPS_DST_NAT_DONE_BIT, &ct->status);
}
4.2 alloc_null_binding
當資料包的狀態是IP_CT_NEW並且是LOCAL_IN鏈上的就呼叫alloc_null_bingding對連結跟蹤做NAT修改reply方向,因為LOCAL_IN是netfileter框架的的一個出口,如果這時連結跟蹤沒做NAT那麼資料包出去就會有問題。
unsigned int
alloc_null_binding(struct nf_conn *ct, unsigned int hooknum)
{
/* Force range to this IP; let proto decide mapping for
per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED).
Use reply in case it's already been mangled (eg local packet).
*/
__be32 ip
= (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC
? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip
: ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip);
struct nf_nat_range range
= { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } };
pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip);
/*連結跟蹤做nat,修改tuple的reply方向*/
return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum));
}
4.3 nf_nat_rule_find
連結狀態是IP_CTNEW、IP_CT_RELATED、IP_CT_RELATED+IP_CT_IS_REPLY,而且不是在LOCAL_IN上就呼叫nf_nat_rule_find查詢NAT表匹配規則,找到就呼叫相應的target函式(ipt_snat_target或者ipt_dnat_target)實現連線跟蹤項的轉換。然如果沒有找到就呼叫alloc_null_binding做連結跟蹤的NAT。alloc_null_bingding實際呼叫的是nf_nat_setup_info,這個函式下一節再分析。
int nf_nat_rule_find(struct sk_buff *skb,
unsigned int hooknum,
const struct net_device *in,
const struct net_device *out,
struct nf_conn *ct)
{
struct net *net = nf_ct_net(ct);
int ret;
/*查詢nat表匹配的規則做NAT*/
ret = ipt_do_table(skb, hooknum, in, out, net->ipv4.nat_table);
if (ret == NF_ACCEPT) {
/*判斷有沒有做NAT*/
if (!nf_nat_initialized(ct, HOOK2MANIP(hooknum)))
/* NUL mapping 改變tuple的reply方向*/
ret = alloc_null_binding(ct, hooknum);
}
return ret;
}
5、nf_nat_packet
當資料包的連結跟蹤已經做了NAT,就呼叫nf_nat_packet根據連結跟蹤修改資料包的ip、埠做NAT。這個函式很巧妙,此時連結跟蹤已經做了NAT,就取dir的反方向的tuple,然後再對tuple中的源ip、目的ip、源埠、目的埠顛倒,得到target,最後呼叫manip_pkt做NAT修改資料包的ip地址和埠。
這個地方有點繞舉一個例子:比如一個閘道器112.112.112.112,它下面的區域網有一個A裝置ip是192.168.0.100,這個A裝置要訪問一個外網伺服器地址是113.113.113.113,這樣必須由做SNAT,首先連結跟蹤做了SNAT後tuple如下
src | dst |
192.168.0.100 | 113.113.113.113 |
src | dst |
113.113.113.113 | 112.112.112.112 |
裝置A的資料包訪問伺服器是orig方向:192.168.0.100 -> 113.113.113.113,呼叫nf_nat_packet取反也就是reply:113.113.113.113 -> 12.112.112.112再顛倒過來得到target:12.112.112.112 -> 113.113.113.113,然後將target:112.112.112.112 -> 113.113.113.113修改資料包的源Ip、目的ip完成SNAT轉換。
當外部伺服器有資料包reply:113.113.113->112.112.112.112,呼叫nf_nat_packet取反方向也就是orig 192.168.0.100->113.113.113.113再顛倒過來得到target:13.113.113.113.->192.168.0.100然後將target:113.113.113.113 -> 192.168.0.100修改資料包的源IP、目的IP。
比如110.110.110.110的閘道器地址要做DNAT到內部一個地址192.168..0.200,一個外網地址111.111.111.111訪問閘道器110.110.110.110就會做DNAT到192.168.0.200,連結跟蹤做DNAT後tuple如下
src | dst |
111.111.111.111 | 110.110.110.110 |
src | dst |
192.168.0.200 | 111.111.111.111 |
當外網地址訪問閘道器也就是orig方向:111.111.111.111->110.110.110.110,呼叫nf_nat_packet會取相反方向的tuple也就是reply:192.168.0.200->111.111.111.111,然顛倒得到target:111.111.111.111->192.168.0.200然後修改源Ip、目的ip完成dnat轉換。
192.168.0.200有回覆包也就是reply方向:192.168.0.200->111.111.111.111,呼叫nf_nat_packet會取相反方向的tuple也就是orig:111.111.111.111->110.110.110.110,然後顛倒得到target:110.110.110.110->111.111.111.111,修改資料包的源地址、目的地地址完成reply。
所以說NAT起始就是基於連結跟蹤實現的。
/* Do packet manipulations according to nf_nat_setup_info. */
unsigned int nf_nat_packet(struct nf_conn *ct,
enum ip_conntrack_info ctinfo,
unsigned int hooknum,
struct sk_buff *skb)
{
enum ip_conntrack_dir dir = CTINFO2DIR(ctinfo);
unsigned long statusbit;
enum nf_nat_manip_type mtype = HOOK2MANIP(hooknum);
if (mtype == IP_NAT_MANIP_SRC)
statusbit = IPS_SRC_NAT;
else
statusbit = IPS_DST_NAT;
/* Invert if this is reply dir. */
if (dir == IP_CT_DIR_REPLY)
statusbit ^= IPS_NAT_MASK;
/* Non-atomic: these bits don't change. */
if (ct->status & statusbit) {
struct nf_conntrack_tuple target;
/* We are aiming to look like inverse of other direction. */
/*取dir的反方向的tuple,然後把該tuple的源ip、目的ip
源port、目的port顛倒過來得到target*/
nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple);
/*根據連結跟蹤的target做nat*/
if (!manip_pkt(target.dst.protonum, skb, 0, &target, mtype))
return NF_DROP;
}
return NF_ACCEPT;
}
6、manip_pkt
manip_pkt主要根據傳進來的target和mainiptype完成三層、四層的NAT轉換。先獲取四層的struct nf_nat_protocol 結構體例項然後呼叫四層協議的manip_pkt完成四層埠的NAT,
static bool
manip_pkt(u_int16_t proto,
struct sk_buff *skb,
unsigned int iphdroff,
const struct nf_conntrack_tuple *target,
enum nf_nat_manip_type maniptype)
{
struct iphdr *iph;
const struct nf_nat_protocol *p;
if (!skb_make_writable(skb, iphdroff + sizeof(*iph)))
return false;
iph = (void *)skb->data + iphdroff;
/* Manipulate protcol part. */
/* rcu_read_lock()ed by nf_hook_slow */
/*獲取nat四層轉換結構體例項*/
p = __nf_nat_proto_find(proto);
/*四層協議的NAT轉換*/
if (!p->manip_pkt(skb, iphdroff, target, maniptype))
return false;
iph = (void *)skb->data + iphdroff;
if (maniptype == IP_NAT_MANIP_SRC) {
csum_replace4(&iph->check, iph->saddr, target->src.u3.ip);
/*snat改變源地址*/
iph->saddr = target->src.u3.ip;
} else {
/*dnat改變目的地址*/
csum_replace4(&iph->check, iph->daddr, target->dst.u3.ip);
iph->daddr = target->dst.u3.ip;
}
return true;
}