1. 程式人生 > >linux sysctl 引數實現 暨 ip_forward引數對Linux核心轉發影響分析

linux sysctl 引數實現 暨 ip_forward引數對Linux核心轉發影響分析

在進行Linux核心轉發時,需要在proc檔案系統的proc/sys目錄設定轉發的引數,可以使用下面的方法檢視該引數的值 cat /proc/sys/net/ipv4/ip_forward,該引數的預設值為0,可以使用下面的方法進行修改該值,使能Linux核心的IP層的資料抓發,但是下面的方法在系統重啟後不再生效。

echo 1 > /proc/sys/net/ipv4/ip_forward

Linux系統中也提供了一個系統的配置工具sysctl,使用它可以讀取和配置Linux核心的一些引數。但是該方法和proc檔案系統相關,使用該工具Linux核心需要支援proc檔案系統。下面是使用sysctl配置核心的轉發引數。

# sysctl net.ipv4.ip_forward

net.ipv4.ip_forward = 0

/ #  sysctl -w net.ipv4.ip_forward=1

net.ipv4.ip_forward = 1

/ # sysctl net.ipv4.ip_forward

net.ipv4.ip_forward = 1

/ #

注意,引數 net.ipv4.ip_forward 實際是對應的 proc 目錄/proc/sys/net/ipv4/ip_forward,選項 -w 表示配置該核心配置引數,沒有選項表示讀核心配置引數,不加任何選項資訊,就表示讀取操作。

通過上面的方法我們可以設定和讀取

IP轉發的引數。但是本文重點不是講該引數如何配置,而是在配置完成後,在核心的轉發過程中如何生效的,以及如何配置到核心中。既然,該引數是配置使能IP層的轉發,那應該在Linux核心的轉發部分對該引數進行了判斷,該引數的判斷實際上是在查詢路由時進行判斷的,下面這張圖顯示了其中的呼叫關係,

在查路由的過程中,如果是轉發的資料包呼叫下面的巨集判斷轉發的引數是否開啟。在函式ip_route_input_slow

if (!IN_DEV_FORWARD(in_dev))

            goto e_hostunreach;

看一下該巨集是如何進行定義的,下面的巨集定義在include/linux/inetdevice.h

檔案中。

#define IN_DEV_FORWARD(in_dev)                 IN_DEV_CONF_GET((in_dev), FORWARDING)

在把IN_DEV_CONF_GET巨集進一步展開了看:

#define IN_DEV_CONF_GET(in_dev, attr) \

         ipv4_devconf_get((in_dev), NET_IPV4_CONF_ ## attr)//這裡的##表示連線兩個字串。

下面是ipv4_devconf_get函式的定義:

static inline int ipv4_devconf_get(struct in_device *in_dev, int index)

{

         index--;//這裡的index相當於NET_IPV4_CONF_FORWARDING

         return in_dev->cnf.data[index];// init_net->ipv4.devconf_dfl.data[0]

}

1)對於巨集NET_IPV4_CONF_ FORWARDING定義在include/linux/sysctl.h檔案中,是一個列舉型別的。

enum

{

         NET_IPV4_CONF_FORWARDING=1,

         NET_IPV4_CONF_MC_FORWARDING=2,

         NET_IPV4_CONF_PROXY_ARP=3,

         NET_IPV4_CONF_ACCEPT_REDIRECTS=4,

         NET_IPV4_CONF_SECURE_REDIRECTS=5,

         NET_IPV4_CONF_SEND_REDIRECTS=6,

         NET_IPV4_CONF_SHARED_MEDIA=7,

         NET_IPV4_CONF_RP_FILTER=8,

         NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE=9,

         NET_IPV4_CONF_BOOTP_RELAY=10,

         NET_IPV4_CONF_LOG_MARTIANS=11,

         NET_IPV4_CONF_TAG=12,

         NET_IPV4_CONF_ARPFILTER=13,

         NET_IPV4_CONF_MEDIUM_ID=14,

         NET_IPV4_CONF_NOXFRM=15,

         NET_IPV4_CONF_NOPOLICY=16,

         NET_IPV4_CONF_FORCE_IGMP_VERSION=17,

         NET_IPV4_CONF_ARP_ANNOUNCE=18,

         NET_IPV4_CONF_ARP_IGNORE=19,

         NET_IPV4_CONF_PROMOTE_SECONDARIES=20,

         NET_IPV4_CONF_ARP_ACCEPT=21,

         NET_IPV4_CONF_ARP_NOTIFY=22,

         NET_IPV4_CONF_SRC_VMARK=24,

         __NET_IPV4_CONF_MAX

};

2)對於return in_dev->cnf.data[index];返回的相當於in_dev->cnf.data[0],那下面我們看一下該初始值是如何產生的。

首先,in_dev是怎麼獲取到的,在ip_route_input_slow函式中通過struct in_device *in_dev = in_dev_get(dev);函式獲取,在in_dev_get函式中呼叫__in_dev_get_rcu,通過下面的賦值語句進行賦值struct in_device *in_dev = dev->ip_ptr;

static inline struct in_device *__in_dev_get_rcu(const struct net_device *dev)

{

         struct in_device *in_dev = dev->ip_ptr;

         if (in_dev)

                   in_dev = rcu_dereference(in_dev);

         return in_dev;

}

static __inline__ struct in_device *

in_dev_get(const struct net_device *dev)

{

         struct in_device *in_dev;

         rcu_read_lock();

         in_dev = __in_dev_get_rcu(dev);

         if (in_dev)

                   atomic_inc(&in_dev->refcnt);

         rcu_read_unlock();

         return in_dev;

}

         dev->ip_ptr;又是什麼時候賦值呢?答案是在net_device註冊初始化函式inetdev_init中,

static struct in_device *inetdev_init(struct net_device *dev)

{

         struct in_device *in_dev;

         ASSERT_RTNL();

         in_dev = kzalloc(sizeof(*in_dev), GFP_KERNEL);

         if (!in_dev)

                   goto out;

         memcpy(&in_dev->cnf, dev_net(dev)->ipv4.devconf_dflt,

                            sizeof(in_dev->cnf));//這裡對in_dev->cnt進行初始化操作,---1

         in_dev->cnf.sysctl = NULL;

         in_dev->dev = dev;

         if ((in_dev->arp_parms = neigh_parms_alloc(dev, &arp_tbl)) == NULL)

                   goto out_kfree;

         if (IPV4_DEVCONF(in_dev->cnf, FORWARDING))

                   dev_disable_lro(dev);

         /* Reference in_dev->dev */

         dev_hold(dev);

         /* Account for reference dev->ip_ptr (below) */

         in_dev_hold(in_dev);

         devinet_sysctl_register(in_dev);

         ip_mc_init_dev(in_dev);

         if (dev->flags & IFF_UP)

                   ip_mc_up(in_dev);

         /* we can receive as soon as ip_ptr is set -- do this last */

         rcu_assign_pointer(dev->ip_ptr, in_dev);//使用RCU保護鎖機制對dev->ip_ptr進行賦值

out:

         return in_dev;

out_kfree:

         kfree(in_dev);

         in_dev = NULL;

         goto out;

}

1dev_net(dev)->ipv4.devconf_dfl 也就相當於init_net->ipv4.devconf_dfl,而devconf_dfl的初始化時在/net/ipv4/devinet.c檔案中,devinet_init_net函式中,

static struct ipv4_devconf ipv4_devconf_dflt = {

         .data = {

                   [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,

                   [NET_IPV4_CONF_ACCEPT_SOURCE_ROUTE - 1] = 1,

         },

};//這裡並沒有對FORWARDING進行賦值操作

static __net_init int devinet_init_net(struct net *net)

{

         int err;

         struct ipv4_devconf *all, *dflt;

#ifdef CONFIG_SYSCTL

         struct ctl_table *tbl = ctl_forward_entry;

         struct ctl_table_header *forw_hdr;

#endif

         err = -ENOMEM;

         all = &ipv4_devconf; //----------------------------進行初始化操作

         dflt = &ipv4_devconf_dflt;

         if (net != &init_net) {

                   all = kmemdup(all, sizeof(ipv4_devconf), GFP_KERNEL);

                   if (all == NULL)

                            goto err_alloc_all;

                   dflt = kmemdup(dflt, sizeof(ipv4_devconf_dflt), GFP_KERNEL);

                   if (dflt == NULL)

                            goto err_alloc_dflt;

#ifdef CONFIG_SYSCTL

                   tbl = kmemdup(tbl, sizeof(ctl_forward_entry), GFP_KERNEL);

                   if (tbl == NULL)

                            goto err_alloc_ctl;

                   tbl[0].data = &all->data[NET_IPV4_CONF_FORWARDING - 1];

                   tbl[0].extra1 = all;

                   tbl[0].extra2 = net;

#endif

         }

#ifdef CONFIG_SYSCTL

         err = __devinet_sysctl_register(net, "all",

                            NET_PROTO_CONF_ALL, all);

         if (err < 0)

                   goto err_reg_all;

         err = __devinet_sysctl_register(net, "default",

                            NET_PROTO_CONF_DEFAULT, dflt);

         if (err < 0)

                   goto err_reg_dflt;

         err = -ENOMEM;

         forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);

         if (forw_hdr == NULL)

                   goto err_reg_ctl;

         net->ipv4.forw_hdr = forw_hdr;

#endif

         net->ipv4.devconf_all = all;//這裡對net->ipv4_devconfi_all進行了初始化

         net->ipv4.devconf_dflt = dflt;// //這裡對net->devconf_dflt進行了初始化

         return 0;

………………………….

}

上面的函式對net相關功能的初始化,在devinet.c檔案中還有一個和ipv4_devconf_dflt類似的變數ipv4_devconf,但是IN_DEV_FORWARD(in_dev)巨集讀取的是結構體ipv4_devconf_dflt中變數的值,所以,如果要在Linux核心中修改轉發的引數時,需要在ipv4_devconf_dflt中新增才能生效。

static struct ipv4_devconf ipv4_devconf = {

         .data = {

                   [NET_IPV4_CONF_ACCEPT_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SEND_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SECURE_REDIRECTS - 1] = 1,

                   [NET_IPV4_CONF_SHARED_MEDIA - 1] = 1,

                   [NET_IPV4_CONF_FORCE_IGMP_VERSION-1]=2,

         },

};

3)下面看一下使用echo 1 > /proc/sys/net/ipv4/ip_forward配置語句如何是Linux核心IP轉發生效的。

在上面的devinet_init_net()函式中,有下面的兩段程式碼

         struct ctl_table *tbl = ctl_forward_entry;

         forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);

其中ctl_forward_entry定義為下面的結構,

static struct ctl_table ctl_forward_entry[] = {

         {

                   .ctl_name         = NET_IPV4_FORWARD,//一個ID

                   .procname        = "ip_forward",//字串,包含在proc/sys下目錄項,實際為proc/sys目錄下的檔名

                   .data                  = &ipv4_devconf.data[

                                               NET_IPV4_CONF_FORWARDING - 1],//回撥函式設定的值

                   .maxlen             = sizeof(int),//設定值的最大長度

                   .mode                = 0644,//檔案的許可權,也就是ip_forward檔案的許可權

                   .proc_handler  = devinet_sysctl_forward,// /proc/sys下面的檔案修改的時候呼叫該回調函式。

                   .strategy = devinet_conf_sysctl,// sysctl讀寫系統引數時候呼叫該回調函式

                   .extra1              = &ipv4_devconf,

                   .extra2              = &init_net,

         },

         { },

};

         forw_hdr = register_net_sysctl_table(net, net_ipv4_path, tbl);用於動態註冊系統控制功能,其中net_ipv4_path定義為下面的形式。也就是proc/sys/下的目錄名,tbl就是上面的ctl_forward_entry[]結構體。

static __net_initdata struct ctl_path net_ipv4_path[] = {

         { .procname = "net", .ctl_name = CTL_NET, },

         { .procname = "ipv4", .ctl_name = NET_IPV4, },

         { },

};

使用echo 1 > /proc/sys/net/ipv4/ip_forward呼叫devinet_sysctl_forward函式進行處理,下面是該函式的定義實現。其中引數write1表示寫配置,為0表示讀取配置值,buffer是要寫的值,lenpbuffer的大小,ppos為位置。這裡的__user是告訴不應該解除該指標的引用,因為在當前地址空間中它是沒有意義的,所以對於這種變數,在kernel中使用要用到copy_to_usercopy_from_user

static int devinet_sysctl_forward(ctl_table *ctl, int write,

                                       void __user *buffer,

                                       size_t *lenp, loff_t *ppos)

{

         int *valp = ctl->data;//獲取&ipv4_devconf.data地址

         int val = *valp;

         loff_t pos = *ppos;

         int ret = proc_dointvec(ctl, write, buffer, lenp, ppos);//該函式處理傳進來的int型,proc_dostring處理傳過來的字串。

/* ctl->data change  echo "0" >/proc/sys/net/ipv4/ip_forward  write = 1 *valp = 0 val = 1 */

         if (write && *valp != val) {

                   struct net *net = ctl->extra2;

                   if (valp != &IPV4_DEVCONF_DFLT(net, FORWARDING)) {

                            if (!rtnl_trylock()) {

                                     /* Restore the original values before restarting */

                                     *valp = val;

                                     *ppos = pos;

                                     return restart_syscall();

                            }

                            if (valp == &IPV4_DEVCONF_ALL(net, FORWARDING)) {

                                     inet_forward_change(net);//呼叫該函式進行配置in_dev->cnf.data

                            } else if (*valp) {

                                     struct ipv4_devconf *cnf = ctl->extra1;

                                     struct in_device *idev =

                                               container_of(cnf, struct in_device, cnf);

                                     dev_disable_lro(idev->dev);

                            }

                            rtnl_unlock();

                            rt_cache_flush(net, 0);

                   }

         }

         return ret;

}

下面是這個函式就是修改forward引數,

static void inet_forward_change(struct net *net)

{

         struct net_device *dev;

         int on = IPV4_DEVCONF_ALL(net, FORWARDING);//獲取配置的值

         IPV4_DEVCONF_ALL(net, ACCEPT_REDIRECTS) = !on;

         IPV4_DEVCONF_DFLT(net, FORWARDING) = on;//設定ipv4_devconf_dflt結構體,

         read_lock(&dev_base_lock);

         for_each_netdev(net, dev) {

                   struct in_device *in_dev;

                   if (on)

                            dev_disable_lro(dev);

                   rcu_read_lock();

                   in_dev = __in_dev_get_rcu(dev);

                   if (in_dev)

                            IN_DEV_CONF_SET(in_dev, FORWARDING, on);//呼叫該巨集設定in_dev->cnf.data

                   rcu_read_unlock();

         }

         read_unlock(&dev_base_lock);

}

static inline void ipv4_devconf_set(struct in_device *in_dev, int index,

                                         int val)

{

         index--;

         set_bit(index, in_dev->cnf.state);

         in_dev->cnf.data[index] = val;//設定in_devdata,這裡的IndexNET_IPV4_CONF_FORWARDING

}

其呼叫關係如下圖: