Linux Network Architecture Isaac Y. Tsai. 2010/09/02 © by Outline Linux kernel architecture Network...

Linux Network Linux Network ArchitectureArchitecture

Isaac Y. Tsai <[email protected]>

2010/09/02 © by

Outline

Linux kernel architectureNetwork related kernel data structureNetwork device driver

using igb as an example

2010/09/02 © by

Linux kernel architecture

2010/09/02 © by

Network related kernel data structures

Protocol stack processingstruct sk_buff

defined in <linux/skbuff.h>src <kernel src>/core/net/skbuff.c

NIC & protocol stack interfacestruct net_device

defined in <linux/netdevice.h>

NIC I/O bus, e.g. PCI, USB, …I/O bus specific data structure

2010/09/02 © by

struct sk_buffSocket buffer (skb)Kernel data structure containing control i

nformation required for packet processing

A doubly linked listWhen a payload data is passed to a socket,

a socket buffer is created and the payload data address is stored in the structure variable.

2010/09/02 © by

struct sk_buffHeader file : <linux/skbuff.h>Implementation:

<kernel src>/net/core/skbuff.cA pointer to network device

struct net_device *dev;Pointers to protocol header

sk_buff_data_t transport_header,network_header, mac_header;

Pointers to the whole packetsk_buff_data_t tail, end;unsigned char *head, *data;

2010/09/02 © by

struct sk_buff

2010/09/02 © by

struct sk_buff_head

2010/09/02 © by

head/data/tail/end fields

head: start of the packet

data: start of packet payload

tail: end of packet payload

end: end of packetlen: amount of data t

he packet contains

2010/09/02 © by

sk_buff functions

struct sk_buff *alloc_skb(unsigned int size, int gfp_mask) void kfree_skb(struct sk_buff *skb) struct sk_buff *skb_get(struct sk_buff *skb) struct sk_buff *skb_clone(struct sk_buff *skb, int gfp_mask) struct sk_buff *skb_copy(const struct sk_buff *skb, int gfp_mask) struct skb_copy_expand(const struct sk_buff *skb, int new_headroo

m, int new_tailroom, int gfp_mask)int skb_cloned(struct sk_buff *skb)int skb_shared(struct sk_buff *skb)

2010/09/02 © by

Operations on lists of sk_buffstruct sk_buff *skb_peek(struct sk_buff_head *list_)struct sk_buff *skb_peek_tail(struct sk_buff_head *list_) __u32 skb_queue_len(sk_buff_head *list_) void skb_queue_head(struct sk_buff_head *list_, struct sk_buff *newsk) void skb_queue_tail(struct sk_buff_head *list_, struct sk_buff *newsk)

2010/09/02 © by

Operations on sk_buff dataunsigned char *skb_put(struct sk_buff *sbk, int len) unsigned char *skb_push(struct sk_buff *skb, int len) unsigned char *skb_pull(struct sk_buff *skb, int len) void skb_reserve(struct sk_buff *skb, int len)int skb_headroom(struct sk_buff *skb) int skb_tailroom(struct sk_buff *skb) struct sk_buff *skb_cow(struct sk_buff *skb, int headroom)

2010/09/02 © by

alloc_skb skb = kmem_cache_alloc(skbuff_head_cache, gfp_mask & ~_ _GFP_DMA); ... ... ... size = SKB_DATA_ALIGN(size); data = kmalloc(size + sizeof(struct skb_shared_info), gfp_mask);

2010/09/02 © by

a) skb_put()

b) skb_push()

c) skb_pull()

d) skb_reserve()

2010/09/02 © by

skb_reserveskb_reserve(skb,2)

Ethernet frame size is 14bytes, this keeps the following IP header to be aligned on 16bytes boundary

2010/09/02 © by

TCP to IP sk_buff operations

2010/09/02 © by

UDP to IP sk_buff operations

2010/09/02 © by

skb_clone

When an ingress packet needs to be delivered to multiple recipients

2010/09/02 © by

a) pkb_copy()

b) skb_copy()

2010/09/02 © by

Network Device Driver

2010/09/02 © by

struct net_devRepresent a network interface cardHeader: <linux/netdevice.h>Name and index of the network deviceState of the deviceDevice mtu: maximum transmission unit, the ma

ximum size of frame the device can handlePointers to device driver functions

2010/09/02 © by

Network device interface

2010/09/02 © by

Kernel net_device structuredev_base stores registered network devices

2010/09/02 © by

struct net_deviceActivation: open, close, ioctlData transfer: hard_start_xmit, pollWatchdog: tx_timeout, watchdog_timeoStatistics: get_stats, get_wireless_statsConfiguration: ethtool_ops, change_mtuBus specific: mem_start, men_endstruct net_dev_ops

2010/09/02 © by

struct net_devicestruct net_device {

char name[IFNAMSIZ];struct hlist_node name_hlist;char *ifalias;unsigned long mem_end, mem_start, base_addr;unsigned int irq;unsigned char if_port;unsigned char dma;unsigned long state;struct list_head dev_list, napi_list, unreg_list;unsigned long features;int ifindex, iflink;struct net_device_stats stats;

#ifdef CONFIG_WIRELESS_EXTconst struct iw_handler_def * wireless_handlers;struct iw_public_data * wireless_data;

#endifconst struct net_device_ops *netdev_ops;const struct ethtool_ops *ethtool_ops;const struct header_ops *header_ops;

2010/09/02 © by

struct net_device (cont’ed)unsigned int flags;unsigned short gflags, priv_flags, padded;unsigned char operstate, link_mode; unsigned mtu;unsigned short type;unsigned short hard_header_len;unsigned short needed_headroom, needed_tailroom;struct net_device *master; unsigned char perm_addr[MAX_ADDR_LEN], addr_len;unsigned short dev_id;struct netdev_hw_addr_list uc;int uc_promisc;spinlock_t addr_list_lock;struct dev_addr_list *mc_list;int mc_count;unsigned int promiscuity, allmulti;

…

2010/09/02 © by

struct net_device_opsstruct net_device_ops {

int (*ndo_init)(struct net_device *dev);void (*ndo_uninit)(struct net_device *dev);int (*ndo_open)(struct net_device *dev);int (*ndo_stop)(struct net_device *dev);netdev_tx_t (*ndo_start_xmit) (struct sk_buff *skb, struct net_device *dev);u16 (*ndo_select_queue)(struct net_device *dev, struct sk_buff *skb);void (*ndo_change_rx_flags)(struct net_device *dev, int flags);void (*ndo_set_rx_mode)(struct net_device *dev);void (*ndo_set_multicast_list)(struct net_device *dev);int (*ndo_set_mac_address)(struct net_device *dev, void *addr);int (*ndo_validate_addr)(struct net_device *dev);int (*ndo_do_ioctl)(struct net_device *dev, struct ifreq *ifr, int cmd);int (*ndo_set_config)(struct net_device *dev, struct ifmap *map);int (*ndo_change_mtu)(struct net_device *dev, int new_mtu);int (*ndo_neigh_setup)(struct net_device *dev, struct neigh_parms *);void (*ndo_tx_timeout) (struct net_device *dev);struct net_device_stats* (*ndo_get_stats)(struct net_device *dev);void (*ndo_vlan_rx_register)(struct net_device *dev, struct vlan_group *grp);void (*ndo_vlan_rx_add_vid)(struct net_device *dev, unsigned short vid);void (*ndo_vlan_rx_kill_vid)(struct net_device *dev, unsigned short vid);

2010/09/02 © by

Interrupt based device driver

2010/09/02 © by

Device driver code flow1. Detecting Hardware Device:

Once a network driver is loaded into the kernel, the driver probes for the hardware device it supports (I/O ports and IRQ line). The device found are to be registered with kernel.

2. Registration with kernel:Usually linux drivers register itself with kernel, once it is loaded. During the registration process it asks for its unique major/minor number. There will be a corresponding file in /dev directory with major/minor number allocated for that device (e.g.: /dev/hda – hard disk partition). But when a network driver is loaded into kernel, it does not ask for major/minor number as other drivers do. There is no “everything is a file” concept for network device (it means there is NO /dev/eth0 like file, similar to /dev/hda hard disk partition). Instead, the network driver inserts a data structure (struct net_device) for each newly detected interface into a global list of network devices. This structure describes the characteristics of the found device.

3. Filling up of net_device structure:Kernel takes care of some ethernet defaults through a function (ether_setup()), which fills several fields in the net_devie structure. Device specific fields are filled by this device driver.

2010/09/02 © by

Device driver code flow (cont’d)

4. Opening (“open” method) the device:(a) It requests and gets allocated its memory region and IRQs.(b) The hardware address (“MAC address” popularly known as) is

copied from real hardware to net_device structure.(c) Transmit Queue of this device is started (“netif_start_queue”)

to accept packets for transmission.

Note: Before the network device is used, it must be opened by the kernel in response to “ifconfig / ifup” command. With this command an IP address is assigned to the device and device is made up (ON). Assigning IP address is happening at OSI layer 3 (Network layer – IP), so this device driver (OSI layer 2 – MAC) has nothing to do with that. But to make this device up, IFF_UP flag of net_device structure is set. Kernel calls open method of this device to do the same.

2010/09/02 © by


5. Transmission of Packet (“hard_start_xmit” method):(a) Whenever the kernel needs to transmit a data packet, it calls the “hard

_start_xmit” method to put the data on an outgoing queue.(b) Kernel put the data (packet) in the form of a structure called “socket bu

ffer structure” (struct sk_buff).(c) Device driver does not modify this data and it does some sanity checks o

nly. Then it transmits the data by calling highly hardware dependent routines of the device.

Note1: The “hard_start_xmit” function is protected from concurrent calls by a spinlock (xmit_lock).

Note2: The hardware interface (ethernet card) has limited memory for outgoing packets. When this memory is exhausted, the driver will tell the kernel (“netif_stop_queue”) not to start any more transmissions until the hardware is ready to accept new data. Once the driver has stopped its queue, it must arrange to restart the queue at some point in the future, when it is again able to accept packets for transmission. To do so, it should call “netif_wake_queue” method.

2010/09/02 © by


Note3: If the current system time exceeds the device’s “trans_start” time (which is set while a packet is transmitted) by at least the timeout period, the networking layer will eventually call the driver’s “tx_timeout” method. That method’s job is to clear up the problem and to ensure the proper completion of any transmissions that were already in progress.

6. Receiption of Packet:

(a) When a packet is arrived at hardware, it triggers the corresponding interrupt. The interrupt handling routine of driver is called.

(b) This routine receives a pointer to the data and its length (packet), which are already available in memory. Its responsibility is to send the packet to the upper layers of networking code.

2010/09/02 © by


7. Closing/Releasing/Stopping (“stop” method) the device:(a) It releases allocated memory and IRQs.(b) Trasmit Queue of this device is stopped (“netif_stop_queue”) f

rom accepting packets for transmission.

Note: This method is called when we issue “ifdown <dev>” command.

8. Changes in Link state:The networking subsystem needs to know when network links go up or down, and it provides a few functions that the driver may use to convey that information. “netif_carrier_off”, “netif_carrier_on” and “netif_carrier_ok” are such functions.

2010/09/02 © by

igb_main.c module constructor

static int __init igb_init_module(void) {int ret;

printk(KERN_INFO "%s - version %s\n", igb_driver_string,igb_driver_version);printk(KERN_INFO "%s\n", igb_copyright);#ifdef IGB_DCA dca_register_notify(&dca_notifier);#endif ret = pci_register_driver(&igb_driver);#ifdef USE_REBOOT_NOTIFIER if (ret >= 0) { register_reboot_notifier(&igb_notifier_reboot); }#endif#ifdef ENABLE_TNAPI thread_proc_init();#endif return ret;}module_init(igb_init_module);

2010/09/02 © by

igb_driver variablestatic struct pci_driver igb_driver = { .name = igb_driver_name, .id_table = igb_pci_tbl, .probe = igb_probe, .remove = __devexit_p(igb_remove),#ifdef CONFIG_PM/* Power Managment Hooks */ .suspend = igb_suspend, .resume = igb_resume,#endif#ifndef USE_REBOOT_NOTIFIER

.shutdown = igb_shutdown,#endif#ifdef HAVE_PCI_ERS .err_handler = &igb_err_handler,#endif};

2010/09/02 © by

igb_netdev_ops variablestatic const struct net_device_ops igb_netdev_ops = {

.ndo_open = igb_open,

.ndo_stop = igb_close,

.ndo_start_xmit = igb_xmit_frame_adv,

.ndo_get_stats = igb_get_stats,

.ndo_set_rx_mode = igb_set_rx_mode,

.ndo_set_multicast_list = igb_set_rx_mode,

.ndo_set_mac_address = igb_set_mac,

.ndo_change_mtu = igb_change_mtu,

.ndo_do_ioctl = igb_ioctl,

.ndo_tx_timeout = igb_tx_timeout,

.ndo_validate_addr = eth_validate_addr,

.ndo_vlan_rx_register = igb_vlan_rx_register,

.ndo_vlan_rx_add_vid = igb_vlan_rx_add_vid,

.ndo_vlan_rx_kill_vid = igb_vlan_rx_kill_vid,#ifdef CONFIG_NET_POLL_CONTROLLER

.ndo_poll_controller = igb_netpoll,#endif};

2010/09/02 © by

igb_main.c module deconstructorstatic void __exit igb_exit_module(void){#ifdef IGB_DCA dca_unregister_notify(&dca_notifier);#endif#ifdef USE_REBOOT_NOTIFIER unregister_reboot_notifier(&igb_notifier_reboot);#endif pci_unregister_driver(&igb_driver);#ifdef ENABLE_TNAPI thread_proc_term();#endif}module_exit(igb_exit_module);

2010/09/02 © by

printk : kernel’s printfHeader: <linux/kernel.h>Arguments are the same as printfFormat specifiers: similar to printf but no float and double An initial 3 character sequence for log level

KERN_EMERG “<0>” /* system is unusable */KERN_ALERT “<1>” /* action must be taken immediately */KERN_CRIT “<2>” /* critical conditions */KERN_ERR “<3>” /* error conditions */KERN_WARNING “<4>” /* warning conditions */KERN_NOTICE “<5>” /* normal but significant conditions */KERN_INFO “<6>” /* informational */KERN_DEBUG “<7>” /* debug-level messages */

2010/09/02 © by

Module

Basic headers<linux/module.h><linux/version.h><linux/kernel.h>

MODULE_DEVICE_TABLE(pci, igb_pci_tbl);MODULE_AUTHOR()MODULE_DESCRIPTION()MODULE_LICENSE()MODULE_VERSION()

2010/09/02 © by

Net device registration

Header: <linux/netdevice.h>Net device storage

struct net_device *alloc_etherdev(sizeof_priv)

Registering net deviceint register_netdev(struct net_device *)void unregister_netdev(struct net_device *)

2010/09/02 © by

igb.h struct igb_adapterstruct igb_adapter {

struct timer_list watchdog_timer, phy_info_timer;struct vlan_group *vlgrp;u16 mng_vlan_id;u32 bd_number, wol, en_mng_pt;u16 link_speed, link_duplex;unsigned int total_tx_bytes, total_tx_packets, total_rx_bytes, total_rx_packets;/* Interrupt Throttle Rate */u32 itr, itr_setting;u16 tx_itr, rx_itr;struct work_struct reset_task, watchdog_task;bool fc_autoneg;u8 tx_timeout_factor;

#ifdef ETHTOOL_PHYS_IDstruct timer_list blink_timer;unsigned long led_status;

#endif

2010/09/02 © by

struct igb_adapter/* TX */struct igb_ring *tx_ring; /* One per active queue */unsigned int restart_queue;unsigned long tx_queue_len;u32 tx_timeout_count;/* RX */struct igb_ring *rx_ring; /* One per active queue */int num_tx_queues, num_rx_queues;u64 hw_csum_err, hw_csum_good;u32 alloc_rx_buff_failed, max_frame_size, min_frame_size;

/* OS defined structs */struct net_device *netdev;struct pci_dev *pdev;struct net_device_stats net_stats;

2010/09/02 © by

driver init

Initallocate ring buffers and associated sk_buffallocate and initialize net_deviceregister the net_deviceget MAC, and so on from device EEPROMrequest firmware download, if needed

int request_firmware(fw,name,device)register a packet receive interrupt

mostly postponed till open the device

2010/09/02 © by

igb_open()static int igb_open(struct net_device *netdev){

struct igb_adapter *adapter = netdev_priv(netdev);struct e1000_hw *hw = &adapter->hw;int err, i;if (test_bit(__IGB_TESTING, &adapter->state)) return -EBUSY;err = igb_setup_all_tx_resources(adapter);if (err) goto err_setup_tx;err = igb_setup_all_rx_resources(adapter);if (err) goto err_setup_rx;igb_configure(adapter);err = igb_request_irq(adapter);if (err) goto err_req_irq;clear_bit(__IGB_DOWN, &adapter->state);for (i = 0; i < adapter->num_q_vectors; i++) {

struct igb_q_vector *q_vector = adapter->q_vector[i];napi_enable(&q_vector->napi);

}

2010/09/02 © by

igb_open() (cont’ed)igb_configure_lli(adapter);E1000_READ_REG(hw, E1000_ICR);igb_irq_enable(adapter);if (adapter->vfs_allocated_count) {

u32 reg_data = E1000_READ_REG(hw, E1000_CTRL_EXT);reg_data |= E1000_CTRL_EXT_PFRSTD;E1000_WRITE_REG(hw, E1000_CTRL_EXT, reg_data);

}netif_tx_start_all_queues(netdev); hw->mac.get_link_status = 1;mod_timer(&adapter->watchdog_timer, jiffies + 1);return E1000_SUCCESS;

err_req_irq:igb_release_hw_control(adapter); igb_free_all_rx_resources(adapter);

err_setup_rx:igb_free_all_tx_resources(adapter);

err_setup_tx:igb_reset(adapter); return err;

}

2010/09/02 © by

Igb_close()static int igb_close(struct net_device *netdev){

struct igb_adapter *adapter = netdev_priv(netdev);WARN_ON(test_bit(__IGB_RESETTING, &adapter->state));igb_down(adapter);igb_free_irq(adapter);igb_free_all_tx_resources(adapter);igb_free_all_rx_resources(adapter);return 0;

}

2010/09/02 © by

igb_xmit_frame_adv()static netdev_tx_t igb_xmit_frame_adv(struct sk_buff *skb, struct net_devi

ce *netdev){

struct igb_adapter *adapter = netdev_priv(netdev);struct igb_ring *tx_ring;

#ifdef HAVE_TX_MQint r_idx = 0;r_idx = skb->queue_mapping & (IGB_ABS_MAX_TX_QUEUES - 1);tx_ring = adapter->multi_tx_table[r_idx];

#elsetx_ring = &adapter->tx_ring[0];

#endif/* This goes back to the question of how to logically map a tx queue

* to a flow. Right now, performance is impacted slightly negatively * if using multiple tx queues. If the stack breaks away from a * single qdisc implementation, we can look at this again. */return igb_xmit_frame_ring_adv(skb, netdev, tx_ring);

}

2010/09/02 © by

NAPI

NAPI (New API) : a device driver framework for high-speed network

Interrupt mitigationdisable some interrupt during high traffic

Packet throttlingdrop packets before further processing

2010/09/02 © by

NAPI interface static inline void netif_napi_add(struct net_device *dev,

struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight)

static inline void napi_enable(struct napi_struct *n)static inline void napi_disable(struct napi_struct *n)static inline void netif_rx_schedule(struct net_device *d

ev, struct napi_struct *napi)static inline void netif_rx_complete(struct net_device *d

ev, struct napi_struct *napi)int netif_receive_skb(struct sk_buff *skb)

2010/09/02 © by

NAPI in packet receptionWhen a new packet is available, interrupt routine should disable an

y further “packet available” interrupt and tell the network subsystem to poll driver shortly to pick up all available packets.

Arrange for polling:void netif_rx_schedule(struct net_device *dev);

Create a poll() method in the driverint (*poll)(struct net_device *dev, int *budget);poll() should process all available incoming packets.Packets should not passed to netif_rx(), instead, use int netif_receive_skb(struct sk_buff *skb);

2010/09/02 © by

NAPI in packet reception (cont’ed)

A new struct net_device field called quota contains the maximum number of packets that the networking subsystem is prepared to receive from your driver at this time. Once you have exhausted that quota, no further packets should be fed to the kernel in this poll() call.

The budget parameter also places a limit on the number of packets which your driver may process. Whichever of budget and quota is lower is the real limit.

Your driver should decrement dev->quota by the number of packets it processed. The value pointed to by the budget parameter should also be decremented by the same amount.

If packets remain to be processed (i.e. the driver used its entire quota), poll() should return a value of one.

If, instead, all packets have been processed, your driver should reenable interrupts, turn off polling, and return zero. Polling is stopped with:

void netif_rx_complete(struct net_device *dev);

2010/09/02 © by

Packet receive interrupt

Receive interrupt handlerminimally handles the packet received

sanity checksputs back the sk_buffs for re-usePasses the associated sk_buffs (and ring buffers) to the protocol layer by NET_RX_SOFTIRQint netif_rx(stuct sk_buff *)When network load is heavy, switch to poll mode, if supported

2010/09/02 © by

igb_intr()static irqreturn_t igb_intr(int irq, void *data){

struct igb_adapter *adapter = data;struct igb_q_vector *q_vector = adapter->q_vector[0];struct e1000_hw *hw = &adapter->hw;u32 icr = E1000_READ_REG(hw, E1000_ICR);if (!icr) return IRQ_NONE; igb_write_itr(q_vector);if (!(icr & E1000_ICR_INT_ASSERTED)) return IRQ_NONE;if (icr & E1000_ICR_DOUTSYNC) { adapter->stats.doosync++; }if (icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {

hw->mac.get_link_status = 1;if (!test_bit(__IGB_DOWN, &adapter->state))

mod_timer(&adapter->watchdog_timer, jiffies + 1);}napi_schedule(&q_vector->napi);return IRQ_HANDLED;

}

2010/09/02 © by

igb_intr_msi()static irqreturn_t igb_intr_msi(int irq, void *data){

struct igb_adapter *adapter = data;struct igb_q_vector *q_vector = adapter->q_vector[0];struct e1000_hw *hw = &adapter->hw;u32 icr = E1000_READ_REG(hw, E1000_ICR);igb_write_itr(q_vector);if (icr & E1000_ICR_DOUTSYNC) { adapter->stats.doosync++; }if (icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC)) {

hw->mac.get_link_status = 1;if (!test_bit(__IGB_DOWN, &adapter->state))

mod_timer(&adapter->watchdog_timer, jiffies + 1);}napi_schedule(&q_vector->napi);return IRQ_HANDLED;

}

2010/09/02 © by

napi_schedule()kcompat.h#define napi_schedule(_napi) netif_rx_schedule(napi_to_poll_dev(_na

pi))

kcompat.cstruct net_device *napi_to_poll_dev(struct napi_struct *napi){

struct adapter_q_vector *q_vector = container_of(napi, struct adapter_q_vector, napi);return &q_vector->poll_dev;

}

2010/09/02 © by

igb_poll()static int igb_poll(struct napi_struct *napi, int budget){

struct igb_q_vector *q_vector = container_of(napi, struct igb_q_vector, napi);int tx_clean_complete = 1, work_done = 0;

#ifdef IGB_DCAif (q_vector->adapter->flags & IGB_FLAG_DCA_ENABLED)

igb_update_dca(q_vector);#endif

if (q_vector->tx_ring) tx_clean_complete = igb_clean_tx_irq(q_vector);if (q_vector->rx_ring) igb_clean_rx_irq_adv(q_vector, &work_done, budget);if (!tx_clean_complete) work_done = budget;

#ifndef HAVE_NETDEV_NAPI_LISTif (!netif_running(q_vector->adapter->netdev)) work_done = 0;

#endifif (work_done < budget) {

napi_complete(napi); igb_ring_irq_enable(q_vector);}return work_done;

}

2010/09/02 © by

Calling sequence to install igb_poll()

igb_open()igb_request_irq(struct igb_adapter *adapter)

igb_alloc_q_vectors(struct igb_adapter *adapter)netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);

Igb_sw_init(), igb_resume()igb_init_interrupt_scheme(struct igb_adapter *adapter)

igb_alloc_q_vectors(struct igb_adapter *adapter)netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);

igb_poll() will never be called if weight is not initialize and left as zero. Gigabit adaptor drivers tend to set weight to 64; smaller values can be used for slower media.

2010/09/02 © by

netif_napi_add()#define netif_napi_add(_netdev, _napi, _poll, _weight) \

do { \struct napi_struct *__napi = (_napi); \struct net_device *poll_dev = napi_to_poll_dev(__napi); \poll_dev->poll = &(__kc_adapter_clean); \poll_dev->priv = (_napi); \poll_dev->weight = (_weight); \set_bit(__LINK_STATE_RX_SCHED, &poll_dev->state); \set_bit(__LINK_STATE_START, &poll_dev->state);\dev_hold(poll_dev); \_netdev->poll = &(__kc_adapter_clean); \_netdev->weight = (_weight); \__napi->poll = &(_poll); \__napi->weight = (_weight); \__napi->dev = (_netdev); \set_bit(__LINK_STATE_RX_SCHED, &(_netdev)->state); \

} while (0)

2010/09/02 © by

igb_clean_rx_irq_adv()static bool igb_clean_rx_irq_adv(struct igb_q_vector *q_vector, int *work_

done, int budget){

struct igb_adapter *adapter = q_vector->adapter;struct net_device *netdev = adapter->netdev;struct igb_ring *rx_ring = q_vector->rx_ring;struct pci_dev *pdev = rx_ring->pdev;union e1000_adv_rx_desc *rx_desc , *next_rxd;struct igb_buffer *buffer_info , *next_buffer;struct sk_buff *skb;bool cleaned = FALSE;int cleaned_count = 0;unsigned int total_bytes = 0, total_packets = 0, i;u32 staterr;u16 length;

2010/09/02 © by

igb_clean_rx_irq_adv() (contin’ed)i = rx_ring->next_to_clean;

buffer_info = &rx_ring->buffer_info[i];rx_desc = E1000_RX_DESC_ADV(*rx_ring, i);staterr = le32_to_cpu(rx_desc->wb.upper.status_error);

#ifdef ENABLE_TNAPIif(!adapter->tnapi.shutdown) { wake_up_interruptible(&adapter->tnapi.packet_waitqueue[rx_ring->queue_index]); return TRUE;}

#endifwhile (staterr & E1000_RXD_STAT_DD) {

if (*work_done >= budget) break; (*work_done)++;skb = buffer_info->skb; prefetch(skb->data - NET_IP_ALIGN);buffer_info->skb = NULL; i++; if (i == rx_ring->count) i = 0;next_rxd = E1000_RX_DESC_ADV(*rx_ring, i);prefetch(next_rxd); next_buffer = &rx_ring->buffer_info[i];length = le16_to_cpu(rx_desc->wb.upper.length);cleaned = TRUE; cleaned_count++;

2010/09/02 © by

igb_clean_rx_irq_adv() (contin’ed)

send_up:#endif /* CONFIG_IGB_DISABLE_PACKET_SPLIT */

if (staterr & E1000_RXDEXT_ERR_FRAME_ERR_MASK) {dev_kfree_skb_irq(skb); goto next_desc;

}#ifdef SIOCSHWTSTAMP

igb_rx_hwtstamp(adapter, staterr, skb);#endif

total_bytes += skb->len; total_packets++;igb_rx_checksum_adv(rx_ring, staterr, skb);

#ifndef ETH_TYPE_TRANS_SETS_DEVskb->dev = netdev;

#endifskb->protocol = eth_type_trans(skb, netdev);igb_receive_skb(rx_ring, staterr, rx_desc, skb);netdev->last_rx = jiffies;

2010/09/02 © by

igb_receive_skb()igb_receive_skb()

#ifdef IGB_LROlro_vlan_hwaccel_receive_skb(&ring->lro_mgr,

skb, adapter->vlgrp, le16_to_cpu(rx_desc->wb.upper.vlan), rx_desc);

lro_receive_skb(&ring->lro_mgr, skb, rx_desc);#endif

vlan_gro_receive(&q_vector->napi, adapter->vlgrp, le16_to_cpu(rx_desc->wb.upper.vlan), skb);

napi_gro_receive(&q_vector->napi, skb);

kcompat.h: #define napi_gro_receive(_napi, _skb) netif_receive_skb(_skb)

2010/09/02 © by

lro_receive_skb()<kernel src>/net/ipv4/inet_lro.c

void lro_receive_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb, void *priv)

{if (__lro_proc_skb(lro_mgr, skb, NULL, 0, priv)) {

if (lro_mgr->features & LRO_F_NAPI)netif_receive_skb(skb);

elsenetif_rx(skb);

}}

2010/09/02 © by

netif_receive_skb()<kernel src>/net/core/dev.cint netif_receive_skb(struct sk_buff *skb){

struct packet_type *ptype, *pt_prev;struct net_device *orig_dev, *master, *null_or_orig, *null_or_bond;int ret = NET_RX_DROP;__be16 type;if (!skb->tstamp.tv64) net_timestamp(skb);if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb))

return NET_RX_SUCCESS;if (netpoll_receive_skb(skb)) return NET_RX_DROP;if (!skb->skb_iif) skb->skb_iif = skb->dev->ifindex;null_or_orig = NULL; orig_dev = skb->dev;master = ACCESS_ONCE(orig_dev->master);

2010/09/02 © by

netif_receive_skb() (cont’ed)if (master) {

if (skb_bond_should_drop(skb, master)) null_or_orig = orig_dev;

else skb->dev = master;}__get_cpu_var(netdev_rx_stat).total++;skb_reset_network_header(skb); skb_reset_transport_header(skb);skb->mac_len = skb->network_header - skb->mac_header; pt_prev = NULL;rcu_read_lock();

#ifdef CONFIG_NET_CLS_ACTif (skb->tc_verd & TC_NCLS) { skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); goto ncls; }

#endiflist_for_each_entry_rcu(ptype, &ptype_all, list) {

if (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}

2010/09/02 © by

netif_receive_skb() (cont’ed)#ifdef CONFIG_NET_CLS_ACT

skb = handle_ing(skb, &pt_prev, &ret, orig_dev); if (!skb) goto out;ncls:#endif

skb = handle_bridge(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;skb = handle_macvlan(skb, &pt_prev, &ret, orig_dev);if (!skb) goto out;null_or_bond = NULL;if ((skb->dev->priv_flags & IFF_802_1Q_VLAN) && (vlan_dev_real_dev(skb->dev)->priv_flags & IFF_BONDING)) {

null_or_bond = vlan_dev_real_dev(skb->dev);}type = skb->protocol;

2010/09/02 © by

netif_receive_skb() (cont’ed)list_for_each_entry_rcu(ptype,&ptype_base[ntohs(type) & PTYPE_HASH_MASK], list) {

if (ptype->type == type && (ptype->dev == null_or_orig || ptype->dev == skb->dev || ptype->dev == orig_dev || ptype->dev == null_or_bond)) {

if (pt_prev) ret = deliver_skb(skb, pt_prev, orig_dev);pt_prev = ptype;

}}if (pt_prev) {

ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);} else {

kfree_skb(skb);ret = NET_RX_DROP;

}out:

rcu_read_unlock();return ret;

}

2010/09/02 © by

Large segment offloadlarge segment offload (LSO) is a technique for increasing outbound

throughput of high-bandwidth network connections by reducing CPU overhead. It works by queuing up large buffers and letting the NIC split them into separate packets. The technique is also called TCP segmentation offload (TSO) when applied to TCP, or generic segmentation offload (GSO). When large chunks of data are to be sent over a computer network, they need to be first broken down to smaller segments that can pass through all the network elements like routers and switches between the source and destination computers. This process is referred to as segmentation. Segmentation is often done by the TCP protocol in the host computer. Offloading this work to the network card is called TCP segmentation offload (TSO).

Large Receive Offload (LRO) is a technique for increasing inbound throughput of high-bandwidth network connections by reducing CPU overhead. It works by aggregating multiple incoming packets from a single stream into a larger buffer before they are passed higher up the networking stack, thus reducing the number of packets that have to be processed. LRO combines multiple Ethernet frames into a single receive in the stack, thereby potentially decreasing CPU utilization for receives.

2010/09/02 © by

LRO and GRO

Generic Receive Offload (GRO) attempts to replicate the success of the transmit-side offload mechanism TSO (TCP Segmentation Offload) on the receive-side. This is crucial to the success of 10Gb/s Ethernet as the standard MTU of 1500 imposes a huge burden on the CPU which is no longer able to keep up without assistance. TSO is one of the techniques devised to resolve this problem on the transmit-side, i.e., the side that is of most interest to servers/data producers. However, as data rates continue to increase, the receive side too have become a bottleneck. Following in the footsteps of LRO (Large receive offload), GRO attempts to resolve this problem without causing conflicts with other parts of the network stack, such as forwarding and bridging. The talk contains information aimed at a general audience as well as technical details on the implementation. Familiarity with the Linux kernel helps in the latter but everyone should be able to gain a better understanding of the present technologies in high-speed networking.

2010/09/02 © by

Flow control related APIs

For interaction with protocol layerHeader: <linux/netdevice.h>APIs

void netif_start_queue(struct net_device *)void netif_stop_queue(struct net_device *)void netif_wake_queue(struct net_device *)int netif_queue_stopped(struct net_device *)

2010/09/02 © by

netif_start_queue(), netif_stop_queue()

<linux/netdevice.h>

static inline void netif_start_queue(struct net_device *dev){ netif_tx_start_queue(netdev_get_tx_queue(dev, 0));

}static inline void netif_tx_start_queue(struct netdev_queue *dev_q

ueue){clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state);

}

static inline void netif_stop_queue(struct net_device *dev){ netif_tx_stop_queue(netdev_get_tx_queue(dev, 0));

}static inline void netif_tx_stop_queue(struct netdev_queue *dev_q

ueue){set_bit(__QUEUE_STATE_XOFF, &dev_queue->state);

}

2010/09/02 © by

netif_wake_queue(), netif_queue_stopped()

static inline void netif_wake_queue(struct net_device *dev){ netif_tx_wake_queue(netdev_get_tx_queue(dev, 0)); }

static inline void netif_tx_wake_queue(struct netdev_queue *dev_queue){#ifdef CONFIG_NETPOLL_TRAP

if (netpoll_trap()) {netif_tx_start_queue(dev_queue);

return;}#endif

if (test_and_clear_bit(__QUEUE_STATE_XOFF, &dev_queue->state))__netif_schedule(dev_queue->qdisc);

}static inline int netif_queue_stopped(const struct net_device *dev){ return netif_tx_queue_stopped(netdev_get_tx_queue(dev, 0)); }static inline int netif_tx_queue_stopped(const struct netdev_queue *dev_qu

eue){

return test_bit(__QUEUE_STATE_XOFF, &dev_queue->state); }

2010/09/02 © by

ReferencesLinux Network Architecture[1] C. Benvenuti, Understanding Linux Network Internals, O'Reilly Media, 2005.[2] J. Corbet, G. Kroah-Hartman, and A. Rubini, Linux Device Drivers, 3rd ed.,

O'Reilly, 2005.[3] K. Wehrle, F. Pahlke, H. Ritter et al., Linux Network Architecture, Prentice Ha

ll, 2004.TCP Performance[1] A. P. Foong, T. R. Huff, H. H. Hum et al., “TCP Performance Re-Visited,” in In

ternational Symposium on Performance Analysis of Systems and Software (ISPASS'03), Austin, Texas, USA, 2003.

[2] L. Grossman, “Large Receive Offload Implementation in Neterion 10GbE Ethernet Driver,” in Proceedings of the Linux Symposium, Ottawa, Ontario, Canada, 2005, pp. 195-200.

[3] A. Menon, and W. Zwaenepoel, “Optimizing TCP receive performance,” in USENIX 2008 Annual Technical Conference on Annual Technical Conference, Boston, Massachusetts, USA, 2008, pp. 85-98.

Linux Network Architecture Isaac Y. Tsai. 2010/09/02 © by Outline Linux kernel architecture Network...

Documents

Transcript of Linux Network Architecture Isaac Y. Tsai. 2010/09/02 © by Outline Linux kernel architecture Network...