From 245f292d71d3fdd7536c2e4986769d5b9b48fb7f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 27 Jul 2012 23:49:30 +0000
Subject: ixgbe: Initialize q_vector cpu and affinity masks correctly

When enabling DCB the rings belonging to a q_vector on CPU 0 were not
reinitializing their DCA registers.  Upon closer inspection the issue was
that the q_vector CPU variable was left at 0 resulting in the driver not
updating the DCA registers.

In order to guarantee the DCA registers will be updated I am adding a
couple line change so that we initialize the CPU variable to -1 which will
force a DCA update the first time an interrupt fires on that q_vector.

In addition we were setting the CPU affinity hint to all CPUs when we were
not specifying a CPU.  Instead we should leave it as all zeros to avoid any
possible confusion about the fact that we shouldn't be giving a hint.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
index 17ecbcedd54..5e508b614dc 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_lib.c
@@ -802,10 +802,13 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
 	/* setup affinity mask and node */
 	if (cpu != -1)
 		cpumask_set_cpu(cpu, &q_vector->affinity_mask);
-	else
-		cpumask_copy(&q_vector->affinity_mask, cpu_online_mask);
 	q_vector->numa_node = node;
 
+#ifdef CONFIG_IXGBE_DCA
+	/* initialize CPU for DCA */
+	q_vector->cpu = -1;
+
+#endif
 	/* initialize NAPI */
 	netif_napi_add(adapter->netdev, &q_vector->napi,
 		       ixgbe_poll, 64);
-- 
cgit v1.2.3


From 872844ddb9e44a49b759ae3e34250fefbab656f2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Wed, 15 Aug 2012 02:10:43 +0000
Subject: ixgbe: Enable jumbo frames support w/ SR-IOV

This change makes it so that we can have limited support for jumbo frames
when SR-IOV is enabled.  In order to accomplish this it is necessary to
disable all VFs when the PF has jumbo frames enabled.  If the VFs then
request the same maximum frame size as the PF they will be re-enabled.  A
follow on patch will add a means of identifying when a VF can support
spanning buffers and does not need to be worried about the actual supported
max frame size.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Ross Brattain <ross.b.brattain@intel.com>
Tested-by: Robert Garrett <robertx.e.garrett@intel.com>
Tested-by: Sibai Li <Sibai.li@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c  |   4 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |  13 ++-
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 107 ++++++++++++++++++-------
 3 files changed, 93 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
index ae73ef14fdf..252850d9a3e 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_fcoe.c
@@ -800,6 +800,10 @@ int ixgbe_fcoe_enable(struct net_device *netdev)
 		return -EINVAL;
 
 	e_info(drv, "Enabling FCoE offload features.\n");
+
+	if (adapter->flags & IXGBE_FLAG_SRIOV_ENABLED)
+		e_warn(probe, "Enabling FCoE on PF will disable legacy VFs\n");
+
 	if (netif_running(netdev))
 		netdev->netdev_ops->ndo_stop(netdev);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index fa3d552e1f4..e2a6691cbd7 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -3263,6 +3263,11 @@ static void ixgbe_set_rx_buffer_len(struct ixgbe_adapter *adapter)
 		max_frame = IXGBE_FCOE_JUMBO_FRAME_SIZE;
 
 #endif /* IXGBE_FCOE */
+
+	/* adjust max frame to be at least the size of a standard frame */
+	if (max_frame < (ETH_FRAME_LEN + ETH_FCS_LEN))
+		max_frame = (ETH_FRAME_LEN + ETH_FCS_LEN);
+
 	mhadd = IXGBE_READ_REG(hw, IXGBE_MHADD);
 	if (max_frame != (mhadd >> IXGBE_MHADD_MFS_SHIFT)) {
 		mhadd &= ~IXGBE_MHADD_MFS_MASK;
@@ -4828,14 +4833,14 @@ static int ixgbe_change_mtu(struct net_device *netdev, int new_mtu)
 		return -EINVAL;
 
 	/*
-	 * For 82599EB we cannot allow PF to change MTU greater than 1500
-	 * in SR-IOV mode as it may cause buffer overruns in guest VFs that
-	 * don't allocate and chain buffers correctly.
+	 * For 82599EB we cannot allow legacy VFs to enable their receive
+	 * paths when MTU greater than 1500 is configured.  So display a
+	 * warning that legacy VFs will be disabled.
 	 */
 	if ((adapter->flags & IXGBE_FLAG_SRIOV_ENABLED) &&
 	    (adapter->hw.mac.type == ixgbe_mac_82599EB) &&
 	    (max_frame > MAXIMUM_ETHERNET_VLAN_SIZE))
-			return -EINVAL;
+		e_warn(probe, "Setting MTU > 1500 will disable legacy VFs\n");
 
 	e_info(probe, "changing MTU from %d to %d\n", netdev->mtu, new_mtu);
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index dce48bf64d9..420766e28ec 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -150,16 +150,6 @@ void ixgbe_enable_sriov(struct ixgbe_adapter *adapter,
 		adapter->flags2 &= ~(IXGBE_FLAG2_RSC_CAPABLE |
 				     IXGBE_FLAG2_RSC_ENABLED);
 
-#ifdef IXGBE_FCOE
-		/*
-		 * When SR-IOV is enabled 82599 cannot support jumbo frames
-		 * so we must disable FCoE because we cannot support FCoE MTU.
-		 */
-		if (adapter->hw.mac.type == ixgbe_mac_82599EB)
-			adapter->flags &= ~(IXGBE_FLAG_FCOE_ENABLED |
-					    IXGBE_FLAG_FCOE_CAPABLE);
-#endif
-
 		/* enable spoof checking for all VFs */
 		for (i = 0; i < adapter->num_vfs; i++)
 			adapter->vfinfo[i].spoofchk_enabled = true;
@@ -353,31 +343,77 @@ static int ixgbe_set_vf_vlan(struct ixgbe_adapter *adapter, int add, int vid,
 	return adapter->hw.mac.ops.set_vfta(&adapter->hw, vid, vf, (bool)add);
 }
 
-static void ixgbe_set_vf_lpe(struct ixgbe_adapter *adapter, u32 *msgbuf)
+static s32 ixgbe_set_vf_lpe(struct ixgbe_adapter *adapter, u32 *msgbuf, u32 vf)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	int new_mtu = msgbuf[1];
+	int max_frame = msgbuf[1];
 	u32 max_frs;
-	int max_frame = new_mtu + ETH_HLEN + ETH_FCS_LEN;
 
-	/* Only X540 supports jumbo frames in IOV mode */
-	if (adapter->hw.mac.type != ixgbe_mac_X540)
-		return;
+	/*
+	 * For 82599EB we have to keep all PFs and VFs operating with
+	 * the same max_frame value in order to avoid sending an oversize
+	 * frame to a VF.  In order to guarantee this is handled correctly
+	 * for all cases we have several special exceptions to take into
+	 * account before we can enable the VF for receive
+	 */
+	if (adapter->hw.mac.type == ixgbe_mac_82599EB) {
+		struct net_device *dev = adapter->netdev;
+		int pf_max_frame = dev->mtu + ETH_HLEN;
+		u32 reg_offset, vf_shift, vfre;
+		s32 err = 0;
+
+#ifdef CONFIG_FCOE
+		if (dev->features & NETIF_F_FCOE_MTU)
+			pf_max_frame = max_t(int, pf_max_frame,
+					     IXGBE_FCOE_JUMBO_FRAME_SIZE);
+
+#endif /* CONFIG_FCOE */
+		/*
+		 * If the PF or VF are running w/ jumbo frames enabled we
+		 * need to shut down the VF Rx path as we cannot support
+		 * jumbo frames on legacy VFs
+		 */
+		if ((pf_max_frame > ETH_FRAME_LEN) ||
+		    (max_frame > (ETH_FRAME_LEN + ETH_FCS_LEN)))
+			err = -EINVAL;
+
+		/* determine VF receive enable location */
+		vf_shift = vf % 32;
+		reg_offset = vf / 32;
+
+		/* enable or disable receive depending on error */
+		vfre = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset));
+		if (err)
+			vfre &= ~(1 << vf_shift);
+		else
+			vfre |= 1 << vf_shift;
+		IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), vfre);
+
+		if (err) {
+			e_err(drv, "VF max_frame %d out of range\n", max_frame);
+			return err;
+		}
+	}
 
 	/* MTU < 68 is an error and causes problems on some kernels */
-	if ((new_mtu < 68) || (max_frame > IXGBE_MAX_JUMBO_FRAME_SIZE)) {
-		e_err(drv, "VF mtu %d out of range\n", new_mtu);
-		return;
+	if (max_frame > IXGBE_MAX_JUMBO_FRAME_SIZE) {
+		e_err(drv, "VF max_frame %d out of range\n", max_frame);
+		return -EINVAL;
 	}
 
-	max_frs = (IXGBE_READ_REG(hw, IXGBE_MAXFRS) &
-		   IXGBE_MHADD_MFS_MASK) >> IXGBE_MHADD_MFS_SHIFT;
-	if (max_frs < new_mtu) {
-		max_frs = new_mtu << IXGBE_MHADD_MFS_SHIFT;
+	/* pull current max frame size from hardware */
+	max_frs = IXGBE_READ_REG(hw, IXGBE_MAXFRS);
+	max_frs &= IXGBE_MHADD_MFS_MASK;
+	max_frs >>= IXGBE_MHADD_MFS_SHIFT;
+
+	if (max_frs < max_frame) {
+		max_frs = max_frame << IXGBE_MHADD_MFS_SHIFT;
 		IXGBE_WRITE_REG(hw, IXGBE_MAXFRS, max_frs);
 	}
 
-	e_info(hw, "VF requests change max MTU to %d\n", new_mtu);
+	e_info(hw, "VF requests change max MTU to %d\n", max_frame);
+
+	return 0;
 }
 
 static void ixgbe_set_vmolr(struct ixgbe_hw *hw, u32 vf, bool aupe)
@@ -532,11 +568,28 @@ static inline void ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf)
 
 	/* enable transmit and receive for vf */
 	reg = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset));
-	reg |= (reg | (1 << vf_shift));
+	reg |= 1 << vf_shift;
 	IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg);
 
 	reg = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset));
-	reg |= (reg | (1 << vf_shift));
+	reg |= 1 << vf_shift;
+	/*
+	 * The 82599 cannot support a mix of jumbo and non-jumbo PF/VFs.
+	 * For more info take a look at ixgbe_set_vf_lpe
+	 */
+	if (adapter->hw.mac.type == ixgbe_mac_82599EB) {
+		struct net_device *dev = adapter->netdev;
+		int pf_max_frame = dev->mtu + ETH_HLEN;
+
+#ifdef CONFIG_FCOE
+		if (dev->features & NETIF_F_FCOE_MTU)
+			pf_max_frame = max_t(int, pf_max_frame,
+					     IXGBE_FCOE_JUMBO_FRAME_SIZE);
+
+#endif /* CONFIG_FCOE */
+		if (pf_max_frame > ETH_FRAME_LEN)
+			reg &= ~(1 << vf_shift);
+	}
 	IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg);
 
 	/* Enable counting of spoofed packets in the SSVPC register */
@@ -633,7 +686,7 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 		                                 hash_list, vf);
 		break;
 	case IXGBE_VF_SET_LPE:
-		ixgbe_set_vf_lpe(adapter, msgbuf);
+		retval = ixgbe_set_vf_lpe(adapter, msgbuf, vf);
 		break;
 	case IXGBE_VF_SET_VLAN:
 		add = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK)
-- 
cgit v1.2.3


From 58a02beedf14776b055d1eee72fcc1c64ebc79d6 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 20 Jul 2012 08:09:17 +0000
Subject: ixgbe: Move message handling routines into their own functions

Instead of trying to maintain one large monolithic function that handles
most of the different messages from the VF it makes sense to break the
message handling function up so that we can just go through one switch
statement and call the correct routine for a given message.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 223 ++++++++++++++-----------
 1 file changed, 130 insertions(+), 93 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 420766e28ec..d641c04cde5 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -255,8 +255,11 @@ void ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
 }
 
 static int ixgbe_set_vf_multicasts(struct ixgbe_adapter *adapter,
-				   int entries, u16 *hash_list, u32 vf)
+				   u32 *msgbuf, u32 vf)
 {
+	int entries = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK)
+		       >> IXGBE_VT_MSGINFO_SHIFT;
+	u16 *hash_list = (u16 *)&msgbuf[1];
 	struct vf_data_storage *vfinfo = &adapter->vfinfo[vf];
 	struct ixgbe_hw *hw = &adapter->hw;
 	int i;
@@ -557,20 +560,31 @@ int ixgbe_vf_configuration(struct pci_dev *pdev, unsigned int event_mask)
 	return 0;
 }
 
-static inline void ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf)
+static int ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf)
 {
 	struct ixgbe_hw *hw = &adapter->hw;
-	u32 reg;
+	unsigned char *vf_mac = adapter->vfinfo[vf].vf_mac_addresses;
+	u32 reg, msgbuf[4];
 	u32 reg_offset, vf_shift;
+	u8 *addr = (u8 *)(&msgbuf[1]);
+
+	e_info(probe, "VF Reset msg received from vf %d\n", vf);
+
+	/* reset the filters for the device */
+	ixgbe_vf_reset_event(adapter, vf);
+
+	/* set vf mac address */
+	ixgbe_set_vf_mac(adapter, vf, vf_mac);
 
 	vf_shift = vf % 32;
 	reg_offset = vf / 32;
 
-	/* enable transmit and receive for vf */
+	/* enable transmit for vf */
 	reg = IXGBE_READ_REG(hw, IXGBE_VFTE(reg_offset));
 	reg |= 1 << vf_shift;
 	IXGBE_WRITE_REG(hw, IXGBE_VFTE(reg_offset), reg);
 
+	/* enable receive for vf */
 	reg = IXGBE_READ_REG(hw, IXGBE_VFRE(reg_offset));
 	reg |= 1 << vf_shift;
 	/*
@@ -592,12 +606,115 @@ static inline void ixgbe_vf_reset_msg(struct ixgbe_adapter *adapter, u32 vf)
 	}
 	IXGBE_WRITE_REG(hw, IXGBE_VFRE(reg_offset), reg);
 
+	/* enable VF mailbox for further messages */
+	adapter->vfinfo[vf].clear_to_send = true;
+
 	/* Enable counting of spoofed packets in the SSVPC register */
 	reg = IXGBE_READ_REG(hw, IXGBE_VMECM(reg_offset));
 	reg |= (1 << vf_shift);
 	IXGBE_WRITE_REG(hw, IXGBE_VMECM(reg_offset), reg);
 
-	ixgbe_vf_reset_event(adapter, vf);
+	/* reply to reset with ack and vf mac address */
+	msgbuf[0] = IXGBE_VF_RESET | IXGBE_VT_MSGTYPE_ACK;
+	memcpy(addr, vf_mac, ETH_ALEN);
+
+	/*
+	 * Piggyback the multicast filter type so VF can compute the
+	 * correct vectors
+	 */
+	msgbuf[3] = hw->mac.mc_filter_type;
+	ixgbe_write_mbx(hw, msgbuf, IXGBE_VF_PERMADDR_MSG_LEN, vf);
+
+	return 0;
+}
+
+static int ixgbe_set_vf_mac_addr(struct ixgbe_adapter *adapter,
+				 u32 *msgbuf, u32 vf)
+{
+	u8 *new_mac = ((u8 *)(&msgbuf[1]));
+
+	if (!is_valid_ether_addr(new_mac)) {
+		e_warn(drv, "VF %d attempted to set invalid mac\n", vf);
+		return -1;
+	}
+
+	if (adapter->vfinfo[vf].pf_set_mac &&
+	    memcmp(adapter->vfinfo[vf].vf_mac_addresses, new_mac,
+		   ETH_ALEN)) {
+		e_warn(drv,
+		       "VF %d attempted to override administratively set MAC address\n"
+		       "Reload the VF driver to resume operations\n",
+		       vf);
+		return -1;
+	}
+
+	return ixgbe_set_vf_mac(adapter, vf, new_mac);
+}
+
+static int ixgbe_set_vf_vlan_msg(struct ixgbe_adapter *adapter,
+				 u32 *msgbuf, u32 vf)
+{
+	struct ixgbe_hw *hw = &adapter->hw;
+	int add = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK) >> IXGBE_VT_MSGINFO_SHIFT;
+	int vid = (msgbuf[1] & IXGBE_VLVF_VLANID_MASK);
+	int err;
+
+	if (adapter->vfinfo[vf].pf_vlan) {
+		e_warn(drv,
+		       "VF %d attempted to override administratively set VLAN configuration\n"
+		       "Reload the VF driver to resume operations\n",
+		       vf);
+		return -1;
+	}
+
+	if (add)
+		adapter->vfinfo[vf].vlan_count++;
+	else if (adapter->vfinfo[vf].vlan_count)
+		adapter->vfinfo[vf].vlan_count--;
+
+	err = ixgbe_set_vf_vlan(adapter, add, vid, vf);
+	if (!err && adapter->vfinfo[vf].spoofchk_enabled)
+		hw->mac.ops.set_vlan_anti_spoofing(hw, true, vf);
+
+	return err;
+}
+
+static int ixgbe_set_vf_macvlan_msg(struct ixgbe_adapter *adapter,
+				    u32 *msgbuf, u32 vf)
+{
+	u8 *new_mac = ((u8 *)(&msgbuf[1]));
+	int index = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK) >>
+		    IXGBE_VT_MSGINFO_SHIFT;
+	int err;
+
+	if (adapter->vfinfo[vf].pf_set_mac && index > 0) {
+		e_warn(drv,
+		       "VF %d requested MACVLAN filter but is administratively denied\n",
+		       vf);
+		return -1;
+	}
+
+	/* An non-zero index indicates the VF is setting a filter */
+	if (index) {
+		if (!is_valid_ether_addr(new_mac)) {
+			e_warn(drv, "VF %d attempted to set invalid mac\n", vf);
+			return -1;
+		}
+
+		/*
+		 * If the VF is allowed to set MAC filters then turn off
+		 * anti-spoofing to avoid false positives.
+		 */
+		if (adapter->vfinfo[vf].spoofchk_enabled)
+			ixgbe_ndo_set_vf_spoofchk(adapter->netdev, vf, false);
+	}
+
+	err = ixgbe_set_vf_macvlan(adapter, vf, index, new_mac);
+	if (err == -ENOSPC)
+		e_warn(drv,
+		       "VF %d has requested a MACVLAN filter but there is no space for it\n",
+		       vf);
+	return err;
 }
 
 static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
@@ -606,10 +723,6 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	u32 msgbuf[IXGBE_VFMAILBOX_SIZE];
 	struct ixgbe_hw *hw = &adapter->hw;
 	s32 retval;
-	int entries;
-	u16 *hash_list;
-	int add, vid, index;
-	u8 *new_mac;
 
 	retval = ixgbe_read_mbx(hw, msgbuf, mbx_size, vf);
 
@@ -630,33 +743,8 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	 * allowed to start any configuration.
 	 */
 
-	if (msgbuf[0] == IXGBE_VF_RESET) {
-		unsigned char *vf_mac = adapter->vfinfo[vf].vf_mac_addresses;
-		new_mac = (u8 *)(&msgbuf[1]);
-		e_info(probe, "VF Reset msg received from vf %d\n", vf);
-		adapter->vfinfo[vf].clear_to_send = false;
-		ixgbe_vf_reset_msg(adapter, vf);
-		adapter->vfinfo[vf].clear_to_send = true;
-
-		if (is_valid_ether_addr(new_mac) &&
-		    !adapter->vfinfo[vf].pf_set_mac)
-			ixgbe_set_vf_mac(adapter, vf, vf_mac);
-		else
-			ixgbe_set_vf_mac(adapter,
-				 vf, adapter->vfinfo[vf].vf_mac_addresses);
-
-		/* reply to reset with ack and vf mac address */
-		msgbuf[0] = IXGBE_VF_RESET | IXGBE_VT_MSGTYPE_ACK;
-		memcpy(new_mac, vf_mac, ETH_ALEN);
-		/*
-		 * Piggyback the multicast filter type so VF can compute the
-		 * correct vectors
-		 */
-		msgbuf[3] = hw->mac.mc_filter_type;
-		ixgbe_write_mbx(hw, msgbuf, IXGBE_VF_PERMADDR_MSG_LEN, vf);
-
-		return retval;
-	}
+	if (msgbuf[0] == IXGBE_VF_RESET)
+		return ixgbe_vf_reset_msg(adapter, vf);
 
 	if (!adapter->vfinfo[vf].clear_to_send) {
 		msgbuf[0] |= IXGBE_VT_MSGTYPE_NACK;
@@ -666,70 +754,19 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 
 	switch ((msgbuf[0] & 0xFFFF)) {
 	case IXGBE_VF_SET_MAC_ADDR:
-		new_mac = ((u8 *)(&msgbuf[1]));
-		if (is_valid_ether_addr(new_mac) &&
-		    !adapter->vfinfo[vf].pf_set_mac) {
-			ixgbe_set_vf_mac(adapter, vf, new_mac);
-		} else if (memcmp(adapter->vfinfo[vf].vf_mac_addresses,
-				  new_mac, ETH_ALEN)) {
-			e_warn(drv, "VF %d attempted to override "
-			       "administratively set MAC address\nReload "
-			       "the VF driver to resume operations\n", vf);
-			retval = -1;
-		}
+		retval = ixgbe_set_vf_mac_addr(adapter, msgbuf, vf);
 		break;
 	case IXGBE_VF_SET_MULTICAST:
-		entries = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK)
-		          >> IXGBE_VT_MSGINFO_SHIFT;
-		hash_list = (u16 *)&msgbuf[1];
-		retval = ixgbe_set_vf_multicasts(adapter, entries,
-		                                 hash_list, vf);
+		retval = ixgbe_set_vf_multicasts(adapter, msgbuf, vf);
+		break;
+	case IXGBE_VF_SET_VLAN:
+		retval = ixgbe_set_vf_vlan_msg(adapter, msgbuf, vf);
 		break;
 	case IXGBE_VF_SET_LPE:
 		retval = ixgbe_set_vf_lpe(adapter, msgbuf, vf);
 		break;
-	case IXGBE_VF_SET_VLAN:
-		add = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK)
-		      >> IXGBE_VT_MSGINFO_SHIFT;
-		vid = (msgbuf[1] & IXGBE_VLVF_VLANID_MASK);
-		if (adapter->vfinfo[vf].pf_vlan) {
-			e_warn(drv, "VF %d attempted to override "
-			       "administratively set VLAN configuration\n"
-			       "Reload the VF driver to resume operations\n",
-			       vf);
-			retval = -1;
-		} else {
-			if (add)
-				adapter->vfinfo[vf].vlan_count++;
-			else if (adapter->vfinfo[vf].vlan_count)
-				adapter->vfinfo[vf].vlan_count--;
-			retval = ixgbe_set_vf_vlan(adapter, add, vid, vf);
-			if (!retval && adapter->vfinfo[vf].spoofchk_enabled)
-				hw->mac.ops.set_vlan_anti_spoofing(hw, true, vf);
-		}
-		break;
 	case IXGBE_VF_SET_MACVLAN:
-		index = (msgbuf[0] & IXGBE_VT_MSGINFO_MASK) >>
-			IXGBE_VT_MSGINFO_SHIFT;
-		if (adapter->vfinfo[vf].pf_set_mac && index > 0) {
-			e_warn(drv, "VF %d requested MACVLAN filter but is "
-				    "administratively denied\n", vf);
-			retval = -1;
-			break;
-		}
-		/*
-		 * If the VF is allowed to set MAC filters then turn off
-		 * anti-spoofing to avoid false positives.  An index
-		 * greater than 0 will indicate the VF is setting a
-		 * macvlan MAC filter.
-		 */
-		if (index > 0 && adapter->vfinfo[vf].spoofchk_enabled)
-			ixgbe_ndo_set_vf_spoofchk(adapter->netdev, vf, false);
-		retval = ixgbe_set_vf_macvlan(adapter, vf, index,
-					      (unsigned char *)(&msgbuf[1]));
-		if (retval == -ENOSPC)
-			e_warn(drv, "VF %d has requested a MACVLAN filter "
-				    "but there is no space for it\n", vf);
+		retval = ixgbe_set_vf_macvlan_msg(adapter, msgbuf, vf);
 		break;
 	default:
 		e_err(drv, "Unhandled Msg %8.8x\n", msgbuf[0]);
-- 
cgit v1.2.3


From 374c65d6664a498c3997616161894bd9f55fab54 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Fri, 20 Jul 2012 08:09:22 +0000
Subject: ixgbe: Add mailbox API version negotiation support to ixgbe PF

This change allows us to add a mailbox versioning API.  This will allow us
to determine the features supported by the VFs from the PF.  For example we
will be implementing a version 1.1 API for the VF that will indicate that
it can support us enabling Jumbo frames as the VF will support buffer
chaining.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Phil Schmitt <phillip.j.schmitt@intel.com>
Tested-by: Robert Garrett <RobertX.Garrett@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h       |  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h   | 21 ++++++++++++++--
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 33 ++++++++++++++++++++++----
 3 files changed, 48 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 30efc9f0f47..ccb850500a9 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -135,6 +135,7 @@ struct vf_data_storage {
 	u16 tx_rate;
 	u16 vlan_count;
 	u8 spoofchk_enabled;
+	unsigned int vf_api;
 };
 
 struct vf_macvlans {
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
index 310bdd96107..d4c842e1db6 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
@@ -62,12 +62,29 @@
 /* bits 23:16 are used for exra info for certain messages */
 #define IXGBE_VT_MSGINFO_MASK     (0xFF << IXGBE_VT_MSGINFO_SHIFT)
 
+/* definitions to support mailbox API version negotiation */
+
+/*
+ * Each element denotes a version of the API; existing numbers may not
+ * change; any additions must go at the end
+ */
+enum ixgbe_pfvf_api_rev {
+	ixgbe_mbox_api_10,	/* API version 1.0, linux/freebsd VF driver */
+	ixgbe_mbox_api_20,	/* API version 2.0, solaris Phase1 VF driver */
+	/* This value should always be last */
+	ixgbe_mbox_api_unknown,	/* indicates that API version is not known */
+};
+
+/* mailbox API, legacy requests */
 #define IXGBE_VF_RESET            0x01 /* VF requests reset */
 #define IXGBE_VF_SET_MAC_ADDR     0x02 /* VF requests PF to set MAC addr */
 #define IXGBE_VF_SET_MULTICAST    0x03 /* VF requests PF to set MC addr */
 #define IXGBE_VF_SET_VLAN         0x04 /* VF requests PF to set VLAN */
-#define IXGBE_VF_SET_LPE          0x05 /* VF requests PF to set VMOLR.LPE */
-#define IXGBE_VF_SET_MACVLAN      0x06 /* VF requests PF for unicast filter */
+
+/* mailbox API, version 1.0 VF requests */
+#define IXGBE_VF_SET_LPE	0x05 /* VF requests PF to set VMOLR.LPE */
+#define IXGBE_VF_SET_MACVLAN	0x06 /* VF requests PF for unicast filter */
+#define IXGBE_VF_API_NEGOTIATE	0x08 /* negotiate API version */
 
 /* length of permanent address message returned from PF */
 #define IXGBE_VF_PERMADDR_MSG_LEN 4
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index d641c04cde5..f563625f1f4 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -469,6 +469,9 @@ static inline void ixgbe_vf_reset_event(struct ixgbe_adapter *adapter, u32 vf)
 	ixgbe_set_rx_mode(adapter->netdev);
 
 	hw->mac.ops.clear_rar(hw, rar_entry);
+
+	/* reset VF api back to unknown */
+	adapter->vfinfo[vf].vf_api = ixgbe_mbox_api_10;
 }
 
 static int ixgbe_set_vf_mac(struct ixgbe_adapter *adapter,
@@ -717,6 +720,24 @@ static int ixgbe_set_vf_macvlan_msg(struct ixgbe_adapter *adapter,
 	return err;
 }
 
+static int ixgbe_negotiate_vf_api(struct ixgbe_adapter *adapter,
+				  u32 *msgbuf, u32 vf)
+{
+	int api = msgbuf[1];
+
+	switch (api) {
+	case ixgbe_mbox_api_10:
+		adapter->vfinfo[vf].vf_api = api;
+		return 0;
+	default:
+		break;
+	}
+
+	e_info(drv, "VF %d requested invalid api version %u\n", vf, api);
+
+	return -1;
+}
+
 static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 {
 	u32 mbx_size = IXGBE_VFMAILBOX_SIZE;
@@ -738,14 +759,13 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	/* flush the ack before we write any messages back */
 	IXGBE_WRITE_FLUSH(hw);
 
+	if (msgbuf[0] == IXGBE_VF_RESET)
+		return ixgbe_vf_reset_msg(adapter, vf);
+
 	/*
 	 * until the vf completes a virtual function reset it should not be
 	 * allowed to start any configuration.
 	 */
-
-	if (msgbuf[0] == IXGBE_VF_RESET)
-		return ixgbe_vf_reset_msg(adapter, vf);
-
 	if (!adapter->vfinfo[vf].clear_to_send) {
 		msgbuf[0] |= IXGBE_VT_MSGTYPE_NACK;
 		ixgbe_write_mbx(hw, msgbuf, 1, vf);
@@ -768,6 +788,9 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 	case IXGBE_VF_SET_MACVLAN:
 		retval = ixgbe_set_vf_macvlan_msg(adapter, msgbuf, vf);
 		break;
+	case IXGBE_VF_API_NEGOTIATE:
+		retval = ixgbe_negotiate_vf_api(adapter, msgbuf, vf);
+		break;
 	default:
 		e_err(drv, "Unhandled Msg %8.8x\n", msgbuf[0]);
 		retval = IXGBE_ERR_MBX;
@@ -782,7 +805,7 @@ static int ixgbe_rcv_msg_from_vf(struct ixgbe_adapter *adapter, u32 vf)
 
 	msgbuf[0] |= IXGBE_VT_MSGTYPE_CTS;
 
-	ixgbe_write_mbx(hw, msgbuf, 1, vf);
+	ixgbe_write_mbx(hw, msgbuf, mbx_size, vf);
 
 	return retval;
 }
-- 
cgit v1.2.3


From ea5ceeabf5621ed36d24e6741b45cfd1e1fce11b Mon Sep 17 00:00:00 2001
From: Tushar Dave <tushar.n.dave@intel.com>
Date: Fri, 14 Sep 2012 03:43:43 +0000
Subject: igb: Correcting and improving small packet check and padding

Current implementation mess up the tail pointer. This patch sets skb->tail
correctly.
Also, the small packet check and padding is optimized by using unlikely and
calling skb_pad directly.

Signed-off-by: Tushar Dave <tushar.n.dave@intel.com>
Tested-by: Jeff Pieper <jeffrey.e.pieper@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index e1ceb37ef12..c611cffa788 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4467,10 +4467,11 @@ static netdev_tx_t igb_xmit_frame(struct sk_buff *skb,
 	 * The minimum packet size with TCTL.PSP set is 17 so pad the skb
 	 * in order to meet this minimum size requirement.
 	 */
-	if (skb->len < 17) {
-		if (skb_padto(skb, 17))
+	if (unlikely(skb->len < 17)) {
+		if (skb_pad(skb, 17 - skb->len))
 			return NETDEV_TX_OK;
 		skb->len = 17;
+		skb_set_tail_pointer(skb, 17);
 	}
 
 	return igb_xmit_frame_ring(skb, igb_tx_queue_mapping(adapter, skb));
-- 
cgit v1.2.3


From b534550a17cda69a1d62acc18fff33370b5eee5b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 05:14:55 +0000
Subject: igb: Split Rx timestamping into two separate functions

In order to support page based receive we will need to split up the two
different types of timestamping into two separate functions.  The first one
will handle legacy timestamps with the value in the register, and the new
one will handle timestamps in the Rx buffer itself.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Acked-by: Matthew Vick <matthew.vick@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h     | 17 ++++++++--
 drivers/net/ethernet/intel/igb/igb_ptp.c | 55 ++++++++++++++++++++++----------
 2 files changed, 53 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 8aad230c059..f6a1cd9d72a 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -442,9 +442,22 @@ extern void igb_ptp_stop(struct igb_adapter *adapter);
 extern void igb_ptp_reset(struct igb_adapter *adapter);
 extern void igb_ptp_tx_work(struct work_struct *work);
 extern void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter);
-extern void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector,
-				union e1000_adv_rx_desc *rx_desc,
+extern void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
 				struct sk_buff *skb);
+extern void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector,
+				unsigned char *va,
+				struct sk_buff *skb);
+static inline void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector,
+				       union e1000_adv_rx_desc *rx_desc,
+				       struct sk_buff *skb)
+{
+	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
+		igb_ptp_rx_pktstamp(q_vector, skb->data, skb);
+		skb_pull(skb, IGB_TS_HDR_LEN);
+	} else if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TS)) {
+		igb_ptp_rx_rgtstamp(q_vector, skb);
+	}
+}
 extern int igb_ptp_hwtstamp_ioctl(struct net_device *netdev,
 				  struct ifreq *ifr, int cmd);
 #endif /* CONFIG_IGB_PTP */
diff --git a/drivers/net/ethernet/intel/igb/igb_ptp.c b/drivers/net/ethernet/intel/igb/igb_ptp.c
index ee21445157a..a7db4cec7a1 100644
--- a/drivers/net/ethernet/intel/igb/igb_ptp.c
+++ b/drivers/net/ethernet/intel/igb/igb_ptp.c
@@ -441,18 +441,46 @@ void igb_ptp_tx_hwtstamp(struct igb_adapter *adapter)
 	adapter->ptp_tx_skb = NULL;
 }
 
-void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector,
-			 union e1000_adv_rx_desc *rx_desc,
+/**
+ * igb_ptp_rx_pktstamp - retrieve Rx per packet timestamp
+ * @q_vector: Pointer to interrupt specific structure
+ * @va: Pointer to address containing Rx buffer
+ * @skb: Buffer containing timestamp and packet
+ *
+ * This function is meant to retrieve a timestamp from the first buffer of an
+ * incoming frame.  The value is stored in little endian format starting on
+ * byte 8.
+ */
+void igb_ptp_rx_pktstamp(struct igb_q_vector *q_vector,
+			 unsigned char *va,
+			 struct sk_buff *skb)
+{
+	u64 *regval = (u64 *)va;
+
+	/*
+	 * The timestamp is recorded in little endian format.
+	 * DWORD: 0        1        2        3
+	 * Field: Reserved Reserved SYSTIML  SYSTIMH
+	 */
+	igb_ptp_systim_to_hwtstamp(q_vector->adapter, skb_hwtstamps(skb),
+				   le64_to_cpu(regval[1]));
+}
+
+/**
+ * igb_ptp_rx_rgtstamp - retrieve Rx timestamp stored in register
+ * @q_vector: Pointer to interrupt specific structure
+ * @skb: Buffer containing timestamp and packet
+ *
+ * This function is meant to retrieve a timestamp from the internal registers
+ * of the adapter and store it in the skb.
+ */
+void igb_ptp_rx_rgtstamp(struct igb_q_vector *q_vector,
 			 struct sk_buff *skb)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
 	struct e1000_hw *hw = &adapter->hw;
 	u64 regval;
 
-	if (!igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP |
-				       E1000_RXDADV_STAT_TS))
-		return;
-
 	/*
 	 * If this bit is set, then the RX registers contain the time stamp. No
 	 * other packet will be time stamped until we read these registers, so
@@ -464,18 +492,11 @@ void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector,
 	 * If nothing went wrong, then it should have a shared tx_flags that we
 	 * can turn into a skb_shared_hwtstamps.
 	 */
-	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
-		u32 *stamp = (u32 *)skb->data;
-		regval = le32_to_cpu(*(stamp + 2));
-		regval |= (u64)le32_to_cpu(*(stamp + 3)) << 32;
-		skb_pull(skb, IGB_TS_HDR_LEN);
-	} else {
-		if (!(rd32(E1000_TSYNCRXCTL) & E1000_TSYNCRXCTL_VALID))
-			return;
+	if (!(rd32(E1000_TSYNCRXCTL) & E1000_TSYNCRXCTL_VALID))
+		return;
 
-		regval = rd32(E1000_RXSTMPL);
-		regval |= (u64)rd32(E1000_RXSTMPH) << 32;
-	}
+	regval = rd32(E1000_RXSTMPL);
+	regval |= (u64)rd32(E1000_RXSTMPH) << 32;
 
 	igb_ptp_systim_to_hwtstamp(adapter, skb_hwtstamps(skb), regval);
 }
-- 
cgit v1.2.3


From 1a1c225b9463038ac68b369ef05e4ee7fd9c82a5 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:30:52 +0000
Subject: igb: Do not use header split, instead receive all frames into a
 single buffer

This change makes it so that we no longer use header split.  The idea is to
reduce partial cache line writes by hardware when handling frames larger
then header size.  We can compensate for the extra overhead of having to
memcpy the header buffer by avoiding the cache misses seen by leaving an
full skb allocated and sitting on the ring.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |  13 +-
 drivers/net/ethernet/intel/igb/igb_ethtool.c |  31 +-
 drivers/net/ethernet/intel/igb/igb_main.c    | 420 ++++++++++++++++++---------
 3 files changed, 312 insertions(+), 152 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index f6a1cd9d72a..72ab9ac34a3 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -174,11 +174,9 @@ struct igb_tx_buffer {
 };
 
 struct igb_rx_buffer {
-	struct sk_buff *skb;
 	dma_addr_t dma;
 	struct page *page;
-	dma_addr_t page_dma;
-	u32 page_offset;
+	unsigned int page_offset;
 };
 
 struct igb_tx_queue_stats {
@@ -251,6 +249,7 @@ struct igb_ring {
 		};
 		/* RX */
 		struct {
+			struct sk_buff *skb;
 			struct igb_rx_queue_stats rx_stats;
 			struct u64_stats_sync rx_syncp;
 		};
@@ -451,13 +450,11 @@ static inline void igb_ptp_rx_hwtstamp(struct igb_q_vector *q_vector,
 				       union e1000_adv_rx_desc *rx_desc,
 				       struct sk_buff *skb)
 {
-	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
-		igb_ptp_rx_pktstamp(q_vector, skb->data, skb);
-		skb_pull(skb, IGB_TS_HDR_LEN);
-	} else if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TS)) {
+	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TS) &&
+	    !igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP))
 		igb_ptp_rx_rgtstamp(q_vector, skb);
-	}
 }
+
 extern int igb_ptp_hwtstamp_ioctl(struct net_device *netdev,
 				  struct ifreq *ifr, int cmd);
 #endif /* CONFIG_IGB_PTP */
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 2ea01284982..0faac423bd5 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -37,6 +37,7 @@
 #include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/pm_runtime.h>
+#include <linux/highmem.h>
 
 #include "igb.h"
 
@@ -1685,16 +1686,24 @@ static void igb_create_lbtest_frame(struct sk_buff *skb,
 	memset(&skb->data[frame_size + 12], 0xAF, 1);
 }
 
-static int igb_check_lbtest_frame(struct sk_buff *skb, unsigned int frame_size)
+static int igb_check_lbtest_frame(struct igb_rx_buffer *rx_buffer,
+				  unsigned int frame_size)
 {
-	frame_size /= 2;
-	if (*(skb->data + 3) == 0xFF) {
-		if ((*(skb->data + frame_size + 10) == 0xBE) &&
-		   (*(skb->data + frame_size + 12) == 0xAF)) {
-			return 0;
-		}
-	}
-	return 13;
+	unsigned char *data;
+	bool match = true;
+
+	frame_size >>= 1;
+
+	data = kmap(rx_buffer->page) + rx_buffer->page_offset;
+
+	if (data[3] != 0xFF ||
+	    data[frame_size + 10] != 0xBE ||
+	    data[frame_size + 12] != 0xAF)
+		match = false;
+
+	kunmap(rx_buffer->page);
+
+	return match;
 }
 
 static int igb_clean_test_rings(struct igb_ring *rx_ring,
@@ -1720,12 +1729,12 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 		/* unmap rx buffer, will be remapped by alloc_rx_buffers */
 		dma_unmap_single(rx_ring->dev,
 				 rx_buffer_info->dma,
-				 IGB_RX_HDR_LEN,
+				 PAGE_SIZE / 2,
 				 DMA_FROM_DEVICE);
 		rx_buffer_info->dma = 0;
 
 		/* verify contents of skb */
-		if (!igb_check_lbtest_frame(rx_buffer_info->skb, size))
+		if (igb_check_lbtest_frame(rx_buffer_info, size))
 			count++;
 
 		/* unmap buffer on tx side */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index c611cffa788..665eafa401d 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -534,25 +534,21 @@ rx_ring_summary:
 
 			if (staterr & E1000_RXD_STAT_DD) {
 				/* Descriptor Done */
-				pr_info("%s[0x%03X]     %016llX %016llX -------"
-					"--------- %p%s\n", "RWB", i,
+				pr_info("%s[0x%03X]     %016llX %016llX ---------------- %s\n",
+					"RWB", i,
 					le64_to_cpu(u0->a),
 					le64_to_cpu(u0->b),
-					buffer_info->skb, next_desc);
+					next_desc);
 			} else {
-				pr_info("%s[0x%03X]     %016llX %016llX %016llX"
-					" %p%s\n", "R  ", i,
+				pr_info("%s[0x%03X]     %016llX %016llX %016llX %s\n",
+					"R  ", i,
 					le64_to_cpu(u0->a),
 					le64_to_cpu(u0->b),
 					(u64)buffer_info->dma,
-					buffer_info->skb, next_desc);
+					next_desc);
 
 				if (netif_msg_pktdata(adapter) &&
-				    buffer_info->dma && buffer_info->skb) {
-					print_hex_dump(KERN_INFO, "",
-						  DUMP_PREFIX_ADDRESS,
-						  16, 1, buffer_info->skb->data,
-						  IGB_RX_HDR_LEN, true);
+				    buffer_info->dma && buffer_info->page) {
 					print_hex_dump(KERN_INFO, "",
 					  DUMP_PREFIX_ADDRESS,
 					  16, 1,
@@ -3111,7 +3107,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
 #else
 	srrctl |= (PAGE_SIZE / 2) >> E1000_SRRCTL_BSIZEPKT_SHIFT;
 #endif
-	srrctl |= E1000_SRRCTL_DESCTYPE_HDR_SPLIT_ALWAYS;
+	srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
 #ifdef CONFIG_IGB_PTP
 	if (hw->mac.type >= e1000_82580)
 		srrctl |= E1000_SRRCTL_TIMESTAMP;
@@ -3305,36 +3301,27 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	unsigned long size;
 	u16 i;
 
+	if (rx_ring->skb)
+		dev_kfree_skb(rx_ring->skb);
+	rx_ring->skb = NULL;
+
 	if (!rx_ring->rx_buffer_info)
 		return;
 
 	/* Free all the Rx ring sk_buffs */
 	for (i = 0; i < rx_ring->count; i++) {
 		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
-		if (buffer_info->dma) {
-			dma_unmap_single(rx_ring->dev,
-			                 buffer_info->dma,
-					 IGB_RX_HDR_LEN,
-					 DMA_FROM_DEVICE);
-			buffer_info->dma = 0;
-		}
 
-		if (buffer_info->skb) {
-			dev_kfree_skb(buffer_info->skb);
-			buffer_info->skb = NULL;
-		}
-		if (buffer_info->page_dma) {
+		if (buffer_info->dma)
 			dma_unmap_page(rx_ring->dev,
-			               buffer_info->page_dma,
+				       buffer_info->dma,
 				       PAGE_SIZE / 2,
 				       DMA_FROM_DEVICE);
-			buffer_info->page_dma = 0;
-		}
-		if (buffer_info->page) {
-			put_page(buffer_info->page);
-			buffer_info->page = NULL;
-			buffer_info->page_offset = 0;
-		}
+		buffer_info->dma = 0;
+		if (buffer_info->page)
+			__free_page(buffer_info->page);
+		buffer_info->page = NULL;
+		buffer_info->page_offset = 0;
 	}
 
 	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
@@ -5906,23 +5893,219 @@ static void igb_rx_vlan(struct igb_ring *ring,
 	}
 }
 
-static inline u16 igb_get_hlen(union e1000_adv_rx_desc *rx_desc)
+/**
+ * igb_get_headlen - determine size of header for LRO/GRO
+ * @data: pointer to the start of the headers
+ * @max_len: total length of section to find headers in
+ *
+ * This function is meant to determine the length of headers that will
+ * be recognized by hardware for LRO, and GRO offloads.  The main
+ * motivation of doing this is to only perform one pull for IPv4 TCP
+ * packets so that we can do basic things like calculating the gso_size
+ * based on the average data per packet.
+ **/
+static unsigned int igb_get_headlen(unsigned char *data,
+				    unsigned int max_len)
+{
+	union {
+		unsigned char *network;
+		/* l2 headers */
+		struct ethhdr *eth;
+		struct vlan_hdr *vlan;
+		/* l3 headers */
+		struct iphdr *ipv4;
+		struct ipv6hdr *ipv6;
+	} hdr;
+	__be16 protocol;
+	u8 nexthdr = 0;	/* default to not TCP */
+	u8 hlen;
+
+	/* this should never happen, but better safe than sorry */
+	if (max_len < ETH_HLEN)
+		return max_len;
+
+	/* initialize network frame pointer */
+	hdr.network = data;
+
+	/* set first protocol and move network header forward */
+	protocol = hdr.eth->h_proto;
+	hdr.network += ETH_HLEN;
+
+	/* handle any vlan tag if present */
+	if (protocol == __constant_htons(ETH_P_8021Q)) {
+		if ((hdr.network - data) > (max_len - VLAN_HLEN))
+			return max_len;
+
+		protocol = hdr.vlan->h_vlan_encapsulated_proto;
+		hdr.network += VLAN_HLEN;
+	}
+
+	/* handle L3 protocols */
+	if (protocol == __constant_htons(ETH_P_IP)) {
+		if ((hdr.network - data) > (max_len - sizeof(struct iphdr)))
+			return max_len;
+
+		/* access ihl as a u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[0] & 0x0F) << 2;
+
+		/* verify hlen meets minimum size requirements */
+		if (hlen < sizeof(struct iphdr))
+			return hdr.network - data;
+
+		/* record next protocol */
+		nexthdr = hdr.ipv4->protocol;
+		hdr.network += hlen;
+	} else if (protocol == __constant_htons(ETH_P_IPV6)) {
+		if ((hdr.network - data) > (max_len - sizeof(struct ipv6hdr)))
+			return max_len;
+
+		/* record next protocol */
+		nexthdr = hdr.ipv6->nexthdr;
+		hdr.network += sizeof(struct ipv6hdr);
+	} else {
+		return hdr.network - data;
+	}
+
+	/* finally sort out TCP */
+	if (nexthdr == IPPROTO_TCP) {
+		if ((hdr.network - data) > (max_len - sizeof(struct tcphdr)))
+			return max_len;
+
+		/* access doff as a u8 to avoid unaligned access on ia64 */
+		hlen = (hdr.network[12] & 0xF0) >> 2;
+
+		/* verify hlen meets minimum size requirements */
+		if (hlen < sizeof(struct tcphdr))
+			return hdr.network - data;
+
+		hdr.network += hlen;
+	} else if (nexthdr == IPPROTO_UDP) {
+		if ((hdr.network - data) > (max_len - sizeof(struct udphdr)))
+			return max_len;
+
+		hdr.network += sizeof(struct udphdr);
+	}
+
+	/*
+	 * If everything has gone correctly hdr.network should be the
+	 * data section of the packet and will be the end of the header.
+	 * If not then it probably represents the end of the last recognized
+	 * header.
+	 */
+	if ((hdr.network - data) < max_len)
+		return hdr.network - data;
+	else
+		return max_len;
+}
+
+/**
+ * igb_pull_tail - igb specific version of skb_pull_tail
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @skb: pointer to current skb being adjusted
+ *
+ * This function is an igb specific version of __pskb_pull_tail.  The
+ * main difference between this version and the original function is that
+ * this function can make several assumptions about the state of things
+ * that allow for significant optimizations versus the standard function.
+ * As a result we can do things like drop a frag and maintain an accurate
+ * truesize for the skb.
+ */
+static void igb_pull_tail(struct igb_ring *rx_ring,
+			  union e1000_adv_rx_desc *rx_desc,
+			  struct sk_buff *skb)
 {
-	/* HW will not DMA in data larger than the given buffer, even if it
-	 * parses the (NFS, of course) header to be larger.  In that case, it
-	 * fills the header buffer and spills the rest into the page.
+	struct skb_frag_struct *frag = &skb_shinfo(skb)->frags[0];
+	unsigned char *va;
+	unsigned int pull_len;
+
+	/*
+	 * it is valid to use page_address instead of kmap since we are
+	 * working with pages allocated out of the lomem pool per
+	 * alloc_page(GFP_ATOMIC)
 	 */
-	u16 hlen = (le16_to_cpu(rx_desc->wb.lower.lo_dword.hdr_info) &
-	           E1000_RXDADV_HDRBUFLEN_MASK) >> E1000_RXDADV_HDRBUFLEN_SHIFT;
-	if (hlen > IGB_RX_HDR_LEN)
-		hlen = IGB_RX_HDR_LEN;
-	return hlen;
+	va = skb_frag_address(frag);
+
+#ifdef CONFIG_IGB_PTP
+	if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
+		/* retrieve timestamp from buffer */
+		igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb);
+
+		/* update pointers to remove timestamp header */
+		skb_frag_size_sub(frag, IGB_TS_HDR_LEN);
+		frag->page_offset += IGB_TS_HDR_LEN;
+		skb->data_len -= IGB_TS_HDR_LEN;
+		skb->len -= IGB_TS_HDR_LEN;
+
+		/* move va to start of packet data */
+		va += IGB_TS_HDR_LEN;
+	}
+
+#endif
+	/*
+	 * we need the header to contain the greater of either ETH_HLEN or
+	 * 60 bytes if the skb->len is less than 60 for skb_pad.
+	 */
+	pull_len = igb_get_headlen(va, IGB_RX_HDR_LEN);
+
+	/* align pull length to size of long to optimize memcpy performance */
+	skb_copy_to_linear_data(skb, va, ALIGN(pull_len, sizeof(long)));
+
+	/* update all of the pointers */
+	skb_frag_size_sub(frag, pull_len);
+	frag->page_offset += pull_len;
+	skb->data_len -= pull_len;
+	skb->tail += pull_len;
+}
+
+/**
+ * igb_cleanup_headers - Correct corrupted or empty headers
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being fixed
+ *
+ * Address the case where we are pulling data in on pages only
+ * and as such no data is present in the skb header.
+ *
+ * In addition if skb is not at least 60 bytes we need to pad it so that
+ * it is large enough to qualify as a valid Ethernet frame.
+ *
+ * Returns true if an error was encountered and skb was freed.
+ **/
+static bool igb_cleanup_headers(struct igb_ring *rx_ring,
+				union e1000_adv_rx_desc *rx_desc,
+				struct sk_buff *skb)
+{
+
+	if (unlikely((igb_test_staterr(rx_desc,
+				       E1000_RXDEXT_ERR_FRAME_ERR_MASK)))) {
+		struct net_device *netdev = rx_ring->netdev;
+		if (!(netdev->features & NETIF_F_RXALL)) {
+			dev_kfree_skb_any(skb);
+			return true;
+		}
+	}
+
+	/* place header in linear portion of buffer */
+	if (skb_is_nonlinear(skb))
+		igb_pull_tail(rx_ring, rx_desc, skb);
+
+	/* if skb_pad returns an error the skb was freed */
+	if (unlikely(skb->len < 60)) {
+		int pad_len = 60 - skb->len;
+
+		if (skb_pad(skb, pad_len))
+			return true;
+		__skb_put(skb, pad_len);
+	}
+
+	return false;
 }
 
 static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 {
 	struct igb_ring *rx_ring = q_vector->rx.ring;
 	union e1000_adv_rx_desc *rx_desc;
+	struct sk_buff *skb = rx_ring->skb;
 	const int current_node = numa_node_id();
 	unsigned int total_bytes = 0, total_packets = 0;
 	u16 cleaned_count = igb_desc_unused(rx_ring);
@@ -5932,12 +6115,9 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 
 	while (igb_test_staterr(rx_desc, E1000_RXD_STAT_DD)) {
 		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
-		struct sk_buff *skb = buffer_info->skb;
+		struct page *page;
 		union e1000_adv_rx_desc *next_rxd;
 
-		buffer_info->skb = NULL;
-		prefetch(skb->data);
-
 		i++;
 		if (i == rx_ring->count)
 			i = 0;
@@ -5952,52 +6132,57 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 		 */
 		rmb();
 
-		if (!skb_is_nonlinear(skb)) {
-			__skb_put(skb, igb_get_hlen(rx_desc));
-			dma_unmap_single(rx_ring->dev, buffer_info->dma,
-					 IGB_RX_HDR_LEN,
-					 DMA_FROM_DEVICE);
-			buffer_info->dma = 0;
-		}
-
-		if (rx_desc->wb.upper.length) {
-			u16 length = le16_to_cpu(rx_desc->wb.upper.length);
+		page = buffer_info->page;
+		prefetchw(page);
 
-			skb_fill_page_desc(skb, skb_shinfo(skb)->nr_frags,
-						buffer_info->page,
-						buffer_info->page_offset,
-						length);
+		if (likely(!skb)) {
+			void *page_addr = page_address(page) +
+					  buffer_info->page_offset;
 
-			skb->len += length;
-			skb->data_len += length;
-			skb->truesize += PAGE_SIZE / 2;
+			/* prefetch first cache line of first page */
+			prefetch(page_addr);
+#if L1_CACHE_BYTES < 128
+			prefetch(page_addr + L1_CACHE_BYTES);
+#endif
 
-			if ((page_count(buffer_info->page) != 1) ||
-			    (page_to_nid(buffer_info->page) != current_node))
-				buffer_info->page = NULL;
-			else
-				get_page(buffer_info->page);
+			/* allocate a skb to store the frags */
+			skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+							IGB_RX_HDR_LEN);
+			if (unlikely(!skb)) {
+				rx_ring->rx_stats.alloc_failed++;
+				break;
+			}
 
-			dma_unmap_page(rx_ring->dev, buffer_info->page_dma,
-				       PAGE_SIZE / 2, DMA_FROM_DEVICE);
-			buffer_info->page_dma = 0;
+			/*
+			 * we will be copying header into skb->data in
+			 * pskb_may_pull so it is in our interest to prefetch
+			 * it now to avoid a possible cache miss
+			 */
+			prefetchw(skb->data);
 		}
 
-		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP)) {
-			struct igb_rx_buffer *next_buffer;
-			next_buffer = &rx_ring->rx_buffer_info[i];
-			buffer_info->skb = next_buffer->skb;
-			buffer_info->dma = next_buffer->dma;
-			next_buffer->skb = skb;
-			next_buffer->dma = 0;
-			goto next_desc;
-		}
+		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+				buffer_info->page_offset,
+				le16_to_cpu(rx_desc->wb.upper.length),
+				PAGE_SIZE / 2);
 
-		if (unlikely((igb_test_staterr(rx_desc,
-					       E1000_RXDEXT_ERR_FRAME_ERR_MASK))
-			     && !(rx_ring->netdev->features & NETIF_F_RXALL))) {
-			dev_kfree_skb_any(skb);
+		if ((page_count(buffer_info->page) != 1) ||
+		    (page_to_nid(buffer_info->page) != current_node))
+			buffer_info->page = NULL;
+		else
+			get_page(buffer_info->page);
+
+		dma_unmap_page(rx_ring->dev, buffer_info->dma,
+			       PAGE_SIZE / 2, DMA_FROM_DEVICE);
+		buffer_info->dma = 0;
+
+		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP))
 			goto next_desc;
+
+		/* verify the packet layout is correct */
+		if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
+			skb = NULL;
+			continue;
 		}
 
 #ifdef CONFIG_IGB_PTP
@@ -6010,10 +6195,14 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 		total_bytes += skb->len;
 		total_packets++;
 
+		skb_record_rx_queue(skb, rx_ring->queue_index);
 		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 
 		napi_gro_receive(&q_vector->napi, skb);
 
+		/* reset skb pointer */
+		skb = NULL;
+
 		budget--;
 next_desc:
 		if (!budget)
@@ -6030,6 +6219,9 @@ next_desc:
 		rx_desc = next_rxd;
 	}
 
+	/* place incomplete frames back on ring for completion */
+	rx_ring->skb = skb;
+
 	rx_ring->next_to_clean = i;
 	u64_stats_update_begin(&rx_ring->rx_syncp);
 	rx_ring->rx_stats.packets += total_packets;
@@ -6044,70 +6236,37 @@ next_desc:
 	return !!budget;
 }
 
-static bool igb_alloc_mapped_skb(struct igb_ring *rx_ring,
-				 struct igb_rx_buffer *bi)
-{
-	struct sk_buff *skb = bi->skb;
-	dma_addr_t dma = bi->dma;
-
-	if (dma)
-		return true;
-
-	if (likely(!skb)) {
-		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-						IGB_RX_HDR_LEN);
-		bi->skb = skb;
-		if (!skb) {
-			rx_ring->rx_stats.alloc_failed++;
-			return false;
-		}
-
-		/* initialize skb for ring */
-		skb_record_rx_queue(skb, rx_ring->queue_index);
-	}
-
-	dma = dma_map_single(rx_ring->dev, skb->data,
-			     IGB_RX_HDR_LEN, DMA_FROM_DEVICE);
-
-	if (dma_mapping_error(rx_ring->dev, dma)) {
-		rx_ring->rx_stats.alloc_failed++;
-		return false;
-	}
-
-	bi->dma = dma;
-	return true;
-}
-
 static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 				  struct igb_rx_buffer *bi)
 {
 	struct page *page = bi->page;
-	dma_addr_t page_dma = bi->page_dma;
+	dma_addr_t dma = bi->dma;
 	unsigned int page_offset = bi->page_offset ^ (PAGE_SIZE / 2);
 
-	if (page_dma)
+	if (dma)
 		return true;
 
 	if (!page) {
-		page = __skb_alloc_page(GFP_ATOMIC, bi->skb);
-		bi->page = page;
+		page = __skb_alloc_page(GFP_ATOMIC | __GFP_COLD, NULL);
 		if (unlikely(!page)) {
 			rx_ring->rx_stats.alloc_failed++;
 			return false;
 		}
+		bi->page = page;
 	}
 
-	page_dma = dma_map_page(rx_ring->dev, page,
-				page_offset, PAGE_SIZE / 2,
-				DMA_FROM_DEVICE);
+	dma = dma_map_page(rx_ring->dev, page,
+			   page_offset, PAGE_SIZE / 2,
+			   DMA_FROM_DEVICE);
 
-	if (dma_mapping_error(rx_ring->dev, page_dma)) {
+	if (dma_mapping_error(rx_ring->dev, dma)) {
 		rx_ring->rx_stats.alloc_failed++;
 		return false;
 	}
 
-	bi->page_dma = page_dma;
+	bi->dma = dma;
 	bi->page_offset = page_offset;
+
 	return true;
 }
 
@@ -6126,17 +6285,12 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 	i -= rx_ring->count;
 
 	while (cleaned_count--) {
-		if (!igb_alloc_mapped_skb(rx_ring, bi))
+		if (!igb_alloc_mapped_page(rx_ring, bi))
 			break;
 
 		/* Refresh the desc even if buffer_addrs didn't change
 		 * because each write-back erases this info. */
-		rx_desc->read.hdr_addr = cpu_to_le64(bi->dma);
-
-		if (!igb_alloc_mapped_page(rx_ring, bi))
-			break;
-
-		rx_desc->read.pkt_addr = cpu_to_le64(bi->page_dma);
+		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
 
 		rx_desc++;
 		bi++;
-- 
cgit v1.2.3


From db2ee5bdf5c83320fa19f73a38204585f1518798 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:30:57 +0000
Subject: igb: Combine post-processing of skb into a single function

This change is meant to just clean-up a number of function calls that were
made at the end of the Rx clean-up path by combining them into a single
function call.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 69 ++++++++++++++++++++-----------
 1 file changed, 44 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 665eafa401d..9e38f147d10 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5877,22 +5877,6 @@ static inline void igb_rx_hash(struct igb_ring *ring,
 		skb->rxhash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
 }
 
-static void igb_rx_vlan(struct igb_ring *ring,
-			union e1000_adv_rx_desc *rx_desc,
-			struct sk_buff *skb)
-{
-	if (igb_test_staterr(rx_desc, E1000_RXD_STAT_VP)) {
-		u16 vid;
-		if (igb_test_staterr(rx_desc, E1000_RXDEXT_STATERR_LB) &&
-		    test_bit(IGB_RING_FLAG_RX_LB_VLAN_BSWAP, &ring->flags))
-			vid = be16_to_cpu(rx_desc->wb.upper.vlan);
-		else
-			vid = le16_to_cpu(rx_desc->wb.upper.vlan);
-
-		__vlan_hwaccel_put_tag(skb, vid);
-	}
-}
-
 /**
  * igb_get_headlen - determine size of header for LRO/GRO
  * @data: pointer to the start of the headers
@@ -6101,6 +6085,47 @@ static bool igb_cleanup_headers(struct igb_ring *rx_ring,
 	return false;
 }
 
+/**
+ * igb_process_skb_fields - Populate skb header fields from Rx descriptor
+ * @rx_ring: rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
+ * @skb: pointer to current skb being populated
+ *
+ * This function checks the ring, descriptor, and packet information in
+ * order to populate the hash, checksum, VLAN, timestamp, protocol, and
+ * other fields within the skb.
+ **/
+static void igb_process_skb_fields(struct igb_ring *rx_ring,
+				   union e1000_adv_rx_desc *rx_desc,
+				   struct sk_buff *skb)
+{
+	struct net_device *dev = rx_ring->netdev;
+
+	igb_rx_hash(rx_ring, rx_desc, skb);
+
+	igb_rx_checksum(rx_ring, rx_desc, skb);
+
+#ifdef CONFIG_IGB_PTP
+	igb_ptp_rx_hwtstamp(rx_ring->q_vector, rx_desc, skb);
+#endif /* CONFIG_IGB_PTP */
+
+	if ((dev->features & NETIF_F_HW_VLAN_RX) &&
+	    igb_test_staterr(rx_desc, E1000_RXD_STAT_VP)) {
+		u16 vid;
+		if (igb_test_staterr(rx_desc, E1000_RXDEXT_STATERR_LB) &&
+		    test_bit(IGB_RING_FLAG_RX_LB_VLAN_BSWAP, &rx_ring->flags))
+			vid = be16_to_cpu(rx_desc->wb.upper.vlan);
+		else
+			vid = le16_to_cpu(rx_desc->wb.upper.vlan);
+
+		__vlan_hwaccel_put_tag(skb, vid);
+	}
+
+	skb_record_rx_queue(skb, rx_ring->queue_index);
+
+	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+}
+
 static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 {
 	struct igb_ring *rx_ring = q_vector->rx.ring;
@@ -6185,18 +6210,12 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 			continue;
 		}
 
-#ifdef CONFIG_IGB_PTP
-		igb_ptp_rx_hwtstamp(q_vector, rx_desc, skb);
-#endif /* CONFIG_IGB_PTP */
-		igb_rx_hash(rx_ring, rx_desc, skb);
-		igb_rx_checksum(rx_ring, rx_desc, skb);
-		igb_rx_vlan(rx_ring, rx_desc, skb);
-
+		/* probably a little skewed due to removing CRC */
 		total_bytes += skb->len;
 		total_packets++;
 
-		skb_record_rx_queue(skb, rx_ring->queue_index);
-		skb->protocol = eth_type_trans(skb, rx_ring->netdev);
+		/* populate checksum, timestamp, VLAN, and protocol */
+		igb_process_skb_fields(rx_ring, rx_desc, skb);
 
 		napi_gro_receive(&q_vector->napi, skb);
 
-- 
cgit v1.2.3


From cbc8e55f6fdae27b667051b36040f66768ef79f2 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:02 +0000
Subject: igb: Map entire page and sync half instead of mapping and unmapping
 half pages

This change makes it so that we map the entire page and just sync half of
it for the device at a time.  The advantage to this approach is that we can
avoid the locking on map/unmap seen in many IOMMU implementations.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |   1 +
 drivers/net/ethernet/intel/igb/igb_ethtool.c |  26 ++--
 drivers/net/ethernet/intel/igb/igb_main.c    | 211 +++++++++++++++++++++------
 3 files changed, 181 insertions(+), 57 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 72ab9ac34a3..1d15bb0b1e9 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -239,6 +239,7 @@ struct igb_ring {
 	/* everything past this point are written often */
 	u16 next_to_clean ____cacheline_aligned_in_smp;
 	u16 next_to_use;
+	u16 next_to_alloc;
 
 	union {
 		/* TX */
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 0faac423bd5..96c6df65726 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1694,7 +1694,7 @@ static int igb_check_lbtest_frame(struct igb_rx_buffer *rx_buffer,
 
 	frame_size >>= 1;
 
-	data = kmap(rx_buffer->page) + rx_buffer->page_offset;
+	data = kmap(rx_buffer->page);
 
 	if (data[3] != 0xFF ||
 	    data[frame_size + 10] != 0xBE ||
@@ -1713,9 +1713,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 	union e1000_adv_rx_desc *rx_desc;
 	struct igb_rx_buffer *rx_buffer_info;
 	struct igb_tx_buffer *tx_buffer_info;
-	struct netdev_queue *txq;
 	u16 rx_ntc, tx_ntc, count = 0;
-	unsigned int total_bytes = 0, total_packets = 0;
 
 	/* initialize next to clean and descriptor values */
 	rx_ntc = rx_ring->next_to_clean;
@@ -1726,21 +1724,24 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 		/* check rx buffer */
 		rx_buffer_info = &rx_ring->rx_buffer_info[rx_ntc];
 
-		/* unmap rx buffer, will be remapped by alloc_rx_buffers */
-		dma_unmap_single(rx_ring->dev,
-				 rx_buffer_info->dma,
-				 PAGE_SIZE / 2,
-				 DMA_FROM_DEVICE);
-		rx_buffer_info->dma = 0;
+		/* sync Rx buffer for CPU read */
+		dma_sync_single_for_cpu(rx_ring->dev,
+					rx_buffer_info->dma,
+					PAGE_SIZE / 2,
+					DMA_FROM_DEVICE);
 
 		/* verify contents of skb */
 		if (igb_check_lbtest_frame(rx_buffer_info, size))
 			count++;
 
+		/* sync Rx buffer for device write */
+		dma_sync_single_for_device(rx_ring->dev,
+					   rx_buffer_info->dma,
+					   PAGE_SIZE / 2,
+					   DMA_FROM_DEVICE);
+
 		/* unmap buffer on tx side */
 		tx_buffer_info = &tx_ring->tx_buffer_info[tx_ntc];
-		total_bytes += tx_buffer_info->bytecount;
-		total_packets += tx_buffer_info->gso_segs;
 		igb_unmap_and_free_tx_resource(tx_ring, tx_buffer_info);
 
 		/* increment rx/tx next to clean counters */
@@ -1755,8 +1756,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 		rx_desc = IGB_RX_DESC(rx_ring, rx_ntc);
 	}
 
-	txq = netdev_get_tx_queue(tx_ring->netdev, tx_ring->queue_index);
-	netdev_tx_completed_queue(txq, total_packets, total_bytes);
+	netdev_tx_reset_queue(txring_txq(tx_ring));
 
 	/* re-map buffers to ring, store next to clean values */
 	igb_alloc_rx_buffers(rx_ring, count);
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 9e38f147d10..18ad18fb426 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -2785,6 +2785,7 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	if (!rx_ring->desc)
 		goto err;
 
+	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 
@@ -3312,16 +3313,16 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	for (i = 0; i < rx_ring->count; i++) {
 		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
 
-		if (buffer_info->dma)
-			dma_unmap_page(rx_ring->dev,
-				       buffer_info->dma,
-				       PAGE_SIZE / 2,
-				       DMA_FROM_DEVICE);
-		buffer_info->dma = 0;
-		if (buffer_info->page)
-			__free_page(buffer_info->page);
+		if (!buffer_info->page)
+			continue;
+
+		dma_unmap_page(rx_ring->dev,
+			       buffer_info->dma,
+			       PAGE_SIZE,
+			       DMA_FROM_DEVICE);
+		__free_page(buffer_info->page);
+
 		buffer_info->page = NULL;
-		buffer_info->page_offset = 0;
 	}
 
 	size = sizeof(struct igb_rx_buffer) * rx_ring->count;
@@ -3330,6 +3331,7 @@ static void igb_clean_rx_ring(struct igb_ring *rx_ring)
 	/* Zero out the descriptor ring */
 	memset(rx_ring->desc, 0, rx_ring->size);
 
+	rx_ring->next_to_alloc = 0;
 	rx_ring->next_to_clean = 0;
 	rx_ring->next_to_use = 0;
 }
@@ -5828,6 +5830,104 @@ static bool igb_clean_tx_irq(struct igb_q_vector *q_vector)
 	return !!budget;
 }
 
+/**
+ * igb_reuse_rx_page - page flip buffer and store it back on the ring
+ * @rx_ring: rx descriptor ring to store buffers on
+ * @old_buff: donor buffer to have page reused
+ *
+ * Synchronizes page for reuse by the adapter
+ **/
+static void igb_reuse_rx_page(struct igb_ring *rx_ring,
+			      struct igb_rx_buffer *old_buff)
+{
+	struct igb_rx_buffer *new_buff;
+	u16 nta = rx_ring->next_to_alloc;
+
+	new_buff = &rx_ring->rx_buffer_info[nta];
+
+	/* update, and store next to alloc */
+	nta++;
+	rx_ring->next_to_alloc = (nta < rx_ring->count) ? nta : 0;
+
+	/* transfer page from old buffer to new buffer */
+	memcpy(new_buff, old_buff, sizeof(struct igb_rx_buffer));
+
+	/* sync the buffer for use by the device */
+	dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma,
+					 old_buff->page_offset,
+					 PAGE_SIZE / 2,
+					 DMA_FROM_DEVICE);
+}
+
+/**
+ * igb_add_rx_frag - Add contents of Rx buffer to sk_buff
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @rx_buffer: buffer containing page to add
+ * @rx_desc: descriptor containing length of buffer written by hardware
+ * @skb: sk_buff to place the data into
+ *
+ * This function will add the data contained in rx_buffer->page to the skb.
+ * This is done either through a direct copy if the data in the buffer is
+ * less than the skb header size, otherwise it will just attach the page as
+ * a frag to the skb.
+ *
+ * The function will then update the page offset if necessary and return
+ * true if the buffer can be reused by the adapter.
+ **/
+static bool igb_add_rx_frag(struct igb_ring *rx_ring,
+			    struct igb_rx_buffer *rx_buffer,
+			    union e1000_adv_rx_desc *rx_desc,
+			    struct sk_buff *skb)
+{
+	struct page *page = rx_buffer->page;
+	unsigned int size = le16_to_cpu(rx_desc->wb.upper.length);
+
+	if ((size <= IGB_RX_HDR_LEN) && !skb_is_nonlinear(skb)) {
+		unsigned char *va = page_address(page) + rx_buffer->page_offset;
+
+#ifdef CONFIG_IGB_PTP
+		if (igb_test_staterr(rx_desc, E1000_RXDADV_STAT_TSIP)) {
+			igb_ptp_rx_pktstamp(rx_ring->q_vector, va, skb);
+			va += IGB_TS_HDR_LEN;
+			size -= IGB_TS_HDR_LEN;
+		}
+
+#endif
+		memcpy(__skb_put(skb, size), va, ALIGN(size, sizeof(long)));
+
+		/* we can reuse buffer as-is, just make sure it is local */
+		if (likely(page_to_nid(page) == numa_node_id()))
+			return true;
+
+		/* this page cannot be reused so discard it */
+		put_page(page);
+		return false;
+	}
+
+	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
+			rx_buffer->page_offset, size, PAGE_SIZE / 2);
+
+	/* avoid re-using remote pages */
+	if (unlikely(page_to_nid(page) != numa_node_id()))
+		return false;
+
+	/* if we are only owner of page we can reuse it */
+	if (unlikely(page_count(page) != 1))
+		return false;
+
+	/* flip page offset to other buffer */
+	rx_buffer->page_offset ^= PAGE_SIZE / 2;
+
+	/*
+	 * since we are the only owner of the page and we need to
+	 * increment it, just set the value to 2 in order to avoid
+	 * an unnecessary locked operation
+	 */
+	atomic_set(&page->_count, 2);
+
+	return true;
+}
+
 static inline void igb_rx_checksum(struct igb_ring *ring,
 				   union e1000_adv_rx_desc *rx_desc,
 				   struct sk_buff *skb)
@@ -5985,6 +6085,7 @@ static unsigned int igb_get_headlen(unsigned char *data,
 /**
  * igb_pull_tail - igb specific version of skb_pull_tail
  * @rx_ring: rx descriptor ring packet is being transacted on
+ * @rx_desc: pointer to the EOP Rx descriptor
  * @skb: pointer to current skb being adjusted
  *
  * This function is an igb specific version of __pskb_pull_tail.  The
@@ -6131,7 +6232,6 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 	struct igb_ring *rx_ring = q_vector->rx.ring;
 	union e1000_adv_rx_desc *rx_desc;
 	struct sk_buff *skb = rx_ring->skb;
-	const int current_node = numa_node_id();
 	unsigned int total_bytes = 0, total_packets = 0;
 	u16 cleaned_count = igb_desc_unused(rx_ring);
 	u16 i = rx_ring->next_to_clean;
@@ -6186,20 +6286,25 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 			prefetchw(skb->data);
 		}
 
-		skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
-				buffer_info->page_offset,
-				le16_to_cpu(rx_desc->wb.upper.length),
-				PAGE_SIZE / 2);
-
-		if ((page_count(buffer_info->page) != 1) ||
-		    (page_to_nid(buffer_info->page) != current_node))
-			buffer_info->page = NULL;
-		else
-			get_page(buffer_info->page);
+		/* we are reusing so sync this buffer for CPU use */
+		dma_sync_single_range_for_cpu(rx_ring->dev,
+					      buffer_info->dma,
+					      buffer_info->page_offset,
+					      PAGE_SIZE / 2,
+					      DMA_FROM_DEVICE);
+
+		/* pull page into skb */
+		if (igb_add_rx_frag(rx_ring, buffer_info, rx_desc, skb)) {
+			/* hand second half of page back to the ring */
+			igb_reuse_rx_page(rx_ring, buffer_info);
+		} else {
+			/* we are not reusing the buffer so unmap it */
+			dma_unmap_page(rx_ring->dev, buffer_info->dma,
+				       PAGE_SIZE, DMA_FROM_DEVICE);
+		}
 
-		dma_unmap_page(rx_ring->dev, buffer_info->dma,
-			       PAGE_SIZE / 2, DMA_FROM_DEVICE);
-		buffer_info->dma = 0;
+		/* clear contents of buffer_info */
+		buffer_info->page = NULL;
 
 		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP))
 			goto next_desc;
@@ -6259,32 +6364,36 @@ static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
 				  struct igb_rx_buffer *bi)
 {
 	struct page *page = bi->page;
-	dma_addr_t dma = bi->dma;
-	unsigned int page_offset = bi->page_offset ^ (PAGE_SIZE / 2);
+	dma_addr_t dma;
 
-	if (dma)
+	/* since we are recycling buffers we should seldom need to alloc */
+	if (likely(page))
 		return true;
 
-	if (!page) {
-		page = __skb_alloc_page(GFP_ATOMIC | __GFP_COLD, NULL);
-		if (unlikely(!page)) {
-			rx_ring->rx_stats.alloc_failed++;
-			return false;
-		}
-		bi->page = page;
+	/* alloc new page for storage */
+	page = __skb_alloc_page(GFP_ATOMIC | __GFP_COLD, NULL);
+	if (unlikely(!page)) {
+		rx_ring->rx_stats.alloc_failed++;
+		return false;
 	}
 
-	dma = dma_map_page(rx_ring->dev, page,
-			   page_offset, PAGE_SIZE / 2,
-			   DMA_FROM_DEVICE);
+	/* map page for use */
+	dma = dma_map_page(rx_ring->dev, page, 0, PAGE_SIZE, DMA_FROM_DEVICE);
 
+	/*
+	 * if mapping failed free memory back to system since
+	 * there isn't much point in holding memory we can't use
+	 */
 	if (dma_mapping_error(rx_ring->dev, dma)) {
+		__free_page(page);
+
 		rx_ring->rx_stats.alloc_failed++;
 		return false;
 	}
 
 	bi->dma = dma;
-	bi->page_offset = page_offset;
+	bi->page = page;
+	bi->page_offset = 0;
 
 	return true;
 }
@@ -6299,17 +6408,23 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 	struct igb_rx_buffer *bi;
 	u16 i = rx_ring->next_to_use;
 
+	/* nothing to do */
+	if (!cleaned_count)
+		return;
+
 	rx_desc = IGB_RX_DESC(rx_ring, i);
 	bi = &rx_ring->rx_buffer_info[i];
 	i -= rx_ring->count;
 
-	while (cleaned_count--) {
+	do {
 		if (!igb_alloc_mapped_page(rx_ring, bi))
 			break;
 
-		/* Refresh the desc even if buffer_addrs didn't change
-		 * because each write-back erases this info. */
-		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma);
+		/*
+		 * Refresh the desc even if buffer_addrs didn't change
+		 * because each write-back erases this info.
+		 */
+		rx_desc->read.pkt_addr = cpu_to_le64(bi->dma + bi->page_offset);
 
 		rx_desc++;
 		bi++;
@@ -6322,17 +6437,25 @@ void igb_alloc_rx_buffers(struct igb_ring *rx_ring, u16 cleaned_count)
 
 		/* clear the hdr_addr for the next_to_use descriptor */
 		rx_desc->read.hdr_addr = 0;
-	}
+
+		cleaned_count--;
+	} while (cleaned_count);
 
 	i += rx_ring->count;
 
 	if (rx_ring->next_to_use != i) {
+		/* record the next descriptor to use */
 		rx_ring->next_to_use = i;
 
-		/* Force memory writes to complete before letting h/w
+		/* update next to alloc since we have filled the ring */
+		rx_ring->next_to_alloc = i;
+
+		/*
+		 * Force memory writes to complete before letting h/w
 		 * know there are new descriptors to fetch.  (Only
 		 * applicable for weak-ordered memory model archs,
-		 * such as IA-64). */
+		 * such as IA-64).
+		 */
 		wmb();
 		writel(i, rx_ring->tail);
 	}
-- 
cgit v1.2.3


From 2e334eee9bef61505b6727b356e724033da55ae3 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:07 +0000
Subject: igb: Move rx_buffer related code in Rx cleanup path into separate
 function

In order to try and isolate things a bit further I am moving the code
related to retrieving data from the rx_buffer_info structure into a
separate function.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 206 +++++++++++++++++-------------
 1 file changed, 120 insertions(+), 86 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 18ad18fb426..fa7ddec4cfe 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -5928,6 +5928,74 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
 	return true;
 }
 
+static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
+					   union e1000_adv_rx_desc *rx_desc,
+					   struct sk_buff *skb)
+{
+	struct igb_rx_buffer *rx_buffer;
+	struct page *page;
+
+	rx_buffer = &rx_ring->rx_buffer_info[rx_ring->next_to_clean];
+
+	/*
+	 * This memory barrier is needed to keep us from reading
+	 * any other fields out of the rx_desc until we know the
+	 * RXD_STAT_DD bit is set
+	 */
+	rmb();
+
+	page = rx_buffer->page;
+	prefetchw(page);
+
+	if (likely(!skb)) {
+		void *page_addr = page_address(page) +
+				  rx_buffer->page_offset;
+
+		/* prefetch first cache line of first page */
+		prefetch(page_addr);
+#if L1_CACHE_BYTES < 128
+		prefetch(page_addr + L1_CACHE_BYTES);
+#endif
+
+		/* allocate a skb to store the frags */
+		skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
+						IGB_RX_HDR_LEN);
+		if (unlikely(!skb)) {
+			rx_ring->rx_stats.alloc_failed++;
+			return NULL;
+		}
+
+		/*
+		 * we will be copying header into skb->data in
+		 * pskb_may_pull so it is in our interest to prefetch
+		 * it now to avoid a possible cache miss
+		 */
+		prefetchw(skb->data);
+	}
+
+	/* we are reusing so sync this buffer for CPU use */
+	dma_sync_single_range_for_cpu(rx_ring->dev,
+				      rx_buffer->dma,
+				      rx_buffer->page_offset,
+				      PAGE_SIZE / 2,
+				      DMA_FROM_DEVICE);
+
+	/* pull page into skb */
+	if (igb_add_rx_frag(rx_ring, rx_buffer, rx_desc, skb)) {
+		/* hand second half of page back to the ring */
+		igb_reuse_rx_page(rx_ring, rx_buffer);
+	} else {
+		/* we are not reusing the buffer so unmap it */
+		dma_unmap_page(rx_ring->dev, rx_buffer->dma,
+			       PAGE_SIZE, DMA_FROM_DEVICE);
+	}
+
+	/* clear contents of rx_buffer */
+	rx_buffer->page = NULL;
+
+	return skb;
+}
+
 static inline void igb_rx_checksum(struct igb_ring *ring,
 				   union e1000_adv_rx_desc *rx_desc,
 				   struct sk_buff *skb)
@@ -5977,6 +6045,34 @@ static inline void igb_rx_hash(struct igb_ring *ring,
 		skb->rxhash = le32_to_cpu(rx_desc->wb.lower.hi_dword.rss);
 }
 
+/**
+ * igb_is_non_eop - process handling of non-EOP buffers
+ * @rx_ring: Rx ring being processed
+ * @rx_desc: Rx descriptor for current buffer
+ * @skb: current socket buffer containing buffer in progress
+ *
+ * This function updates next to clean.  If the buffer is an EOP buffer
+ * this function exits returning false, otherwise it will place the
+ * sk_buff in the next buffer to be chained and return true indicating
+ * that this is in fact a non-EOP buffer.
+ **/
+static bool igb_is_non_eop(struct igb_ring *rx_ring,
+			   union e1000_adv_rx_desc *rx_desc)
+{
+	u32 ntc = rx_ring->next_to_clean + 1;
+
+	/* fetch, update, and store next to clean */
+	ntc = (ntc < rx_ring->count) ? ntc : 0;
+	rx_ring->next_to_clean = ntc;
+
+	prefetch(IGB_RX_DESC(rx_ring, ntc));
+
+	if (likely(igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP)))
+		return false;
+
+	return true;
+}
+
 /**
  * igb_get_headlen - determine size of header for LRO/GRO
  * @data: pointer to the start of the headers
@@ -6227,87 +6323,39 @@ static void igb_process_skb_fields(struct igb_ring *rx_ring,
 	skb->protocol = eth_type_trans(skb, rx_ring->netdev);
 }
 
-static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
+static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, const int budget)
 {
 	struct igb_ring *rx_ring = q_vector->rx.ring;
-	union e1000_adv_rx_desc *rx_desc;
 	struct sk_buff *skb = rx_ring->skb;
 	unsigned int total_bytes = 0, total_packets = 0;
 	u16 cleaned_count = igb_desc_unused(rx_ring);
-	u16 i = rx_ring->next_to_clean;
-
-	rx_desc = IGB_RX_DESC(rx_ring, i);
-
-	while (igb_test_staterr(rx_desc, E1000_RXD_STAT_DD)) {
-		struct igb_rx_buffer *buffer_info = &rx_ring->rx_buffer_info[i];
-		struct page *page;
-		union e1000_adv_rx_desc *next_rxd;
-
-		i++;
-		if (i == rx_ring->count)
-			i = 0;
-
-		next_rxd = IGB_RX_DESC(rx_ring, i);
-		prefetch(next_rxd);
 
-		/*
-		 * This memory barrier is needed to keep us from reading
-		 * any other fields out of the rx_desc until we know the
-		 * RXD_STAT_DD bit is set
-		 */
-		rmb();
-
-		page = buffer_info->page;
-		prefetchw(page);
+	do {
+		union e1000_adv_rx_desc *rx_desc;
 
-		if (likely(!skb)) {
-			void *page_addr = page_address(page) +
-					  buffer_info->page_offset;
+		/* return some buffers to hardware, one at a time is too slow */
+		if (cleaned_count >= IGB_RX_BUFFER_WRITE) {
+			igb_alloc_rx_buffers(rx_ring, cleaned_count);
+			cleaned_count = 0;
+		}
 
-			/* prefetch first cache line of first page */
-			prefetch(page_addr);
-#if L1_CACHE_BYTES < 128
-			prefetch(page_addr + L1_CACHE_BYTES);
-#endif
+		rx_desc = IGB_RX_DESC(rx_ring, rx_ring->next_to_clean);
 
-			/* allocate a skb to store the frags */
-			skb = netdev_alloc_skb_ip_align(rx_ring->netdev,
-							IGB_RX_HDR_LEN);
-			if (unlikely(!skb)) {
-				rx_ring->rx_stats.alloc_failed++;
-				break;
-			}
+		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_DD))
+			break;
 
-			/*
-			 * we will be copying header into skb->data in
-			 * pskb_may_pull so it is in our interest to prefetch
-			 * it now to avoid a possible cache miss
-			 */
-			prefetchw(skb->data);
-		}
+		/* retrieve a buffer from the ring */
+		skb = igb_fetch_rx_buffer(rx_ring, rx_desc, skb);
 
-		/* we are reusing so sync this buffer for CPU use */
-		dma_sync_single_range_for_cpu(rx_ring->dev,
-					      buffer_info->dma,
-					      buffer_info->page_offset,
-					      PAGE_SIZE / 2,
-					      DMA_FROM_DEVICE);
-
-		/* pull page into skb */
-		if (igb_add_rx_frag(rx_ring, buffer_info, rx_desc, skb)) {
-			/* hand second half of page back to the ring */
-			igb_reuse_rx_page(rx_ring, buffer_info);
-		} else {
-			/* we are not reusing the buffer so unmap it */
-			dma_unmap_page(rx_ring->dev, buffer_info->dma,
-				       PAGE_SIZE, DMA_FROM_DEVICE);
-		}
+		/* exit if we failed to retrieve a buffer */
+		if (!skb)
+			break;
 
-		/* clear contents of buffer_info */
-		buffer_info->page = NULL;
+		cleaned_count++;
 
-		if (!igb_test_staterr(rx_desc, E1000_RXD_STAT_EOP))
-			goto next_desc;
+		/* fetch next buffer in frame if non-eop */
+		if (igb_is_non_eop(rx_ring, rx_desc))
+			continue;
 
 		/* verify the packet layout is correct */
 		if (igb_cleanup_headers(rx_ring, rx_desc, skb)) {
@@ -6317,7 +6365,6 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 
 		/* probably a little skewed due to removing CRC */
 		total_bytes += skb->len;
-		total_packets++;
 
 		/* populate checksum, timestamp, VLAN, and protocol */
 		igb_process_skb_fields(rx_ring, rx_desc, skb);
@@ -6327,26 +6374,13 @@ static bool igb_clean_rx_irq(struct igb_q_vector *q_vector, int budget)
 		/* reset skb pointer */
 		skb = NULL;
 
-		budget--;
-next_desc:
-		if (!budget)
-			break;
-
-		cleaned_count++;
-		/* return some buffers to hardware, one at a time is too slow */
-		if (cleaned_count >= IGB_RX_BUFFER_WRITE) {
-			igb_alloc_rx_buffers(rx_ring, cleaned_count);
-			cleaned_count = 0;
-		}
-
-		/* use prefetched values */
-		rx_desc = next_rxd;
-	}
+		/* update budget accounting */
+		total_packets++;
+	} while (likely(total_packets < budget));
 
 	/* place incomplete frames back on ring for completion */
 	rx_ring->skb = skb;
 
-	rx_ring->next_to_clean = i;
 	u64_stats_update_begin(&rx_ring->rx_syncp);
 	rx_ring->rx_stats.packets += total_packets;
 	rx_ring->rx_stats.bytes += total_bytes;
@@ -6357,7 +6391,7 @@ next_desc:
 	if (cleaned_count)
 		igb_alloc_rx_buffers(rx_ring, cleaned_count);
 
-	return !!budget;
+	return (total_packets < budget);
 }
 
 static bool igb_alloc_mapped_page(struct igb_ring *rx_ring,
-- 
cgit v1.2.3


From de78d1f9c83d0aceca42c17abbbf730ebdc2fc6e Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:12 +0000
Subject: igb: Lock buffer size at 2K even on systems with larger pages

This change locks us in at 2K buffers even on a system that supports larger
frames.  The reason for this change is to make better use of pages and to
reduce the overall truesize of frames generated by igb.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h         |  7 ++++---
 drivers/net/ethernet/intel/igb/igb_ethtool.c |  4 ++--
 drivers/net/ethernet/intel/igb/igb_main.c    | 27 +++++++++++++++++----------
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index 1d15bb0b1e9..d3fd0127c0c 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -132,9 +132,10 @@ struct vf_data_storage {
 #define MAXIMUM_ETHERNET_VLAN_SIZE 1522
 
 /* Supported Rx Buffer Sizes */
-#define IGB_RXBUFFER_256   256
-#define IGB_RXBUFFER_16384 16384
-#define IGB_RX_HDR_LEN     IGB_RXBUFFER_256
+#define IGB_RXBUFFER_256	256
+#define IGB_RXBUFFER_2048	2048
+#define IGB_RX_HDR_LEN		IGB_RXBUFFER_256
+#define IGB_RX_BUFSZ		IGB_RXBUFFER_2048
 
 /* How many Tx Descriptors do we need to call netif_wake_queue ? */
 #define IGB_TX_QUEUE_WAKE	16
diff --git a/drivers/net/ethernet/intel/igb/igb_ethtool.c b/drivers/net/ethernet/intel/igb/igb_ethtool.c
index 96c6df65726..375c0dad8d2 100644
--- a/drivers/net/ethernet/intel/igb/igb_ethtool.c
+++ b/drivers/net/ethernet/intel/igb/igb_ethtool.c
@@ -1727,7 +1727,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 		/* sync Rx buffer for CPU read */
 		dma_sync_single_for_cpu(rx_ring->dev,
 					rx_buffer_info->dma,
-					PAGE_SIZE / 2,
+					IGB_RX_BUFSZ,
 					DMA_FROM_DEVICE);
 
 		/* verify contents of skb */
@@ -1737,7 +1737,7 @@ static int igb_clean_test_rings(struct igb_ring *rx_ring,
 		/* sync Rx buffer for device write */
 		dma_sync_single_for_device(rx_ring->dev,
 					   rx_buffer_info->dma,
-					   PAGE_SIZE / 2,
+					   IGB_RX_BUFSZ,
 					   DMA_FROM_DEVICE);
 
 		/* unmap buffer on tx side */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index fa7ddec4cfe..0141ef3ea67 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -554,7 +554,7 @@ rx_ring_summary:
 					  16, 1,
 					  page_address(buffer_info->page) +
 						      buffer_info->page_offset,
-					  PAGE_SIZE/2, true);
+					  IGB_RX_BUFSZ, true);
 				}
 			}
 		}
@@ -3103,11 +3103,7 @@ void igb_configure_rx_ring(struct igb_adapter *adapter,
 
 	/* set descriptor configuration */
 	srrctl = IGB_RX_HDR_LEN << E1000_SRRCTL_BSIZEHDRSIZE_SHIFT;
-#if (PAGE_SIZE / 2) > IGB_RXBUFFER_16384
-	srrctl |= IGB_RXBUFFER_16384 >> E1000_SRRCTL_BSIZEPKT_SHIFT;
-#else
-	srrctl |= (PAGE_SIZE / 2) >> E1000_SRRCTL_BSIZEPKT_SHIFT;
-#endif
+	srrctl |= IGB_RX_BUFSZ >> E1000_SRRCTL_BSIZEPKT_SHIFT;
 	srrctl |= E1000_SRRCTL_DESCTYPE_ADV_ONEBUF;
 #ifdef CONFIG_IGB_PTP
 	if (hw->mac.type >= e1000_82580)
@@ -5855,7 +5851,7 @@ static void igb_reuse_rx_page(struct igb_ring *rx_ring,
 	/* sync the buffer for use by the device */
 	dma_sync_single_range_for_device(rx_ring->dev, old_buff->dma,
 					 old_buff->page_offset,
-					 PAGE_SIZE / 2,
+					 IGB_RX_BUFSZ,
 					 DMA_FROM_DEVICE);
 }
 
@@ -5905,18 +5901,19 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
 	}
 
 	skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, page,
-			rx_buffer->page_offset, size, PAGE_SIZE / 2);
+			rx_buffer->page_offset, size, IGB_RX_BUFSZ);
 
 	/* avoid re-using remote pages */
 	if (unlikely(page_to_nid(page) != numa_node_id()))
 		return false;
 
+#if (PAGE_SIZE < 8192)
 	/* if we are only owner of page we can reuse it */
 	if (unlikely(page_count(page) != 1))
 		return false;
 
 	/* flip page offset to other buffer */
-	rx_buffer->page_offset ^= PAGE_SIZE / 2;
+	rx_buffer->page_offset ^= IGB_RX_BUFSZ;
 
 	/*
 	 * since we are the only owner of the page and we need to
@@ -5924,6 +5921,16 @@ static bool igb_add_rx_frag(struct igb_ring *rx_ring,
 	 * an unnecessary locked operation
 	 */
 	atomic_set(&page->_count, 2);
+#else
+	/* move offset up to the next cache line */
+	rx_buffer->page_offset += SKB_DATA_ALIGN(size);
+
+	if (rx_buffer->page_offset > (PAGE_SIZE - IGB_RX_BUFSZ))
+		return false;
+
+	/* bump ref count on page before it is given to the stack */
+	get_page(page);
+#endif
 
 	return true;
 }
@@ -5977,7 +5984,7 @@ static struct sk_buff *igb_fetch_rx_buffer(struct igb_ring *rx_ring,
 	dma_sync_single_range_for_cpu(rx_ring->dev,
 				      rx_buffer->dma,
 				      rx_buffer->page_offset,
-				      PAGE_SIZE / 2,
+				      IGB_RX_BUFSZ,
 				      DMA_FROM_DEVICE);
 
 	/* pull page into skb */
-- 
cgit v1.2.3


From 5536d2102a2d37a02e2c233ead4e1e4cabbdcd5b Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:17 +0000
Subject: igb: Combine q_vector and ring allocation into a single function

This change combines the the allocation of q_vectors and rings into a single
function.  The advantage of this is that we are guaranteed we will avoid
overlap in the L1 cache sets.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb.h      |  42 ++--
 drivers/net/ethernet/intel/igb/igb_main.c | 375 +++++++++++++++---------------
 2 files changed, 215 insertions(+), 202 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb.h b/drivers/net/ethernet/intel/igb/igb.h
index d3fd0127c0c..be1971b1829 100644
--- a/drivers/net/ethernet/intel/igb/igb.h
+++ b/drivers/net/ethernet/intel/igb/igb.h
@@ -204,22 +204,6 @@ struct igb_ring_container {
 	u8 itr;				/* current ITR setting for ring */
 };
 
-struct igb_q_vector {
-	struct igb_adapter *adapter;	/* backlink */
-	int cpu;			/* CPU for DCA */
-	u32 eims_value;			/* EIMS mask value */
-
-	struct igb_ring_container rx, tx;
-
-	struct napi_struct napi;
-
-	u16 itr_val;
-	u8 set_itr;
-	void __iomem *itr_register;
-
-	char name[IFNAMSIZ + 9];
-};
-
 struct igb_ring {
 	struct igb_q_vector *q_vector;	/* backlink to q_vector */
 	struct net_device *netdev;	/* back pointer to net_device */
@@ -231,14 +215,15 @@ struct igb_ring {
 	void *desc;			/* descriptor ring memory */
 	unsigned long flags;		/* ring specific flags */
 	void __iomem *tail;		/* pointer to ring tail register */
+	dma_addr_t dma;			/* phys address of the ring */
+	unsigned int  size;		/* length of desc. ring in bytes */
 
 	u16 count;			/* number of desc. in the ring */
 	u8 queue_index;			/* logical index of the ring*/
 	u8 reg_idx;			/* physical index of the ring */
-	u32 size;			/* length of desc. ring in bytes */
 
 	/* everything past this point are written often */
-	u16 next_to_clean ____cacheline_aligned_in_smp;
+	u16 next_to_clean;
 	u16 next_to_use;
 	u16 next_to_alloc;
 
@@ -256,8 +241,25 @@ struct igb_ring {
 			struct u64_stats_sync rx_syncp;
 		};
 	};
-	/* Items past this point are only used during ring alloc / free */
-	dma_addr_t dma;                /* phys address of the ring */
+} ____cacheline_internodealigned_in_smp;
+
+struct igb_q_vector {
+	struct igb_adapter *adapter;	/* backlink */
+	int cpu;			/* CPU for DCA */
+	u32 eims_value;			/* EIMS mask value */
+
+	u16 itr_val;
+	u8 set_itr;
+	void __iomem *itr_register;
+
+	struct igb_ring_container rx, tx;
+
+	struct napi_struct napi;
+	struct rcu_head rcu;	/* to avoid race with update stats on free */
+	char name[IFNAMSIZ + 9];
+
+	/* for dynamic allocation of rings associated with this q_vector */
+	struct igb_ring ring[0] ____cacheline_internodealigned_in_smp;
 };
 
 enum e1000_ring_flags_t {
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 0141ef3ea67..4a25b8fa908 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -652,80 +652,6 @@ static void igb_cache_ring_register(struct igb_adapter *adapter)
 	}
 }
 
-static void igb_free_queues(struct igb_adapter *adapter)
-{
-	int i;
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		kfree(adapter->tx_ring[i]);
-		adapter->tx_ring[i] = NULL;
-	}
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		kfree(adapter->rx_ring[i]);
-		adapter->rx_ring[i] = NULL;
-	}
-	adapter->num_rx_queues = 0;
-	adapter->num_tx_queues = 0;
-}
-
-/**
- * igb_alloc_queues - Allocate memory for all rings
- * @adapter: board private structure to initialize
- *
- * We allocate one ring per queue at run-time since we don't know the
- * number of queues at compile-time.
- **/
-static int igb_alloc_queues(struct igb_adapter *adapter)
-{
-	struct igb_ring *ring;
-	int i;
-
-	for (i = 0; i < adapter->num_tx_queues; i++) {
-		ring = kzalloc(sizeof(struct igb_ring), GFP_KERNEL);
-		if (!ring)
-			goto err;
-		ring->count = adapter->tx_ring_count;
-		ring->queue_index = i;
-		ring->dev = &adapter->pdev->dev;
-		ring->netdev = adapter->netdev;
-		/* For 82575, context index must be unique per ring. */
-		if (adapter->hw.mac.type == e1000_82575)
-			set_bit(IGB_RING_FLAG_TX_CTX_IDX, &ring->flags);
-		adapter->tx_ring[i] = ring;
-	}
-
-	for (i = 0; i < adapter->num_rx_queues; i++) {
-		ring = kzalloc(sizeof(struct igb_ring), GFP_KERNEL);
-		if (!ring)
-			goto err;
-		ring->count = adapter->rx_ring_count;
-		ring->queue_index = i;
-		ring->dev = &adapter->pdev->dev;
-		ring->netdev = adapter->netdev;
-		/* set flag indicating ring supports SCTP checksum offload */
-		if (adapter->hw.mac.type >= e1000_82576)
-			set_bit(IGB_RING_FLAG_RX_SCTP_CSUM, &ring->flags);
-
-		/*
-		 * On i350, i210, and i211, loopback VLAN packets
-		 * have the tag byte-swapped.
-		 * */
-		if (adapter->hw.mac.type >= e1000_i350)
-			set_bit(IGB_RING_FLAG_RX_LB_VLAN_BSWAP, &ring->flags);
-
-		adapter->rx_ring[i] = ring;
-	}
-
-	igb_cache_ring_register(adapter);
-
-	return 0;
-
-err:
-	igb_free_queues(adapter);
-
-	return -ENOMEM;
-}
-
 /**
  *  igb_write_ivar - configure ivar for given MSI-X vector
  *  @hw: pointer to the HW structure
@@ -955,6 +881,35 @@ static void igb_reset_interrupt_capability(struct igb_adapter *adapter)
 	}
 }
 
+/**
+ * igb_free_q_vector - Free memory allocated for specific interrupt vector
+ * @adapter: board private structure to initialize
+ * @v_idx: Index of vector to be freed
+ *
+ * This function frees the memory allocated to the q_vector.  In addition if
+ * NAPI is enabled it will delete any references to the NAPI struct prior
+ * to freeing the q_vector.
+ **/
+static void igb_free_q_vector(struct igb_adapter *adapter, int v_idx)
+{
+	struct igb_q_vector *q_vector = adapter->q_vector[v_idx];
+
+	if (q_vector->tx.ring)
+		adapter->tx_ring[q_vector->tx.ring->queue_index] = NULL;
+
+	if (q_vector->rx.ring)
+		adapter->tx_ring[q_vector->rx.ring->queue_index] = NULL;
+
+	adapter->q_vector[v_idx] = NULL;
+	netif_napi_del(&q_vector->napi);
+
+	/*
+	 * ixgbe_get_stats64() might access the rings on this vector,
+	 * we must wait a grace period before freeing it.
+	 */
+	kfree_rcu(q_vector, rcu);
+}
+
 /**
  * igb_free_q_vectors - Free memory allocated for interrupt vectors
  * @adapter: board private structure to initialize
@@ -965,17 +920,14 @@ static void igb_reset_interrupt_capability(struct igb_adapter *adapter)
  **/
 static void igb_free_q_vectors(struct igb_adapter *adapter)
 {
-	int v_idx;
+	int v_idx = adapter->num_q_vectors;
 
-	for (v_idx = 0; v_idx < adapter->num_q_vectors; v_idx++) {
-		struct igb_q_vector *q_vector = adapter->q_vector[v_idx];
-		adapter->q_vector[v_idx] = NULL;
-		if (!q_vector)
-			continue;
-		netif_napi_del(&q_vector->napi);
-		kfree(q_vector);
-	}
+	adapter->num_tx_queues = 0;
+	adapter->num_rx_queues = 0;
 	adapter->num_q_vectors = 0;
+
+	while (v_idx--)
+		igb_free_q_vector(adapter, v_idx);
 }
 
 /**
@@ -986,7 +938,6 @@ static void igb_free_q_vectors(struct igb_adapter *adapter)
  */
 static void igb_clear_interrupt_scheme(struct igb_adapter *adapter)
 {
-	igb_free_queues(adapter);
 	igb_free_q_vectors(adapter);
 	igb_reset_interrupt_capability(adapter);
 }
@@ -1074,95 +1025,181 @@ out:
 	return err;
 }
 
+static void igb_add_ring(struct igb_ring *ring,
+			 struct igb_ring_container *head)
+{
+	head->ring = ring;
+	head->count++;
+}
+
 /**
- * igb_alloc_q_vectors - Allocate memory for interrupt vectors
+ * igb_alloc_q_vector - Allocate memory for a single interrupt vector
  * @adapter: board private structure to initialize
+ * @v_count: q_vectors allocated on adapter, used for ring interleaving
+ * @v_idx: index of vector in adapter struct
+ * @txr_count: total number of Tx rings to allocate
+ * @txr_idx: index of first Tx ring to allocate
+ * @rxr_count: total number of Rx rings to allocate
+ * @rxr_idx: index of first Rx ring to allocate
  *
- * We allocate one q_vector per queue interrupt.  If allocation fails we
- * return -ENOMEM.
+ * We allocate one q_vector.  If allocation fails we return -ENOMEM.
  **/
-static int igb_alloc_q_vectors(struct igb_adapter *adapter)
+static int igb_alloc_q_vector(struct igb_adapter *adapter,
+			      int v_count, int v_idx,
+			      int txr_count, int txr_idx,
+			      int rxr_count, int rxr_idx)
 {
 	struct igb_q_vector *q_vector;
-	struct e1000_hw *hw = &adapter->hw;
-	int v_idx;
+	struct igb_ring *ring;
+	int ring_count, size;
 
-	for (v_idx = 0; v_idx < adapter->num_q_vectors; v_idx++) {
-		q_vector = kzalloc(sizeof(struct igb_q_vector),
-				   GFP_KERNEL);
-		if (!q_vector)
-			goto err_out;
-		q_vector->adapter = adapter;
-		q_vector->itr_register = hw->hw_addr + E1000_EITR(0);
-		q_vector->itr_val = IGB_START_ITR;
-		netif_napi_add(adapter->netdev, &q_vector->napi, igb_poll, 64);
-		adapter->q_vector[v_idx] = q_vector;
+	/* igb only supports 1 Tx and/or 1 Rx queue per vector */
+	if (txr_count > 1 || rxr_count > 1)
+		return -ENOMEM;
+
+	ring_count = txr_count + rxr_count;
+	size = sizeof(struct igb_q_vector) +
+	       (sizeof(struct igb_ring) * ring_count);
+
+	/* allocate q_vector and rings */
+	q_vector = kzalloc(size, GFP_KERNEL);
+	if (!q_vector)
+		return -ENOMEM;
+
+	/* initialize NAPI */
+	netif_napi_add(adapter->netdev, &q_vector->napi,
+		       igb_poll, 64);
+
+	/* tie q_vector and adapter together */
+	adapter->q_vector[v_idx] = q_vector;
+	q_vector->adapter = adapter;
+
+	/* initialize work limits */
+	q_vector->tx.work_limit = adapter->tx_work_limit;
+
+	/* initialize ITR configuration */
+	q_vector->itr_register = adapter->hw.hw_addr + E1000_EITR(0);
+	q_vector->itr_val = IGB_START_ITR;
+
+	/* initialize pointer to rings */
+	ring = q_vector->ring;
+
+	if (txr_count) {
+		/* assign generic ring traits */
+		ring->dev = &adapter->pdev->dev;
+		ring->netdev = adapter->netdev;
+
+		/* configure backlink on ring */
+		ring->q_vector = q_vector;
+
+		/* update q_vector Tx values */
+		igb_add_ring(ring, &q_vector->tx);
+
+		/* For 82575, context index must be unique per ring. */
+		if (adapter->hw.mac.type == e1000_82575)
+			set_bit(IGB_RING_FLAG_TX_CTX_IDX, &ring->flags);
+
+		/* apply Tx specific ring traits */
+		ring->count = adapter->tx_ring_count;
+		ring->queue_index = txr_idx;
+
+		/* assign ring to adapter */
+		adapter->tx_ring[txr_idx] = ring;
+
+		/* push pointer to next ring */
+		ring++;
 	}
 
-	return 0;
+	if (rxr_count) {
+		/* assign generic ring traits */
+		ring->dev = &adapter->pdev->dev;
+		ring->netdev = adapter->netdev;
 
-err_out:
-	igb_free_q_vectors(adapter);
-	return -ENOMEM;
-}
+		/* configure backlink on ring */
+		ring->q_vector = q_vector;
 
-static void igb_map_rx_ring_to_vector(struct igb_adapter *adapter,
-                                      int ring_idx, int v_idx)
-{
-	struct igb_q_vector *q_vector = adapter->q_vector[v_idx];
+		/* update q_vector Rx values */
+		igb_add_ring(ring, &q_vector->rx);
 
-	q_vector->rx.ring = adapter->rx_ring[ring_idx];
-	q_vector->rx.ring->q_vector = q_vector;
-	q_vector->rx.count++;
-	q_vector->itr_val = adapter->rx_itr_setting;
-	if (q_vector->itr_val && q_vector->itr_val <= 3)
-		q_vector->itr_val = IGB_START_ITR;
-}
+		/* set flag indicating ring supports SCTP checksum offload */
+		if (adapter->hw.mac.type >= e1000_82576)
+			set_bit(IGB_RING_FLAG_RX_SCTP_CSUM, &ring->flags);
 
-static void igb_map_tx_ring_to_vector(struct igb_adapter *adapter,
-                                      int ring_idx, int v_idx)
-{
-	struct igb_q_vector *q_vector = adapter->q_vector[v_idx];
+		/*
+		 * On i350, i210, and i211, loopback VLAN packets
+		 * have the tag byte-swapped.
+		 * */
+		if (adapter->hw.mac.type >= e1000_i350)
+			set_bit(IGB_RING_FLAG_RX_LB_VLAN_BSWAP, &ring->flags);
 
-	q_vector->tx.ring = adapter->tx_ring[ring_idx];
-	q_vector->tx.ring->q_vector = q_vector;
-	q_vector->tx.count++;
-	q_vector->itr_val = adapter->tx_itr_setting;
-	q_vector->tx.work_limit = adapter->tx_work_limit;
-	if (q_vector->itr_val && q_vector->itr_val <= 3)
-		q_vector->itr_val = IGB_START_ITR;
+		/* apply Rx specific ring traits */
+		ring->count = adapter->rx_ring_count;
+		ring->queue_index = rxr_idx;
+
+		/* assign ring to adapter */
+		adapter->rx_ring[rxr_idx] = ring;
+	}
+
+	return 0;
 }
 
+
 /**
- * igb_map_ring_to_vector - maps allocated queues to vectors
+ * igb_alloc_q_vectors - Allocate memory for interrupt vectors
+ * @adapter: board private structure to initialize
  *
- * This function maps the recently allocated queues to vectors.
+ * We allocate one q_vector per queue interrupt.  If allocation fails we
+ * return -ENOMEM.
  **/
-static int igb_map_ring_to_vector(struct igb_adapter *adapter)
+static int igb_alloc_q_vectors(struct igb_adapter *adapter)
 {
-	int i;
-	int v_idx = 0;
+	int q_vectors = adapter->num_q_vectors;
+	int rxr_remaining = adapter->num_rx_queues;
+	int txr_remaining = adapter->num_tx_queues;
+	int rxr_idx = 0, txr_idx = 0, v_idx = 0;
+	int err;
 
-	if ((adapter->num_q_vectors < adapter->num_rx_queues) ||
-	    (adapter->num_q_vectors < adapter->num_tx_queues))
-		return -ENOMEM;
+	if (q_vectors >= (rxr_remaining + txr_remaining)) {
+		for (; rxr_remaining; v_idx++) {
+			err = igb_alloc_q_vector(adapter, q_vectors, v_idx,
+						 0, 0, 1, rxr_idx);
 
-	if (adapter->num_q_vectors >=
-	    (adapter->num_rx_queues + adapter->num_tx_queues)) {
-		for (i = 0; i < adapter->num_rx_queues; i++)
-			igb_map_rx_ring_to_vector(adapter, i, v_idx++);
-		for (i = 0; i < adapter->num_tx_queues; i++)
-			igb_map_tx_ring_to_vector(adapter, i, v_idx++);
-	} else {
-		for (i = 0; i < adapter->num_rx_queues; i++) {
-			if (i < adapter->num_tx_queues)
-				igb_map_tx_ring_to_vector(adapter, i, v_idx);
-			igb_map_rx_ring_to_vector(adapter, i, v_idx++);
+			if (err)
+				goto err_out;
+
+			/* update counts and index */
+			rxr_remaining--;
+			rxr_idx++;
 		}
-		for (; i < adapter->num_tx_queues; i++)
-			igb_map_tx_ring_to_vector(adapter, i, v_idx++);
 	}
+
+	for (; v_idx < q_vectors; v_idx++) {
+		int rqpv = DIV_ROUND_UP(rxr_remaining, q_vectors - v_idx);
+		int tqpv = DIV_ROUND_UP(txr_remaining, q_vectors - v_idx);
+		err = igb_alloc_q_vector(adapter, q_vectors, v_idx,
+					 tqpv, txr_idx, rqpv, rxr_idx);
+
+		if (err)
+			goto err_out;
+
+		/* update counts and index */
+		rxr_remaining -= rqpv;
+		txr_remaining -= tqpv;
+		rxr_idx++;
+		txr_idx++;
+	}
+
 	return 0;
+
+err_out:
+	adapter->num_tx_queues = 0;
+	adapter->num_rx_queues = 0;
+	adapter->num_q_vectors = 0;
+
+	while (v_idx--)
+		igb_free_q_vector(adapter, v_idx);
+
+	return -ENOMEM;
 }
 
 /**
@@ -1185,24 +1222,10 @@ static int igb_init_interrupt_scheme(struct igb_adapter *adapter)
 		goto err_alloc_q_vectors;
 	}
 
-	err = igb_alloc_queues(adapter);
-	if (err) {
-		dev_err(&pdev->dev, "Unable to allocate memory for queues\n");
-		goto err_alloc_queues;
-	}
-
-	err = igb_map_ring_to_vector(adapter);
-	if (err) {
-		dev_err(&pdev->dev, "Invalid q_vector to ring mapping\n");
-		goto err_map_queues;
-	}
-
+	igb_cache_ring_register(adapter);
 
 	return 0;
-err_map_queues:
-	igb_free_queues(adapter);
-err_alloc_queues:
-	igb_free_q_vectors(adapter);
+
 err_alloc_q_vectors:
 	igb_reset_interrupt_capability(adapter);
 	return err;
@@ -1225,11 +1248,11 @@ static int igb_request_irq(struct igb_adapter *adapter)
 		if (!err)
 			goto request_done;
 		/* fall back to MSI */
+		igb_free_all_tx_resources(adapter);
+		igb_free_all_rx_resources(adapter);
 		igb_clear_interrupt_scheme(adapter);
 		if (!pci_enable_msi(pdev))
 			adapter->flags |= IGB_FLAG_HAS_MSI;
-		igb_free_all_tx_resources(adapter);
-		igb_free_all_rx_resources(adapter);
 		adapter->num_tx_queues = 1;
 		adapter->num_rx_queues = 1;
 		adapter->num_q_vectors = 1;
@@ -1239,13 +1262,6 @@ static int igb_request_irq(struct igb_adapter *adapter)
 			        "Unable to allocate memory for vectors\n");
 			goto request_done;
 		}
-		err = igb_alloc_queues(adapter);
-		if (err) {
-			dev_err(&pdev->dev,
-			        "Unable to allocate memory for queues\n");
-			igb_free_q_vectors(adapter);
-			goto request_done;
-		}
 		igb_setup_all_tx_resources(adapter);
 		igb_setup_all_rx_resources(adapter);
 	}
@@ -2633,10 +2649,8 @@ int igb_setup_tx_resources(struct igb_ring *tx_ring)
 	tx_ring->size = tx_ring->count * sizeof(union e1000_adv_tx_desc);
 	tx_ring->size = ALIGN(tx_ring->size, 4096);
 
-	tx_ring->desc = dma_alloc_coherent(dev,
-					   tx_ring->size,
-					   &tx_ring->dma,
-					   GFP_KERNEL);
+	tx_ring->desc = dma_alloc_coherent(dev, tx_ring->size,
+					   &tx_ring->dma, GFP_KERNEL);
 	if (!tx_ring->desc)
 		goto err;
 
@@ -2773,15 +2787,12 @@ int igb_setup_rx_resources(struct igb_ring *rx_ring)
 	if (!rx_ring->rx_buffer_info)
 		goto err;
 
-
 	/* Round up to nearest 4K */
 	rx_ring->size = rx_ring->count * sizeof(union e1000_adv_rx_desc);
 	rx_ring->size = ALIGN(rx_ring->size, 4096);
 
-	rx_ring->desc = dma_alloc_coherent(dev,
-					   rx_ring->size,
-					   &rx_ring->dma,
-					   GFP_KERNEL);
+	rx_ring->desc = dma_alloc_coherent(dev, rx_ring->size,
+					   &rx_ring->dma, GFP_KERNEL);
 	if (!rx_ring->desc)
 		goto err;
 
-- 
cgit v1.2.3


From 0c2cc02e571aee1f2193a004508d4d604eff6a8f Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:22 +0000
Subject: igb: Move the calls to set the Tx and Rx queues into igb_open

This change helps to address locking issues seen with
netif_set_real_num_tx_queues and netif_set_real_num_rx_queues when used in
the igb_set_interrupt_capability function.  To resolve these locking issues
I have moved the two function calls into __igb_open so that they can be
called while the RTNL lock is held.

An added advantage to this is that the number of queues is not updated
until the last possible moment so if there are any issues in allocating
MSI-X interrupts or resources for the rings we have time to change the
values prior to updating the netdev.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/igb_main.c | 31 ++++++++++++++++++-------------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index 4a25b8fa908..e7b10272348 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -948,7 +948,7 @@ static void igb_clear_interrupt_scheme(struct igb_adapter *adapter)
  * Attempt to configure interrupts using the best available
  * capabilities of the hardware and kernel.
  **/
-static int igb_set_interrupt_capability(struct igb_adapter *adapter)
+static void igb_set_interrupt_capability(struct igb_adapter *adapter)
 {
 	int err;
 	int numvecs, i;
@@ -985,7 +985,7 @@ static int igb_set_interrupt_capability(struct igb_adapter *adapter)
 			      adapter->msix_entries,
 			      numvecs);
 	if (err == 0)
-		goto out;
+		return;
 
 	igb_reset_interrupt_capability(adapter);
 
@@ -1015,14 +1015,6 @@ msi_only:
 	adapter->num_q_vectors = 1;
 	if (!pci_enable_msi(adapter->pdev))
 		adapter->flags |= IGB_FLAG_HAS_MSI;
-out:
-	/* Notify the stack of the (possibly) reduced queue counts. */
-	rtnl_lock();
-	netif_set_real_num_tx_queues(adapter->netdev, adapter->num_tx_queues);
-	err = netif_set_real_num_rx_queues(adapter->netdev,
-		adapter->num_rx_queues);
-	rtnl_unlock();
-	return err;
 }
 
 static void igb_add_ring(struct igb_ring *ring,
@@ -1212,9 +1204,7 @@ static int igb_init_interrupt_scheme(struct igb_adapter *adapter)
 	struct pci_dev *pdev = adapter->pdev;
 	int err;
 
-	err = igb_set_interrupt_capability(adapter);
-	if (err)
-		return err;
+	igb_set_interrupt_capability(adapter);
 
 	err = igb_alloc_q_vectors(adapter);
 	if (err) {
@@ -2543,6 +2533,17 @@ static int __igb_open(struct net_device *netdev, bool resuming)
 	if (err)
 		goto err_req_irq;
 
+	/* Notify the stack of the actual queue counts. */
+	err = netif_set_real_num_tx_queues(adapter->netdev,
+					   adapter->num_tx_queues);
+	if (err)
+		goto err_set_queues;
+
+	err = netif_set_real_num_rx_queues(adapter->netdev,
+					   adapter->num_rx_queues);
+	if (err)
+		goto err_set_queues;
+
 	/* From here on the code is the same as igb_up() */
 	clear_bit(__IGB_DOWN, &adapter->state);
 
@@ -2572,6 +2573,8 @@ static int __igb_open(struct net_device *netdev, bool resuming)
 
 	return 0;
 
+err_set_queues:
+	igb_free_irq(adapter);
 err_req_irq:
 	igb_release_hw_control(adapter);
 	igb_power_down_link(adapter);
@@ -6841,7 +6844,9 @@ static int igb_resume(struct device *dev)
 	wr32(E1000_WUS, ~0);
 
 	if (netdev->flags & IFF_UP) {
+		rtnl_lock();
 		err = __igb_open(netdev, true);
+		rtnl_unlock();
 		if (err)
 			return err;
 	}
-- 
cgit v1.2.3


From 6a05004a8a65f187fa5493821d3115397209ae76 Mon Sep 17 00:00:00 2001
From: Alexander Duyck <alexander.h.duyck@intel.com>
Date: Tue, 25 Sep 2012 00:31:27 +0000
Subject: igb: Split igb_update_dca into separate Tx and Rx functions

This change makes it so that igb_update_dca is broken into two halves, one
for Rx and one for Tx.  The advantage to this is primarily readability.

In addition I am enabling relaxed ordering for reads from hardware since
this is supported on all of the igb parts.

Signed-off-by: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Aaron Brown <aaron.f.brown@intel.com>
Signed-off-by: Jeff Kirsher <jeffrey.t.kirsher@intel.com>
---
 drivers/net/ethernet/intel/igb/e1000_82575.h |  3 ++
 drivers/net/ethernet/intel/igb/igb_main.c    | 80 +++++++++++++++++-----------
 2 files changed, 52 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/intel/igb/e1000_82575.h b/drivers/net/ethernet/intel/igb/e1000_82575.h
index e85c453f542..44b76b3b681 100644
--- a/drivers/net/ethernet/intel/igb/e1000_82575.h
+++ b/drivers/net/ethernet/intel/igb/e1000_82575.h
@@ -172,10 +172,13 @@ struct e1000_adv_tx_context_desc {
 #define E1000_DCA_RXCTRL_DESC_DCA_EN (1 << 5) /* DCA Rx Desc enable */
 #define E1000_DCA_RXCTRL_HEAD_DCA_EN (1 << 6) /* DCA Rx Desc header enable */
 #define E1000_DCA_RXCTRL_DATA_DCA_EN (1 << 7) /* DCA Rx Desc payload enable */
+#define E1000_DCA_RXCTRL_DESC_RRO_EN (1 << 9) /* DCA Rx rd Desc Relax Order */
 
 #define E1000_DCA_TXCTRL_CPUID_MASK 0x0000001F /* Tx CPUID Mask */
 #define E1000_DCA_TXCTRL_DESC_DCA_EN (1 << 5) /* DCA Tx Desc enable */
+#define E1000_DCA_TXCTRL_DESC_RRO_EN (1 << 9) /* Tx rd Desc Relax Order */
 #define E1000_DCA_TXCTRL_TX_WB_RO_EN (1 << 11) /* Tx Desc writeback RO bit */
+#define E1000_DCA_TXCTRL_DATA_RRO_EN (1 << 13) /* Tx rd data Relax Order */
 
 /* Additional DCA related definitions, note change in position of CPUID */
 #define E1000_DCA_TXCTRL_CPUID_MASK_82576 0xFF000000 /* Tx CPUID Mask */
diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c
index e7b10272348..87abb573585 100644
--- a/drivers/net/ethernet/intel/igb/igb_main.c
+++ b/drivers/net/ethernet/intel/igb/igb_main.c
@@ -4851,45 +4851,63 @@ static irqreturn_t igb_msix_ring(int irq, void *data)
 }
 
 #ifdef CONFIG_IGB_DCA
+static void igb_update_tx_dca(struct igb_adapter *adapter,
+			      struct igb_ring *tx_ring,
+			      int cpu)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	u32 txctrl = dca3_get_tag(tx_ring->dev, cpu);
+
+	if (hw->mac.type != e1000_82575)
+		txctrl <<= E1000_DCA_TXCTRL_CPUID_SHIFT;
+
+	/*
+	 * We can enable relaxed ordering for reads, but not writes when
+	 * DCA is enabled.  This is due to a known issue in some chipsets
+	 * which will cause the DCA tag to be cleared.
+	 */
+	txctrl |= E1000_DCA_TXCTRL_DESC_RRO_EN |
+		  E1000_DCA_TXCTRL_DATA_RRO_EN |
+		  E1000_DCA_TXCTRL_DESC_DCA_EN;
+
+	wr32(E1000_DCA_TXCTRL(tx_ring->reg_idx), txctrl);
+}
+
+static void igb_update_rx_dca(struct igb_adapter *adapter,
+			      struct igb_ring *rx_ring,
+			      int cpu)
+{
+	struct e1000_hw *hw = &adapter->hw;
+	u32 rxctrl = dca3_get_tag(&adapter->pdev->dev, cpu);
+
+	if (hw->mac.type != e1000_82575)
+		rxctrl <<= E1000_DCA_RXCTRL_CPUID_SHIFT;
+
+	/*
+	 * We can enable relaxed ordering for reads, but not writes when
+	 * DCA is enabled.  This is due to a known issue in some chipsets
+	 * which will cause the DCA tag to be cleared.
+	 */
+	rxctrl |= E1000_DCA_RXCTRL_DESC_RRO_EN |
+		  E1000_DCA_RXCTRL_DESC_DCA_EN;
+
+	wr32(E1000_DCA_RXCTRL(rx_ring->reg_idx), rxctrl);
+}
+
 static void igb_update_dca(struct igb_q_vector *q_vector)
 {
 	struct igb_adapter *adapter = q_vector->adapter;
-	struct e1000_hw *hw = &adapter->hw;
 	int cpu = get_cpu();
 
 	if (q_vector->cpu == cpu)
 		goto out_no_update;
 
-	if (q_vector->tx.ring) {
-		int q = q_vector->tx.ring->reg_idx;
-		u32 dca_txctrl = rd32(E1000_DCA_TXCTRL(q));
-		if (hw->mac.type == e1000_82575) {
-			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK;
-			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu);
-		} else {
-			dca_txctrl &= ~E1000_DCA_TXCTRL_CPUID_MASK_82576;
-			dca_txctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
-			              E1000_DCA_TXCTRL_CPUID_SHIFT;
-		}
-		dca_txctrl |= E1000_DCA_TXCTRL_DESC_DCA_EN;
-		wr32(E1000_DCA_TXCTRL(q), dca_txctrl);
-	}
-	if (q_vector->rx.ring) {
-		int q = q_vector->rx.ring->reg_idx;
-		u32 dca_rxctrl = rd32(E1000_DCA_RXCTRL(q));
-		if (hw->mac.type == e1000_82575) {
-			dca_rxctrl &= ~E1000_DCA_RXCTRL_CPUID_MASK;
-			dca_rxctrl |= dca3_get_tag(&adapter->pdev->dev, cpu);
-		} else {
-			dca_rxctrl &= ~E1000_DCA_RXCTRL_CPUID_MASK_82576;
-			dca_rxctrl |= dca3_get_tag(&adapter->pdev->dev, cpu) <<
-			              E1000_DCA_RXCTRL_CPUID_SHIFT;
-		}
-		dca_rxctrl |= E1000_DCA_RXCTRL_DESC_DCA_EN;
-		dca_rxctrl |= E1000_DCA_RXCTRL_HEAD_DCA_EN;
-		dca_rxctrl |= E1000_DCA_RXCTRL_DATA_DCA_EN;
-		wr32(E1000_DCA_RXCTRL(q), dca_rxctrl);
-	}
+	if (q_vector->tx.ring)
+		igb_update_tx_dca(adapter, q_vector->tx.ring, cpu);
+
+	if (q_vector->rx.ring)
+		igb_update_rx_dca(adapter, q_vector->rx.ring, cpu);
+
 	q_vector->cpu = cpu;
 out_no_update:
 	put_cpu();
-- 
cgit v1.2.3