@@ -1202,6 +1202,14 @@ static int ibmvnic_open(struct net_device *netdev)
/* If device failover is pending, just set device state and return.
* Device operation will be handled by reset routine.
+ *
+ * Note that ->failover_pending is not protected by ->state_lock
+ * because the tasklet (executing ibmvnic_handle_crq()) cannot
+ * block. Even otherwise this can deadlock due to CRQs issued in
+ * ibmvnic_open().
+ *
+ * We check failover_pending again at the end in case of errors.
+ * so its okay if we miss the change to true here.
*/
if (adapter->failover_pending) {
adapter->state = VNIC_OPEN;
@@ -1380,6 +1388,9 @@ static int ibmvnic_close(struct net_device *netdev)
/* If device failover is pending, just set device state and return.
* Device operation will be handled by reset routine.
+ *
+ * Note that ->failover_pending is not protected by ->state_lock
+ * See comments in ibmvnic_open().
*/
if (adapter->failover_pending) {
adapter->state = VNIC_CLOSED;
@@ -1930,6 +1941,14 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
+ /*
+ * TODO: Can this race with a reset? The reset could briefly
+ * set state to PROBED causing us to skip setting the
+ * mac address. When reset complets, we set the old mac
+ * address? Can we check ->resetting bit instead and
+ * save the new mac address in adapter->mac_addr
+ * so reset function can set it when it is done?
+ */
if (adapter->state != VNIC_PROBED) {
ether_addr_copy(adapter->mac_addr, addr->sa_data);
rc = __ibmvnic_set_mac(netdev, addr->sa_data);
@@ -1941,6 +1960,14 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p)
/**
* do_change_param_reset returns zero if we are able to keep processing reset
* events, or non-zero if we hit a fatal error and must halt.
+ *
+ * Notes:
+ * - Regardless of success/failure, this function restores adapter state
+ * to what as it was on entry. In case of failure, it is assumed that
+ * a new hard-reset will be attempted.
+ * - Caller must hold the rtnl lock before calling and release upon
+ * return.
+ *
*/
static int do_change_param_reset(struct ibmvnic_adapter *adapter,
enum ibmvnic_reset_reason reason)
@@ -2039,6 +2066,11 @@ static int do_change_param_reset(struct ibmvnic_adapter *adapter,
/**
* do_reset returns zero if we are able to keep processing reset events, or
* non-zero if we hit a fatal error and must halt.
+ *
+ * Notes:
+ * - Regardless of success/failure, this function restores adapter state
+ * to what as it was on entry. In case of failure, it is assumed that
+ * a new hard-reset will be attempted.
*/
static int do_reset(struct ibmvnic_adapter *adapter,
enum ibmvnic_reset_reason reason)
@@ -2237,6 +2269,17 @@ static int do_reset(struct ibmvnic_adapter *adapter,
return rc;
}
+/**
+ * Perform a hard reset possibly because a prior reset encountered
+ * an error.
+ *
+ * Notes:
+ * - Regardless of success/failure, this function restores adapter state
+ * to what as it was on entry. In case of failure, it is assumed that
+ * a new hard-reset will be attempted.
+ * - Caller must hold the rtnl lock before calling and release upon
+ * return.
+ */
static int do_hard_reset(struct ibmvnic_adapter *adapter,
enum ibmvnic_reset_reason reason)
{
@@ -2651,6 +2694,11 @@ static int ibmvnic_poll(struct napi_struct *napi, int budget)
frames_processed++;
}
+ /* TODO: Can this race with reset and/or is release_rx_pools()?
+ * Is that why we check for VNIC_CLOSING? What if we go to
+ * CLOSING just after we check? We cannot take ->state_lock
+ * since we are in interrupt context.
+ */
if (adapter->state != VNIC_CLOSING &&
((atomic_read(&adapter->rx_pool[scrq_num].available) <
adapter->req_rx_add_entries_per_subcrq / 2) ||
@@ -5358,6 +5406,9 @@ static int ibmvnic_reset_init(struct ibmvnic_adapter *adapter, bool reset)
}
if (adapter->from_passive_init) {
+ /* ibmvnic_reset_init() is always called with ->state_lock
+ * held except from ibmvnic_probe(), so safe to update state.
+ */
adapter->state = VNIC_OPEN;
adapter->from_passive_init = false;
return -1;
@@ -5531,6 +5582,9 @@ static int ibmvnic_remove(struct vio_dev *dev)
adapter->state = VNIC_REMOVING;
spin_unlock_irqrestore(&adapter->remove_lock, rmflags);
+ /* drop state_lock so __ibmvnic_reset() can make progress
+ * during flush_work()
+ */
mutex_unlock(&adapter->state_lock);
flush_work(&adapter->ibmvnic_reset);
@@ -5546,6 +5600,9 @@ static int ibmvnic_remove(struct vio_dev *dev)
release_stats_token(adapter);
release_stats_buffers(adapter);
+ /* Adapter going away. There better be no one checking ->state
+ * or getting state_lock now TODO: Do we need the REMOVED state?
+ */
adapter->state = VNIC_REMOVED;
mutex_destroy(&adapter->state_lock);
rtnl_unlock();
@@ -5627,6 +5684,7 @@ static int ibmvnic_resume(struct device *dev)
struct net_device *netdev = dev_get_drvdata(dev);
struct ibmvnic_adapter *adapter = netdev_priv(netdev);
+ /* resuming from power-down so ignoring state_lock */
if (adapter->state != VNIC_OPEN)
return 0;
@@ -963,6 +963,55 @@ struct ibmvnic_tunables {
u64 mtu;
};
+/**
+ * ->state_lock:
+ * Mutex to serialize read/write of adapter->state field specially
+ * between open and reset functions. If rtnl also needs to be held,
+ * acquire rtnl first and then state_lock (to be consistent with
+ * functions that enter ibmvnic with rtnl already held).
+ *
+ * In general, ->state_lock must be held for all read/writes to the
+ * state field. Exceptions are:
+ * - checks for VNIC_PROBING state - since the adapter is itself
+ * under construction and because we never go _back_ to PROBING.
+ *
+ * - in debug messages involving ->state
+ *
+ * - in ibmvnic_tx_interrupt(), ibmvnic_rx_interrupt() because
+ * a) this is a mutex and b) no (known) impact of getting a stale
+ * state (i.e we will likely recover on the next interrupt).
+ *
+ * - ibmvnic_resume() - we are resuming from a power-down state?
+ *
+ * - ibmvnic_reset() - see ->remove_lock below.
+ *
+ * Besides these, there are couple of TODOs in ibmvnic_poll() and
+ * ibmvnic_set_mac() that need to be investigated separately.
+ *
+ * ->remove_lock
+ * A spin lock used to serialize ibmvnic_reset() and ibmvnic_remove().
+ * ibmvnic_reset() can be called from a tasklet which cannot block,
+ * so it cannot use the ->state_lock. The only states ibmvnic_reset()
+ * checks for are PROBING, REMOVING and REMOVED. PROBING can be ignored
+ * as mentioned above. On entering REMOVING state, ibmvnic_reset()
+ * will skip scheduling resets for the adapter.
+ *
+ * ->pending_resets[], ->next_reset:
+ * A "queue" of pending resets, implemented as a simple array. Resets
+ * are not frequent and even when they do occur, we will usually have
+ * just 1 or 2 entries in the queue at any time. Note that we don't
+ * need/allow duplicates in the queue. In the worst case, we would have
+ * VNIC_RESET_MAX-1 entries (but that means adapter is processing all
+ * valid types of resets at once!) so the slight overhead of the array
+ * is probably acceptable.
+ *
+ * We could still use a linked list but then have to deal with allocation
+ * failure when scheduling a reset. We sometimes enqueue reset from a
+ * tasklet so cannot block when we have allocation failure. Trying to
+ * close the adapter on failure requires us to hold the state_lock, which
+ * then cannot be a mutex (tasklet cannot block) - i.e complicates locking
+ * just for the occasional memory allocation failure.
+ */
struct ibmvnic_adapter {
struct vio_dev *vdev;
struct net_device *netdev;
@@ -1098,6 +1147,6 @@ struct ibmvnic_adapter {
struct ibmvnic_tunables desired;
struct ibmvnic_tunables fallback;
- /* Used for serialization of state field */
+ /* see block comments above */
struct mutex state_lock;
};
Add some comments, notes and TODOs about ->state_lock and RTNL. Signed-off-by: Sukadev Bhattiprolu <sukadev@linux.ibm.com> --- Note: This is fixing lot of comments so not identifying fixes. It "seems" to fit this patch set but can send to net-next if necessary. drivers/net/ethernet/ibm/ibmvnic.c | 58 ++++++++++++++++++++++++++++++ drivers/net/ethernet/ibm/ibmvnic.h | 51 +++++++++++++++++++++++++- 2 files changed, 108 insertions(+), 1 deletion(-)