diff mbox

[V4:,3/3] pci: Provide Multiple Error Received support on AER

Message ID 1244776118.2560.321.camel@ymzhang (mailing list archive)
State Superseded, archived
Headers show

Commit Message

Yanmin Zhang June 12, 2009, 3:08 a.m. UTC
When a root port receive the same errors more than once before kernel
 process them, the Multiple Error Messages Received flags are set by
hardware. Because root port could only save one kind of correctable
error source id and another uncorrectable error source id at the same
time, so the second message sender id is lost if the 2 messages are
sent from 2 different devices. Below patch searches all devices under
the root port when multiple messages are received.

Signed-off-by: Zhang Yanmin <yanmin.zhang@linux.intel.com>

---



--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Comments

Andrew Patterson June 12, 2009, 10:16 p.m. UTC | #1
On Fri, 2009-06-12 at 11:08 +0800, Zhang, Yanmin wrote:
> When a root port receive the same errors more than once before kernel
>  process them, the Multiple Error Messages Received flags are set by
> hardware. Because root port could only save one kind of correctable
> error source id and another uncorrectable error source id at the same
> time, so the second message sender id is lost if the 2 messages are
> sent from 2 different devices. Below patch searches all devices under
> the root port when multiple messages are received.
> 
> Signed-off-by: Zhang Yanmin <yanmin.zhang@linux.intel.com>
> 
> ---
> 
> diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c
> --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:39:24.000000000 +0800
> +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:45:15.000000000 +0800
> @@ -145,13 +145,22 @@ static void set_downstream_devices_error
>  	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
>  }
>  
> +static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
> +{
> +	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
> +		e_info->dev[e_info->error_dev_num ++] = dev;

checkpatch reports:
ERROR: space prohibited before that '++' (ctx:WxB)
#46: FILE: drivers/pci/pcie/aer/aerdrv_core.c:151:
+		e_info->dev[e_info->error_dev_num ++] = dev;

Personally I would prefer:
		e_info->dev[e_info->error_dev_num] = dev;
		e_info->error_dev_num++;

> +		return 1;
> +	} else
> +		return 0;
> +}
> +


This function is now doing more than just comparing device ID's.
Perhaps you could rename it or put call add_error_device after
compare_device_id in find_device_iter?
>  static int compare_device_id(struct pci_dev *dev, struct aer_err_info *e_info)
>  {
>  	if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) {
>  		/*
>  		 * Device ID match
>  		 */
> -		e_info->dev = dev;
> +		add_error_device(e_info, dev);
>  		return 1;
>  	}
>  
> @@ -166,20 +175,38 @@ static int find_device_iter(struct pci_d
>  	u32 status;
>  	u32 mask;
>  	u16 reg16;
> +	int result;
>  	struct aer_err_info *e_info = (struct aer_err_info *)data;
>  
>  	/*
>  	 * When bus id is equal to 0, it might be a bad id
>  	 * reported by root port.
>  	 */
> -	if (!nosourceid && (PCI_BUS(e_info->id) != 0))
> -		return compare_device_id(dev, e_info);
> +	if (!nosourceid && (PCI_BUS(e_info->id) != 0)) {
> +		result = compare_device_id(dev, e_info);
> +		/*
> +		 * If there is no multiple error, we stop
> +		 * or continue based on the id comparing.
> +		 */
> +		if (!(e_info->flags & AER_MULTI_ERROR_VALID_FLAG))
> +			return result;
> +
> +		/*
> +		 * If there are multiple errors and id does match,
> +		 * We need continue to search other devices under
> +		 * the root port. Return 0 means that.
> +		 */
> +		if (result)
> +			return 0;
> +	}
>  
>  	/*
> -	 * Next is to check when bus id is equal to 0 or
> -	 * nosourceid==y. Some ports might lose the bus
> -	 * id of error source id. We check AER status
> -	 * registers to find the initial reporter.
> +	 * When either
> +	 * 	1) nosourceid==y;
> +	 * 	2) bus id is equal to 0. Some ports might lose the bus
> +	 * 		id of error source id;
> +	 * 	3) There are multiple errors and prior id comparing fails;
> +	 * We check AER status registers to find the initial reporter.
>  	 */
>  	if (atomic_read(&dev->enable_cnt) == 0)
>  		return 0;
> @@ -208,8 +235,8 @@ static int find_device_iter(struct pci_d
>  				pos + PCI_ERR_COR_MASK,
>  				&mask);
>  		if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) {
> -			e_info->dev = dev;
> -			return 1;
> +			add_error_device(e_info, dev);
> +			goto added;
>  		}
>  	} else {
>  		pci_read_config_dword(dev,
> @@ -219,12 +246,18 @@ static int find_device_iter(struct pci_d
>  				pos + PCI_ERR_UNCOR_MASK,
>  				&mask);
>  		if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) {
> -			e_info->dev = dev;
> -			return 1;
> +			add_error_device(e_info, dev);
> +			goto added;
>  		}
>  	}
>  
>  	return 0;
> +
> +added:
> +	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
> +		return 0;
> +	} else
> +		return 1;

checkpatch reports:
WARNING: braces {} are not necessary for any arm of this statement
#133: FILE: drivers/pci/pcie/aer/aerdrv_core.c:257:
+	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
[...]
+	} else
[...]


>  }
>  
>  /**
> @@ -705,6 +738,30 @@ static int get_device_error_info(struct 
>  	return AER_SUCCESS;
>  }
>  
> +static inline void aer_process_err_devices(struct pcie_device *p_device,
> +			struct aer_err_info *e_info)
> +{
> +	int i;
> +
> +	if (e_info->dev[0] == NULL) {
Minor not. Can we use 
          if (!e_info->dev[0]) {

> +		printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> +				__func__, e_info->id);

I suspect we don't want to embed the function name here, and use
dev_printk.

> +	}
> +
> +	for (i = 0; i < e_info->error_dev_num; i ++) {
checkpatch reports:
ERROR: space prohibited before that '++' (ctx:WxB)
#154: FILE: drivers/pci/pcie/aer/aerdrv_core.c:751:
+	for (i = 0; i < e_info->error_dev_num; i +


> +		if (e_info->dev[i] == NULL)
again		  if (!e_info->dev[i])

You could also put this check in the for loop.


> +			break;
> +
> +		if (get_device_error_info(e_info->dev[i], e_info) ==
> +				AER_SUCCESS) {
> +			aer_print_error(e_info->dev[i], e_info);
> +			handle_error_source(p_device,
> +					e_info->dev[i],
> +					e_info);
> +		}
> +	}
> +}
> +
>  /**
>   * aer_isr_one_error - consume an error detected by root port
>   * @p_device: pointer to error root port service device
> @@ -747,18 +804,7 @@ static void aer_isr_one_error(struct pci
>  			e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
>  
>  		find_source_device(p_device->port, e_info);
> -		if (e_info->dev == NULL) {
> -			printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> -				__func__, e_info->id);
> -			continue;
> -		}
> -		if (get_device_error_info(e_info->dev, e_info) ==
> -				AER_SUCCESS) {
> -			aer_print_error(e_info->dev, e_info);
> -			handle_error_source(p_device,
> -				e_info->dev,
> -				e_info);
> -		}
> +		aer_process_err_devices(p_device, e_info);
>  	}
>  
>  	kfree(e_info);
> diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h
> --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:39:24.000000000 +0800
> +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:45:15.000000000 +0800
> @@ -57,8 +57,10 @@ struct header_log_regs {
>  	unsigned int dw3;
>  };
>  
> +#define AER_MAX_MULTI_ERR_DEVICES	5
Is this number arbitrary or in the spec somewhere?


>  struct aer_err_info {
> -	struct pci_dev *dev;
> +	struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
> +	int error_dev_num;
>  	u16 id;
>  	int severity;			/* 0:NONFATAL | 1:FATAL | 2:COR */
>  	int flags;
> 
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-pci" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>
Yanmin Zhang June 15, 2009, 1:47 a.m. UTC | #2
On Fri, 2009-06-12 at 22:16 +0000, Andrew Patterson wrote:
> On Fri, 2009-06-12 at 11:08 +0800, Zhang, Yanmin wrote:
> > When a root port receive the same errors more than once before kernel
> >  process them, the Multiple Error Messages Received flags are set by
> > hardware. Because root port could only save one kind of correctable
> > error source id and another uncorrectable error source id at the same
> > time, so the second message sender id is lost if the 2 messages are
> > sent from 2 different devices. Below patch searches all devices under
> > the root port when multiple messages are received.
> > 
> > Signed-off-by: Zhang Yanmin <yanmin.zhang@linux.intel.com>
> > 
> > ---
> > 
> > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c
> > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:39:24.000000000 +0800
> > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:45:15.000000000 +0800
> > @@ -145,13 +145,22 @@ static void set_downstream_devices_error
> >  	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
> >  }
> >  
> > +static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
> > +{
> > +	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
> > +		e_info->dev[e_info->error_dev_num ++] = dev;
> 
> checkpatch reports:
> ERROR: space prohibited before that '++' (ctx:WxB)
> #46: FILE: drivers/pci/pcie/aer/aerdrv_core.c:151:
> +		e_info->dev[e_info->error_dev_num ++] = dev;
Thanks. I will change it and use checkpatch to check it.

> 
> Personally I would prefer:
> 		e_info->dev[e_info->error_dev_num] = dev;
> 		e_info->error_dev_num++;
> 
> > +		return 1;
> > +	} else
> > +		return 0;
> > +}
> > +
> 
> 
> This function is now doing more than just comparing device ID's.
> Perhaps you could rename it or put call add_error_device after
> compare_device_id in find_device_iter?
I will move the call add_error_device to find_device_iter.

> >  static int compare_device_id(struct pci_dev *dev, struct aer_err_info *e_info)
> >  {
> >  	if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) {
> >  		/*
> >  		 * Device ID match
> >  		 */
> > -		e_info->dev = dev;
> > +		add_error_device(e_info, dev);
> >  		return 1;
> >  	}
> >  
> > @@ -166,20 +175,38 @@ static int find_device_iter(struct pci_d
> >  	u32 status;
> >  	u32 mask;
> >  	u16 reg16;
> > +	int result;
> >  	struct aer_err_info *e_info = (struct aer_err_info *)data;
> >  
> >  	/*
> >  	 * When bus id is equal to 0, it might be a bad id
> >  	 * reported by root port.
> >  	 */
> > -	if (!nosourceid && (PCI_BUS(e_info->id) != 0))
> > -		return compare_device_id(dev, e_info);
> > +	if (!nosourceid && (PCI_BUS(e_info->id) != 0)) {
> > +		result = compare_device_id(dev, e_info);
> > +		/*
> > +		 * If there is no multiple error, we stop
> > +		 * or continue based on the id comparing.
> > +		 */
> > +		if (!(e_info->flags & AER_MULTI_ERROR_VALID_FLAG))
> > +			return result;
> > +
> > +		/*
> > +		 * If there are multiple errors and id does match,
> > +		 * We need continue to search other devices under
> > +		 * the root port. Return 0 means that.
> > +		 */
> > +		if (result)
> > +			return 0;
> > +	}
> >  
> >  	/*
> > -	 * Next is to check when bus id is equal to 0 or
> > -	 * nosourceid==y. Some ports might lose the bus
> > -	 * id of error source id. We check AER status
> > -	 * registers to find the initial reporter.
> > +	 * When either
> > +	 * 	1) nosourceid==y;
> > +	 * 	2) bus id is equal to 0. Some ports might lose the bus
> > +	 * 		id of error source id;
> > +	 * 	3) There are multiple errors and prior id comparing fails;
> > +	 * We check AER status registers to find the initial reporter.
> >  	 */
> >  	if (atomic_read(&dev->enable_cnt) == 0)
> >  		return 0;
> > @@ -208,8 +235,8 @@ static int find_device_iter(struct pci_d
> >  				pos + PCI_ERR_COR_MASK,
> >  				&mask);
> >  		if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) {
> > -			e_info->dev = dev;
> > -			return 1;
> > +			add_error_device(e_info, dev);
> > +			goto added;
> >  		}
> >  	} else {
> >  		pci_read_config_dword(dev,
> > @@ -219,12 +246,18 @@ static int find_device_iter(struct pci_d
> >  				pos + PCI_ERR_UNCOR_MASK,
> >  				&mask);
> >  		if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) {
> > -			e_info->dev = dev;
> > -			return 1;
> > +			add_error_device(e_info, dev);
> > +			goto added;
> >  		}
> >  	}
> >  
> >  	return 0;
> > +
> > +added:
> > +	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
> > +		return 0;
> > +	} else
> > +		return 1;
> 
> checkpatch reports:
> WARNING: braces {} are not necessary for any arm of this statement
> #133: FILE: drivers/pci/pcie/aer/aerdrv_core.c:257:
I will change it.

> +	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
> [...]
> +	} else
> [...]
> 
> 
> >  }
> >  
> >  /**
> > @@ -705,6 +738,30 @@ static int get_device_error_info(struct 
> >  	return AER_SUCCESS;
> >  }
> >  
> > +static inline void aer_process_err_devices(struct pcie_device *p_device,
> > +			struct aer_err_info *e_info)
> > +{
> > +	int i;
> > +
> > +	if (e_info->dev[0] == NULL) {
> Minor not. Can we use 
Yes. We can.

>           if (!e_info->dev[0]) {
> 
> > +		printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> > +				__func__, e_info->id);
> 
> I suspect we don't want to embed the function name here, and use
> dev_printk.
Ok.

> 
> > +	}
> > +
> > +	for (i = 0; i < e_info->error_dev_num; i ++) {
> checkpatch reports:
> ERROR: space prohibited before that '++' (ctx:WxB)
> #154: FILE: drivers/pci/pcie/aer/aerdrv_core.c:751:
> +	for (i = 0; i < e_info->error_dev_num; i +
I will change it.

> 
> 
> > +		if (e_info->dev[i] == NULL)
> again		  if (!e_info->dev[i])
Will do.

> 
> You could also put this check in the for loop.
I planed to, but one guy helped me test it within a guest OS on XEN and
reported a weired oops of guest OS. She said useing e_info->error_dev_num
could avoid the oops.

> 
> 
> > +			break;
> > +
> > +		if (get_device_error_info(e_info->dev[i], e_info) ==
> > +				AER_SUCCESS) {
> > +			aer_print_error(e_info->dev[i], e_info);
> > +			handle_error_source(p_device,
> > +					e_info->dev[i],
> > +					e_info);
> > +		}
> > +	}
> > +}
> > +
> >  /**
> >   * aer_isr_one_error - consume an error detected by root port
> >   * @p_device: pointer to error root port service device
> > @@ -747,18 +804,7 @@ static void aer_isr_one_error(struct pci
> >  			e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
> >  
> >  		find_source_device(p_device->port, e_info);
> > -		if (e_info->dev == NULL) {
> > -			printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> > -				__func__, e_info->id);
> > -			continue;
> > -		}
> > -		if (get_device_error_info(e_info->dev, e_info) ==
> > -				AER_SUCCESS) {
> > -			aer_print_error(e_info->dev, e_info);
> > -			handle_error_source(p_device,
> > -				e_info->dev,
> > -				e_info);
> > -		}
> > +		aer_process_err_devices(p_device, e_info);
> >  	}
> >  
> >  	kfree(e_info);
> > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h
> > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:39:24.000000000 +0800
> > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:45:15.000000000 +0800
> > @@ -57,8 +57,10 @@ struct header_log_regs {
> >  	unsigned int dw3;
> >  };
> >  
> > +#define AER_MAX_MULTI_ERR_DEVICES	5
> Is this number arbitrary or in the spec somewhere?
It's arbitrary and not spec. The startpoint is it's very rare that there are more
than 5 devices under the same root port reporting errors at the same time. It's hard
to say number 5 is the best. I just don't want the array is big.

> 
> 
> >  struct aer_err_info {
> > -	struct pci_dev *dev;
> > +	struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
> > +	int error_dev_num;
> >  	u16 id;
> >  	int severity;			/* 0:NONFATAL | 1:FATAL | 2:COR */
> >  	int flags;
> > 


--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Andrew Patterson June 15, 2009, 4:01 a.m. UTC | #3
On Mon, 2009-06-15 at 09:47 +0800, Zhang, Yanmin wrote:
> On Fri, 2009-06-12 at 22:16 +0000, Andrew Patterson wrote:
> > On Fri, 2009-06-12 at 11:08 +0800, Zhang, Yanmin wrote:
> > > When a root port receive the same errors more than once before kernel
> > >  process them, the Multiple Error Messages Received flags are set by
> > > hardware. Because root port could only save one kind of correctable
> > > error source id and another uncorrectable error source id at the same
> > > time, so the second message sender id is lost if the 2 messages are
> > > sent from 2 different devices. Below patch searches all devices under
> > > the root port when multiple messages are received.
> > > 
> > > Signed-off-by: Zhang Yanmin <yanmin.zhang@linux.intel.com>
> > > 
> > > ---
> > > 
> > > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c
> > > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:39:24.000000000 +0800
> > > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:45:15.000000000 +0800
> > > @@ -145,13 +145,22 @@ static void set_downstream_devices_error
> > >  	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
> > >  }
> > >  
> > > +static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
> > > +{
> > > +	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
> > > +		e_info->dev[e_info->error_dev_num ++] = dev;
> > 
> > checkpatch reports:
> > ERROR: space prohibited before that '++' (ctx:WxB)
> > #46: FILE: drivers/pci/pcie/aer/aerdrv_core.c:151:
> > +		e_info->dev[e_info->error_dev_num ++] = dev;
> Thanks. I will change it and use checkpatch to check it.
> 
> > 
> > Personally I would prefer:
> > 		e_info->dev[e_info->error_dev_num] = dev;
> > 		e_info->error_dev_num++;
> > 
> > > +		return 1;
> > > +	} else
> > > +		return 0;
> > > +}
> > > +
> > 
> > 
> > This function is now doing more than just comparing device ID's.
> > Perhaps you could rename it or put call add_error_device after
> > compare_device_id in find_device_iter?
> I will move the call add_error_device to find_device_iter.
> 
> > >  static int compare_device_id(struct pci_dev *dev, struct aer_err_info *e_info)
> > >  {
> > >  	if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) {
> > >  		/*
> > >  		 * Device ID match
> > >  		 */
> > > -		e_info->dev = dev;
> > > +		add_error_device(e_info, dev);
> > >  		return 1;
> > >  	}
> > >  
> > > @@ -166,20 +175,38 @@ static int find_device_iter(struct pci_d
> > >  	u32 status;
> > >  	u32 mask;
> > >  	u16 reg16;
> > > +	int result;
> > >  	struct aer_err_info *e_info = (struct aer_err_info *)data;
> > >  
> > >  	/*
> > >  	 * When bus id is equal to 0, it might be a bad id
> > >  	 * reported by root port.
> > >  	 */
> > > -	if (!nosourceid && (PCI_BUS(e_info->id) != 0))
> > > -		return compare_device_id(dev, e_info);
> > > +	if (!nosourceid && (PCI_BUS(e_info->id) != 0)) {
> > > +		result = compare_device_id(dev, e_info);
> > > +		/*
> > > +		 * If there is no multiple error, we stop
> > > +		 * or continue based on the id comparing.
> > > +		 */
> > > +		if (!(e_info->flags & AER_MULTI_ERROR_VALID_FLAG))
> > > +			return result;
> > > +
> > > +		/*
> > > +		 * If there are multiple errors and id does match,
> > > +		 * We need continue to search other devices under
> > > +		 * the root port. Return 0 means that.
> > > +		 */
> > > +		if (result)
> > > +			return 0;
> > > +	}
> > >  
> > >  	/*
> > > -	 * Next is to check when bus id is equal to 0 or
> > > -	 * nosourceid==y. Some ports might lose the bus
> > > -	 * id of error source id. We check AER status
> > > -	 * registers to find the initial reporter.
> > > +	 * When either
> > > +	 * 	1) nosourceid==y;
> > > +	 * 	2) bus id is equal to 0. Some ports might lose the bus
> > > +	 * 		id of error source id;
> > > +	 * 	3) There are multiple errors and prior id comparing fails;
> > > +	 * We check AER status registers to find the initial reporter.
> > >  	 */
> > >  	if (atomic_read(&dev->enable_cnt) == 0)
> > >  		return 0;
> > > @@ -208,8 +235,8 @@ static int find_device_iter(struct pci_d
> > >  				pos + PCI_ERR_COR_MASK,
> > >  				&mask);
> > >  		if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) {
> > > -			e_info->dev = dev;
> > > -			return 1;
> > > +			add_error_device(e_info, dev);
> > > +			goto added;
> > >  		}
> > >  	} else {
> > >  		pci_read_config_dword(dev,
> > > @@ -219,12 +246,18 @@ static int find_device_iter(struct pci_d
> > >  				pos + PCI_ERR_UNCOR_MASK,
> > >  				&mask);
> > >  		if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) {
> > > -			e_info->dev = dev;
> > > -			return 1;
> > > +			add_error_device(e_info, dev);
> > > +			goto added;
> > >  		}
> > >  	}
> > >  
> > >  	return 0;
> > > +
> > > +added:
> > > +	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
> > > +		return 0;
> > > +	} else
> > > +		return 1;
> > 
> > checkpatch reports:
> > WARNING: braces {} are not necessary for any arm of this statement
> > #133: FILE: drivers/pci/pcie/aer/aerdrv_core.c:257:
> I will change it.
> 
> > +	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
> > [...]
> > +	} else
> > [...]
> > 
> > 
> > >  }
> > >  
> > >  /**
> > > @@ -705,6 +738,30 @@ static int get_device_error_info(struct 
> > >  	return AER_SUCCESS;
> > >  }
> > >  
> > > +static inline void aer_process_err_devices(struct pcie_device *p_device,
> > > +			struct aer_err_info *e_info)
> > > +{
> > > +	int i;
> > > +
> > > +	if (e_info->dev[0] == NULL) {
> > Minor not. Can we use 
> Yes. We can.
> 
> >           if (!e_info->dev[0]) {
> > 
> > > +		printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> > > +				__func__, e_info->id);
> > 
> > I suspect we don't want to embed the function name here, and use
> > dev_printk.
> Ok.
> 
> > 
> > > +	}
> > > +
> > > +	for (i = 0; i < e_info->error_dev_num; i ++) {
> > checkpatch reports:
> > ERROR: space prohibited before that '++' (ctx:WxB)
> > #154: FILE: drivers/pci/pcie/aer/aerdrv_core.c:751:
> > +	for (i = 0; i < e_info->error_dev_num; i +
> I will change it.
> 
> > 
> > 
> > > +		if (e_info->dev[i] == NULL)
> > again		  if (!e_info->dev[i])
> Will do.
> 
> > 
> > You could also put this check in the for loop.
> I planed to, but one guy helped me test it within a guest OS on XEN and
> reported a weired oops of guest OS. She said useing e_info->error_dev_num
> could avoid the oops.

I think something like:

  for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++)

is functionally equivalent.

> 
> > 
> > 
> > > +			break;
> > > +
> > > +		if (get_device_error_info(e_info->dev[i], e_info) ==
> > > +				AER_SUCCESS) {
> > > +			aer_print_error(e_info->dev[i], e_info);
> > > +			handle_error_source(p_device,
> > > +					e_info->dev[i],
> > > +					e_info);
> > > +		}
> > > +	}
> > > +}
> > > +
> > >  /**
> > >   * aer_isr_one_error - consume an error detected by root port
> > >   * @p_device: pointer to error root port service device
> > > @@ -747,18 +804,7 @@ static void aer_isr_one_error(struct pci
> > >  			e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
> > >  
> > >  		find_source_device(p_device->port, e_info);
> > > -		if (e_info->dev == NULL) {
> > > -			printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> > > -				__func__, e_info->id);
> > > -			continue;
> > > -		}
> > > -		if (get_device_error_info(e_info->dev, e_info) ==
> > > -				AER_SUCCESS) {
> > > -			aer_print_error(e_info->dev, e_info);
> > > -			handle_error_source(p_device,
> > > -				e_info->dev,
> > > -				e_info);
> > > -		}
> > > +		aer_process_err_devices(p_device, e_info);
> > >  	}
> > >  
> > >  	kfree(e_info);
> > > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h
> > > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:39:24.000000000 +0800
> > > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:45:15.000000000 +0800
> > > @@ -57,8 +57,10 @@ struct header_log_regs {
> > >  	unsigned int dw3;
> > >  };
> > >  
> > > +#define AER_MAX_MULTI_ERR_DEVICES	5
> > Is this number arbitrary or in the spec somewhere?
> It's arbitrary and not spec. 

I suspected so.

> The startpoint is it's very rare that there are more
> than 5 devices under the same root port reporting errors at the same time. 

Agreed.

> It's hard
> to say number 5 is the best. I just don't want the array is big.

I don't have a problem with that decision. But you might add a comment
saying so, e.g.,

#define AER_MAX_MULTI_ERR_DEVICES	5    /* Not likely to have more */

> 
> > 
> > 
> > >  struct aer_err_info {
> > > -	struct pci_dev *dev;
> > > +	struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
> > > +	int error_dev_num;
> > >  	u16 id;
> > >  	int severity;			/* 0:NONFATAL | 1:FATAL | 2:COR */
> > >  	int flags;
> > > 
> 
>
Yanmin Zhang June 15, 2009, 4:37 a.m. UTC | #4
On Mon, 2009-06-15 at 04:01 +0000, Andrew Patterson wrote:
> On Mon, 2009-06-15 at 09:47 +0800, Zhang, Yanmin wrote:
> > On Fri, 2009-06-12 at 22:16 +0000, Andrew Patterson wrote:
> > > On Fri, 2009-06-12 at 11:08 +0800, Zhang, Yanmin wrote:
> > > > When a root port receive the same errors more than once before kernel
> > > >  process them, the Multiple Error Messages Received flags are set by
> > > > hardware. Because root port could only save one kind of correctable
> > > > error source id and another uncorrectable error source id at the same
> > > > time, so the second message sender id is lost if the 2 messages are
> > > > sent from 2 different devices. Below patch searches all devices under
> > > > the root port when multiple messages are received.
> > > > 
> > > > Signed-off-by: Zhang Yanmin <yanmin.zhang@linux.intel.com>
> > > > 
> > > > +	for (i = 0; i < e_info->error_dev_num; i ++) {
> > > checkpatch reports:
> > > ERROR: space prohibited before that '++' (ctx:WxB)
> > > #154: FILE: drivers/pci/pcie/aer/aerdrv_core.c:751:
> > > +	for (i = 0; i < e_info->error_dev_num; i +
> > I will change it.
> > 
> > > 
> > > 
> > > > +		if (e_info->dev[i] == NULL)
> > > again		  if (!e_info->dev[i])
> > Will do.
> > 
> > > 
> > > You could also put this check in the for loop.
> > I planed to, but one guy helped me test it within a guest OS on XEN and
> > reported a weired oops of guest OS. She said useing e_info->error_dev_num
> > could avoid the oops.
> 
> I think something like:
> 
>   for (i = 0; i < e_info->error_dev_num && e_info->dev[i]; i++)
> 
> is functionally equivalent.
I changed it.

> 
> > 
> > > 
> > > 
> > > > +			break;
> > > > +
> > > > +		if (get_device_error_info(e_info->dev[i], e_info) ==
> > > > +				AER_SUCCESS) {
> > > > +			aer_print_error(e_info->dev[i], e_info);
> > > > +			handle_error_source(p_device,
> > > > +					e_info->dev[i],
> > > > +					e_info);
> > > > +		}
> > > > +	}
> > > > +}
> > > > +
> > > >  /**
> > > >   * aer_isr_one_error - consume an error detected by root port
> > > >   * @p_device: pointer to error root port service device
> > > > @@ -747,18 +804,7 @@ static void aer_isr_one_error(struct pci
> > > >  			e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
> > > >  
> > > >  		find_source_device(p_device->port, e_info);
> > > > -		if (e_info->dev == NULL) {
> > > > -			printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
> > > > -				__func__, e_info->id);
> > > > -			continue;
> > > > -		}
> > > > -		if (get_device_error_info(e_info->dev, e_info) ==
> > > > -				AER_SUCCESS) {
> > > > -			aer_print_error(e_info->dev, e_info);
> > > > -			handle_error_source(p_device,
> > > > -				e_info->dev,
> > > > -				e_info);
> > > > -		}
> > > > +		aer_process_err_devices(p_device, e_info);
> > > >  	}
> > > >  
> > > >  	kfree(e_info);
> > > > diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h
> > > > --- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:39:24.000000000 +0800
> > > > +++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:45:15.000000000 +0800
> > > > @@ -57,8 +57,10 @@ struct header_log_regs {
> > > >  	unsigned int dw3;
> > > >  };
> > > >  
> > > > +#define AER_MAX_MULTI_ERR_DEVICES	5
> > > Is this number arbitrary or in the spec somewhere?
> > It's arbitrary and not spec. 
> 
> I suspected so.
> 
> > The startpoint is it's very rare that there are more
> > than 5 devices under the same root port reporting errors at the same time. 
> 
> Agreed.
> 
> > It's hard
> > to say number 5 is the best. I just don't want the array is big.
> 
> I don't have a problem with that decision. But you might add a comment
> saying so, e.g.,
> 
> #define AER_MAX_MULTI_ERR_DEVICES	5    /* Not likely to have more */
Added.

Thanks Andrew.

--
To unsubscribe from this list: send the line "unsubscribe linux-pci" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
diff mbox

Patch

diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c
--- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:39:24.000000000 +0800
+++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv_core.c	2009-06-12 05:45:15.000000000 +0800
@@ -145,13 +145,22 @@  static void set_downstream_devices_error
 	pci_walk_bus(dev->subordinate, set_device_error_reporting, &enable);
 }
 
+static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
+{
+	if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
+		e_info->dev[e_info->error_dev_num ++] = dev;
+		return 1;
+	} else
+		return 0;
+}
+
 static int compare_device_id(struct pci_dev *dev, struct aer_err_info *e_info)
 {
 	if (e_info->id == ((dev->bus->number << 8) | dev->devfn)) {
 		/*
 		 * Device ID match
 		 */
-		e_info->dev = dev;
+		add_error_device(e_info, dev);
 		return 1;
 	}
 
@@ -166,20 +175,38 @@  static int find_device_iter(struct pci_d
 	u32 status;
 	u32 mask;
 	u16 reg16;
+	int result;
 	struct aer_err_info *e_info = (struct aer_err_info *)data;
 
 	/*
 	 * When bus id is equal to 0, it might be a bad id
 	 * reported by root port.
 	 */
-	if (!nosourceid && (PCI_BUS(e_info->id) != 0))
-		return compare_device_id(dev, e_info);
+	if (!nosourceid && (PCI_BUS(e_info->id) != 0)) {
+		result = compare_device_id(dev, e_info);
+		/*
+		 * If there is no multiple error, we stop
+		 * or continue based on the id comparing.
+		 */
+		if (!(e_info->flags & AER_MULTI_ERROR_VALID_FLAG))
+			return result;
+
+		/*
+		 * If there are multiple errors and id does match,
+		 * We need continue to search other devices under
+		 * the root port. Return 0 means that.
+		 */
+		if (result)
+			return 0;
+	}
 
 	/*
-	 * Next is to check when bus id is equal to 0 or
-	 * nosourceid==y. Some ports might lose the bus
-	 * id of error source id. We check AER status
-	 * registers to find the initial reporter.
+	 * When either
+	 * 	1) nosourceid==y;
+	 * 	2) bus id is equal to 0. Some ports might lose the bus
+	 * 		id of error source id;
+	 * 	3) There are multiple errors and prior id comparing fails;
+	 * We check AER status registers to find the initial reporter.
 	 */
 	if (atomic_read(&dev->enable_cnt) == 0)
 		return 0;
@@ -208,8 +235,8 @@  static int find_device_iter(struct pci_d
 				pos + PCI_ERR_COR_MASK,
 				&mask);
 		if (status & ERR_CORRECTABLE_ERROR_MASK & ~mask) {
-			e_info->dev = dev;
-			return 1;
+			add_error_device(e_info, dev);
+			goto added;
 		}
 	} else {
 		pci_read_config_dword(dev,
@@ -219,12 +246,18 @@  static int find_device_iter(struct pci_d
 				pos + PCI_ERR_UNCOR_MASK,
 				&mask);
 		if (status & ERR_UNCORRECTABLE_ERROR_MASK & ~mask) {
-			e_info->dev = dev;
-			return 1;
+			add_error_device(e_info, dev);
+			goto added;
 		}
 	}
 
 	return 0;
+
+added:
+	if (e_info->flags & AER_MULTI_ERROR_VALID_FLAG) {
+		return 0;
+	} else
+		return 1;
 }
 
 /**
@@ -705,6 +738,30 @@  static int get_device_error_info(struct 
 	return AER_SUCCESS;
 }
 
+static inline void aer_process_err_devices(struct pcie_device *p_device,
+			struct aer_err_info *e_info)
+{
+	int i;
+
+	if (e_info->dev[0] == NULL) {
+		printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
+				__func__, e_info->id);
+	}
+
+	for (i = 0; i < e_info->error_dev_num; i ++) {
+		if (e_info->dev[i] == NULL)
+			break;
+
+		if (get_device_error_info(e_info->dev[i], e_info) ==
+				AER_SUCCESS) {
+			aer_print_error(e_info->dev[i], e_info);
+			handle_error_source(p_device,
+					e_info->dev[i],
+					e_info);
+		}
+	}
+}
+
 /**
  * aer_isr_one_error - consume an error detected by root port
  * @p_device: pointer to error root port service device
@@ -747,18 +804,7 @@  static void aer_isr_one_error(struct pci
 			e_info->flags |= AER_MULTI_ERROR_VALID_FLAG;
 
 		find_source_device(p_device->port, e_info);
-		if (e_info->dev == NULL) {
-			printk(KERN_DEBUG "%s->can't find device of ID%04x\n",
-				__func__, e_info->id);
-			continue;
-		}
-		if (get_device_error_info(e_info->dev, e_info) ==
-				AER_SUCCESS) {
-			aer_print_error(e_info->dev, e_info);
-			handle_error_source(p_device,
-				e_info->dev,
-				e_info);
-		}
+		aer_process_err_devices(p_device, e_info);
 	}
 
 	kfree(e_info);
diff -Nraup linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h
--- linux-2.6_next_aernoid/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:39:24.000000000 +0800
+++ linux-2.6_next_aermultierror/drivers/pci/pcie/aer/aerdrv.h	2009-06-12 05:45:15.000000000 +0800
@@ -57,8 +57,10 @@  struct header_log_regs {
 	unsigned int dw3;
 };
 
+#define AER_MAX_MULTI_ERR_DEVICES	5
 struct aer_err_info {
-	struct pci_dev *dev;
+	struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+	int error_dev_num;
 	u16 id;
 	int severity;			/* 0:NONFATAL | 1:FATAL | 2:COR */
 	int flags;